1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2016 Intel Corporation 4 */ 5 6 #include <linux/kthread.h> 7 8 #include "gem/i915_gem_context.h" 9 #include "gem/i915_gem_internal.h" 10 11 #include "i915_gem_evict.h" 12 #include "intel_gt.h" 13 #include "intel_engine_heartbeat.h" 14 #include "intel_engine_pm.h" 15 #include "selftest_engine_heartbeat.h" 16 17 #include "i915_selftest.h" 18 #include "selftests/i915_random.h" 19 #include "selftests/igt_flush_test.h" 20 #include "selftests/igt_reset.h" 21 #include "selftests/igt_atomic.h" 22 #include "selftests/igt_spinner.h" 23 #include "selftests/intel_scheduler_helpers.h" 24 25 #include "selftests/mock_drm.h" 26 27 #include "gem/selftests/mock_context.h" 28 #include "gem/selftests/igt_gem_utils.h" 29 30 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 31 32 struct hang { 33 struct intel_gt *gt; 34 struct drm_i915_gem_object *hws; 35 struct drm_i915_gem_object *obj; 36 struct i915_gem_context *ctx; 37 u32 *seqno; 38 u32 *batch; 39 }; 40 41 static int hang_init(struct hang *h, struct intel_gt *gt) 42 { 43 void *vaddr; 44 int err; 45 46 memset(h, 0, sizeof(*h)); 47 h->gt = gt; 48 49 h->ctx = kernel_context(gt->i915, NULL); 50 if (IS_ERR(h->ctx)) 51 return PTR_ERR(h->ctx); 52 53 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 54 55 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 56 if (IS_ERR(h->hws)) { 57 err = PTR_ERR(h->hws); 58 goto err_ctx; 59 } 60 61 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 62 if (IS_ERR(h->obj)) { 63 err = PTR_ERR(h->obj); 64 goto err_hws; 65 } 66 67 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 68 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB); 69 if (IS_ERR(vaddr)) { 70 err = PTR_ERR(vaddr); 71 goto err_obj; 72 } 73 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 74 75 vaddr = i915_gem_object_pin_map_unlocked(h->obj, 76 i915_coherent_map_type(gt->i915, h->obj, false)); 77 if (IS_ERR(vaddr)) { 78 err = PTR_ERR(vaddr); 79 goto err_unpin_hws; 80 } 81 h->batch = vaddr; 82 83 return 0; 84 85 err_unpin_hws: 86 i915_gem_object_unpin_map(h->hws); 87 err_obj: 88 i915_gem_object_put(h->obj); 89 err_hws: 90 i915_gem_object_put(h->hws); 91 err_ctx: 92 kernel_context_close(h->ctx); 93 return err; 94 } 95 96 static u64 hws_address(const struct i915_vma *hws, 97 const struct i915_request *rq) 98 { 99 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 100 } 101 102 static struct i915_request * 103 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 104 { 105 struct intel_gt *gt = h->gt; 106 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx); 107 struct drm_i915_gem_object *obj; 108 struct i915_request *rq = NULL; 109 struct i915_vma *hws, *vma; 110 unsigned int flags; 111 void *vaddr; 112 u32 *batch; 113 int err; 114 115 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 116 if (IS_ERR(obj)) { 117 i915_vm_put(vm); 118 return ERR_CAST(obj); 119 } 120 121 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false)); 122 if (IS_ERR(vaddr)) { 123 i915_gem_object_put(obj); 124 i915_vm_put(vm); 125 return ERR_CAST(vaddr); 126 } 127 128 i915_gem_object_unpin_map(h->obj); 129 i915_gem_object_put(h->obj); 130 131 h->obj = obj; 132 h->batch = vaddr; 133 134 vma = i915_vma_instance(h->obj, vm, NULL); 135 if (IS_ERR(vma)) { 136 i915_vm_put(vm); 137 return ERR_CAST(vma); 138 } 139 140 hws = i915_vma_instance(h->hws, vm, NULL); 141 if (IS_ERR(hws)) { 142 i915_vm_put(vm); 143 return ERR_CAST(hws); 144 } 145 146 err = i915_vma_pin(vma, 0, 0, PIN_USER); 147 if (err) { 148 i915_vm_put(vm); 149 return ERR_PTR(err); 150 } 151 152 err = i915_vma_pin(hws, 0, 0, PIN_USER); 153 if (err) 154 goto unpin_vma; 155 156 rq = igt_request_alloc(h->ctx, engine); 157 if (IS_ERR(rq)) { 158 err = PTR_ERR(rq); 159 goto unpin_hws; 160 } 161 162 err = igt_vma_move_to_active_unlocked(vma, rq, 0); 163 if (err) 164 goto cancel_rq; 165 166 err = igt_vma_move_to_active_unlocked(hws, rq, 0); 167 if (err) 168 goto cancel_rq; 169 170 batch = h->batch; 171 if (GRAPHICS_VER(gt->i915) >= 8) { 172 *batch++ = MI_STORE_DWORD_IMM_GEN4; 173 *batch++ = lower_32_bits(hws_address(hws, rq)); 174 *batch++ = upper_32_bits(hws_address(hws, rq)); 175 *batch++ = rq->fence.seqno; 176 *batch++ = MI_NOOP; 177 178 memset(batch, 0, 1024); 179 batch += 1024 / sizeof(*batch); 180 181 *batch++ = MI_NOOP; 182 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 183 *batch++ = lower_32_bits(vma->node.start); 184 *batch++ = upper_32_bits(vma->node.start); 185 } else if (GRAPHICS_VER(gt->i915) >= 6) { 186 *batch++ = MI_STORE_DWORD_IMM_GEN4; 187 *batch++ = 0; 188 *batch++ = lower_32_bits(hws_address(hws, rq)); 189 *batch++ = rq->fence.seqno; 190 *batch++ = MI_NOOP; 191 192 memset(batch, 0, 1024); 193 batch += 1024 / sizeof(*batch); 194 195 *batch++ = MI_NOOP; 196 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 197 *batch++ = lower_32_bits(vma->node.start); 198 } else if (GRAPHICS_VER(gt->i915) >= 4) { 199 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 200 *batch++ = 0; 201 *batch++ = lower_32_bits(hws_address(hws, rq)); 202 *batch++ = rq->fence.seqno; 203 *batch++ = MI_NOOP; 204 205 memset(batch, 0, 1024); 206 batch += 1024 / sizeof(*batch); 207 208 *batch++ = MI_NOOP; 209 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 210 *batch++ = lower_32_bits(vma->node.start); 211 } else { 212 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 213 *batch++ = lower_32_bits(hws_address(hws, rq)); 214 *batch++ = rq->fence.seqno; 215 *batch++ = MI_NOOP; 216 217 memset(batch, 0, 1024); 218 batch += 1024 / sizeof(*batch); 219 220 *batch++ = MI_NOOP; 221 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 222 *batch++ = lower_32_bits(vma->node.start); 223 } 224 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 225 intel_gt_chipset_flush(engine->gt); 226 227 if (rq->engine->emit_init_breadcrumb) { 228 err = rq->engine->emit_init_breadcrumb(rq); 229 if (err) 230 goto cancel_rq; 231 } 232 233 flags = 0; 234 if (GRAPHICS_VER(gt->i915) <= 5) 235 flags |= I915_DISPATCH_SECURE; 236 237 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 238 239 cancel_rq: 240 if (err) { 241 i915_request_set_error_once(rq, err); 242 i915_request_add(rq); 243 } 244 unpin_hws: 245 i915_vma_unpin(hws); 246 unpin_vma: 247 i915_vma_unpin(vma); 248 i915_vm_put(vm); 249 return err ? ERR_PTR(err) : rq; 250 } 251 252 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 253 { 254 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 255 } 256 257 static void hang_fini(struct hang *h) 258 { 259 *h->batch = MI_BATCH_BUFFER_END; 260 intel_gt_chipset_flush(h->gt); 261 262 i915_gem_object_unpin_map(h->obj); 263 i915_gem_object_put(h->obj); 264 265 i915_gem_object_unpin_map(h->hws); 266 i915_gem_object_put(h->hws); 267 268 kernel_context_close(h->ctx); 269 270 igt_flush_test(h->gt->i915); 271 } 272 273 static bool wait_until_running(struct hang *h, struct i915_request *rq) 274 { 275 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 276 rq->fence.seqno), 277 10) && 278 wait_for(i915_seqno_passed(hws_seqno(h, rq), 279 rq->fence.seqno), 280 1000)); 281 } 282 283 static int igt_hang_sanitycheck(void *arg) 284 { 285 struct intel_gt *gt = arg; 286 struct i915_request *rq; 287 struct intel_engine_cs *engine; 288 enum intel_engine_id id; 289 struct hang h; 290 int err; 291 292 /* Basic check that we can execute our hanging batch */ 293 294 err = hang_init(&h, gt); 295 if (err) 296 return err; 297 298 for_each_engine(engine, gt, id) { 299 struct intel_wedge_me w; 300 long timeout; 301 302 if (!intel_engine_can_store_dword(engine)) 303 continue; 304 305 rq = hang_create_request(&h, engine); 306 if (IS_ERR(rq)) { 307 err = PTR_ERR(rq); 308 pr_err("Failed to create request for %s, err=%d\n", 309 engine->name, err); 310 goto fini; 311 } 312 313 i915_request_get(rq); 314 315 *h.batch = MI_BATCH_BUFFER_END; 316 intel_gt_chipset_flush(engine->gt); 317 318 i915_request_add(rq); 319 320 timeout = 0; 321 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 322 timeout = i915_request_wait(rq, 0, 323 MAX_SCHEDULE_TIMEOUT); 324 if (intel_gt_is_wedged(gt)) 325 timeout = -EIO; 326 327 i915_request_put(rq); 328 329 if (timeout < 0) { 330 err = timeout; 331 pr_err("Wait for request failed on %s, err=%d\n", 332 engine->name, err); 333 goto fini; 334 } 335 } 336 337 fini: 338 hang_fini(&h); 339 return err; 340 } 341 342 static bool wait_for_idle(struct intel_engine_cs *engine) 343 { 344 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 345 } 346 347 static int igt_reset_nop(void *arg) 348 { 349 struct intel_gt *gt = arg; 350 struct i915_gpu_error *global = >->i915->gpu_error; 351 struct intel_engine_cs *engine; 352 unsigned int reset_count, count; 353 enum intel_engine_id id; 354 IGT_TIMEOUT(end_time); 355 int err = 0; 356 357 /* Check that we can reset during non-user portions of requests */ 358 359 reset_count = i915_reset_count(global); 360 count = 0; 361 do { 362 for_each_engine(engine, gt, id) { 363 struct intel_context *ce; 364 int i; 365 366 ce = intel_context_create(engine); 367 if (IS_ERR(ce)) { 368 err = PTR_ERR(ce); 369 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 370 break; 371 } 372 373 for (i = 0; i < 16; i++) { 374 struct i915_request *rq; 375 376 rq = intel_context_create_request(ce); 377 if (IS_ERR(rq)) { 378 err = PTR_ERR(rq); 379 pr_err("[%s] Create request failed: %d!\n", 380 engine->name, err); 381 break; 382 } 383 384 i915_request_add(rq); 385 } 386 387 intel_context_put(ce); 388 } 389 390 igt_global_reset_lock(gt); 391 intel_gt_reset(gt, ALL_ENGINES, NULL); 392 igt_global_reset_unlock(gt); 393 394 if (intel_gt_is_wedged(gt)) { 395 pr_err("[%s] GT is wedged!\n", engine->name); 396 err = -EIO; 397 break; 398 } 399 400 if (i915_reset_count(global) != reset_count + ++count) { 401 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n", 402 engine->name, i915_reset_count(global), reset_count, count); 403 err = -EINVAL; 404 break; 405 } 406 407 err = igt_flush_test(gt->i915); 408 if (err) { 409 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 410 break; 411 } 412 } while (time_before(jiffies, end_time)); 413 pr_info("%s: %d resets\n", __func__, count); 414 415 if (igt_flush_test(gt->i915)) { 416 pr_err("Post flush failed: %d!\n", err); 417 err = -EIO; 418 } 419 420 return err; 421 } 422 423 static int igt_reset_nop_engine(void *arg) 424 { 425 struct intel_gt *gt = arg; 426 struct i915_gpu_error *global = >->i915->gpu_error; 427 struct intel_engine_cs *engine; 428 enum intel_engine_id id; 429 430 /* Check that we can engine-reset during non-user portions */ 431 432 if (!intel_has_reset_engine(gt)) 433 return 0; 434 435 for_each_engine(engine, gt, id) { 436 unsigned int reset_count, reset_engine_count, count; 437 struct intel_context *ce; 438 IGT_TIMEOUT(end_time); 439 int err; 440 441 if (intel_engine_uses_guc(engine)) { 442 /* Engine level resets are triggered by GuC when a hang 443 * is detected. They can't be triggered by the KMD any 444 * more. Thus a nop batch cannot be used as a reset test 445 */ 446 continue; 447 } 448 449 ce = intel_context_create(engine); 450 if (IS_ERR(ce)) { 451 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 452 return PTR_ERR(ce); 453 } 454 455 reset_count = i915_reset_count(global); 456 reset_engine_count = i915_reset_engine_count(global, engine); 457 count = 0; 458 459 st_engine_heartbeat_disable(engine); 460 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 461 >->reset.flags)); 462 do { 463 int i; 464 465 if (!wait_for_idle(engine)) { 466 pr_err("%s failed to idle before reset\n", 467 engine->name); 468 err = -EIO; 469 break; 470 } 471 472 for (i = 0; i < 16; i++) { 473 struct i915_request *rq; 474 475 rq = intel_context_create_request(ce); 476 if (IS_ERR(rq)) { 477 struct drm_printer p = 478 drm_info_printer(gt->i915->drm.dev); 479 intel_engine_dump(engine, &p, 480 "%s(%s): failed to submit request\n", 481 __func__, 482 engine->name); 483 484 GEM_TRACE("%s(%s): failed to submit request\n", 485 __func__, 486 engine->name); 487 GEM_TRACE_DUMP(); 488 489 intel_gt_set_wedged(gt); 490 491 err = PTR_ERR(rq); 492 break; 493 } 494 495 i915_request_add(rq); 496 } 497 err = intel_engine_reset(engine, NULL); 498 if (err) { 499 pr_err("intel_engine_reset(%s) failed, err:%d\n", 500 engine->name, err); 501 break; 502 } 503 504 if (i915_reset_count(global) != reset_count) { 505 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 506 err = -EINVAL; 507 break; 508 } 509 510 if (i915_reset_engine_count(global, engine) != 511 reset_engine_count + ++count) { 512 pr_err("%s engine reset not recorded!\n", 513 engine->name); 514 err = -EINVAL; 515 break; 516 } 517 } while (time_before(jiffies, end_time)); 518 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 519 st_engine_heartbeat_enable(engine); 520 521 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 522 523 intel_context_put(ce); 524 if (igt_flush_test(gt->i915)) 525 err = -EIO; 526 if (err) 527 return err; 528 } 529 530 return 0; 531 } 532 533 static void force_reset_timeout(struct intel_engine_cs *engine) 534 { 535 engine->reset_timeout.probability = 999; 536 atomic_set(&engine->reset_timeout.times, -1); 537 } 538 539 static void cancel_reset_timeout(struct intel_engine_cs *engine) 540 { 541 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 542 } 543 544 static int igt_reset_fail_engine(void *arg) 545 { 546 struct intel_gt *gt = arg; 547 struct intel_engine_cs *engine; 548 enum intel_engine_id id; 549 550 /* Check that we can recover from engine-reset failues */ 551 552 if (!intel_has_reset_engine(gt)) 553 return 0; 554 555 for_each_engine(engine, gt, id) { 556 unsigned int count; 557 struct intel_context *ce; 558 IGT_TIMEOUT(end_time); 559 int err; 560 561 /* Can't manually break the reset if i915 doesn't perform it */ 562 if (intel_engine_uses_guc(engine)) 563 continue; 564 565 ce = intel_context_create(engine); 566 if (IS_ERR(ce)) { 567 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 568 return PTR_ERR(ce); 569 } 570 571 st_engine_heartbeat_disable(engine); 572 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 573 >->reset.flags)); 574 575 force_reset_timeout(engine); 576 err = intel_engine_reset(engine, NULL); 577 cancel_reset_timeout(engine); 578 if (err == 0) /* timeouts only generated on gen8+ */ 579 goto skip; 580 581 count = 0; 582 do { 583 struct i915_request *last = NULL; 584 int i; 585 586 if (!wait_for_idle(engine)) { 587 pr_err("%s failed to idle before reset\n", 588 engine->name); 589 err = -EIO; 590 break; 591 } 592 593 for (i = 0; i < count % 15; i++) { 594 struct i915_request *rq; 595 596 rq = intel_context_create_request(ce); 597 if (IS_ERR(rq)) { 598 struct drm_printer p = 599 drm_info_printer(gt->i915->drm.dev); 600 intel_engine_dump(engine, &p, 601 "%s(%s): failed to submit request\n", 602 __func__, 603 engine->name); 604 605 GEM_TRACE("%s(%s): failed to submit request\n", 606 __func__, 607 engine->name); 608 GEM_TRACE_DUMP(); 609 610 intel_gt_set_wedged(gt); 611 if (last) 612 i915_request_put(last); 613 614 err = PTR_ERR(rq); 615 goto out; 616 } 617 618 if (last) 619 i915_request_put(last); 620 last = i915_request_get(rq); 621 i915_request_add(rq); 622 } 623 624 if (count & 1) { 625 err = intel_engine_reset(engine, NULL); 626 if (err) { 627 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 628 engine->name, err); 629 GEM_TRACE_DUMP(); 630 i915_request_put(last); 631 break; 632 } 633 } else { 634 force_reset_timeout(engine); 635 err = intel_engine_reset(engine, NULL); 636 cancel_reset_timeout(engine); 637 if (err != -ETIMEDOUT) { 638 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 639 engine->name, err); 640 i915_request_put(last); 641 break; 642 } 643 } 644 645 err = 0; 646 if (last) { 647 if (i915_request_wait(last, 0, HZ / 2) < 0) { 648 struct drm_printer p = 649 drm_info_printer(gt->i915->drm.dev); 650 651 intel_engine_dump(engine, &p, 652 "%s(%s): failed to complete request\n", 653 __func__, 654 engine->name); 655 656 GEM_TRACE("%s(%s): failed to complete request\n", 657 __func__, 658 engine->name); 659 GEM_TRACE_DUMP(); 660 661 err = -EIO; 662 } 663 i915_request_put(last); 664 } 665 count++; 666 } while (err == 0 && time_before(jiffies, end_time)); 667 out: 668 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 669 skip: 670 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 671 st_engine_heartbeat_enable(engine); 672 intel_context_put(ce); 673 674 if (igt_flush_test(gt->i915)) 675 err = -EIO; 676 if (err) 677 return err; 678 } 679 680 return 0; 681 } 682 683 static int __igt_reset_engine(struct intel_gt *gt, bool active) 684 { 685 struct i915_gpu_error *global = >->i915->gpu_error; 686 struct intel_engine_cs *engine; 687 enum intel_engine_id id; 688 struct hang h; 689 int err = 0; 690 691 /* Check that we can issue an engine reset on an idle engine (no-op) */ 692 693 if (!intel_has_reset_engine(gt)) 694 return 0; 695 696 if (active) { 697 err = hang_init(&h, gt); 698 if (err) 699 return err; 700 } 701 702 for_each_engine(engine, gt, id) { 703 unsigned int reset_count, reset_engine_count; 704 unsigned long count; 705 bool using_guc = intel_engine_uses_guc(engine); 706 IGT_TIMEOUT(end_time); 707 708 if (using_guc && !active) 709 continue; 710 711 if (active && !intel_engine_can_store_dword(engine)) 712 continue; 713 714 if (!wait_for_idle(engine)) { 715 pr_err("%s failed to idle before reset\n", 716 engine->name); 717 err = -EIO; 718 break; 719 } 720 721 reset_count = i915_reset_count(global); 722 reset_engine_count = i915_reset_engine_count(global, engine); 723 724 st_engine_heartbeat_disable(engine); 725 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 726 >->reset.flags)); 727 count = 0; 728 do { 729 struct i915_request *rq = NULL; 730 struct intel_selftest_saved_policy saved; 731 int err2; 732 733 err = intel_selftest_modify_policy(engine, &saved, 734 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 735 if (err) { 736 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 737 break; 738 } 739 740 if (active) { 741 rq = hang_create_request(&h, engine); 742 if (IS_ERR(rq)) { 743 err = PTR_ERR(rq); 744 pr_err("[%s] Create hang request failed: %d!\n", 745 engine->name, err); 746 goto restore; 747 } 748 749 i915_request_get(rq); 750 i915_request_add(rq); 751 752 if (!wait_until_running(&h, rq)) { 753 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 754 755 pr_err("%s: Failed to start request %llx, at %x\n", 756 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 757 intel_engine_dump(engine, &p, 758 "%s\n", engine->name); 759 760 i915_request_put(rq); 761 err = -EIO; 762 goto restore; 763 } 764 } 765 766 if (!using_guc) { 767 err = intel_engine_reset(engine, NULL); 768 if (err) { 769 pr_err("intel_engine_reset(%s) failed, err:%d\n", 770 engine->name, err); 771 goto skip; 772 } 773 } 774 775 if (rq) { 776 /* Ensure the reset happens and kills the engine */ 777 err = intel_selftest_wait_for_rq(rq); 778 if (err) 779 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 780 engine->name, rq->fence.context, 781 rq->fence.seqno, rq->context->guc_id.id, err); 782 } 783 784 skip: 785 if (rq) 786 i915_request_put(rq); 787 788 if (i915_reset_count(global) != reset_count) { 789 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 790 err = -EINVAL; 791 goto restore; 792 } 793 794 /* GuC based resets are not logged per engine */ 795 if (!using_guc) { 796 if (i915_reset_engine_count(global, engine) != 797 ++reset_engine_count) { 798 pr_err("%s engine reset not recorded!\n", 799 engine->name); 800 err = -EINVAL; 801 goto restore; 802 } 803 } 804 805 count++; 806 807 restore: 808 err2 = intel_selftest_restore_policy(engine, &saved); 809 if (err2) 810 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err); 811 if (err == 0) 812 err = err2; 813 if (err) 814 break; 815 } while (time_before(jiffies, end_time)); 816 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 817 st_engine_heartbeat_enable(engine); 818 pr_info("%s: Completed %lu %s resets\n", 819 engine->name, count, active ? "active" : "idle"); 820 821 if (err) 822 break; 823 824 err = igt_flush_test(gt->i915); 825 if (err) { 826 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 827 break; 828 } 829 } 830 831 if (intel_gt_is_wedged(gt)) { 832 pr_err("GT is wedged!\n"); 833 err = -EIO; 834 } 835 836 if (active) 837 hang_fini(&h); 838 839 return err; 840 } 841 842 static int igt_reset_idle_engine(void *arg) 843 { 844 return __igt_reset_engine(arg, false); 845 } 846 847 static int igt_reset_active_engine(void *arg) 848 { 849 return __igt_reset_engine(arg, true); 850 } 851 852 struct active_engine { 853 struct kthread_worker *worker; 854 struct kthread_work work; 855 struct intel_engine_cs *engine; 856 unsigned long resets; 857 unsigned int flags; 858 bool stop; 859 int result; 860 }; 861 862 #define TEST_ACTIVE BIT(0) 863 #define TEST_OTHERS BIT(1) 864 #define TEST_SELF BIT(2) 865 #define TEST_PRIORITY BIT(3) 866 867 static int active_request_put(struct i915_request *rq) 868 { 869 int err = 0; 870 871 if (!rq) 872 return 0; 873 874 if (i915_request_wait(rq, 0, 10 * HZ) < 0) { 875 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 876 rq->engine->name, 877 rq->fence.context, 878 rq->fence.seqno); 879 GEM_TRACE_DUMP(); 880 881 intel_gt_set_wedged(rq->engine->gt); 882 err = -EIO; 883 } 884 885 i915_request_put(rq); 886 887 return err; 888 } 889 890 static void active_engine(struct kthread_work *work) 891 { 892 I915_RND_STATE(prng); 893 struct active_engine *arg = container_of(work, typeof(*arg), work); 894 struct intel_engine_cs *engine = arg->engine; 895 struct i915_request *rq[8] = {}; 896 struct intel_context *ce[ARRAY_SIZE(rq)]; 897 unsigned long count; 898 int err = 0; 899 900 for (count = 0; count < ARRAY_SIZE(ce); count++) { 901 ce[count] = intel_context_create(engine); 902 if (IS_ERR(ce[count])) { 903 arg->result = PTR_ERR(ce[count]); 904 pr_err("[%s] Create context #%ld failed: %d!\n", 905 engine->name, count, arg->result); 906 while (--count) 907 intel_context_put(ce[count]); 908 return; 909 } 910 } 911 912 count = 0; 913 while (!READ_ONCE(arg->stop)) { 914 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 915 struct i915_request *old = rq[idx]; 916 struct i915_request *new; 917 918 new = intel_context_create_request(ce[idx]); 919 if (IS_ERR(new)) { 920 err = PTR_ERR(new); 921 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err); 922 break; 923 } 924 925 rq[idx] = i915_request_get(new); 926 i915_request_add(new); 927 928 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) { 929 struct i915_sched_attr attr = { 930 .priority = 931 i915_prandom_u32_max_state(512, &prng), 932 }; 933 engine->sched_engine->schedule(rq[idx], &attr); 934 } 935 936 err = active_request_put(old); 937 if (err) { 938 pr_err("[%s] Request put failed: %d!\n", engine->name, err); 939 break; 940 } 941 942 cond_resched(); 943 } 944 945 for (count = 0; count < ARRAY_SIZE(rq); count++) { 946 int err__ = active_request_put(rq[count]); 947 948 if (err) 949 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err); 950 951 /* Keep the first error */ 952 if (!err) 953 err = err__; 954 955 intel_context_put(ce[count]); 956 } 957 958 arg->result = err; 959 } 960 961 static int __igt_reset_engines(struct intel_gt *gt, 962 const char *test_name, 963 unsigned int flags) 964 { 965 struct i915_gpu_error *global = >->i915->gpu_error; 966 struct intel_engine_cs *engine, *other; 967 struct active_engine *threads; 968 enum intel_engine_id id, tmp; 969 struct hang h; 970 int err = 0; 971 972 /* Check that issuing a reset on one engine does not interfere 973 * with any other engine. 974 */ 975 976 if (!intel_has_reset_engine(gt)) 977 return 0; 978 979 if (flags & TEST_ACTIVE) { 980 err = hang_init(&h, gt); 981 if (err) 982 return err; 983 984 if (flags & TEST_PRIORITY) 985 h.ctx->sched.priority = 1024; 986 } 987 988 threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL); 989 if (!threads) 990 return -ENOMEM; 991 992 for_each_engine(engine, gt, id) { 993 unsigned long device = i915_reset_count(global); 994 unsigned long count = 0, reported; 995 bool using_guc = intel_engine_uses_guc(engine); 996 IGT_TIMEOUT(end_time); 997 998 if (flags & TEST_ACTIVE) { 999 if (!intel_engine_can_store_dword(engine)) 1000 continue; 1001 } else if (using_guc) 1002 continue; 1003 1004 if (!wait_for_idle(engine)) { 1005 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 1006 engine->name, test_name); 1007 err = -EIO; 1008 break; 1009 } 1010 1011 memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES); 1012 for_each_engine(other, gt, tmp) { 1013 struct kthread_worker *worker; 1014 1015 threads[tmp].resets = 1016 i915_reset_engine_count(global, other); 1017 1018 if (other == engine && !(flags & TEST_SELF)) 1019 continue; 1020 1021 if (other != engine && !(flags & TEST_OTHERS)) 1022 continue; 1023 1024 threads[tmp].engine = other; 1025 threads[tmp].flags = flags; 1026 1027 worker = kthread_create_worker(0, "igt/%s", 1028 other->name); 1029 if (IS_ERR(worker)) { 1030 err = PTR_ERR(worker); 1031 pr_err("[%s] Worker create failed: %d!\n", 1032 engine->name, err); 1033 goto unwind; 1034 } 1035 1036 threads[tmp].worker = worker; 1037 1038 kthread_init_work(&threads[tmp].work, active_engine); 1039 kthread_queue_work(threads[tmp].worker, 1040 &threads[tmp].work); 1041 } 1042 1043 st_engine_heartbeat_disable_no_pm(engine); 1044 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 1045 >->reset.flags)); 1046 do { 1047 struct i915_request *rq = NULL; 1048 struct intel_selftest_saved_policy saved; 1049 int err2; 1050 1051 err = intel_selftest_modify_policy(engine, &saved, 1052 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 1053 if (err) { 1054 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1055 break; 1056 } 1057 1058 if (flags & TEST_ACTIVE) { 1059 rq = hang_create_request(&h, engine); 1060 if (IS_ERR(rq)) { 1061 err = PTR_ERR(rq); 1062 pr_err("[%s] Create hang request failed: %d!\n", 1063 engine->name, err); 1064 goto restore; 1065 } 1066 1067 i915_request_get(rq); 1068 i915_request_add(rq); 1069 1070 if (!wait_until_running(&h, rq)) { 1071 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1072 1073 pr_err("%s: Failed to start request %llx, at %x\n", 1074 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1075 intel_engine_dump(engine, &p, 1076 "%s\n", engine->name); 1077 1078 i915_request_put(rq); 1079 err = -EIO; 1080 goto restore; 1081 } 1082 } else { 1083 intel_engine_pm_get(engine); 1084 } 1085 1086 if (!using_guc) { 1087 err = intel_engine_reset(engine, NULL); 1088 if (err) { 1089 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1090 engine->name, test_name, err); 1091 goto restore; 1092 } 1093 } 1094 1095 if (rq) { 1096 /* Ensure the reset happens and kills the engine */ 1097 err = intel_selftest_wait_for_rq(rq); 1098 if (err) 1099 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 1100 engine->name, rq->fence.context, 1101 rq->fence.seqno, rq->context->guc_id.id, err); 1102 } 1103 1104 count++; 1105 1106 if (rq) { 1107 if (rq->fence.error != -EIO) { 1108 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n", 1109 engine->name, test_name, 1110 rq->fence.context, 1111 rq->fence.seqno, rq->context->guc_id.id); 1112 i915_request_put(rq); 1113 1114 GEM_TRACE_DUMP(); 1115 intel_gt_set_wedged(gt); 1116 err = -EIO; 1117 goto restore; 1118 } 1119 1120 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1121 struct drm_printer p = 1122 drm_info_printer(gt->i915->drm.dev); 1123 1124 pr_err("i915_reset_engine(%s:%s):" 1125 " failed to complete request %llx:%lld after reset\n", 1126 engine->name, test_name, 1127 rq->fence.context, 1128 rq->fence.seqno); 1129 intel_engine_dump(engine, &p, 1130 "%s\n", engine->name); 1131 i915_request_put(rq); 1132 1133 GEM_TRACE_DUMP(); 1134 intel_gt_set_wedged(gt); 1135 err = -EIO; 1136 goto restore; 1137 } 1138 1139 i915_request_put(rq); 1140 } 1141 1142 if (!(flags & TEST_ACTIVE)) 1143 intel_engine_pm_put(engine); 1144 1145 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1146 struct drm_printer p = 1147 drm_info_printer(gt->i915->drm.dev); 1148 1149 pr_err("i915_reset_engine(%s:%s):" 1150 " failed to idle after reset\n", 1151 engine->name, test_name); 1152 intel_engine_dump(engine, &p, 1153 "%s\n", engine->name); 1154 1155 err = -EIO; 1156 goto restore; 1157 } 1158 1159 restore: 1160 err2 = intel_selftest_restore_policy(engine, &saved); 1161 if (err2) 1162 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2); 1163 if (err == 0) 1164 err = err2; 1165 if (err) 1166 break; 1167 } while (time_before(jiffies, end_time)); 1168 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 1169 st_engine_heartbeat_enable_no_pm(engine); 1170 1171 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1172 engine->name, test_name, count); 1173 1174 /* GuC based resets are not logged per engine */ 1175 if (!using_guc) { 1176 reported = i915_reset_engine_count(global, engine); 1177 reported -= threads[engine->id].resets; 1178 if (reported != count) { 1179 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1180 engine->name, test_name, count, reported); 1181 if (!err) 1182 err = -EINVAL; 1183 } 1184 } 1185 1186 unwind: 1187 for_each_engine(other, gt, tmp) { 1188 int ret; 1189 1190 if (!threads[tmp].worker) 1191 continue; 1192 1193 WRITE_ONCE(threads[tmp].stop, true); 1194 kthread_flush_work(&threads[tmp].work); 1195 ret = READ_ONCE(threads[tmp].result); 1196 if (ret) { 1197 pr_err("kthread for other engine %s failed, err=%d\n", 1198 other->name, ret); 1199 if (!err) 1200 err = ret; 1201 } 1202 1203 kthread_destroy_worker(threads[tmp].worker); 1204 1205 /* GuC based resets are not logged per engine */ 1206 if (!using_guc) { 1207 if (other->uabi_class != engine->uabi_class && 1208 threads[tmp].resets != 1209 i915_reset_engine_count(global, other)) { 1210 pr_err("Innocent engine %s was reset (count=%ld)\n", 1211 other->name, 1212 i915_reset_engine_count(global, other) - 1213 threads[tmp].resets); 1214 if (!err) 1215 err = -EINVAL; 1216 } 1217 } 1218 } 1219 1220 if (device != i915_reset_count(global)) { 1221 pr_err("Global reset (count=%ld)!\n", 1222 i915_reset_count(global) - device); 1223 if (!err) 1224 err = -EINVAL; 1225 } 1226 1227 if (err) 1228 break; 1229 1230 err = igt_flush_test(gt->i915); 1231 if (err) { 1232 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1233 break; 1234 } 1235 } 1236 kfree(threads); 1237 1238 if (intel_gt_is_wedged(gt)) 1239 err = -EIO; 1240 1241 if (flags & TEST_ACTIVE) 1242 hang_fini(&h); 1243 1244 return err; 1245 } 1246 1247 static int igt_reset_engines(void *arg) 1248 { 1249 static const struct { 1250 const char *name; 1251 unsigned int flags; 1252 } phases[] = { 1253 { "idle", 0 }, 1254 { "active", TEST_ACTIVE }, 1255 { "others-idle", TEST_OTHERS }, 1256 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1257 { 1258 "others-priority", 1259 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1260 }, 1261 { 1262 "self-priority", 1263 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1264 }, 1265 { } 1266 }; 1267 struct intel_gt *gt = arg; 1268 typeof(*phases) *p; 1269 int err; 1270 1271 for (p = phases; p->name; p++) { 1272 if (p->flags & TEST_PRIORITY) { 1273 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1274 continue; 1275 } 1276 1277 err = __igt_reset_engines(arg, p->name, p->flags); 1278 if (err) 1279 return err; 1280 } 1281 1282 return 0; 1283 } 1284 1285 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1286 { 1287 u32 count = i915_reset_count(>->i915->gpu_error); 1288 1289 intel_gt_reset(gt, mask, NULL); 1290 1291 return count; 1292 } 1293 1294 static int igt_reset_wait(void *arg) 1295 { 1296 struct intel_gt *gt = arg; 1297 struct i915_gpu_error *global = >->i915->gpu_error; 1298 struct intel_engine_cs *engine; 1299 struct i915_request *rq; 1300 unsigned int reset_count; 1301 struct hang h; 1302 long timeout; 1303 int err; 1304 1305 engine = intel_selftest_find_any_engine(gt); 1306 1307 if (!engine || !intel_engine_can_store_dword(engine)) 1308 return 0; 1309 1310 /* Check that we detect a stuck waiter and issue a reset */ 1311 1312 igt_global_reset_lock(gt); 1313 1314 err = hang_init(&h, gt); 1315 if (err) { 1316 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1317 goto unlock; 1318 } 1319 1320 rq = hang_create_request(&h, engine); 1321 if (IS_ERR(rq)) { 1322 err = PTR_ERR(rq); 1323 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1324 goto fini; 1325 } 1326 1327 i915_request_get(rq); 1328 i915_request_add(rq); 1329 1330 if (!wait_until_running(&h, rq)) { 1331 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1332 1333 pr_err("%s: Failed to start request %llx, at %x\n", 1334 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1335 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1336 1337 intel_gt_set_wedged(gt); 1338 1339 err = -EIO; 1340 goto out_rq; 1341 } 1342 1343 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1344 1345 timeout = i915_request_wait(rq, 0, 10); 1346 if (timeout < 0) { 1347 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1348 timeout); 1349 err = timeout; 1350 goto out_rq; 1351 } 1352 1353 if (i915_reset_count(global) == reset_count) { 1354 pr_err("No GPU reset recorded!\n"); 1355 err = -EINVAL; 1356 goto out_rq; 1357 } 1358 1359 out_rq: 1360 i915_request_put(rq); 1361 fini: 1362 hang_fini(&h); 1363 unlock: 1364 igt_global_reset_unlock(gt); 1365 1366 if (intel_gt_is_wedged(gt)) 1367 return -EIO; 1368 1369 return err; 1370 } 1371 1372 struct evict_vma { 1373 struct completion completion; 1374 struct i915_vma *vma; 1375 }; 1376 1377 static int evict_vma(void *data) 1378 { 1379 struct evict_vma *arg = data; 1380 struct i915_address_space *vm = arg->vma->vm; 1381 struct drm_mm_node evict = arg->vma->node; 1382 int err; 1383 1384 complete(&arg->completion); 1385 1386 mutex_lock(&vm->mutex); 1387 err = i915_gem_evict_for_node(vm, NULL, &evict, 0); 1388 mutex_unlock(&vm->mutex); 1389 1390 return err; 1391 } 1392 1393 static int evict_fence(void *data) 1394 { 1395 struct evict_vma *arg = data; 1396 int err; 1397 1398 complete(&arg->completion); 1399 1400 /* Mark the fence register as dirty to force the mmio update. */ 1401 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1402 if (err) { 1403 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1404 return err; 1405 } 1406 1407 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1408 if (err) { 1409 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1410 return err; 1411 } 1412 1413 err = i915_vma_pin_fence(arg->vma); 1414 i915_vma_unpin(arg->vma); 1415 if (err) { 1416 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1417 return err; 1418 } 1419 1420 i915_vma_unpin_fence(arg->vma); 1421 1422 return 0; 1423 } 1424 1425 static int __igt_reset_evict_vma(struct intel_gt *gt, 1426 struct i915_address_space *vm, 1427 int (*fn)(void *), 1428 unsigned int flags) 1429 { 1430 struct intel_engine_cs *engine; 1431 struct drm_i915_gem_object *obj; 1432 struct task_struct *tsk = NULL; 1433 struct i915_request *rq; 1434 struct evict_vma arg; 1435 struct hang h; 1436 unsigned int pin_flags; 1437 int err; 1438 1439 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1440 return 0; 1441 1442 engine = intel_selftest_find_any_engine(gt); 1443 1444 if (!engine || !intel_engine_can_store_dword(engine)) 1445 return 0; 1446 1447 /* Check that we can recover an unbind stuck on a hanging request */ 1448 1449 err = hang_init(&h, gt); 1450 if (err) { 1451 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1452 return err; 1453 } 1454 1455 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1456 if (IS_ERR(obj)) { 1457 err = PTR_ERR(obj); 1458 pr_err("[%s] Create object failed: %d!\n", engine->name, err); 1459 goto fini; 1460 } 1461 1462 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1463 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1464 if (err) { 1465 pr_err("Invalid X-tiling settings; err:%d\n", err); 1466 goto out_obj; 1467 } 1468 } 1469 1470 arg.vma = i915_vma_instance(obj, vm, NULL); 1471 if (IS_ERR(arg.vma)) { 1472 err = PTR_ERR(arg.vma); 1473 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err); 1474 goto out_obj; 1475 } 1476 1477 rq = hang_create_request(&h, engine); 1478 if (IS_ERR(rq)) { 1479 err = PTR_ERR(rq); 1480 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1481 goto out_obj; 1482 } 1483 1484 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1485 1486 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1487 pin_flags |= PIN_MAPPABLE; 1488 1489 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1490 if (err) { 1491 i915_request_add(rq); 1492 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err); 1493 goto out_obj; 1494 } 1495 1496 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1497 err = i915_vma_pin_fence(arg.vma); 1498 if (err) { 1499 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1500 i915_vma_unpin(arg.vma); 1501 i915_request_add(rq); 1502 goto out_obj; 1503 } 1504 } 1505 1506 err = igt_vma_move_to_active_unlocked(arg.vma, rq, flags); 1507 if (err) 1508 pr_err("[%s] Move to active failed: %d!\n", engine->name, err); 1509 1510 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1511 i915_vma_unpin_fence(arg.vma); 1512 i915_vma_unpin(arg.vma); 1513 1514 i915_request_get(rq); 1515 i915_request_add(rq); 1516 if (err) 1517 goto out_rq; 1518 1519 if (!wait_until_running(&h, rq)) { 1520 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1521 1522 pr_err("%s: Failed to start request %llx, at %x\n", 1523 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1524 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1525 1526 intel_gt_set_wedged(gt); 1527 goto out_reset; 1528 } 1529 1530 init_completion(&arg.completion); 1531 1532 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1533 if (IS_ERR(tsk)) { 1534 err = PTR_ERR(tsk); 1535 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1536 tsk = NULL; 1537 goto out_reset; 1538 } 1539 get_task_struct(tsk); 1540 1541 wait_for_completion(&arg.completion); 1542 1543 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1544 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1545 1546 pr_err("igt/evict_vma kthread did not wait\n"); 1547 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1548 1549 intel_gt_set_wedged(gt); 1550 goto out_reset; 1551 } 1552 1553 out_reset: 1554 igt_global_reset_lock(gt); 1555 fake_hangcheck(gt, rq->engine->mask); 1556 igt_global_reset_unlock(gt); 1557 1558 if (tsk) { 1559 struct intel_wedge_me w; 1560 1561 /* The reset, even indirectly, should take less than 10ms. */ 1562 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1563 err = kthread_stop(tsk); 1564 1565 put_task_struct(tsk); 1566 } 1567 1568 out_rq: 1569 i915_request_put(rq); 1570 out_obj: 1571 i915_gem_object_put(obj); 1572 fini: 1573 hang_fini(&h); 1574 if (intel_gt_is_wedged(gt)) 1575 return -EIO; 1576 1577 return err; 1578 } 1579 1580 static int igt_reset_evict_ggtt(void *arg) 1581 { 1582 struct intel_gt *gt = arg; 1583 1584 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1585 evict_vma, EXEC_OBJECT_WRITE); 1586 } 1587 1588 static int igt_reset_evict_ppgtt(void *arg) 1589 { 1590 struct intel_gt *gt = arg; 1591 struct i915_ppgtt *ppgtt; 1592 int err; 1593 1594 /* aliasing == global gtt locking, covered above */ 1595 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1596 return 0; 1597 1598 ppgtt = i915_ppgtt_create(gt, 0); 1599 if (IS_ERR(ppgtt)) 1600 return PTR_ERR(ppgtt); 1601 1602 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1603 evict_vma, EXEC_OBJECT_WRITE); 1604 i915_vm_put(&ppgtt->vm); 1605 1606 return err; 1607 } 1608 1609 static int igt_reset_evict_fence(void *arg) 1610 { 1611 struct intel_gt *gt = arg; 1612 1613 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1614 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1615 } 1616 1617 static int wait_for_others(struct intel_gt *gt, 1618 struct intel_engine_cs *exclude) 1619 { 1620 struct intel_engine_cs *engine; 1621 enum intel_engine_id id; 1622 1623 for_each_engine(engine, gt, id) { 1624 if (engine == exclude) 1625 continue; 1626 1627 if (!wait_for_idle(engine)) 1628 return -EIO; 1629 } 1630 1631 return 0; 1632 } 1633 1634 static int igt_reset_queue(void *arg) 1635 { 1636 struct intel_gt *gt = arg; 1637 struct i915_gpu_error *global = >->i915->gpu_error; 1638 struct intel_engine_cs *engine; 1639 enum intel_engine_id id; 1640 struct hang h; 1641 int err; 1642 1643 /* Check that we replay pending requests following a hang */ 1644 1645 igt_global_reset_lock(gt); 1646 1647 err = hang_init(&h, gt); 1648 if (err) 1649 goto unlock; 1650 1651 for_each_engine(engine, gt, id) { 1652 struct intel_selftest_saved_policy saved; 1653 struct i915_request *prev; 1654 IGT_TIMEOUT(end_time); 1655 unsigned int count; 1656 bool using_guc = intel_engine_uses_guc(engine); 1657 1658 if (!intel_engine_can_store_dword(engine)) 1659 continue; 1660 1661 if (using_guc) { 1662 err = intel_selftest_modify_policy(engine, &saved, 1663 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK); 1664 if (err) { 1665 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1666 goto fini; 1667 } 1668 } 1669 1670 prev = hang_create_request(&h, engine); 1671 if (IS_ERR(prev)) { 1672 err = PTR_ERR(prev); 1673 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err); 1674 goto restore; 1675 } 1676 1677 i915_request_get(prev); 1678 i915_request_add(prev); 1679 1680 count = 0; 1681 do { 1682 struct i915_request *rq; 1683 unsigned int reset_count; 1684 1685 rq = hang_create_request(&h, engine); 1686 if (IS_ERR(rq)) { 1687 err = PTR_ERR(rq); 1688 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1689 goto restore; 1690 } 1691 1692 i915_request_get(rq); 1693 i915_request_add(rq); 1694 1695 /* 1696 * XXX We don't handle resetting the kernel context 1697 * very well. If we trigger a device reset twice in 1698 * quick succession while the kernel context is 1699 * executing, we may end up skipping the breadcrumb. 1700 * This is really only a problem for the selftest as 1701 * normally there is a large interlude between resets 1702 * (hangcheck), or we focus on resetting just one 1703 * engine and so avoid repeatedly resetting innocents. 1704 */ 1705 err = wait_for_others(gt, engine); 1706 if (err) { 1707 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1708 __func__, engine->name); 1709 i915_request_put(rq); 1710 i915_request_put(prev); 1711 1712 GEM_TRACE_DUMP(); 1713 intel_gt_set_wedged(gt); 1714 goto restore; 1715 } 1716 1717 if (!wait_until_running(&h, prev)) { 1718 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1719 1720 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1721 __func__, engine->name, 1722 prev->fence.seqno, hws_seqno(&h, prev)); 1723 intel_engine_dump(engine, &p, 1724 "%s\n", engine->name); 1725 1726 i915_request_put(rq); 1727 i915_request_put(prev); 1728 1729 intel_gt_set_wedged(gt); 1730 1731 err = -EIO; 1732 goto restore; 1733 } 1734 1735 reset_count = fake_hangcheck(gt, BIT(id)); 1736 1737 if (prev->fence.error != -EIO) { 1738 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1739 prev->fence.error); 1740 i915_request_put(rq); 1741 i915_request_put(prev); 1742 err = -EINVAL; 1743 goto restore; 1744 } 1745 1746 if (rq->fence.error) { 1747 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1748 rq->fence.error); 1749 i915_request_put(rq); 1750 i915_request_put(prev); 1751 err = -EINVAL; 1752 goto restore; 1753 } 1754 1755 if (i915_reset_count(global) == reset_count) { 1756 pr_err("No GPU reset recorded!\n"); 1757 i915_request_put(rq); 1758 i915_request_put(prev); 1759 err = -EINVAL; 1760 goto restore; 1761 } 1762 1763 i915_request_put(prev); 1764 prev = rq; 1765 count++; 1766 } while (time_before(jiffies, end_time)); 1767 pr_info("%s: Completed %d queued resets\n", 1768 engine->name, count); 1769 1770 *h.batch = MI_BATCH_BUFFER_END; 1771 intel_gt_chipset_flush(engine->gt); 1772 1773 i915_request_put(prev); 1774 1775 restore: 1776 if (using_guc) { 1777 int err2 = intel_selftest_restore_policy(engine, &saved); 1778 1779 if (err2) 1780 pr_err("%s:%d> [%s] Restore policy failed: %d!\n", 1781 __func__, __LINE__, engine->name, err2); 1782 if (err == 0) 1783 err = err2; 1784 } 1785 if (err) 1786 goto fini; 1787 1788 err = igt_flush_test(gt->i915); 1789 if (err) { 1790 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1791 break; 1792 } 1793 } 1794 1795 fini: 1796 hang_fini(&h); 1797 unlock: 1798 igt_global_reset_unlock(gt); 1799 1800 if (intel_gt_is_wedged(gt)) 1801 return -EIO; 1802 1803 return err; 1804 } 1805 1806 static int igt_handle_error(void *arg) 1807 { 1808 struct intel_gt *gt = arg; 1809 struct i915_gpu_error *global = >->i915->gpu_error; 1810 struct intel_engine_cs *engine; 1811 struct hang h; 1812 struct i915_request *rq; 1813 struct i915_gpu_coredump *error; 1814 int err; 1815 1816 engine = intel_selftest_find_any_engine(gt); 1817 1818 /* Check that we can issue a global GPU and engine reset */ 1819 1820 if (!intel_has_reset_engine(gt)) 1821 return 0; 1822 1823 if (!engine || !intel_engine_can_store_dword(engine)) 1824 return 0; 1825 1826 err = hang_init(&h, gt); 1827 if (err) { 1828 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1829 return err; 1830 } 1831 1832 rq = hang_create_request(&h, engine); 1833 if (IS_ERR(rq)) { 1834 err = PTR_ERR(rq); 1835 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1836 goto err_fini; 1837 } 1838 1839 i915_request_get(rq); 1840 i915_request_add(rq); 1841 1842 if (!wait_until_running(&h, rq)) { 1843 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1844 1845 pr_err("%s: Failed to start request %llx, at %x\n", 1846 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1847 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1848 1849 intel_gt_set_wedged(gt); 1850 1851 err = -EIO; 1852 goto err_request; 1853 } 1854 1855 /* Temporarily disable error capture */ 1856 error = xchg(&global->first_error, (void *)-1); 1857 1858 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1859 1860 xchg(&global->first_error, error); 1861 1862 if (rq->fence.error != -EIO) { 1863 pr_err("Guilty request not identified!\n"); 1864 err = -EINVAL; 1865 goto err_request; 1866 } 1867 1868 err_request: 1869 i915_request_put(rq); 1870 err_fini: 1871 hang_fini(&h); 1872 return err; 1873 } 1874 1875 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1876 const struct igt_atomic_section *p, 1877 const char *mode) 1878 { 1879 struct tasklet_struct * const t = &engine->sched_engine->tasklet; 1880 int err; 1881 1882 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1883 engine->name, mode, p->name); 1884 1885 if (t->func) 1886 tasklet_disable(t); 1887 if (strcmp(p->name, "softirq")) 1888 local_bh_disable(); 1889 p->critical_section_begin(); 1890 1891 err = __intel_engine_reset_bh(engine, NULL); 1892 1893 p->critical_section_end(); 1894 if (strcmp(p->name, "softirq")) 1895 local_bh_enable(); 1896 if (t->func) { 1897 tasklet_enable(t); 1898 tasklet_hi_schedule(t); 1899 } 1900 1901 if (err) 1902 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1903 engine->name, mode, p->name); 1904 1905 return err; 1906 } 1907 1908 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1909 const struct igt_atomic_section *p) 1910 { 1911 struct i915_request *rq; 1912 struct hang h; 1913 int err; 1914 1915 err = __igt_atomic_reset_engine(engine, p, "idle"); 1916 if (err) 1917 return err; 1918 1919 err = hang_init(&h, engine->gt); 1920 if (err) { 1921 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1922 return err; 1923 } 1924 1925 rq = hang_create_request(&h, engine); 1926 if (IS_ERR(rq)) { 1927 err = PTR_ERR(rq); 1928 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1929 goto out; 1930 } 1931 1932 i915_request_get(rq); 1933 i915_request_add(rq); 1934 1935 if (wait_until_running(&h, rq)) { 1936 err = __igt_atomic_reset_engine(engine, p, "active"); 1937 } else { 1938 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1939 __func__, engine->name, 1940 rq->fence.seqno, hws_seqno(&h, rq)); 1941 intel_gt_set_wedged(engine->gt); 1942 err = -EIO; 1943 } 1944 1945 if (err == 0) { 1946 struct intel_wedge_me w; 1947 1948 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1949 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1950 if (intel_gt_is_wedged(engine->gt)) 1951 err = -EIO; 1952 } 1953 1954 i915_request_put(rq); 1955 out: 1956 hang_fini(&h); 1957 return err; 1958 } 1959 1960 static int igt_reset_engines_atomic(void *arg) 1961 { 1962 struct intel_gt *gt = arg; 1963 const typeof(*igt_atomic_phases) *p; 1964 int err = 0; 1965 1966 /* Check that the engines resets are usable from atomic context */ 1967 1968 if (!intel_has_reset_engine(gt)) 1969 return 0; 1970 1971 if (intel_uc_uses_guc_submission(>->uc)) 1972 return 0; 1973 1974 igt_global_reset_lock(gt); 1975 1976 /* Flush any requests before we get started and check basics */ 1977 if (!igt_force_reset(gt)) 1978 goto unlock; 1979 1980 for (p = igt_atomic_phases; p->name; p++) { 1981 struct intel_engine_cs *engine; 1982 enum intel_engine_id id; 1983 1984 for_each_engine(engine, gt, id) { 1985 err = igt_atomic_reset_engine(engine, p); 1986 if (err) 1987 goto out; 1988 } 1989 } 1990 1991 out: 1992 /* As we poke around the guts, do a full reset before continuing. */ 1993 igt_force_reset(gt); 1994 unlock: 1995 igt_global_reset_unlock(gt); 1996 1997 return err; 1998 } 1999 2000 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 2001 { 2002 static const struct i915_subtest tests[] = { 2003 SUBTEST(igt_hang_sanitycheck), 2004 SUBTEST(igt_reset_nop), 2005 SUBTEST(igt_reset_nop_engine), 2006 SUBTEST(igt_reset_idle_engine), 2007 SUBTEST(igt_reset_active_engine), 2008 SUBTEST(igt_reset_fail_engine), 2009 SUBTEST(igt_reset_engines), 2010 SUBTEST(igt_reset_engines_atomic), 2011 SUBTEST(igt_reset_queue), 2012 SUBTEST(igt_reset_wait), 2013 SUBTEST(igt_reset_evict_ggtt), 2014 SUBTEST(igt_reset_evict_ppgtt), 2015 SUBTEST(igt_reset_evict_fence), 2016 SUBTEST(igt_handle_error), 2017 }; 2018 struct intel_gt *gt = to_gt(i915); 2019 intel_wakeref_t wakeref; 2020 int err; 2021 2022 if (!intel_has_gpu_reset(gt)) 2023 return 0; 2024 2025 if (intel_gt_is_wedged(gt)) 2026 return -EIO; /* we're long past hope of a successful reset */ 2027 2028 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 2029 2030 err = intel_gt_live_subtests(tests, gt); 2031 2032 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 2033 2034 return err; 2035 } 2036