1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2016 Intel Corporation 4 */ 5 6 #include <linux/kthread.h> 7 8 #include "gem/i915_gem_context.h" 9 10 #include "i915_gem_evict.h" 11 #include "intel_gt.h" 12 #include "intel_engine_heartbeat.h" 13 #include "intel_engine_pm.h" 14 #include "selftest_engine_heartbeat.h" 15 16 #include "i915_selftest.h" 17 #include "selftests/i915_random.h" 18 #include "selftests/igt_flush_test.h" 19 #include "selftests/igt_reset.h" 20 #include "selftests/igt_atomic.h" 21 #include "selftests/igt_spinner.h" 22 #include "selftests/intel_scheduler_helpers.h" 23 24 #include "selftests/mock_drm.h" 25 26 #include "gem/selftests/mock_context.h" 27 #include "gem/selftests/igt_gem_utils.h" 28 29 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 30 31 struct hang { 32 struct intel_gt *gt; 33 struct drm_i915_gem_object *hws; 34 struct drm_i915_gem_object *obj; 35 struct i915_gem_context *ctx; 36 u32 *seqno; 37 u32 *batch; 38 }; 39 40 static int hang_init(struct hang *h, struct intel_gt *gt) 41 { 42 void *vaddr; 43 int err; 44 45 memset(h, 0, sizeof(*h)); 46 h->gt = gt; 47 48 h->ctx = kernel_context(gt->i915, NULL); 49 if (IS_ERR(h->ctx)) 50 return PTR_ERR(h->ctx); 51 52 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 53 54 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 55 if (IS_ERR(h->hws)) { 56 err = PTR_ERR(h->hws); 57 goto err_ctx; 58 } 59 60 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 61 if (IS_ERR(h->obj)) { 62 err = PTR_ERR(h->obj); 63 goto err_hws; 64 } 65 66 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 67 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB); 68 if (IS_ERR(vaddr)) { 69 err = PTR_ERR(vaddr); 70 goto err_obj; 71 } 72 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 73 74 vaddr = i915_gem_object_pin_map_unlocked(h->obj, 75 i915_coherent_map_type(gt->i915, h->obj, false)); 76 if (IS_ERR(vaddr)) { 77 err = PTR_ERR(vaddr); 78 goto err_unpin_hws; 79 } 80 h->batch = vaddr; 81 82 return 0; 83 84 err_unpin_hws: 85 i915_gem_object_unpin_map(h->hws); 86 err_obj: 87 i915_gem_object_put(h->obj); 88 err_hws: 89 i915_gem_object_put(h->hws); 90 err_ctx: 91 kernel_context_close(h->ctx); 92 return err; 93 } 94 95 static u64 hws_address(const struct i915_vma *hws, 96 const struct i915_request *rq) 97 { 98 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 99 } 100 101 static int move_to_active(struct i915_vma *vma, 102 struct i915_request *rq, 103 unsigned int flags) 104 { 105 int err; 106 107 i915_vma_lock(vma); 108 err = i915_request_await_object(rq, vma->obj, 109 flags & EXEC_OBJECT_WRITE); 110 if (err == 0) 111 err = i915_vma_move_to_active(vma, rq, flags); 112 i915_vma_unlock(vma); 113 114 return err; 115 } 116 117 static struct i915_request * 118 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 119 { 120 struct intel_gt *gt = h->gt; 121 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx); 122 struct drm_i915_gem_object *obj; 123 struct i915_request *rq = NULL; 124 struct i915_vma *hws, *vma; 125 unsigned int flags; 126 void *vaddr; 127 u32 *batch; 128 int err; 129 130 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 131 if (IS_ERR(obj)) { 132 i915_vm_put(vm); 133 return ERR_CAST(obj); 134 } 135 136 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false)); 137 if (IS_ERR(vaddr)) { 138 i915_gem_object_put(obj); 139 i915_vm_put(vm); 140 return ERR_CAST(vaddr); 141 } 142 143 i915_gem_object_unpin_map(h->obj); 144 i915_gem_object_put(h->obj); 145 146 h->obj = obj; 147 h->batch = vaddr; 148 149 vma = i915_vma_instance(h->obj, vm, NULL); 150 if (IS_ERR(vma)) { 151 i915_vm_put(vm); 152 return ERR_CAST(vma); 153 } 154 155 hws = i915_vma_instance(h->hws, vm, NULL); 156 if (IS_ERR(hws)) { 157 i915_vm_put(vm); 158 return ERR_CAST(hws); 159 } 160 161 err = i915_vma_pin(vma, 0, 0, PIN_USER); 162 if (err) { 163 i915_vm_put(vm); 164 return ERR_PTR(err); 165 } 166 167 err = i915_vma_pin(hws, 0, 0, PIN_USER); 168 if (err) 169 goto unpin_vma; 170 171 rq = igt_request_alloc(h->ctx, engine); 172 if (IS_ERR(rq)) { 173 err = PTR_ERR(rq); 174 goto unpin_hws; 175 } 176 177 err = move_to_active(vma, rq, 0); 178 if (err) 179 goto cancel_rq; 180 181 err = move_to_active(hws, rq, 0); 182 if (err) 183 goto cancel_rq; 184 185 batch = h->batch; 186 if (GRAPHICS_VER(gt->i915) >= 8) { 187 *batch++ = MI_STORE_DWORD_IMM_GEN4; 188 *batch++ = lower_32_bits(hws_address(hws, rq)); 189 *batch++ = upper_32_bits(hws_address(hws, rq)); 190 *batch++ = rq->fence.seqno; 191 *batch++ = MI_NOOP; 192 193 memset(batch, 0, 1024); 194 batch += 1024 / sizeof(*batch); 195 196 *batch++ = MI_NOOP; 197 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 198 *batch++ = lower_32_bits(vma->node.start); 199 *batch++ = upper_32_bits(vma->node.start); 200 } else if (GRAPHICS_VER(gt->i915) >= 6) { 201 *batch++ = MI_STORE_DWORD_IMM_GEN4; 202 *batch++ = 0; 203 *batch++ = lower_32_bits(hws_address(hws, rq)); 204 *batch++ = rq->fence.seqno; 205 *batch++ = MI_NOOP; 206 207 memset(batch, 0, 1024); 208 batch += 1024 / sizeof(*batch); 209 210 *batch++ = MI_NOOP; 211 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 212 *batch++ = lower_32_bits(vma->node.start); 213 } else if (GRAPHICS_VER(gt->i915) >= 4) { 214 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 215 *batch++ = 0; 216 *batch++ = lower_32_bits(hws_address(hws, rq)); 217 *batch++ = rq->fence.seqno; 218 *batch++ = MI_NOOP; 219 220 memset(batch, 0, 1024); 221 batch += 1024 / sizeof(*batch); 222 223 *batch++ = MI_NOOP; 224 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 225 *batch++ = lower_32_bits(vma->node.start); 226 } else { 227 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 228 *batch++ = lower_32_bits(hws_address(hws, rq)); 229 *batch++ = rq->fence.seqno; 230 *batch++ = MI_NOOP; 231 232 memset(batch, 0, 1024); 233 batch += 1024 / sizeof(*batch); 234 235 *batch++ = MI_NOOP; 236 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 237 *batch++ = lower_32_bits(vma->node.start); 238 } 239 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 240 intel_gt_chipset_flush(engine->gt); 241 242 if (rq->engine->emit_init_breadcrumb) { 243 err = rq->engine->emit_init_breadcrumb(rq); 244 if (err) 245 goto cancel_rq; 246 } 247 248 flags = 0; 249 if (GRAPHICS_VER(gt->i915) <= 5) 250 flags |= I915_DISPATCH_SECURE; 251 252 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 253 254 cancel_rq: 255 if (err) { 256 i915_request_set_error_once(rq, err); 257 i915_request_add(rq); 258 } 259 unpin_hws: 260 i915_vma_unpin(hws); 261 unpin_vma: 262 i915_vma_unpin(vma); 263 i915_vm_put(vm); 264 return err ? ERR_PTR(err) : rq; 265 } 266 267 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 268 { 269 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 270 } 271 272 static void hang_fini(struct hang *h) 273 { 274 *h->batch = MI_BATCH_BUFFER_END; 275 intel_gt_chipset_flush(h->gt); 276 277 i915_gem_object_unpin_map(h->obj); 278 i915_gem_object_put(h->obj); 279 280 i915_gem_object_unpin_map(h->hws); 281 i915_gem_object_put(h->hws); 282 283 kernel_context_close(h->ctx); 284 285 igt_flush_test(h->gt->i915); 286 } 287 288 static bool wait_until_running(struct hang *h, struct i915_request *rq) 289 { 290 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 291 rq->fence.seqno), 292 10) && 293 wait_for(i915_seqno_passed(hws_seqno(h, rq), 294 rq->fence.seqno), 295 1000)); 296 } 297 298 static int igt_hang_sanitycheck(void *arg) 299 { 300 struct intel_gt *gt = arg; 301 struct i915_request *rq; 302 struct intel_engine_cs *engine; 303 enum intel_engine_id id; 304 struct hang h; 305 int err; 306 307 /* Basic check that we can execute our hanging batch */ 308 309 err = hang_init(&h, gt); 310 if (err) 311 return err; 312 313 for_each_engine(engine, gt, id) { 314 struct intel_wedge_me w; 315 long timeout; 316 317 if (!intel_engine_can_store_dword(engine)) 318 continue; 319 320 rq = hang_create_request(&h, engine); 321 if (IS_ERR(rq)) { 322 err = PTR_ERR(rq); 323 pr_err("Failed to create request for %s, err=%d\n", 324 engine->name, err); 325 goto fini; 326 } 327 328 i915_request_get(rq); 329 330 *h.batch = MI_BATCH_BUFFER_END; 331 intel_gt_chipset_flush(engine->gt); 332 333 i915_request_add(rq); 334 335 timeout = 0; 336 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 337 timeout = i915_request_wait(rq, 0, 338 MAX_SCHEDULE_TIMEOUT); 339 if (intel_gt_is_wedged(gt)) 340 timeout = -EIO; 341 342 i915_request_put(rq); 343 344 if (timeout < 0) { 345 err = timeout; 346 pr_err("Wait for request failed on %s, err=%d\n", 347 engine->name, err); 348 goto fini; 349 } 350 } 351 352 fini: 353 hang_fini(&h); 354 return err; 355 } 356 357 static bool wait_for_idle(struct intel_engine_cs *engine) 358 { 359 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 360 } 361 362 static int igt_reset_nop(void *arg) 363 { 364 struct intel_gt *gt = arg; 365 struct i915_gpu_error *global = >->i915->gpu_error; 366 struct intel_engine_cs *engine; 367 unsigned int reset_count, count; 368 enum intel_engine_id id; 369 IGT_TIMEOUT(end_time); 370 int err = 0; 371 372 /* Check that we can reset during non-user portions of requests */ 373 374 reset_count = i915_reset_count(global); 375 count = 0; 376 do { 377 for_each_engine(engine, gt, id) { 378 struct intel_context *ce; 379 int i; 380 381 ce = intel_context_create(engine); 382 if (IS_ERR(ce)) { 383 err = PTR_ERR(ce); 384 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 385 break; 386 } 387 388 for (i = 0; i < 16; i++) { 389 struct i915_request *rq; 390 391 rq = intel_context_create_request(ce); 392 if (IS_ERR(rq)) { 393 err = PTR_ERR(rq); 394 pr_err("[%s] Create request failed: %d!\n", 395 engine->name, err); 396 break; 397 } 398 399 i915_request_add(rq); 400 } 401 402 intel_context_put(ce); 403 } 404 405 igt_global_reset_lock(gt); 406 intel_gt_reset(gt, ALL_ENGINES, NULL); 407 igt_global_reset_unlock(gt); 408 409 if (intel_gt_is_wedged(gt)) { 410 pr_err("[%s] GT is wedged!\n", engine->name); 411 err = -EIO; 412 break; 413 } 414 415 if (i915_reset_count(global) != reset_count + ++count) { 416 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n", 417 engine->name, i915_reset_count(global), reset_count, count); 418 err = -EINVAL; 419 break; 420 } 421 422 err = igt_flush_test(gt->i915); 423 if (err) { 424 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 425 break; 426 } 427 } while (time_before(jiffies, end_time)); 428 pr_info("%s: %d resets\n", __func__, count); 429 430 if (igt_flush_test(gt->i915)) { 431 pr_err("Post flush failed: %d!\n", err); 432 err = -EIO; 433 } 434 435 return err; 436 } 437 438 static int igt_reset_nop_engine(void *arg) 439 { 440 struct intel_gt *gt = arg; 441 struct i915_gpu_error *global = >->i915->gpu_error; 442 struct intel_engine_cs *engine; 443 enum intel_engine_id id; 444 445 /* Check that we can engine-reset during non-user portions */ 446 447 if (!intel_has_reset_engine(gt)) 448 return 0; 449 450 for_each_engine(engine, gt, id) { 451 unsigned int reset_count, reset_engine_count, count; 452 struct intel_context *ce; 453 IGT_TIMEOUT(end_time); 454 int err; 455 456 if (intel_engine_uses_guc(engine)) { 457 /* Engine level resets are triggered by GuC when a hang 458 * is detected. They can't be triggered by the KMD any 459 * more. Thus a nop batch cannot be used as a reset test 460 */ 461 continue; 462 } 463 464 ce = intel_context_create(engine); 465 if (IS_ERR(ce)) { 466 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 467 return PTR_ERR(ce); 468 } 469 470 reset_count = i915_reset_count(global); 471 reset_engine_count = i915_reset_engine_count(global, engine); 472 count = 0; 473 474 st_engine_heartbeat_disable(engine); 475 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 476 do { 477 int i; 478 479 if (!wait_for_idle(engine)) { 480 pr_err("%s failed to idle before reset\n", 481 engine->name); 482 err = -EIO; 483 break; 484 } 485 486 for (i = 0; i < 16; i++) { 487 struct i915_request *rq; 488 489 rq = intel_context_create_request(ce); 490 if (IS_ERR(rq)) { 491 struct drm_printer p = 492 drm_info_printer(gt->i915->drm.dev); 493 intel_engine_dump(engine, &p, 494 "%s(%s): failed to submit request\n", 495 __func__, 496 engine->name); 497 498 GEM_TRACE("%s(%s): failed to submit request\n", 499 __func__, 500 engine->name); 501 GEM_TRACE_DUMP(); 502 503 intel_gt_set_wedged(gt); 504 505 err = PTR_ERR(rq); 506 break; 507 } 508 509 i915_request_add(rq); 510 } 511 err = intel_engine_reset(engine, NULL); 512 if (err) { 513 pr_err("intel_engine_reset(%s) failed, err:%d\n", 514 engine->name, err); 515 break; 516 } 517 518 if (i915_reset_count(global) != reset_count) { 519 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 520 err = -EINVAL; 521 break; 522 } 523 524 if (i915_reset_engine_count(global, engine) != 525 reset_engine_count + ++count) { 526 pr_err("%s engine reset not recorded!\n", 527 engine->name); 528 err = -EINVAL; 529 break; 530 } 531 } while (time_before(jiffies, end_time)); 532 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 533 st_engine_heartbeat_enable(engine); 534 535 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 536 537 intel_context_put(ce); 538 if (igt_flush_test(gt->i915)) 539 err = -EIO; 540 if (err) 541 return err; 542 } 543 544 return 0; 545 } 546 547 static void force_reset_timeout(struct intel_engine_cs *engine) 548 { 549 engine->reset_timeout.probability = 999; 550 atomic_set(&engine->reset_timeout.times, -1); 551 } 552 553 static void cancel_reset_timeout(struct intel_engine_cs *engine) 554 { 555 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 556 } 557 558 static int igt_reset_fail_engine(void *arg) 559 { 560 struct intel_gt *gt = arg; 561 struct intel_engine_cs *engine; 562 enum intel_engine_id id; 563 564 /* Check that we can recover from engine-reset failues */ 565 566 if (!intel_has_reset_engine(gt)) 567 return 0; 568 569 for_each_engine(engine, gt, id) { 570 unsigned int count; 571 struct intel_context *ce; 572 IGT_TIMEOUT(end_time); 573 int err; 574 575 /* Can't manually break the reset if i915 doesn't perform it */ 576 if (intel_engine_uses_guc(engine)) 577 continue; 578 579 ce = intel_context_create(engine); 580 if (IS_ERR(ce)) { 581 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 582 return PTR_ERR(ce); 583 } 584 585 st_engine_heartbeat_disable(engine); 586 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 587 588 force_reset_timeout(engine); 589 err = intel_engine_reset(engine, NULL); 590 cancel_reset_timeout(engine); 591 if (err == 0) /* timeouts only generated on gen8+ */ 592 goto skip; 593 594 count = 0; 595 do { 596 struct i915_request *last = NULL; 597 int i; 598 599 if (!wait_for_idle(engine)) { 600 pr_err("%s failed to idle before reset\n", 601 engine->name); 602 err = -EIO; 603 break; 604 } 605 606 for (i = 0; i < count % 15; i++) { 607 struct i915_request *rq; 608 609 rq = intel_context_create_request(ce); 610 if (IS_ERR(rq)) { 611 struct drm_printer p = 612 drm_info_printer(gt->i915->drm.dev); 613 intel_engine_dump(engine, &p, 614 "%s(%s): failed to submit request\n", 615 __func__, 616 engine->name); 617 618 GEM_TRACE("%s(%s): failed to submit request\n", 619 __func__, 620 engine->name); 621 GEM_TRACE_DUMP(); 622 623 intel_gt_set_wedged(gt); 624 if (last) 625 i915_request_put(last); 626 627 err = PTR_ERR(rq); 628 goto out; 629 } 630 631 if (last) 632 i915_request_put(last); 633 last = i915_request_get(rq); 634 i915_request_add(rq); 635 } 636 637 if (count & 1) { 638 err = intel_engine_reset(engine, NULL); 639 if (err) { 640 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 641 engine->name, err); 642 GEM_TRACE_DUMP(); 643 i915_request_put(last); 644 break; 645 } 646 } else { 647 force_reset_timeout(engine); 648 err = intel_engine_reset(engine, NULL); 649 cancel_reset_timeout(engine); 650 if (err != -ETIMEDOUT) { 651 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 652 engine->name, err); 653 i915_request_put(last); 654 break; 655 } 656 } 657 658 err = 0; 659 if (last) { 660 if (i915_request_wait(last, 0, HZ / 2) < 0) { 661 struct drm_printer p = 662 drm_info_printer(gt->i915->drm.dev); 663 664 intel_engine_dump(engine, &p, 665 "%s(%s): failed to complete request\n", 666 __func__, 667 engine->name); 668 669 GEM_TRACE("%s(%s): failed to complete request\n", 670 __func__, 671 engine->name); 672 GEM_TRACE_DUMP(); 673 674 err = -EIO; 675 } 676 i915_request_put(last); 677 } 678 count++; 679 } while (err == 0 && time_before(jiffies, end_time)); 680 out: 681 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 682 skip: 683 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 684 st_engine_heartbeat_enable(engine); 685 intel_context_put(ce); 686 687 if (igt_flush_test(gt->i915)) 688 err = -EIO; 689 if (err) 690 return err; 691 } 692 693 return 0; 694 } 695 696 static int __igt_reset_engine(struct intel_gt *gt, bool active) 697 { 698 struct i915_gpu_error *global = >->i915->gpu_error; 699 struct intel_engine_cs *engine; 700 enum intel_engine_id id; 701 struct hang h; 702 int err = 0; 703 704 /* Check that we can issue an engine reset on an idle engine (no-op) */ 705 706 if (!intel_has_reset_engine(gt)) 707 return 0; 708 709 if (active) { 710 err = hang_init(&h, gt); 711 if (err) 712 return err; 713 } 714 715 for_each_engine(engine, gt, id) { 716 unsigned int reset_count, reset_engine_count; 717 unsigned long count; 718 bool using_guc = intel_engine_uses_guc(engine); 719 IGT_TIMEOUT(end_time); 720 721 if (using_guc && !active) 722 continue; 723 724 if (active && !intel_engine_can_store_dword(engine)) 725 continue; 726 727 if (!wait_for_idle(engine)) { 728 pr_err("%s failed to idle before reset\n", 729 engine->name); 730 err = -EIO; 731 break; 732 } 733 734 reset_count = i915_reset_count(global); 735 reset_engine_count = i915_reset_engine_count(global, engine); 736 737 st_engine_heartbeat_disable(engine); 738 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 739 count = 0; 740 do { 741 struct i915_request *rq = NULL; 742 struct intel_selftest_saved_policy saved; 743 int err2; 744 745 err = intel_selftest_modify_policy(engine, &saved, 746 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 747 if (err) { 748 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 749 break; 750 } 751 752 if (active) { 753 rq = hang_create_request(&h, engine); 754 if (IS_ERR(rq)) { 755 err = PTR_ERR(rq); 756 pr_err("[%s] Create hang request failed: %d!\n", 757 engine->name, err); 758 goto restore; 759 } 760 761 i915_request_get(rq); 762 i915_request_add(rq); 763 764 if (!wait_until_running(&h, rq)) { 765 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 766 767 pr_err("%s: Failed to start request %llx, at %x\n", 768 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 769 intel_engine_dump(engine, &p, 770 "%s\n", engine->name); 771 772 i915_request_put(rq); 773 err = -EIO; 774 goto restore; 775 } 776 } 777 778 if (!using_guc) { 779 err = intel_engine_reset(engine, NULL); 780 if (err) { 781 pr_err("intel_engine_reset(%s) failed, err:%d\n", 782 engine->name, err); 783 goto skip; 784 } 785 } 786 787 if (rq) { 788 /* Ensure the reset happens and kills the engine */ 789 err = intel_selftest_wait_for_rq(rq); 790 if (err) 791 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 792 engine->name, rq->fence.context, 793 rq->fence.seqno, rq->context->guc_id.id, err); 794 } 795 796 skip: 797 if (rq) 798 i915_request_put(rq); 799 800 if (i915_reset_count(global) != reset_count) { 801 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 802 err = -EINVAL; 803 goto restore; 804 } 805 806 /* GuC based resets are not logged per engine */ 807 if (!using_guc) { 808 if (i915_reset_engine_count(global, engine) != 809 ++reset_engine_count) { 810 pr_err("%s engine reset not recorded!\n", 811 engine->name); 812 err = -EINVAL; 813 goto restore; 814 } 815 } 816 817 count++; 818 819 restore: 820 err2 = intel_selftest_restore_policy(engine, &saved); 821 if (err2) 822 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err); 823 if (err == 0) 824 err = err2; 825 if (err) 826 break; 827 } while (time_before(jiffies, end_time)); 828 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 829 st_engine_heartbeat_enable(engine); 830 pr_info("%s: Completed %lu %s resets\n", 831 engine->name, count, active ? "active" : "idle"); 832 833 if (err) 834 break; 835 836 err = igt_flush_test(gt->i915); 837 if (err) { 838 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 839 break; 840 } 841 } 842 843 if (intel_gt_is_wedged(gt)) { 844 pr_err("GT is wedged!\n"); 845 err = -EIO; 846 } 847 848 if (active) 849 hang_fini(&h); 850 851 return err; 852 } 853 854 static int igt_reset_idle_engine(void *arg) 855 { 856 return __igt_reset_engine(arg, false); 857 } 858 859 static int igt_reset_active_engine(void *arg) 860 { 861 return __igt_reset_engine(arg, true); 862 } 863 864 struct active_engine { 865 struct task_struct *task; 866 struct intel_engine_cs *engine; 867 unsigned long resets; 868 unsigned int flags; 869 }; 870 871 #define TEST_ACTIVE BIT(0) 872 #define TEST_OTHERS BIT(1) 873 #define TEST_SELF BIT(2) 874 #define TEST_PRIORITY BIT(3) 875 876 static int active_request_put(struct i915_request *rq) 877 { 878 int err = 0; 879 880 if (!rq) 881 return 0; 882 883 if (i915_request_wait(rq, 0, 10 * HZ) < 0) { 884 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 885 rq->engine->name, 886 rq->fence.context, 887 rq->fence.seqno); 888 GEM_TRACE_DUMP(); 889 890 intel_gt_set_wedged(rq->engine->gt); 891 err = -EIO; 892 } 893 894 i915_request_put(rq); 895 896 return err; 897 } 898 899 static int active_engine(void *data) 900 { 901 I915_RND_STATE(prng); 902 struct active_engine *arg = data; 903 struct intel_engine_cs *engine = arg->engine; 904 struct i915_request *rq[8] = {}; 905 struct intel_context *ce[ARRAY_SIZE(rq)]; 906 unsigned long count; 907 int err = 0; 908 909 for (count = 0; count < ARRAY_SIZE(ce); count++) { 910 ce[count] = intel_context_create(engine); 911 if (IS_ERR(ce[count])) { 912 err = PTR_ERR(ce[count]); 913 pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err); 914 while (--count) 915 intel_context_put(ce[count]); 916 return err; 917 } 918 } 919 920 count = 0; 921 while (!kthread_should_stop()) { 922 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 923 struct i915_request *old = rq[idx]; 924 struct i915_request *new; 925 926 new = intel_context_create_request(ce[idx]); 927 if (IS_ERR(new)) { 928 err = PTR_ERR(new); 929 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err); 930 break; 931 } 932 933 rq[idx] = i915_request_get(new); 934 i915_request_add(new); 935 936 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) { 937 struct i915_sched_attr attr = { 938 .priority = 939 i915_prandom_u32_max_state(512, &prng), 940 }; 941 engine->sched_engine->schedule(rq[idx], &attr); 942 } 943 944 err = active_request_put(old); 945 if (err) { 946 pr_err("[%s] Request put failed: %d!\n", engine->name, err); 947 break; 948 } 949 950 cond_resched(); 951 } 952 953 for (count = 0; count < ARRAY_SIZE(rq); count++) { 954 int err__ = active_request_put(rq[count]); 955 956 if (err) 957 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err); 958 959 /* Keep the first error */ 960 if (!err) 961 err = err__; 962 963 intel_context_put(ce[count]); 964 } 965 966 return err; 967 } 968 969 static int __igt_reset_engines(struct intel_gt *gt, 970 const char *test_name, 971 unsigned int flags) 972 { 973 struct i915_gpu_error *global = >->i915->gpu_error; 974 struct intel_engine_cs *engine, *other; 975 enum intel_engine_id id, tmp; 976 struct hang h; 977 int err = 0; 978 979 /* Check that issuing a reset on one engine does not interfere 980 * with any other engine. 981 */ 982 983 if (!intel_has_reset_engine(gt)) 984 return 0; 985 986 if (flags & TEST_ACTIVE) { 987 err = hang_init(&h, gt); 988 if (err) 989 return err; 990 991 if (flags & TEST_PRIORITY) 992 h.ctx->sched.priority = 1024; 993 } 994 995 for_each_engine(engine, gt, id) { 996 struct active_engine threads[I915_NUM_ENGINES] = {}; 997 unsigned long device = i915_reset_count(global); 998 unsigned long count = 0, reported; 999 bool using_guc = intel_engine_uses_guc(engine); 1000 IGT_TIMEOUT(end_time); 1001 1002 if (flags & TEST_ACTIVE) { 1003 if (!intel_engine_can_store_dword(engine)) 1004 continue; 1005 } else if (using_guc) 1006 continue; 1007 1008 if (!wait_for_idle(engine)) { 1009 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 1010 engine->name, test_name); 1011 err = -EIO; 1012 break; 1013 } 1014 1015 memset(threads, 0, sizeof(threads)); 1016 for_each_engine(other, gt, tmp) { 1017 struct task_struct *tsk; 1018 1019 threads[tmp].resets = 1020 i915_reset_engine_count(global, other); 1021 1022 if (other == engine && !(flags & TEST_SELF)) 1023 continue; 1024 1025 if (other != engine && !(flags & TEST_OTHERS)) 1026 continue; 1027 1028 threads[tmp].engine = other; 1029 threads[tmp].flags = flags; 1030 1031 tsk = kthread_run(active_engine, &threads[tmp], 1032 "igt/%s", other->name); 1033 if (IS_ERR(tsk)) { 1034 err = PTR_ERR(tsk); 1035 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1036 goto unwind; 1037 } 1038 1039 threads[tmp].task = tsk; 1040 get_task_struct(tsk); 1041 } 1042 1043 yield(); /* start all threads before we begin */ 1044 1045 st_engine_heartbeat_disable_no_pm(engine); 1046 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 1047 do { 1048 struct i915_request *rq = NULL; 1049 struct intel_selftest_saved_policy saved; 1050 int err2; 1051 1052 err = intel_selftest_modify_policy(engine, &saved, 1053 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 1054 if (err) { 1055 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1056 break; 1057 } 1058 1059 if (flags & TEST_ACTIVE) { 1060 rq = hang_create_request(&h, engine); 1061 if (IS_ERR(rq)) { 1062 err = PTR_ERR(rq); 1063 pr_err("[%s] Create hang request failed: %d!\n", 1064 engine->name, err); 1065 goto restore; 1066 } 1067 1068 i915_request_get(rq); 1069 i915_request_add(rq); 1070 1071 if (!wait_until_running(&h, rq)) { 1072 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1073 1074 pr_err("%s: Failed to start request %llx, at %x\n", 1075 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1076 intel_engine_dump(engine, &p, 1077 "%s\n", engine->name); 1078 1079 i915_request_put(rq); 1080 err = -EIO; 1081 goto restore; 1082 } 1083 } else { 1084 intel_engine_pm_get(engine); 1085 } 1086 1087 if (!using_guc) { 1088 err = intel_engine_reset(engine, NULL); 1089 if (err) { 1090 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1091 engine->name, test_name, err); 1092 goto restore; 1093 } 1094 } 1095 1096 if (rq) { 1097 /* Ensure the reset happens and kills the engine */ 1098 err = intel_selftest_wait_for_rq(rq); 1099 if (err) 1100 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 1101 engine->name, rq->fence.context, 1102 rq->fence.seqno, rq->context->guc_id.id, err); 1103 } 1104 1105 count++; 1106 1107 if (rq) { 1108 if (rq->fence.error != -EIO) { 1109 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n", 1110 engine->name, test_name, 1111 rq->fence.context, 1112 rq->fence.seqno, rq->context->guc_id.id); 1113 i915_request_put(rq); 1114 1115 GEM_TRACE_DUMP(); 1116 intel_gt_set_wedged(gt); 1117 err = -EIO; 1118 goto restore; 1119 } 1120 1121 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1122 struct drm_printer p = 1123 drm_info_printer(gt->i915->drm.dev); 1124 1125 pr_err("i915_reset_engine(%s:%s):" 1126 " failed to complete request %llx:%lld after reset\n", 1127 engine->name, test_name, 1128 rq->fence.context, 1129 rq->fence.seqno); 1130 intel_engine_dump(engine, &p, 1131 "%s\n", engine->name); 1132 i915_request_put(rq); 1133 1134 GEM_TRACE_DUMP(); 1135 intel_gt_set_wedged(gt); 1136 err = -EIO; 1137 goto restore; 1138 } 1139 1140 i915_request_put(rq); 1141 } 1142 1143 if (!(flags & TEST_ACTIVE)) 1144 intel_engine_pm_put(engine); 1145 1146 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1147 struct drm_printer p = 1148 drm_info_printer(gt->i915->drm.dev); 1149 1150 pr_err("i915_reset_engine(%s:%s):" 1151 " failed to idle after reset\n", 1152 engine->name, test_name); 1153 intel_engine_dump(engine, &p, 1154 "%s\n", engine->name); 1155 1156 err = -EIO; 1157 goto restore; 1158 } 1159 1160 restore: 1161 err2 = intel_selftest_restore_policy(engine, &saved); 1162 if (err2) 1163 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2); 1164 if (err == 0) 1165 err = err2; 1166 if (err) 1167 break; 1168 } while (time_before(jiffies, end_time)); 1169 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 1170 st_engine_heartbeat_enable_no_pm(engine); 1171 1172 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1173 engine->name, test_name, count); 1174 1175 /* GuC based resets are not logged per engine */ 1176 if (!using_guc) { 1177 reported = i915_reset_engine_count(global, engine); 1178 reported -= threads[engine->id].resets; 1179 if (reported != count) { 1180 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1181 engine->name, test_name, count, reported); 1182 if (!err) 1183 err = -EINVAL; 1184 } 1185 } 1186 1187 unwind: 1188 for_each_engine(other, gt, tmp) { 1189 int ret; 1190 1191 if (!threads[tmp].task) 1192 continue; 1193 1194 ret = kthread_stop(threads[tmp].task); 1195 if (ret) { 1196 pr_err("kthread for other engine %s failed, err=%d\n", 1197 other->name, ret); 1198 if (!err) 1199 err = ret; 1200 } 1201 put_task_struct(threads[tmp].task); 1202 1203 /* GuC based resets are not logged per engine */ 1204 if (!using_guc) { 1205 if (other->uabi_class != engine->uabi_class && 1206 threads[tmp].resets != 1207 i915_reset_engine_count(global, other)) { 1208 pr_err("Innocent engine %s was reset (count=%ld)\n", 1209 other->name, 1210 i915_reset_engine_count(global, other) - 1211 threads[tmp].resets); 1212 if (!err) 1213 err = -EINVAL; 1214 } 1215 } 1216 } 1217 1218 if (device != i915_reset_count(global)) { 1219 pr_err("Global reset (count=%ld)!\n", 1220 i915_reset_count(global) - device); 1221 if (!err) 1222 err = -EINVAL; 1223 } 1224 1225 if (err) 1226 break; 1227 1228 err = igt_flush_test(gt->i915); 1229 if (err) { 1230 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1231 break; 1232 } 1233 } 1234 1235 if (intel_gt_is_wedged(gt)) 1236 err = -EIO; 1237 1238 if (flags & TEST_ACTIVE) 1239 hang_fini(&h); 1240 1241 return err; 1242 } 1243 1244 static int igt_reset_engines(void *arg) 1245 { 1246 static const struct { 1247 const char *name; 1248 unsigned int flags; 1249 } phases[] = { 1250 { "idle", 0 }, 1251 { "active", TEST_ACTIVE }, 1252 { "others-idle", TEST_OTHERS }, 1253 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1254 { 1255 "others-priority", 1256 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1257 }, 1258 { 1259 "self-priority", 1260 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1261 }, 1262 { } 1263 }; 1264 struct intel_gt *gt = arg; 1265 typeof(*phases) *p; 1266 int err; 1267 1268 for (p = phases; p->name; p++) { 1269 if (p->flags & TEST_PRIORITY) { 1270 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1271 continue; 1272 } 1273 1274 err = __igt_reset_engines(arg, p->name, p->flags); 1275 if (err) 1276 return err; 1277 } 1278 1279 return 0; 1280 } 1281 1282 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1283 { 1284 u32 count = i915_reset_count(>->i915->gpu_error); 1285 1286 intel_gt_reset(gt, mask, NULL); 1287 1288 return count; 1289 } 1290 1291 static int igt_reset_wait(void *arg) 1292 { 1293 struct intel_gt *gt = arg; 1294 struct i915_gpu_error *global = >->i915->gpu_error; 1295 struct intel_engine_cs *engine = gt->engine[RCS0]; 1296 struct i915_request *rq; 1297 unsigned int reset_count; 1298 struct hang h; 1299 long timeout; 1300 int err; 1301 1302 if (!engine || !intel_engine_can_store_dword(engine)) 1303 return 0; 1304 1305 /* Check that we detect a stuck waiter and issue a reset */ 1306 1307 igt_global_reset_lock(gt); 1308 1309 err = hang_init(&h, gt); 1310 if (err) { 1311 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1312 goto unlock; 1313 } 1314 1315 rq = hang_create_request(&h, engine); 1316 if (IS_ERR(rq)) { 1317 err = PTR_ERR(rq); 1318 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1319 goto fini; 1320 } 1321 1322 i915_request_get(rq); 1323 i915_request_add(rq); 1324 1325 if (!wait_until_running(&h, rq)) { 1326 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1327 1328 pr_err("%s: Failed to start request %llx, at %x\n", 1329 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1330 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1331 1332 intel_gt_set_wedged(gt); 1333 1334 err = -EIO; 1335 goto out_rq; 1336 } 1337 1338 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1339 1340 timeout = i915_request_wait(rq, 0, 10); 1341 if (timeout < 0) { 1342 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1343 timeout); 1344 err = timeout; 1345 goto out_rq; 1346 } 1347 1348 if (i915_reset_count(global) == reset_count) { 1349 pr_err("No GPU reset recorded!\n"); 1350 err = -EINVAL; 1351 goto out_rq; 1352 } 1353 1354 out_rq: 1355 i915_request_put(rq); 1356 fini: 1357 hang_fini(&h); 1358 unlock: 1359 igt_global_reset_unlock(gt); 1360 1361 if (intel_gt_is_wedged(gt)) 1362 return -EIO; 1363 1364 return err; 1365 } 1366 1367 struct evict_vma { 1368 struct completion completion; 1369 struct i915_vma *vma; 1370 }; 1371 1372 static int evict_vma(void *data) 1373 { 1374 struct evict_vma *arg = data; 1375 struct i915_address_space *vm = arg->vma->vm; 1376 struct drm_mm_node evict = arg->vma->node; 1377 int err; 1378 1379 complete(&arg->completion); 1380 1381 mutex_lock(&vm->mutex); 1382 err = i915_gem_evict_for_node(vm, &evict, 0); 1383 mutex_unlock(&vm->mutex); 1384 1385 return err; 1386 } 1387 1388 static int evict_fence(void *data) 1389 { 1390 struct evict_vma *arg = data; 1391 int err; 1392 1393 complete(&arg->completion); 1394 1395 /* Mark the fence register as dirty to force the mmio update. */ 1396 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1397 if (err) { 1398 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1399 return err; 1400 } 1401 1402 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1403 if (err) { 1404 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1405 return err; 1406 } 1407 1408 err = i915_vma_pin_fence(arg->vma); 1409 i915_vma_unpin(arg->vma); 1410 if (err) { 1411 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1412 return err; 1413 } 1414 1415 i915_vma_unpin_fence(arg->vma); 1416 1417 return 0; 1418 } 1419 1420 static int __igt_reset_evict_vma(struct intel_gt *gt, 1421 struct i915_address_space *vm, 1422 int (*fn)(void *), 1423 unsigned int flags) 1424 { 1425 struct intel_engine_cs *engine = gt->engine[RCS0]; 1426 struct drm_i915_gem_object *obj; 1427 struct task_struct *tsk = NULL; 1428 struct i915_request *rq; 1429 struct evict_vma arg; 1430 struct hang h; 1431 unsigned int pin_flags; 1432 int err; 1433 1434 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1435 return 0; 1436 1437 if (!engine || !intel_engine_can_store_dword(engine)) 1438 return 0; 1439 1440 /* Check that we can recover an unbind stuck on a hanging request */ 1441 1442 err = hang_init(&h, gt); 1443 if (err) { 1444 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1445 return err; 1446 } 1447 1448 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1449 if (IS_ERR(obj)) { 1450 err = PTR_ERR(obj); 1451 pr_err("[%s] Create object failed: %d!\n", engine->name, err); 1452 goto fini; 1453 } 1454 1455 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1456 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1457 if (err) { 1458 pr_err("Invalid X-tiling settings; err:%d\n", err); 1459 goto out_obj; 1460 } 1461 } 1462 1463 arg.vma = i915_vma_instance(obj, vm, NULL); 1464 if (IS_ERR(arg.vma)) { 1465 err = PTR_ERR(arg.vma); 1466 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err); 1467 goto out_obj; 1468 } 1469 1470 rq = hang_create_request(&h, engine); 1471 if (IS_ERR(rq)) { 1472 err = PTR_ERR(rq); 1473 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1474 goto out_obj; 1475 } 1476 1477 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1478 1479 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1480 pin_flags |= PIN_MAPPABLE; 1481 1482 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1483 if (err) { 1484 i915_request_add(rq); 1485 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err); 1486 goto out_obj; 1487 } 1488 1489 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1490 err = i915_vma_pin_fence(arg.vma); 1491 if (err) { 1492 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1493 i915_vma_unpin(arg.vma); 1494 i915_request_add(rq); 1495 goto out_obj; 1496 } 1497 } 1498 1499 i915_vma_lock(arg.vma); 1500 err = i915_request_await_object(rq, arg.vma->obj, 1501 flags & EXEC_OBJECT_WRITE); 1502 if (err == 0) { 1503 err = i915_vma_move_to_active(arg.vma, rq, flags); 1504 if (err) 1505 pr_err("[%s] Move to active failed: %d!\n", engine->name, err); 1506 } else { 1507 pr_err("[%s] Request await failed: %d!\n", engine->name, err); 1508 } 1509 1510 i915_vma_unlock(arg.vma); 1511 1512 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1513 i915_vma_unpin_fence(arg.vma); 1514 i915_vma_unpin(arg.vma); 1515 1516 i915_request_get(rq); 1517 i915_request_add(rq); 1518 if (err) 1519 goto out_rq; 1520 1521 if (!wait_until_running(&h, rq)) { 1522 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1523 1524 pr_err("%s: Failed to start request %llx, at %x\n", 1525 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1526 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1527 1528 intel_gt_set_wedged(gt); 1529 goto out_reset; 1530 } 1531 1532 init_completion(&arg.completion); 1533 1534 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1535 if (IS_ERR(tsk)) { 1536 err = PTR_ERR(tsk); 1537 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1538 tsk = NULL; 1539 goto out_reset; 1540 } 1541 get_task_struct(tsk); 1542 1543 wait_for_completion(&arg.completion); 1544 1545 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1546 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1547 1548 pr_err("igt/evict_vma kthread did not wait\n"); 1549 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1550 1551 intel_gt_set_wedged(gt); 1552 goto out_reset; 1553 } 1554 1555 out_reset: 1556 igt_global_reset_lock(gt); 1557 fake_hangcheck(gt, rq->engine->mask); 1558 igt_global_reset_unlock(gt); 1559 1560 if (tsk) { 1561 struct intel_wedge_me w; 1562 1563 /* The reset, even indirectly, should take less than 10ms. */ 1564 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1565 err = kthread_stop(tsk); 1566 1567 put_task_struct(tsk); 1568 } 1569 1570 out_rq: 1571 i915_request_put(rq); 1572 out_obj: 1573 i915_gem_object_put(obj); 1574 fini: 1575 hang_fini(&h); 1576 if (intel_gt_is_wedged(gt)) 1577 return -EIO; 1578 1579 return err; 1580 } 1581 1582 static int igt_reset_evict_ggtt(void *arg) 1583 { 1584 struct intel_gt *gt = arg; 1585 1586 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1587 evict_vma, EXEC_OBJECT_WRITE); 1588 } 1589 1590 static int igt_reset_evict_ppgtt(void *arg) 1591 { 1592 struct intel_gt *gt = arg; 1593 struct i915_ppgtt *ppgtt; 1594 int err; 1595 1596 /* aliasing == global gtt locking, covered above */ 1597 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1598 return 0; 1599 1600 ppgtt = i915_ppgtt_create(gt, 0); 1601 if (IS_ERR(ppgtt)) 1602 return PTR_ERR(ppgtt); 1603 1604 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1605 evict_vma, EXEC_OBJECT_WRITE); 1606 i915_vm_put(&ppgtt->vm); 1607 1608 return err; 1609 } 1610 1611 static int igt_reset_evict_fence(void *arg) 1612 { 1613 struct intel_gt *gt = arg; 1614 1615 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1616 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1617 } 1618 1619 static int wait_for_others(struct intel_gt *gt, 1620 struct intel_engine_cs *exclude) 1621 { 1622 struct intel_engine_cs *engine; 1623 enum intel_engine_id id; 1624 1625 for_each_engine(engine, gt, id) { 1626 if (engine == exclude) 1627 continue; 1628 1629 if (!wait_for_idle(engine)) 1630 return -EIO; 1631 } 1632 1633 return 0; 1634 } 1635 1636 static int igt_reset_queue(void *arg) 1637 { 1638 struct intel_gt *gt = arg; 1639 struct i915_gpu_error *global = >->i915->gpu_error; 1640 struct intel_engine_cs *engine; 1641 enum intel_engine_id id; 1642 struct hang h; 1643 int err; 1644 1645 /* Check that we replay pending requests following a hang */ 1646 1647 igt_global_reset_lock(gt); 1648 1649 err = hang_init(&h, gt); 1650 if (err) 1651 goto unlock; 1652 1653 for_each_engine(engine, gt, id) { 1654 struct intel_selftest_saved_policy saved; 1655 struct i915_request *prev; 1656 IGT_TIMEOUT(end_time); 1657 unsigned int count; 1658 bool using_guc = intel_engine_uses_guc(engine); 1659 1660 if (!intel_engine_can_store_dword(engine)) 1661 continue; 1662 1663 if (using_guc) { 1664 err = intel_selftest_modify_policy(engine, &saved, 1665 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK); 1666 if (err) { 1667 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1668 goto fini; 1669 } 1670 } 1671 1672 prev = hang_create_request(&h, engine); 1673 if (IS_ERR(prev)) { 1674 err = PTR_ERR(prev); 1675 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err); 1676 goto restore; 1677 } 1678 1679 i915_request_get(prev); 1680 i915_request_add(prev); 1681 1682 count = 0; 1683 do { 1684 struct i915_request *rq; 1685 unsigned int reset_count; 1686 1687 rq = hang_create_request(&h, engine); 1688 if (IS_ERR(rq)) { 1689 err = PTR_ERR(rq); 1690 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1691 goto restore; 1692 } 1693 1694 i915_request_get(rq); 1695 i915_request_add(rq); 1696 1697 /* 1698 * XXX We don't handle resetting the kernel context 1699 * very well. If we trigger a device reset twice in 1700 * quick succession while the kernel context is 1701 * executing, we may end up skipping the breadcrumb. 1702 * This is really only a problem for the selftest as 1703 * normally there is a large interlude between resets 1704 * (hangcheck), or we focus on resetting just one 1705 * engine and so avoid repeatedly resetting innocents. 1706 */ 1707 err = wait_for_others(gt, engine); 1708 if (err) { 1709 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1710 __func__, engine->name); 1711 i915_request_put(rq); 1712 i915_request_put(prev); 1713 1714 GEM_TRACE_DUMP(); 1715 intel_gt_set_wedged(gt); 1716 goto restore; 1717 } 1718 1719 if (!wait_until_running(&h, prev)) { 1720 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1721 1722 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1723 __func__, engine->name, 1724 prev->fence.seqno, hws_seqno(&h, prev)); 1725 intel_engine_dump(engine, &p, 1726 "%s\n", engine->name); 1727 1728 i915_request_put(rq); 1729 i915_request_put(prev); 1730 1731 intel_gt_set_wedged(gt); 1732 1733 err = -EIO; 1734 goto restore; 1735 } 1736 1737 reset_count = fake_hangcheck(gt, BIT(id)); 1738 1739 if (prev->fence.error != -EIO) { 1740 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1741 prev->fence.error); 1742 i915_request_put(rq); 1743 i915_request_put(prev); 1744 err = -EINVAL; 1745 goto restore; 1746 } 1747 1748 if (rq->fence.error) { 1749 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1750 rq->fence.error); 1751 i915_request_put(rq); 1752 i915_request_put(prev); 1753 err = -EINVAL; 1754 goto restore; 1755 } 1756 1757 if (i915_reset_count(global) == reset_count) { 1758 pr_err("No GPU reset recorded!\n"); 1759 i915_request_put(rq); 1760 i915_request_put(prev); 1761 err = -EINVAL; 1762 goto restore; 1763 } 1764 1765 i915_request_put(prev); 1766 prev = rq; 1767 count++; 1768 } while (time_before(jiffies, end_time)); 1769 pr_info("%s: Completed %d queued resets\n", 1770 engine->name, count); 1771 1772 *h.batch = MI_BATCH_BUFFER_END; 1773 intel_gt_chipset_flush(engine->gt); 1774 1775 i915_request_put(prev); 1776 1777 restore: 1778 if (using_guc) { 1779 int err2 = intel_selftest_restore_policy(engine, &saved); 1780 1781 if (err2) 1782 pr_err("%s:%d> [%s] Restore policy failed: %d!\n", 1783 __func__, __LINE__, engine->name, err2); 1784 if (err == 0) 1785 err = err2; 1786 } 1787 if (err) 1788 goto fini; 1789 1790 err = igt_flush_test(gt->i915); 1791 if (err) { 1792 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1793 break; 1794 } 1795 } 1796 1797 fini: 1798 hang_fini(&h); 1799 unlock: 1800 igt_global_reset_unlock(gt); 1801 1802 if (intel_gt_is_wedged(gt)) 1803 return -EIO; 1804 1805 return err; 1806 } 1807 1808 static int igt_handle_error(void *arg) 1809 { 1810 struct intel_gt *gt = arg; 1811 struct i915_gpu_error *global = >->i915->gpu_error; 1812 struct intel_engine_cs *engine = gt->engine[RCS0]; 1813 struct hang h; 1814 struct i915_request *rq; 1815 struct i915_gpu_coredump *error; 1816 int err; 1817 1818 /* Check that we can issue a global GPU and engine reset */ 1819 1820 if (!intel_has_reset_engine(gt)) 1821 return 0; 1822 1823 if (!engine || !intel_engine_can_store_dword(engine)) 1824 return 0; 1825 1826 err = hang_init(&h, gt); 1827 if (err) { 1828 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1829 return err; 1830 } 1831 1832 rq = hang_create_request(&h, engine); 1833 if (IS_ERR(rq)) { 1834 err = PTR_ERR(rq); 1835 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1836 goto err_fini; 1837 } 1838 1839 i915_request_get(rq); 1840 i915_request_add(rq); 1841 1842 if (!wait_until_running(&h, rq)) { 1843 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1844 1845 pr_err("%s: Failed to start request %llx, at %x\n", 1846 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1847 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1848 1849 intel_gt_set_wedged(gt); 1850 1851 err = -EIO; 1852 goto err_request; 1853 } 1854 1855 /* Temporarily disable error capture */ 1856 error = xchg(&global->first_error, (void *)-1); 1857 1858 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1859 1860 xchg(&global->first_error, error); 1861 1862 if (rq->fence.error != -EIO) { 1863 pr_err("Guilty request not identified!\n"); 1864 err = -EINVAL; 1865 goto err_request; 1866 } 1867 1868 err_request: 1869 i915_request_put(rq); 1870 err_fini: 1871 hang_fini(&h); 1872 return err; 1873 } 1874 1875 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1876 const struct igt_atomic_section *p, 1877 const char *mode) 1878 { 1879 struct tasklet_struct * const t = &engine->sched_engine->tasklet; 1880 int err; 1881 1882 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1883 engine->name, mode, p->name); 1884 1885 if (t->func) 1886 tasklet_disable(t); 1887 if (strcmp(p->name, "softirq")) 1888 local_bh_disable(); 1889 p->critical_section_begin(); 1890 1891 err = __intel_engine_reset_bh(engine, NULL); 1892 1893 p->critical_section_end(); 1894 if (strcmp(p->name, "softirq")) 1895 local_bh_enable(); 1896 if (t->func) { 1897 tasklet_enable(t); 1898 tasklet_hi_schedule(t); 1899 } 1900 1901 if (err) 1902 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1903 engine->name, mode, p->name); 1904 1905 return err; 1906 } 1907 1908 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1909 const struct igt_atomic_section *p) 1910 { 1911 struct i915_request *rq; 1912 struct hang h; 1913 int err; 1914 1915 err = __igt_atomic_reset_engine(engine, p, "idle"); 1916 if (err) 1917 return err; 1918 1919 err = hang_init(&h, engine->gt); 1920 if (err) { 1921 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1922 return err; 1923 } 1924 1925 rq = hang_create_request(&h, engine); 1926 if (IS_ERR(rq)) { 1927 err = PTR_ERR(rq); 1928 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1929 goto out; 1930 } 1931 1932 i915_request_get(rq); 1933 i915_request_add(rq); 1934 1935 if (wait_until_running(&h, rq)) { 1936 err = __igt_atomic_reset_engine(engine, p, "active"); 1937 } else { 1938 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1939 __func__, engine->name, 1940 rq->fence.seqno, hws_seqno(&h, rq)); 1941 intel_gt_set_wedged(engine->gt); 1942 err = -EIO; 1943 } 1944 1945 if (err == 0) { 1946 struct intel_wedge_me w; 1947 1948 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1949 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1950 if (intel_gt_is_wedged(engine->gt)) 1951 err = -EIO; 1952 } 1953 1954 i915_request_put(rq); 1955 out: 1956 hang_fini(&h); 1957 return err; 1958 } 1959 1960 static int igt_reset_engines_atomic(void *arg) 1961 { 1962 struct intel_gt *gt = arg; 1963 const typeof(*igt_atomic_phases) *p; 1964 int err = 0; 1965 1966 /* Check that the engines resets are usable from atomic context */ 1967 1968 if (!intel_has_reset_engine(gt)) 1969 return 0; 1970 1971 if (intel_uc_uses_guc_submission(>->uc)) 1972 return 0; 1973 1974 igt_global_reset_lock(gt); 1975 1976 /* Flush any requests before we get started and check basics */ 1977 if (!igt_force_reset(gt)) 1978 goto unlock; 1979 1980 for (p = igt_atomic_phases; p->name; p++) { 1981 struct intel_engine_cs *engine; 1982 enum intel_engine_id id; 1983 1984 for_each_engine(engine, gt, id) { 1985 err = igt_atomic_reset_engine(engine, p); 1986 if (err) 1987 goto out; 1988 } 1989 } 1990 1991 out: 1992 /* As we poke around the guts, do a full reset before continuing. */ 1993 igt_force_reset(gt); 1994 unlock: 1995 igt_global_reset_unlock(gt); 1996 1997 return err; 1998 } 1999 2000 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 2001 { 2002 static const struct i915_subtest tests[] = { 2003 SUBTEST(igt_hang_sanitycheck), 2004 SUBTEST(igt_reset_nop), 2005 SUBTEST(igt_reset_nop_engine), 2006 SUBTEST(igt_reset_idle_engine), 2007 SUBTEST(igt_reset_active_engine), 2008 SUBTEST(igt_reset_fail_engine), 2009 SUBTEST(igt_reset_engines), 2010 SUBTEST(igt_reset_engines_atomic), 2011 SUBTEST(igt_reset_queue), 2012 SUBTEST(igt_reset_wait), 2013 SUBTEST(igt_reset_evict_ggtt), 2014 SUBTEST(igt_reset_evict_ppgtt), 2015 SUBTEST(igt_reset_evict_fence), 2016 SUBTEST(igt_handle_error), 2017 }; 2018 struct intel_gt *gt = &i915->gt; 2019 intel_wakeref_t wakeref; 2020 int err; 2021 2022 if (!intel_has_gpu_reset(gt)) 2023 return 0; 2024 2025 if (intel_gt_is_wedged(gt)) 2026 return -EIO; /* we're long past hope of a successful reset */ 2027 2028 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 2029 2030 err = intel_gt_live_subtests(tests, gt); 2031 2032 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 2033 2034 return err; 2035 } 2036