1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2016 Intel Corporation 4 */ 5 6 #include <linux/kthread.h> 7 8 #include "gem/i915_gem_context.h" 9 10 #include "intel_gt.h" 11 #include "intel_engine_heartbeat.h" 12 #include "intel_engine_pm.h" 13 #include "selftest_engine_heartbeat.h" 14 15 #include "i915_selftest.h" 16 #include "selftests/i915_random.h" 17 #include "selftests/igt_flush_test.h" 18 #include "selftests/igt_reset.h" 19 #include "selftests/igt_atomic.h" 20 21 #include "selftests/mock_drm.h" 22 23 #include "gem/selftests/mock_context.h" 24 #include "gem/selftests/igt_gem_utils.h" 25 26 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 27 28 struct hang { 29 struct intel_gt *gt; 30 struct drm_i915_gem_object *hws; 31 struct drm_i915_gem_object *obj; 32 struct i915_gem_context *ctx; 33 u32 *seqno; 34 u32 *batch; 35 }; 36 37 static int hang_init(struct hang *h, struct intel_gt *gt) 38 { 39 void *vaddr; 40 int err; 41 42 memset(h, 0, sizeof(*h)); 43 h->gt = gt; 44 45 h->ctx = kernel_context(gt->i915, NULL); 46 if (IS_ERR(h->ctx)) 47 return PTR_ERR(h->ctx); 48 49 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 50 51 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 52 if (IS_ERR(h->hws)) { 53 err = PTR_ERR(h->hws); 54 goto err_ctx; 55 } 56 57 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 58 if (IS_ERR(h->obj)) { 59 err = PTR_ERR(h->obj); 60 goto err_hws; 61 } 62 63 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 64 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB); 65 if (IS_ERR(vaddr)) { 66 err = PTR_ERR(vaddr); 67 goto err_obj; 68 } 69 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 70 71 vaddr = i915_gem_object_pin_map_unlocked(h->obj, 72 i915_coherent_map_type(gt->i915, h->obj, false)); 73 if (IS_ERR(vaddr)) { 74 err = PTR_ERR(vaddr); 75 goto err_unpin_hws; 76 } 77 h->batch = vaddr; 78 79 return 0; 80 81 err_unpin_hws: 82 i915_gem_object_unpin_map(h->hws); 83 err_obj: 84 i915_gem_object_put(h->obj); 85 err_hws: 86 i915_gem_object_put(h->hws); 87 err_ctx: 88 kernel_context_close(h->ctx); 89 return err; 90 } 91 92 static u64 hws_address(const struct i915_vma *hws, 93 const struct i915_request *rq) 94 { 95 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 96 } 97 98 static int move_to_active(struct i915_vma *vma, 99 struct i915_request *rq, 100 unsigned int flags) 101 { 102 int err; 103 104 i915_vma_lock(vma); 105 err = i915_request_await_object(rq, vma->obj, 106 flags & EXEC_OBJECT_WRITE); 107 if (err == 0) 108 err = i915_vma_move_to_active(vma, rq, flags); 109 i915_vma_unlock(vma); 110 111 return err; 112 } 113 114 static struct i915_request * 115 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 116 { 117 struct intel_gt *gt = h->gt; 118 struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx); 119 struct drm_i915_gem_object *obj; 120 struct i915_request *rq = NULL; 121 struct i915_vma *hws, *vma; 122 unsigned int flags; 123 void *vaddr; 124 u32 *batch; 125 int err; 126 127 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 128 if (IS_ERR(obj)) { 129 i915_vm_put(vm); 130 return ERR_CAST(obj); 131 } 132 133 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false)); 134 if (IS_ERR(vaddr)) { 135 i915_gem_object_put(obj); 136 i915_vm_put(vm); 137 return ERR_CAST(vaddr); 138 } 139 140 i915_gem_object_unpin_map(h->obj); 141 i915_gem_object_put(h->obj); 142 143 h->obj = obj; 144 h->batch = vaddr; 145 146 vma = i915_vma_instance(h->obj, vm, NULL); 147 if (IS_ERR(vma)) { 148 i915_vm_put(vm); 149 return ERR_CAST(vma); 150 } 151 152 hws = i915_vma_instance(h->hws, vm, NULL); 153 if (IS_ERR(hws)) { 154 i915_vm_put(vm); 155 return ERR_CAST(hws); 156 } 157 158 err = i915_vma_pin(vma, 0, 0, PIN_USER); 159 if (err) { 160 i915_vm_put(vm); 161 return ERR_PTR(err); 162 } 163 164 err = i915_vma_pin(hws, 0, 0, PIN_USER); 165 if (err) 166 goto unpin_vma; 167 168 rq = igt_request_alloc(h->ctx, engine); 169 if (IS_ERR(rq)) { 170 err = PTR_ERR(rq); 171 goto unpin_hws; 172 } 173 174 err = move_to_active(vma, rq, 0); 175 if (err) 176 goto cancel_rq; 177 178 err = move_to_active(hws, rq, 0); 179 if (err) 180 goto cancel_rq; 181 182 batch = h->batch; 183 if (GRAPHICS_VER(gt->i915) >= 8) { 184 *batch++ = MI_STORE_DWORD_IMM_GEN4; 185 *batch++ = lower_32_bits(hws_address(hws, rq)); 186 *batch++ = upper_32_bits(hws_address(hws, rq)); 187 *batch++ = rq->fence.seqno; 188 *batch++ = MI_NOOP; 189 190 memset(batch, 0, 1024); 191 batch += 1024 / sizeof(*batch); 192 193 *batch++ = MI_NOOP; 194 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 195 *batch++ = lower_32_bits(vma->node.start); 196 *batch++ = upper_32_bits(vma->node.start); 197 } else if (GRAPHICS_VER(gt->i915) >= 6) { 198 *batch++ = MI_STORE_DWORD_IMM_GEN4; 199 *batch++ = 0; 200 *batch++ = lower_32_bits(hws_address(hws, rq)); 201 *batch++ = rq->fence.seqno; 202 *batch++ = MI_NOOP; 203 204 memset(batch, 0, 1024); 205 batch += 1024 / sizeof(*batch); 206 207 *batch++ = MI_NOOP; 208 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 209 *batch++ = lower_32_bits(vma->node.start); 210 } else if (GRAPHICS_VER(gt->i915) >= 4) { 211 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 212 *batch++ = 0; 213 *batch++ = lower_32_bits(hws_address(hws, rq)); 214 *batch++ = rq->fence.seqno; 215 *batch++ = MI_NOOP; 216 217 memset(batch, 0, 1024); 218 batch += 1024 / sizeof(*batch); 219 220 *batch++ = MI_NOOP; 221 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 222 *batch++ = lower_32_bits(vma->node.start); 223 } else { 224 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 225 *batch++ = lower_32_bits(hws_address(hws, rq)); 226 *batch++ = rq->fence.seqno; 227 *batch++ = MI_NOOP; 228 229 memset(batch, 0, 1024); 230 batch += 1024 / sizeof(*batch); 231 232 *batch++ = MI_NOOP; 233 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 234 *batch++ = lower_32_bits(vma->node.start); 235 } 236 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 237 intel_gt_chipset_flush(engine->gt); 238 239 if (rq->engine->emit_init_breadcrumb) { 240 err = rq->engine->emit_init_breadcrumb(rq); 241 if (err) 242 goto cancel_rq; 243 } 244 245 flags = 0; 246 if (GRAPHICS_VER(gt->i915) <= 5) 247 flags |= I915_DISPATCH_SECURE; 248 249 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 250 251 cancel_rq: 252 if (err) { 253 i915_request_set_error_once(rq, err); 254 i915_request_add(rq); 255 } 256 unpin_hws: 257 i915_vma_unpin(hws); 258 unpin_vma: 259 i915_vma_unpin(vma); 260 i915_vm_put(vm); 261 return err ? ERR_PTR(err) : rq; 262 } 263 264 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 265 { 266 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 267 } 268 269 static void hang_fini(struct hang *h) 270 { 271 *h->batch = MI_BATCH_BUFFER_END; 272 intel_gt_chipset_flush(h->gt); 273 274 i915_gem_object_unpin_map(h->obj); 275 i915_gem_object_put(h->obj); 276 277 i915_gem_object_unpin_map(h->hws); 278 i915_gem_object_put(h->hws); 279 280 kernel_context_close(h->ctx); 281 282 igt_flush_test(h->gt->i915); 283 } 284 285 static bool wait_until_running(struct hang *h, struct i915_request *rq) 286 { 287 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 288 rq->fence.seqno), 289 10) && 290 wait_for(i915_seqno_passed(hws_seqno(h, rq), 291 rq->fence.seqno), 292 1000)); 293 } 294 295 static int igt_hang_sanitycheck(void *arg) 296 { 297 struct intel_gt *gt = arg; 298 struct i915_request *rq; 299 struct intel_engine_cs *engine; 300 enum intel_engine_id id; 301 struct hang h; 302 int err; 303 304 /* Basic check that we can execute our hanging batch */ 305 306 err = hang_init(&h, gt); 307 if (err) 308 return err; 309 310 for_each_engine(engine, gt, id) { 311 struct intel_wedge_me w; 312 long timeout; 313 314 if (!intel_engine_can_store_dword(engine)) 315 continue; 316 317 rq = hang_create_request(&h, engine); 318 if (IS_ERR(rq)) { 319 err = PTR_ERR(rq); 320 pr_err("Failed to create request for %s, err=%d\n", 321 engine->name, err); 322 goto fini; 323 } 324 325 i915_request_get(rq); 326 327 *h.batch = MI_BATCH_BUFFER_END; 328 intel_gt_chipset_flush(engine->gt); 329 330 i915_request_add(rq); 331 332 timeout = 0; 333 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 334 timeout = i915_request_wait(rq, 0, 335 MAX_SCHEDULE_TIMEOUT); 336 if (intel_gt_is_wedged(gt)) 337 timeout = -EIO; 338 339 i915_request_put(rq); 340 341 if (timeout < 0) { 342 err = timeout; 343 pr_err("Wait for request failed on %s, err=%d\n", 344 engine->name, err); 345 goto fini; 346 } 347 } 348 349 fini: 350 hang_fini(&h); 351 return err; 352 } 353 354 static bool wait_for_idle(struct intel_engine_cs *engine) 355 { 356 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 357 } 358 359 static int igt_reset_nop(void *arg) 360 { 361 struct intel_gt *gt = arg; 362 struct i915_gpu_error *global = >->i915->gpu_error; 363 struct intel_engine_cs *engine; 364 unsigned int reset_count, count; 365 enum intel_engine_id id; 366 IGT_TIMEOUT(end_time); 367 int err = 0; 368 369 /* Check that we can reset during non-user portions of requests */ 370 371 reset_count = i915_reset_count(global); 372 count = 0; 373 do { 374 for_each_engine(engine, gt, id) { 375 struct intel_context *ce; 376 int i; 377 378 ce = intel_context_create(engine); 379 if (IS_ERR(ce)) { 380 err = PTR_ERR(ce); 381 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 382 break; 383 } 384 385 for (i = 0; i < 16; i++) { 386 struct i915_request *rq; 387 388 rq = intel_context_create_request(ce); 389 if (IS_ERR(rq)) { 390 err = PTR_ERR(rq); 391 pr_err("[%s] Create request failed: %d!\n", 392 engine->name, err); 393 break; 394 } 395 396 i915_request_add(rq); 397 } 398 399 intel_context_put(ce); 400 } 401 402 igt_global_reset_lock(gt); 403 intel_gt_reset(gt, ALL_ENGINES, NULL); 404 igt_global_reset_unlock(gt); 405 406 if (intel_gt_is_wedged(gt)) { 407 pr_err("[%s] GT is wedged!\n", engine->name); 408 err = -EIO; 409 break; 410 } 411 412 if (i915_reset_count(global) != reset_count + ++count) { 413 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n", 414 engine->name, i915_reset_count(global), reset_count, count); 415 err = -EINVAL; 416 break; 417 } 418 419 err = igt_flush_test(gt->i915); 420 if (err) { 421 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 422 break; 423 } 424 } while (time_before(jiffies, end_time)); 425 pr_info("%s: %d resets\n", __func__, count); 426 427 if (igt_flush_test(gt->i915)) { 428 pr_err("Post flush failed: %d!\n", err); 429 err = -EIO; 430 } 431 432 return err; 433 } 434 435 static int igt_reset_nop_engine(void *arg) 436 { 437 struct intel_gt *gt = arg; 438 struct i915_gpu_error *global = >->i915->gpu_error; 439 struct intel_engine_cs *engine; 440 enum intel_engine_id id; 441 442 /* Check that we can engine-reset during non-user portions */ 443 444 if (!intel_has_reset_engine(gt)) 445 return 0; 446 447 for_each_engine(engine, gt, id) { 448 unsigned int reset_count, reset_engine_count, count; 449 struct intel_context *ce; 450 IGT_TIMEOUT(end_time); 451 int err; 452 453 ce = intel_context_create(engine); 454 if (IS_ERR(ce)) { 455 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 456 return PTR_ERR(ce); 457 } 458 459 reset_count = i915_reset_count(global); 460 reset_engine_count = i915_reset_engine_count(global, engine); 461 count = 0; 462 463 st_engine_heartbeat_disable(engine); 464 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 465 do { 466 int i; 467 468 if (!wait_for_idle(engine)) { 469 pr_err("%s failed to idle before reset\n", 470 engine->name); 471 err = -EIO; 472 break; 473 } 474 475 for (i = 0; i < 16; i++) { 476 struct i915_request *rq; 477 478 rq = intel_context_create_request(ce); 479 if (IS_ERR(rq)) { 480 struct drm_printer p = 481 drm_info_printer(gt->i915->drm.dev); 482 intel_engine_dump(engine, &p, 483 "%s(%s): failed to submit request\n", 484 __func__, 485 engine->name); 486 487 GEM_TRACE("%s(%s): failed to submit request\n", 488 __func__, 489 engine->name); 490 GEM_TRACE_DUMP(); 491 492 intel_gt_set_wedged(gt); 493 494 err = PTR_ERR(rq); 495 break; 496 } 497 498 i915_request_add(rq); 499 } 500 err = intel_engine_reset(engine, NULL); 501 if (err) { 502 pr_err("intel_engine_reset(%s) failed, err:%d\n", 503 engine->name, err); 504 break; 505 } 506 507 if (i915_reset_count(global) != reset_count) { 508 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 509 err = -EINVAL; 510 break; 511 } 512 513 if (i915_reset_engine_count(global, engine) != 514 reset_engine_count + ++count) { 515 pr_err("%s engine reset not recorded!\n", 516 engine->name); 517 err = -EINVAL; 518 break; 519 } 520 } while (time_before(jiffies, end_time)); 521 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 522 st_engine_heartbeat_enable(engine); 523 524 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 525 526 intel_context_put(ce); 527 if (igt_flush_test(gt->i915)) 528 err = -EIO; 529 if (err) 530 return err; 531 } 532 533 return 0; 534 } 535 536 static void force_reset_timeout(struct intel_engine_cs *engine) 537 { 538 engine->reset_timeout.probability = 999; 539 atomic_set(&engine->reset_timeout.times, -1); 540 } 541 542 static void cancel_reset_timeout(struct intel_engine_cs *engine) 543 { 544 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 545 } 546 547 static int igt_reset_fail_engine(void *arg) 548 { 549 struct intel_gt *gt = arg; 550 struct intel_engine_cs *engine; 551 enum intel_engine_id id; 552 553 /* Check that we can recover from engine-reset failues */ 554 555 if (!intel_has_reset_engine(gt)) 556 return 0; 557 558 for_each_engine(engine, gt, id) { 559 unsigned int count; 560 struct intel_context *ce; 561 IGT_TIMEOUT(end_time); 562 int err; 563 564 ce = intel_context_create(engine); 565 if (IS_ERR(ce)) { 566 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 567 return PTR_ERR(ce); 568 } 569 570 st_engine_heartbeat_disable(engine); 571 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 572 573 force_reset_timeout(engine); 574 err = intel_engine_reset(engine, NULL); 575 cancel_reset_timeout(engine); 576 if (err == 0) /* timeouts only generated on gen8+ */ 577 goto skip; 578 579 count = 0; 580 do { 581 struct i915_request *last = NULL; 582 int i; 583 584 if (!wait_for_idle(engine)) { 585 pr_err("%s failed to idle before reset\n", 586 engine->name); 587 err = -EIO; 588 break; 589 } 590 591 for (i = 0; i < count % 15; i++) { 592 struct i915_request *rq; 593 594 rq = intel_context_create_request(ce); 595 if (IS_ERR(rq)) { 596 struct drm_printer p = 597 drm_info_printer(gt->i915->drm.dev); 598 intel_engine_dump(engine, &p, 599 "%s(%s): failed to submit request\n", 600 __func__, 601 engine->name); 602 603 GEM_TRACE("%s(%s): failed to submit request\n", 604 __func__, 605 engine->name); 606 GEM_TRACE_DUMP(); 607 608 intel_gt_set_wedged(gt); 609 if (last) 610 i915_request_put(last); 611 612 err = PTR_ERR(rq); 613 goto out; 614 } 615 616 if (last) 617 i915_request_put(last); 618 last = i915_request_get(rq); 619 i915_request_add(rq); 620 } 621 622 if (count & 1) { 623 err = intel_engine_reset(engine, NULL); 624 if (err) { 625 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 626 engine->name, err); 627 GEM_TRACE_DUMP(); 628 i915_request_put(last); 629 break; 630 } 631 } else { 632 force_reset_timeout(engine); 633 err = intel_engine_reset(engine, NULL); 634 cancel_reset_timeout(engine); 635 if (err != -ETIMEDOUT) { 636 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 637 engine->name, err); 638 i915_request_put(last); 639 break; 640 } 641 } 642 643 err = 0; 644 if (last) { 645 if (i915_request_wait(last, 0, HZ / 2) < 0) { 646 struct drm_printer p = 647 drm_info_printer(gt->i915->drm.dev); 648 649 intel_engine_dump(engine, &p, 650 "%s(%s): failed to complete request\n", 651 __func__, 652 engine->name); 653 654 GEM_TRACE("%s(%s): failed to complete request\n", 655 __func__, 656 engine->name); 657 GEM_TRACE_DUMP(); 658 659 err = -EIO; 660 } 661 i915_request_put(last); 662 } 663 count++; 664 } while (err == 0 && time_before(jiffies, end_time)); 665 out: 666 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 667 skip: 668 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 669 st_engine_heartbeat_enable(engine); 670 intel_context_put(ce); 671 672 if (igt_flush_test(gt->i915)) 673 err = -EIO; 674 if (err) 675 return err; 676 } 677 678 return 0; 679 } 680 681 static int __igt_reset_engine(struct intel_gt *gt, bool active) 682 { 683 struct i915_gpu_error *global = >->i915->gpu_error; 684 struct intel_engine_cs *engine; 685 enum intel_engine_id id; 686 struct hang h; 687 int err = 0; 688 689 /* Check that we can issue an engine reset on an idle engine (no-op) */ 690 691 if (!intel_has_reset_engine(gt)) 692 return 0; 693 694 if (active) { 695 err = hang_init(&h, gt); 696 if (err) 697 return err; 698 } 699 700 for_each_engine(engine, gt, id) { 701 unsigned int reset_count, reset_engine_count; 702 unsigned long count; 703 IGT_TIMEOUT(end_time); 704 705 if (active && !intel_engine_can_store_dword(engine)) 706 continue; 707 708 if (!wait_for_idle(engine)) { 709 pr_err("%s failed to idle before reset\n", 710 engine->name); 711 err = -EIO; 712 break; 713 } 714 715 reset_count = i915_reset_count(global); 716 reset_engine_count = i915_reset_engine_count(global, engine); 717 718 st_engine_heartbeat_disable(engine); 719 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 720 count = 0; 721 do { 722 if (active) { 723 struct i915_request *rq; 724 725 rq = hang_create_request(&h, engine); 726 if (IS_ERR(rq)) { 727 err = PTR_ERR(rq); 728 pr_err("[%s] Create hang request failed: %d!\n", 729 engine->name, err); 730 break; 731 } 732 733 i915_request_get(rq); 734 i915_request_add(rq); 735 736 if (!wait_until_running(&h, rq)) { 737 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 738 739 pr_err("%s: Failed to start request %llx, at %x\n", 740 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 741 intel_engine_dump(engine, &p, 742 "%s\n", engine->name); 743 744 i915_request_put(rq); 745 err = -EIO; 746 break; 747 } 748 749 i915_request_put(rq); 750 } 751 752 err = intel_engine_reset(engine, NULL); 753 if (err) { 754 pr_err("intel_engine_reset(%s) failed, err:%d\n", 755 engine->name, err); 756 break; 757 } 758 759 if (i915_reset_count(global) != reset_count) { 760 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 761 err = -EINVAL; 762 break; 763 } 764 765 if (i915_reset_engine_count(global, engine) != 766 ++reset_engine_count) { 767 pr_err("%s engine reset not recorded!\n", 768 engine->name); 769 err = -EINVAL; 770 break; 771 } 772 773 count++; 774 } while (time_before(jiffies, end_time)); 775 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 776 st_engine_heartbeat_enable(engine); 777 pr_info("%s: Completed %lu %s resets\n", 778 engine->name, count, active ? "active" : "idle"); 779 780 if (err) 781 break; 782 783 err = igt_flush_test(gt->i915); 784 if (err) { 785 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 786 break; 787 } 788 } 789 790 if (intel_gt_is_wedged(gt)) { 791 pr_err("GT is wedged!\n"); 792 err = -EIO; 793 } 794 795 if (active) 796 hang_fini(&h); 797 798 return err; 799 } 800 801 static int igt_reset_idle_engine(void *arg) 802 { 803 return __igt_reset_engine(arg, false); 804 } 805 806 static int igt_reset_active_engine(void *arg) 807 { 808 return __igt_reset_engine(arg, true); 809 } 810 811 struct active_engine { 812 struct task_struct *task; 813 struct intel_engine_cs *engine; 814 unsigned long resets; 815 unsigned int flags; 816 }; 817 818 #define TEST_ACTIVE BIT(0) 819 #define TEST_OTHERS BIT(1) 820 #define TEST_SELF BIT(2) 821 #define TEST_PRIORITY BIT(3) 822 823 static int active_request_put(struct i915_request *rq) 824 { 825 int err = 0; 826 827 if (!rq) 828 return 0; 829 830 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 831 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 832 rq->engine->name, 833 rq->fence.context, 834 rq->fence.seqno); 835 GEM_TRACE_DUMP(); 836 837 intel_gt_set_wedged(rq->engine->gt); 838 err = -EIO; 839 } 840 841 i915_request_put(rq); 842 843 return err; 844 } 845 846 static int active_engine(void *data) 847 { 848 I915_RND_STATE(prng); 849 struct active_engine *arg = data; 850 struct intel_engine_cs *engine = arg->engine; 851 struct i915_request *rq[8] = {}; 852 struct intel_context *ce[ARRAY_SIZE(rq)]; 853 unsigned long count; 854 int err = 0; 855 856 for (count = 0; count < ARRAY_SIZE(ce); count++) { 857 ce[count] = intel_context_create(engine); 858 if (IS_ERR(ce[count])) { 859 err = PTR_ERR(ce[count]); 860 pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err); 861 while (--count) 862 intel_context_put(ce[count]); 863 return err; 864 } 865 } 866 867 count = 0; 868 while (!kthread_should_stop()) { 869 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 870 struct i915_request *old = rq[idx]; 871 struct i915_request *new; 872 873 new = intel_context_create_request(ce[idx]); 874 if (IS_ERR(new)) { 875 err = PTR_ERR(new); 876 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err); 877 break; 878 } 879 880 rq[idx] = i915_request_get(new); 881 i915_request_add(new); 882 883 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) { 884 struct i915_sched_attr attr = { 885 .priority = 886 i915_prandom_u32_max_state(512, &prng), 887 }; 888 engine->sched_engine->schedule(rq[idx], &attr); 889 } 890 891 err = active_request_put(old); 892 if (err) { 893 pr_err("[%s] Request put failed: %d!\n", engine->name, err); 894 break; 895 } 896 897 cond_resched(); 898 } 899 900 for (count = 0; count < ARRAY_SIZE(rq); count++) { 901 int err__ = active_request_put(rq[count]); 902 903 if (err) 904 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err); 905 906 /* Keep the first error */ 907 if (!err) 908 err = err__; 909 910 intel_context_put(ce[count]); 911 } 912 913 return err; 914 } 915 916 static int __igt_reset_engines(struct intel_gt *gt, 917 const char *test_name, 918 unsigned int flags) 919 { 920 struct i915_gpu_error *global = >->i915->gpu_error; 921 struct intel_engine_cs *engine, *other; 922 enum intel_engine_id id, tmp; 923 struct hang h; 924 int err = 0; 925 926 /* Check that issuing a reset on one engine does not interfere 927 * with any other engine. 928 */ 929 930 if (!intel_has_reset_engine(gt)) 931 return 0; 932 933 if (flags & TEST_ACTIVE) { 934 err = hang_init(&h, gt); 935 if (err) 936 return err; 937 938 if (flags & TEST_PRIORITY) 939 h.ctx->sched.priority = 1024; 940 } 941 942 for_each_engine(engine, gt, id) { 943 struct active_engine threads[I915_NUM_ENGINES] = {}; 944 unsigned long device = i915_reset_count(global); 945 unsigned long count = 0, reported; 946 IGT_TIMEOUT(end_time); 947 948 if (flags & TEST_ACTIVE && 949 !intel_engine_can_store_dword(engine)) 950 continue; 951 952 if (!wait_for_idle(engine)) { 953 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 954 engine->name, test_name); 955 err = -EIO; 956 break; 957 } 958 959 memset(threads, 0, sizeof(threads)); 960 for_each_engine(other, gt, tmp) { 961 struct task_struct *tsk; 962 963 threads[tmp].resets = 964 i915_reset_engine_count(global, other); 965 966 if (other == engine && !(flags & TEST_SELF)) 967 continue; 968 969 if (other != engine && !(flags & TEST_OTHERS)) 970 continue; 971 972 threads[tmp].engine = other; 973 threads[tmp].flags = flags; 974 975 tsk = kthread_run(active_engine, &threads[tmp], 976 "igt/%s", other->name); 977 if (IS_ERR(tsk)) { 978 err = PTR_ERR(tsk); 979 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 980 goto unwind; 981 } 982 983 threads[tmp].task = tsk; 984 get_task_struct(tsk); 985 } 986 987 yield(); /* start all threads before we begin */ 988 989 st_engine_heartbeat_disable(engine); 990 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 991 do { 992 struct i915_request *rq = NULL; 993 994 if (flags & TEST_ACTIVE) { 995 rq = hang_create_request(&h, engine); 996 if (IS_ERR(rq)) { 997 err = PTR_ERR(rq); 998 pr_err("[%s] Create hang request failed: %d!\n", 999 engine->name, err); 1000 break; 1001 } 1002 1003 i915_request_get(rq); 1004 i915_request_add(rq); 1005 1006 if (!wait_until_running(&h, rq)) { 1007 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1008 1009 pr_err("%s: Failed to start request %llx, at %x\n", 1010 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1011 intel_engine_dump(engine, &p, 1012 "%s\n", engine->name); 1013 1014 i915_request_put(rq); 1015 err = -EIO; 1016 break; 1017 } 1018 } 1019 1020 err = intel_engine_reset(engine, NULL); 1021 if (err) { 1022 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1023 engine->name, test_name, err); 1024 break; 1025 } 1026 1027 count++; 1028 1029 if (rq) { 1030 if (rq->fence.error != -EIO) { 1031 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n", 1032 engine->name, test_name, 1033 rq->fence.context, 1034 rq->fence.seqno, rq->context->guc_id); 1035 i915_request_put(rq); 1036 1037 GEM_TRACE_DUMP(); 1038 intel_gt_set_wedged(gt); 1039 err = -EIO; 1040 break; 1041 } 1042 1043 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1044 struct drm_printer p = 1045 drm_info_printer(gt->i915->drm.dev); 1046 1047 pr_err("i915_reset_engine(%s:%s):" 1048 " failed to complete request %llx:%lld after reset\n", 1049 engine->name, test_name, 1050 rq->fence.context, 1051 rq->fence.seqno); 1052 intel_engine_dump(engine, &p, 1053 "%s\n", engine->name); 1054 i915_request_put(rq); 1055 1056 GEM_TRACE_DUMP(); 1057 intel_gt_set_wedged(gt); 1058 err = -EIO; 1059 break; 1060 } 1061 1062 i915_request_put(rq); 1063 } 1064 1065 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1066 struct drm_printer p = 1067 drm_info_printer(gt->i915->drm.dev); 1068 1069 pr_err("i915_reset_engine(%s:%s):" 1070 " failed to idle after reset\n", 1071 engine->name, test_name); 1072 intel_engine_dump(engine, &p, 1073 "%s\n", engine->name); 1074 1075 err = -EIO; 1076 break; 1077 } 1078 } while (time_before(jiffies, end_time)); 1079 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 1080 st_engine_heartbeat_enable(engine); 1081 1082 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1083 engine->name, test_name, count); 1084 1085 reported = i915_reset_engine_count(global, engine); 1086 reported -= threads[engine->id].resets; 1087 if (reported != count) { 1088 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1089 engine->name, test_name, count, reported); 1090 if (!err) 1091 err = -EINVAL; 1092 } 1093 1094 unwind: 1095 for_each_engine(other, gt, tmp) { 1096 int ret; 1097 1098 if (!threads[tmp].task) 1099 continue; 1100 1101 ret = kthread_stop(threads[tmp].task); 1102 if (ret) { 1103 pr_err("kthread for other engine %s failed, err=%d\n", 1104 other->name, ret); 1105 if (!err) 1106 err = ret; 1107 } 1108 put_task_struct(threads[tmp].task); 1109 1110 if (other->uabi_class != engine->uabi_class && 1111 threads[tmp].resets != 1112 i915_reset_engine_count(global, other)) { 1113 pr_err("Innocent engine %s was reset (count=%ld)\n", 1114 other->name, 1115 i915_reset_engine_count(global, other) - 1116 threads[tmp].resets); 1117 if (!err) 1118 err = -EINVAL; 1119 } 1120 } 1121 1122 if (device != i915_reset_count(global)) { 1123 pr_err("Global reset (count=%ld)!\n", 1124 i915_reset_count(global) - device); 1125 if (!err) 1126 err = -EINVAL; 1127 } 1128 1129 if (err) 1130 break; 1131 1132 err = igt_flush_test(gt->i915); 1133 if (err) { 1134 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1135 break; 1136 } 1137 } 1138 1139 if (intel_gt_is_wedged(gt)) 1140 err = -EIO; 1141 1142 if (flags & TEST_ACTIVE) 1143 hang_fini(&h); 1144 1145 return err; 1146 } 1147 1148 static int igt_reset_engines(void *arg) 1149 { 1150 static const struct { 1151 const char *name; 1152 unsigned int flags; 1153 } phases[] = { 1154 { "idle", 0 }, 1155 { "active", TEST_ACTIVE }, 1156 { "others-idle", TEST_OTHERS }, 1157 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1158 { 1159 "others-priority", 1160 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1161 }, 1162 { 1163 "self-priority", 1164 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1165 }, 1166 { } 1167 }; 1168 struct intel_gt *gt = arg; 1169 typeof(*phases) *p; 1170 int err; 1171 1172 for (p = phases; p->name; p++) { 1173 if (p->flags & TEST_PRIORITY) { 1174 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1175 continue; 1176 } 1177 1178 err = __igt_reset_engines(arg, p->name, p->flags); 1179 if (err) 1180 return err; 1181 } 1182 1183 return 0; 1184 } 1185 1186 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1187 { 1188 u32 count = i915_reset_count(>->i915->gpu_error); 1189 1190 intel_gt_reset(gt, mask, NULL); 1191 1192 return count; 1193 } 1194 1195 static int igt_reset_wait(void *arg) 1196 { 1197 struct intel_gt *gt = arg; 1198 struct i915_gpu_error *global = >->i915->gpu_error; 1199 struct intel_engine_cs *engine = gt->engine[RCS0]; 1200 struct i915_request *rq; 1201 unsigned int reset_count; 1202 struct hang h; 1203 long timeout; 1204 int err; 1205 1206 if (!engine || !intel_engine_can_store_dword(engine)) 1207 return 0; 1208 1209 /* Check that we detect a stuck waiter and issue a reset */ 1210 1211 igt_global_reset_lock(gt); 1212 1213 err = hang_init(&h, gt); 1214 if (err) { 1215 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1216 goto unlock; 1217 } 1218 1219 rq = hang_create_request(&h, engine); 1220 if (IS_ERR(rq)) { 1221 err = PTR_ERR(rq); 1222 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1223 goto fini; 1224 } 1225 1226 i915_request_get(rq); 1227 i915_request_add(rq); 1228 1229 if (!wait_until_running(&h, rq)) { 1230 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1231 1232 pr_err("%s: Failed to start request %llx, at %x\n", 1233 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1234 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1235 1236 intel_gt_set_wedged(gt); 1237 1238 err = -EIO; 1239 goto out_rq; 1240 } 1241 1242 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1243 1244 timeout = i915_request_wait(rq, 0, 10); 1245 if (timeout < 0) { 1246 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1247 timeout); 1248 err = timeout; 1249 goto out_rq; 1250 } 1251 1252 if (i915_reset_count(global) == reset_count) { 1253 pr_err("No GPU reset recorded!\n"); 1254 err = -EINVAL; 1255 goto out_rq; 1256 } 1257 1258 out_rq: 1259 i915_request_put(rq); 1260 fini: 1261 hang_fini(&h); 1262 unlock: 1263 igt_global_reset_unlock(gt); 1264 1265 if (intel_gt_is_wedged(gt)) 1266 return -EIO; 1267 1268 return err; 1269 } 1270 1271 struct evict_vma { 1272 struct completion completion; 1273 struct i915_vma *vma; 1274 }; 1275 1276 static int evict_vma(void *data) 1277 { 1278 struct evict_vma *arg = data; 1279 struct i915_address_space *vm = arg->vma->vm; 1280 struct drm_mm_node evict = arg->vma->node; 1281 int err; 1282 1283 complete(&arg->completion); 1284 1285 mutex_lock(&vm->mutex); 1286 err = i915_gem_evict_for_node(vm, &evict, 0); 1287 mutex_unlock(&vm->mutex); 1288 1289 return err; 1290 } 1291 1292 static int evict_fence(void *data) 1293 { 1294 struct evict_vma *arg = data; 1295 int err; 1296 1297 complete(&arg->completion); 1298 1299 /* Mark the fence register as dirty to force the mmio update. */ 1300 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1301 if (err) { 1302 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1303 return err; 1304 } 1305 1306 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1307 if (err) { 1308 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1309 return err; 1310 } 1311 1312 err = i915_vma_pin_fence(arg->vma); 1313 i915_vma_unpin(arg->vma); 1314 if (err) { 1315 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1316 return err; 1317 } 1318 1319 i915_vma_unpin_fence(arg->vma); 1320 1321 return 0; 1322 } 1323 1324 static int __igt_reset_evict_vma(struct intel_gt *gt, 1325 struct i915_address_space *vm, 1326 int (*fn)(void *), 1327 unsigned int flags) 1328 { 1329 struct intel_engine_cs *engine = gt->engine[RCS0]; 1330 struct drm_i915_gem_object *obj; 1331 struct task_struct *tsk = NULL; 1332 struct i915_request *rq; 1333 struct evict_vma arg; 1334 struct hang h; 1335 unsigned int pin_flags; 1336 int err; 1337 1338 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1339 return 0; 1340 1341 if (!engine || !intel_engine_can_store_dword(engine)) 1342 return 0; 1343 1344 /* Check that we can recover an unbind stuck on a hanging request */ 1345 1346 err = hang_init(&h, gt); 1347 if (err) { 1348 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1349 return err; 1350 } 1351 1352 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1353 if (IS_ERR(obj)) { 1354 err = PTR_ERR(obj); 1355 pr_err("[%s] Create object failed: %d!\n", engine->name, err); 1356 goto fini; 1357 } 1358 1359 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1360 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1361 if (err) { 1362 pr_err("Invalid X-tiling settings; err:%d\n", err); 1363 goto out_obj; 1364 } 1365 } 1366 1367 arg.vma = i915_vma_instance(obj, vm, NULL); 1368 if (IS_ERR(arg.vma)) { 1369 err = PTR_ERR(arg.vma); 1370 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err); 1371 goto out_obj; 1372 } 1373 1374 rq = hang_create_request(&h, engine); 1375 if (IS_ERR(rq)) { 1376 err = PTR_ERR(rq); 1377 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1378 goto out_obj; 1379 } 1380 1381 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1382 1383 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1384 pin_flags |= PIN_MAPPABLE; 1385 1386 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1387 if (err) { 1388 i915_request_add(rq); 1389 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err); 1390 goto out_obj; 1391 } 1392 1393 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1394 err = i915_vma_pin_fence(arg.vma); 1395 if (err) { 1396 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1397 i915_vma_unpin(arg.vma); 1398 i915_request_add(rq); 1399 goto out_obj; 1400 } 1401 } 1402 1403 i915_vma_lock(arg.vma); 1404 err = i915_request_await_object(rq, arg.vma->obj, 1405 flags & EXEC_OBJECT_WRITE); 1406 if (err == 0) { 1407 err = i915_vma_move_to_active(arg.vma, rq, flags); 1408 if (err) 1409 pr_err("[%s] Move to active failed: %d!\n", engine->name, err); 1410 } else { 1411 pr_err("[%s] Request await failed: %d!\n", engine->name, err); 1412 } 1413 1414 i915_vma_unlock(arg.vma); 1415 1416 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1417 i915_vma_unpin_fence(arg.vma); 1418 i915_vma_unpin(arg.vma); 1419 1420 i915_request_get(rq); 1421 i915_request_add(rq); 1422 if (err) 1423 goto out_rq; 1424 1425 if (!wait_until_running(&h, rq)) { 1426 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1427 1428 pr_err("%s: Failed to start request %llx, at %x\n", 1429 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1430 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1431 1432 intel_gt_set_wedged(gt); 1433 goto out_reset; 1434 } 1435 1436 init_completion(&arg.completion); 1437 1438 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1439 if (IS_ERR(tsk)) { 1440 err = PTR_ERR(tsk); 1441 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1442 tsk = NULL; 1443 goto out_reset; 1444 } 1445 get_task_struct(tsk); 1446 1447 wait_for_completion(&arg.completion); 1448 1449 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1450 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1451 1452 pr_err("igt/evict_vma kthread did not wait\n"); 1453 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1454 1455 intel_gt_set_wedged(gt); 1456 goto out_reset; 1457 } 1458 1459 out_reset: 1460 igt_global_reset_lock(gt); 1461 fake_hangcheck(gt, rq->engine->mask); 1462 igt_global_reset_unlock(gt); 1463 1464 if (tsk) { 1465 struct intel_wedge_me w; 1466 1467 /* The reset, even indirectly, should take less than 10ms. */ 1468 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1469 err = kthread_stop(tsk); 1470 1471 put_task_struct(tsk); 1472 } 1473 1474 out_rq: 1475 i915_request_put(rq); 1476 out_obj: 1477 i915_gem_object_put(obj); 1478 fini: 1479 hang_fini(&h); 1480 if (intel_gt_is_wedged(gt)) 1481 return -EIO; 1482 1483 return err; 1484 } 1485 1486 static int igt_reset_evict_ggtt(void *arg) 1487 { 1488 struct intel_gt *gt = arg; 1489 1490 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1491 evict_vma, EXEC_OBJECT_WRITE); 1492 } 1493 1494 static int igt_reset_evict_ppgtt(void *arg) 1495 { 1496 struct intel_gt *gt = arg; 1497 struct i915_ppgtt *ppgtt; 1498 int err; 1499 1500 /* aliasing == global gtt locking, covered above */ 1501 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1502 return 0; 1503 1504 ppgtt = i915_ppgtt_create(gt); 1505 if (IS_ERR(ppgtt)) 1506 return PTR_ERR(ppgtt); 1507 1508 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1509 evict_vma, EXEC_OBJECT_WRITE); 1510 i915_vm_put(&ppgtt->vm); 1511 1512 return err; 1513 } 1514 1515 static int igt_reset_evict_fence(void *arg) 1516 { 1517 struct intel_gt *gt = arg; 1518 1519 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1520 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1521 } 1522 1523 static int wait_for_others(struct intel_gt *gt, 1524 struct intel_engine_cs *exclude) 1525 { 1526 struct intel_engine_cs *engine; 1527 enum intel_engine_id id; 1528 1529 for_each_engine(engine, gt, id) { 1530 if (engine == exclude) 1531 continue; 1532 1533 if (!wait_for_idle(engine)) 1534 return -EIO; 1535 } 1536 1537 return 0; 1538 } 1539 1540 static int igt_reset_queue(void *arg) 1541 { 1542 struct intel_gt *gt = arg; 1543 struct i915_gpu_error *global = >->i915->gpu_error; 1544 struct intel_engine_cs *engine; 1545 enum intel_engine_id id; 1546 struct hang h; 1547 int err; 1548 1549 /* Check that we replay pending requests following a hang */ 1550 1551 igt_global_reset_lock(gt); 1552 1553 err = hang_init(&h, gt); 1554 if (err) 1555 goto unlock; 1556 1557 for_each_engine(engine, gt, id) { 1558 struct i915_request *prev; 1559 IGT_TIMEOUT(end_time); 1560 unsigned int count; 1561 1562 if (!intel_engine_can_store_dword(engine)) 1563 continue; 1564 1565 prev = hang_create_request(&h, engine); 1566 if (IS_ERR(prev)) { 1567 err = PTR_ERR(prev); 1568 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err); 1569 goto fini; 1570 } 1571 1572 i915_request_get(prev); 1573 i915_request_add(prev); 1574 1575 count = 0; 1576 do { 1577 struct i915_request *rq; 1578 unsigned int reset_count; 1579 1580 rq = hang_create_request(&h, engine); 1581 if (IS_ERR(rq)) { 1582 err = PTR_ERR(rq); 1583 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1584 goto fini; 1585 } 1586 1587 i915_request_get(rq); 1588 i915_request_add(rq); 1589 1590 /* 1591 * XXX We don't handle resetting the kernel context 1592 * very well. If we trigger a device reset twice in 1593 * quick succession while the kernel context is 1594 * executing, we may end up skipping the breadcrumb. 1595 * This is really only a problem for the selftest as 1596 * normally there is a large interlude between resets 1597 * (hangcheck), or we focus on resetting just one 1598 * engine and so avoid repeatedly resetting innocents. 1599 */ 1600 err = wait_for_others(gt, engine); 1601 if (err) { 1602 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1603 __func__, engine->name); 1604 i915_request_put(rq); 1605 i915_request_put(prev); 1606 1607 GEM_TRACE_DUMP(); 1608 intel_gt_set_wedged(gt); 1609 goto fini; 1610 } 1611 1612 if (!wait_until_running(&h, prev)) { 1613 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1614 1615 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1616 __func__, engine->name, 1617 prev->fence.seqno, hws_seqno(&h, prev)); 1618 intel_engine_dump(engine, &p, 1619 "%s\n", engine->name); 1620 1621 i915_request_put(rq); 1622 i915_request_put(prev); 1623 1624 intel_gt_set_wedged(gt); 1625 1626 err = -EIO; 1627 goto fini; 1628 } 1629 1630 reset_count = fake_hangcheck(gt, BIT(id)); 1631 1632 if (prev->fence.error != -EIO) { 1633 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1634 prev->fence.error); 1635 i915_request_put(rq); 1636 i915_request_put(prev); 1637 err = -EINVAL; 1638 goto fini; 1639 } 1640 1641 if (rq->fence.error) { 1642 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1643 rq->fence.error); 1644 i915_request_put(rq); 1645 i915_request_put(prev); 1646 err = -EINVAL; 1647 goto fini; 1648 } 1649 1650 if (i915_reset_count(global) == reset_count) { 1651 pr_err("No GPU reset recorded!\n"); 1652 i915_request_put(rq); 1653 i915_request_put(prev); 1654 err = -EINVAL; 1655 goto fini; 1656 } 1657 1658 i915_request_put(prev); 1659 prev = rq; 1660 count++; 1661 } while (time_before(jiffies, end_time)); 1662 pr_info("%s: Completed %d queued resets\n", 1663 engine->name, count); 1664 1665 *h.batch = MI_BATCH_BUFFER_END; 1666 intel_gt_chipset_flush(engine->gt); 1667 1668 i915_request_put(prev); 1669 1670 err = igt_flush_test(gt->i915); 1671 if (err) { 1672 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1673 break; 1674 } 1675 } 1676 1677 fini: 1678 hang_fini(&h); 1679 unlock: 1680 igt_global_reset_unlock(gt); 1681 1682 if (intel_gt_is_wedged(gt)) 1683 return -EIO; 1684 1685 return err; 1686 } 1687 1688 static int igt_handle_error(void *arg) 1689 { 1690 struct intel_gt *gt = arg; 1691 struct i915_gpu_error *global = >->i915->gpu_error; 1692 struct intel_engine_cs *engine = gt->engine[RCS0]; 1693 struct hang h; 1694 struct i915_request *rq; 1695 struct i915_gpu_coredump *error; 1696 int err; 1697 1698 /* Check that we can issue a global GPU and engine reset */ 1699 1700 if (!intel_has_reset_engine(gt)) 1701 return 0; 1702 1703 if (!engine || !intel_engine_can_store_dword(engine)) 1704 return 0; 1705 1706 err = hang_init(&h, gt); 1707 if (err) { 1708 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1709 return err; 1710 } 1711 1712 rq = hang_create_request(&h, engine); 1713 if (IS_ERR(rq)) { 1714 err = PTR_ERR(rq); 1715 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1716 goto err_fini; 1717 } 1718 1719 i915_request_get(rq); 1720 i915_request_add(rq); 1721 1722 if (!wait_until_running(&h, rq)) { 1723 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1724 1725 pr_err("%s: Failed to start request %llx, at %x\n", 1726 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1727 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1728 1729 intel_gt_set_wedged(gt); 1730 1731 err = -EIO; 1732 goto err_request; 1733 } 1734 1735 /* Temporarily disable error capture */ 1736 error = xchg(&global->first_error, (void *)-1); 1737 1738 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1739 1740 xchg(&global->first_error, error); 1741 1742 if (rq->fence.error != -EIO) { 1743 pr_err("Guilty request not identified!\n"); 1744 err = -EINVAL; 1745 goto err_request; 1746 } 1747 1748 err_request: 1749 i915_request_put(rq); 1750 err_fini: 1751 hang_fini(&h); 1752 return err; 1753 } 1754 1755 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1756 const struct igt_atomic_section *p, 1757 const char *mode) 1758 { 1759 struct tasklet_struct * const t = &engine->sched_engine->tasklet; 1760 int err; 1761 1762 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1763 engine->name, mode, p->name); 1764 1765 if (t->func) 1766 tasklet_disable(t); 1767 if (strcmp(p->name, "softirq")) 1768 local_bh_disable(); 1769 p->critical_section_begin(); 1770 1771 err = __intel_engine_reset_bh(engine, NULL); 1772 1773 p->critical_section_end(); 1774 if (strcmp(p->name, "softirq")) 1775 local_bh_enable(); 1776 if (t->func) { 1777 tasklet_enable(t); 1778 tasklet_hi_schedule(t); 1779 } 1780 1781 if (err) 1782 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1783 engine->name, mode, p->name); 1784 1785 return err; 1786 } 1787 1788 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1789 const struct igt_atomic_section *p) 1790 { 1791 struct i915_request *rq; 1792 struct hang h; 1793 int err; 1794 1795 err = __igt_atomic_reset_engine(engine, p, "idle"); 1796 if (err) 1797 return err; 1798 1799 err = hang_init(&h, engine->gt); 1800 if (err) { 1801 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1802 return err; 1803 } 1804 1805 rq = hang_create_request(&h, engine); 1806 if (IS_ERR(rq)) { 1807 err = PTR_ERR(rq); 1808 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1809 goto out; 1810 } 1811 1812 i915_request_get(rq); 1813 i915_request_add(rq); 1814 1815 if (wait_until_running(&h, rq)) { 1816 err = __igt_atomic_reset_engine(engine, p, "active"); 1817 } else { 1818 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1819 __func__, engine->name, 1820 rq->fence.seqno, hws_seqno(&h, rq)); 1821 intel_gt_set_wedged(engine->gt); 1822 err = -EIO; 1823 } 1824 1825 if (err == 0) { 1826 struct intel_wedge_me w; 1827 1828 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1829 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1830 if (intel_gt_is_wedged(engine->gt)) 1831 err = -EIO; 1832 } 1833 1834 i915_request_put(rq); 1835 out: 1836 hang_fini(&h); 1837 return err; 1838 } 1839 1840 static int igt_reset_engines_atomic(void *arg) 1841 { 1842 struct intel_gt *gt = arg; 1843 const typeof(*igt_atomic_phases) *p; 1844 int err = 0; 1845 1846 /* Check that the engines resets are usable from atomic context */ 1847 1848 if (!intel_has_reset_engine(gt)) 1849 return 0; 1850 1851 if (intel_uc_uses_guc_submission(>->uc)) 1852 return 0; 1853 1854 igt_global_reset_lock(gt); 1855 1856 /* Flush any requests before we get started and check basics */ 1857 if (!igt_force_reset(gt)) 1858 goto unlock; 1859 1860 for (p = igt_atomic_phases; p->name; p++) { 1861 struct intel_engine_cs *engine; 1862 enum intel_engine_id id; 1863 1864 for_each_engine(engine, gt, id) { 1865 err = igt_atomic_reset_engine(engine, p); 1866 if (err) 1867 goto out; 1868 } 1869 } 1870 1871 out: 1872 /* As we poke around the guts, do a full reset before continuing. */ 1873 igt_force_reset(gt); 1874 unlock: 1875 igt_global_reset_unlock(gt); 1876 1877 return err; 1878 } 1879 1880 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1881 { 1882 static const struct i915_subtest tests[] = { 1883 SUBTEST(igt_hang_sanitycheck), 1884 SUBTEST(igt_reset_nop), 1885 SUBTEST(igt_reset_nop_engine), 1886 SUBTEST(igt_reset_idle_engine), 1887 SUBTEST(igt_reset_active_engine), 1888 SUBTEST(igt_reset_fail_engine), 1889 SUBTEST(igt_reset_engines), 1890 SUBTEST(igt_reset_engines_atomic), 1891 SUBTEST(igt_reset_queue), 1892 SUBTEST(igt_reset_wait), 1893 SUBTEST(igt_reset_evict_ggtt), 1894 SUBTEST(igt_reset_evict_ppgtt), 1895 SUBTEST(igt_reset_evict_fence), 1896 SUBTEST(igt_handle_error), 1897 }; 1898 struct intel_gt *gt = &i915->gt; 1899 intel_wakeref_t wakeref; 1900 int err; 1901 1902 if (!intel_has_gpu_reset(gt)) 1903 return 0; 1904 1905 if (intel_gt_is_wedged(gt)) 1906 return -EIO; /* we're long past hope of a successful reset */ 1907 1908 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1909 1910 err = intel_gt_live_subtests(tests, gt); 1911 1912 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1913 1914 return err; 1915 } 1916