1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2016 Intel Corporation 4 */ 5 6 #include <linux/kthread.h> 7 8 #include "gem/i915_gem_context.h" 9 10 #include "intel_gt.h" 11 #include "intel_engine_heartbeat.h" 12 #include "intel_engine_pm.h" 13 #include "selftest_engine_heartbeat.h" 14 15 #include "i915_selftest.h" 16 #include "selftests/i915_random.h" 17 #include "selftests/igt_flush_test.h" 18 #include "selftests/igt_reset.h" 19 #include "selftests/igt_atomic.h" 20 21 #include "selftests/mock_drm.h" 22 23 #include "gem/selftests/mock_context.h" 24 #include "gem/selftests/igt_gem_utils.h" 25 26 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 27 28 struct hang { 29 struct intel_gt *gt; 30 struct drm_i915_gem_object *hws; 31 struct drm_i915_gem_object *obj; 32 struct i915_gem_context *ctx; 33 u32 *seqno; 34 u32 *batch; 35 }; 36 37 static int hang_init(struct hang *h, struct intel_gt *gt) 38 { 39 void *vaddr; 40 int err; 41 42 memset(h, 0, sizeof(*h)); 43 h->gt = gt; 44 45 h->ctx = kernel_context(gt->i915); 46 if (IS_ERR(h->ctx)) 47 return PTR_ERR(h->ctx); 48 49 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 50 51 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 52 if (IS_ERR(h->hws)) { 53 err = PTR_ERR(h->hws); 54 goto err_ctx; 55 } 56 57 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 58 if (IS_ERR(h->obj)) { 59 err = PTR_ERR(h->obj); 60 goto err_hws; 61 } 62 63 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 64 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB); 65 if (IS_ERR(vaddr)) { 66 err = PTR_ERR(vaddr); 67 goto err_obj; 68 } 69 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 70 71 vaddr = i915_gem_object_pin_map_unlocked(h->obj, 72 i915_coherent_map_type(gt->i915, h->obj, false)); 73 if (IS_ERR(vaddr)) { 74 err = PTR_ERR(vaddr); 75 goto err_unpin_hws; 76 } 77 h->batch = vaddr; 78 79 return 0; 80 81 err_unpin_hws: 82 i915_gem_object_unpin_map(h->hws); 83 err_obj: 84 i915_gem_object_put(h->obj); 85 err_hws: 86 i915_gem_object_put(h->hws); 87 err_ctx: 88 kernel_context_close(h->ctx); 89 return err; 90 } 91 92 static u64 hws_address(const struct i915_vma *hws, 93 const struct i915_request *rq) 94 { 95 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 96 } 97 98 static int move_to_active(struct i915_vma *vma, 99 struct i915_request *rq, 100 unsigned int flags) 101 { 102 int err; 103 104 i915_vma_lock(vma); 105 err = i915_request_await_object(rq, vma->obj, 106 flags & EXEC_OBJECT_WRITE); 107 if (err == 0) 108 err = i915_vma_move_to_active(vma, rq, flags); 109 i915_vma_unlock(vma); 110 111 return err; 112 } 113 114 static struct i915_request * 115 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 116 { 117 struct intel_gt *gt = h->gt; 118 struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx); 119 struct drm_i915_gem_object *obj; 120 struct i915_request *rq = NULL; 121 struct i915_vma *hws, *vma; 122 unsigned int flags; 123 void *vaddr; 124 u32 *batch; 125 int err; 126 127 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 128 if (IS_ERR(obj)) { 129 i915_vm_put(vm); 130 return ERR_CAST(obj); 131 } 132 133 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false)); 134 if (IS_ERR(vaddr)) { 135 i915_gem_object_put(obj); 136 i915_vm_put(vm); 137 return ERR_CAST(vaddr); 138 } 139 140 i915_gem_object_unpin_map(h->obj); 141 i915_gem_object_put(h->obj); 142 143 h->obj = obj; 144 h->batch = vaddr; 145 146 vma = i915_vma_instance(h->obj, vm, NULL); 147 if (IS_ERR(vma)) { 148 i915_vm_put(vm); 149 return ERR_CAST(vma); 150 } 151 152 hws = i915_vma_instance(h->hws, vm, NULL); 153 if (IS_ERR(hws)) { 154 i915_vm_put(vm); 155 return ERR_CAST(hws); 156 } 157 158 err = i915_vma_pin(vma, 0, 0, PIN_USER); 159 if (err) { 160 i915_vm_put(vm); 161 return ERR_PTR(err); 162 } 163 164 err = i915_vma_pin(hws, 0, 0, PIN_USER); 165 if (err) 166 goto unpin_vma; 167 168 rq = igt_request_alloc(h->ctx, engine); 169 if (IS_ERR(rq)) { 170 err = PTR_ERR(rq); 171 goto unpin_hws; 172 } 173 174 err = move_to_active(vma, rq, 0); 175 if (err) 176 goto cancel_rq; 177 178 err = move_to_active(hws, rq, 0); 179 if (err) 180 goto cancel_rq; 181 182 batch = h->batch; 183 if (GRAPHICS_VER(gt->i915) >= 8) { 184 *batch++ = MI_STORE_DWORD_IMM_GEN4; 185 *batch++ = lower_32_bits(hws_address(hws, rq)); 186 *batch++ = upper_32_bits(hws_address(hws, rq)); 187 *batch++ = rq->fence.seqno; 188 *batch++ = MI_NOOP; 189 190 memset(batch, 0, 1024); 191 batch += 1024 / sizeof(*batch); 192 193 *batch++ = MI_NOOP; 194 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 195 *batch++ = lower_32_bits(vma->node.start); 196 *batch++ = upper_32_bits(vma->node.start); 197 } else if (GRAPHICS_VER(gt->i915) >= 6) { 198 *batch++ = MI_STORE_DWORD_IMM_GEN4; 199 *batch++ = 0; 200 *batch++ = lower_32_bits(hws_address(hws, rq)); 201 *batch++ = rq->fence.seqno; 202 *batch++ = MI_NOOP; 203 204 memset(batch, 0, 1024); 205 batch += 1024 / sizeof(*batch); 206 207 *batch++ = MI_NOOP; 208 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 209 *batch++ = lower_32_bits(vma->node.start); 210 } else if (GRAPHICS_VER(gt->i915) >= 4) { 211 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 212 *batch++ = 0; 213 *batch++ = lower_32_bits(hws_address(hws, rq)); 214 *batch++ = rq->fence.seqno; 215 *batch++ = MI_NOOP; 216 217 memset(batch, 0, 1024); 218 batch += 1024 / sizeof(*batch); 219 220 *batch++ = MI_NOOP; 221 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 222 *batch++ = lower_32_bits(vma->node.start); 223 } else { 224 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 225 *batch++ = lower_32_bits(hws_address(hws, rq)); 226 *batch++ = rq->fence.seqno; 227 *batch++ = MI_NOOP; 228 229 memset(batch, 0, 1024); 230 batch += 1024 / sizeof(*batch); 231 232 *batch++ = MI_NOOP; 233 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 234 *batch++ = lower_32_bits(vma->node.start); 235 } 236 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 237 intel_gt_chipset_flush(engine->gt); 238 239 if (rq->engine->emit_init_breadcrumb) { 240 err = rq->engine->emit_init_breadcrumb(rq); 241 if (err) 242 goto cancel_rq; 243 } 244 245 flags = 0; 246 if (GRAPHICS_VER(gt->i915) <= 5) 247 flags |= I915_DISPATCH_SECURE; 248 249 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 250 251 cancel_rq: 252 if (err) { 253 i915_request_set_error_once(rq, err); 254 i915_request_add(rq); 255 } 256 unpin_hws: 257 i915_vma_unpin(hws); 258 unpin_vma: 259 i915_vma_unpin(vma); 260 i915_vm_put(vm); 261 return err ? ERR_PTR(err) : rq; 262 } 263 264 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 265 { 266 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 267 } 268 269 static void hang_fini(struct hang *h) 270 { 271 *h->batch = MI_BATCH_BUFFER_END; 272 intel_gt_chipset_flush(h->gt); 273 274 i915_gem_object_unpin_map(h->obj); 275 i915_gem_object_put(h->obj); 276 277 i915_gem_object_unpin_map(h->hws); 278 i915_gem_object_put(h->hws); 279 280 kernel_context_close(h->ctx); 281 282 igt_flush_test(h->gt->i915); 283 } 284 285 static bool wait_until_running(struct hang *h, struct i915_request *rq) 286 { 287 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 288 rq->fence.seqno), 289 10) && 290 wait_for(i915_seqno_passed(hws_seqno(h, rq), 291 rq->fence.seqno), 292 1000)); 293 } 294 295 static int igt_hang_sanitycheck(void *arg) 296 { 297 struct intel_gt *gt = arg; 298 struct i915_request *rq; 299 struct intel_engine_cs *engine; 300 enum intel_engine_id id; 301 struct hang h; 302 int err; 303 304 /* Basic check that we can execute our hanging batch */ 305 306 err = hang_init(&h, gt); 307 if (err) 308 return err; 309 310 for_each_engine(engine, gt, id) { 311 struct intel_wedge_me w; 312 long timeout; 313 314 if (!intel_engine_can_store_dword(engine)) 315 continue; 316 317 rq = hang_create_request(&h, engine); 318 if (IS_ERR(rq)) { 319 err = PTR_ERR(rq); 320 pr_err("Failed to create request for %s, err=%d\n", 321 engine->name, err); 322 goto fini; 323 } 324 325 i915_request_get(rq); 326 327 *h.batch = MI_BATCH_BUFFER_END; 328 intel_gt_chipset_flush(engine->gt); 329 330 i915_request_add(rq); 331 332 timeout = 0; 333 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 334 timeout = i915_request_wait(rq, 0, 335 MAX_SCHEDULE_TIMEOUT); 336 if (intel_gt_is_wedged(gt)) 337 timeout = -EIO; 338 339 i915_request_put(rq); 340 341 if (timeout < 0) { 342 err = timeout; 343 pr_err("Wait for request failed on %s, err=%d\n", 344 engine->name, err); 345 goto fini; 346 } 347 } 348 349 fini: 350 hang_fini(&h); 351 return err; 352 } 353 354 static bool wait_for_idle(struct intel_engine_cs *engine) 355 { 356 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 357 } 358 359 static int igt_reset_nop(void *arg) 360 { 361 struct intel_gt *gt = arg; 362 struct i915_gpu_error *global = >->i915->gpu_error; 363 struct intel_engine_cs *engine; 364 unsigned int reset_count, count; 365 enum intel_engine_id id; 366 IGT_TIMEOUT(end_time); 367 int err = 0; 368 369 /* Check that we can reset during non-user portions of requests */ 370 371 reset_count = i915_reset_count(global); 372 count = 0; 373 do { 374 for_each_engine(engine, gt, id) { 375 struct intel_context *ce; 376 int i; 377 378 ce = intel_context_create(engine); 379 if (IS_ERR(ce)) { 380 err = PTR_ERR(ce); 381 break; 382 } 383 384 for (i = 0; i < 16; i++) { 385 struct i915_request *rq; 386 387 rq = intel_context_create_request(ce); 388 if (IS_ERR(rq)) { 389 err = PTR_ERR(rq); 390 break; 391 } 392 393 i915_request_add(rq); 394 } 395 396 intel_context_put(ce); 397 } 398 399 igt_global_reset_lock(gt); 400 intel_gt_reset(gt, ALL_ENGINES, NULL); 401 igt_global_reset_unlock(gt); 402 403 if (intel_gt_is_wedged(gt)) { 404 err = -EIO; 405 break; 406 } 407 408 if (i915_reset_count(global) != reset_count + ++count) { 409 pr_err("Full GPU reset not recorded!\n"); 410 err = -EINVAL; 411 break; 412 } 413 414 err = igt_flush_test(gt->i915); 415 if (err) 416 break; 417 } while (time_before(jiffies, end_time)); 418 pr_info("%s: %d resets\n", __func__, count); 419 420 if (igt_flush_test(gt->i915)) 421 err = -EIO; 422 return err; 423 } 424 425 static int igt_reset_nop_engine(void *arg) 426 { 427 struct intel_gt *gt = arg; 428 struct i915_gpu_error *global = >->i915->gpu_error; 429 struct intel_engine_cs *engine; 430 enum intel_engine_id id; 431 432 /* Check that we can engine-reset during non-user portions */ 433 434 if (!intel_has_reset_engine(gt)) 435 return 0; 436 437 for_each_engine(engine, gt, id) { 438 unsigned int reset_count, reset_engine_count, count; 439 struct intel_context *ce; 440 IGT_TIMEOUT(end_time); 441 int err; 442 443 ce = intel_context_create(engine); 444 if (IS_ERR(ce)) 445 return PTR_ERR(ce); 446 447 reset_count = i915_reset_count(global); 448 reset_engine_count = i915_reset_engine_count(global, engine); 449 count = 0; 450 451 st_engine_heartbeat_disable(engine); 452 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 453 do { 454 int i; 455 456 if (!wait_for_idle(engine)) { 457 pr_err("%s failed to idle before reset\n", 458 engine->name); 459 err = -EIO; 460 break; 461 } 462 463 for (i = 0; i < 16; i++) { 464 struct i915_request *rq; 465 466 rq = intel_context_create_request(ce); 467 if (IS_ERR(rq)) { 468 struct drm_printer p = 469 drm_info_printer(gt->i915->drm.dev); 470 intel_engine_dump(engine, &p, 471 "%s(%s): failed to submit request\n", 472 __func__, 473 engine->name); 474 475 GEM_TRACE("%s(%s): failed to submit request\n", 476 __func__, 477 engine->name); 478 GEM_TRACE_DUMP(); 479 480 intel_gt_set_wedged(gt); 481 482 err = PTR_ERR(rq); 483 break; 484 } 485 486 i915_request_add(rq); 487 } 488 err = intel_engine_reset(engine, NULL); 489 if (err) { 490 pr_err("intel_engine_reset(%s) failed, err:%d\n", 491 engine->name, err); 492 break; 493 } 494 495 if (i915_reset_count(global) != reset_count) { 496 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 497 err = -EINVAL; 498 break; 499 } 500 501 if (i915_reset_engine_count(global, engine) != 502 reset_engine_count + ++count) { 503 pr_err("%s engine reset not recorded!\n", 504 engine->name); 505 err = -EINVAL; 506 break; 507 } 508 } while (time_before(jiffies, end_time)); 509 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 510 st_engine_heartbeat_enable(engine); 511 512 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 513 514 intel_context_put(ce); 515 if (igt_flush_test(gt->i915)) 516 err = -EIO; 517 if (err) 518 return err; 519 } 520 521 return 0; 522 } 523 524 static void force_reset_timeout(struct intel_engine_cs *engine) 525 { 526 engine->reset_timeout.probability = 999; 527 atomic_set(&engine->reset_timeout.times, -1); 528 } 529 530 static void cancel_reset_timeout(struct intel_engine_cs *engine) 531 { 532 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 533 } 534 535 static int igt_reset_fail_engine(void *arg) 536 { 537 struct intel_gt *gt = arg; 538 struct intel_engine_cs *engine; 539 enum intel_engine_id id; 540 541 /* Check that we can recover from engine-reset failues */ 542 543 if (!intel_has_reset_engine(gt)) 544 return 0; 545 546 for_each_engine(engine, gt, id) { 547 unsigned int count; 548 struct intel_context *ce; 549 IGT_TIMEOUT(end_time); 550 int err; 551 552 ce = intel_context_create(engine); 553 if (IS_ERR(ce)) 554 return PTR_ERR(ce); 555 556 st_engine_heartbeat_disable(engine); 557 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 558 559 force_reset_timeout(engine); 560 err = intel_engine_reset(engine, NULL); 561 cancel_reset_timeout(engine); 562 if (err == 0) /* timeouts only generated on gen8+ */ 563 goto skip; 564 565 count = 0; 566 do { 567 struct i915_request *last = NULL; 568 int i; 569 570 if (!wait_for_idle(engine)) { 571 pr_err("%s failed to idle before reset\n", 572 engine->name); 573 err = -EIO; 574 break; 575 } 576 577 for (i = 0; i < count % 15; i++) { 578 struct i915_request *rq; 579 580 rq = intel_context_create_request(ce); 581 if (IS_ERR(rq)) { 582 struct drm_printer p = 583 drm_info_printer(gt->i915->drm.dev); 584 intel_engine_dump(engine, &p, 585 "%s(%s): failed to submit request\n", 586 __func__, 587 engine->name); 588 589 GEM_TRACE("%s(%s): failed to submit request\n", 590 __func__, 591 engine->name); 592 GEM_TRACE_DUMP(); 593 594 intel_gt_set_wedged(gt); 595 if (last) 596 i915_request_put(last); 597 598 err = PTR_ERR(rq); 599 goto out; 600 } 601 602 if (last) 603 i915_request_put(last); 604 last = i915_request_get(rq); 605 i915_request_add(rq); 606 } 607 608 if (count & 1) { 609 err = intel_engine_reset(engine, NULL); 610 if (err) { 611 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 612 engine->name, err); 613 GEM_TRACE_DUMP(); 614 i915_request_put(last); 615 break; 616 } 617 } else { 618 force_reset_timeout(engine); 619 err = intel_engine_reset(engine, NULL); 620 cancel_reset_timeout(engine); 621 if (err != -ETIMEDOUT) { 622 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 623 engine->name, err); 624 i915_request_put(last); 625 break; 626 } 627 } 628 629 err = 0; 630 if (last) { 631 if (i915_request_wait(last, 0, HZ / 2) < 0) { 632 struct drm_printer p = 633 drm_info_printer(gt->i915->drm.dev); 634 635 intel_engine_dump(engine, &p, 636 "%s(%s): failed to complete request\n", 637 __func__, 638 engine->name); 639 640 GEM_TRACE("%s(%s): failed to complete request\n", 641 __func__, 642 engine->name); 643 GEM_TRACE_DUMP(); 644 645 err = -EIO; 646 } 647 i915_request_put(last); 648 } 649 count++; 650 } while (err == 0 && time_before(jiffies, end_time)); 651 out: 652 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 653 skip: 654 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 655 st_engine_heartbeat_enable(engine); 656 intel_context_put(ce); 657 658 if (igt_flush_test(gt->i915)) 659 err = -EIO; 660 if (err) 661 return err; 662 } 663 664 return 0; 665 } 666 667 static int __igt_reset_engine(struct intel_gt *gt, bool active) 668 { 669 struct i915_gpu_error *global = >->i915->gpu_error; 670 struct intel_engine_cs *engine; 671 enum intel_engine_id id; 672 struct hang h; 673 int err = 0; 674 675 /* Check that we can issue an engine reset on an idle engine (no-op) */ 676 677 if (!intel_has_reset_engine(gt)) 678 return 0; 679 680 if (active) { 681 err = hang_init(&h, gt); 682 if (err) 683 return err; 684 } 685 686 for_each_engine(engine, gt, id) { 687 unsigned int reset_count, reset_engine_count; 688 unsigned long count; 689 IGT_TIMEOUT(end_time); 690 691 if (active && !intel_engine_can_store_dword(engine)) 692 continue; 693 694 if (!wait_for_idle(engine)) { 695 pr_err("%s failed to idle before reset\n", 696 engine->name); 697 err = -EIO; 698 break; 699 } 700 701 reset_count = i915_reset_count(global); 702 reset_engine_count = i915_reset_engine_count(global, engine); 703 704 st_engine_heartbeat_disable(engine); 705 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 706 count = 0; 707 do { 708 if (active) { 709 struct i915_request *rq; 710 711 rq = hang_create_request(&h, engine); 712 if (IS_ERR(rq)) { 713 err = PTR_ERR(rq); 714 break; 715 } 716 717 i915_request_get(rq); 718 i915_request_add(rq); 719 720 if (!wait_until_running(&h, rq)) { 721 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 722 723 pr_err("%s: Failed to start request %llx, at %x\n", 724 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 725 intel_engine_dump(engine, &p, 726 "%s\n", engine->name); 727 728 i915_request_put(rq); 729 err = -EIO; 730 break; 731 } 732 733 i915_request_put(rq); 734 } 735 736 err = intel_engine_reset(engine, NULL); 737 if (err) { 738 pr_err("intel_engine_reset(%s) failed, err:%d\n", 739 engine->name, err); 740 break; 741 } 742 743 if (i915_reset_count(global) != reset_count) { 744 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 745 err = -EINVAL; 746 break; 747 } 748 749 if (i915_reset_engine_count(global, engine) != 750 ++reset_engine_count) { 751 pr_err("%s engine reset not recorded!\n", 752 engine->name); 753 err = -EINVAL; 754 break; 755 } 756 757 count++; 758 } while (time_before(jiffies, end_time)); 759 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 760 st_engine_heartbeat_enable(engine); 761 pr_info("%s: Completed %lu %s resets\n", 762 engine->name, count, active ? "active" : "idle"); 763 764 if (err) 765 break; 766 767 err = igt_flush_test(gt->i915); 768 if (err) 769 break; 770 } 771 772 if (intel_gt_is_wedged(gt)) 773 err = -EIO; 774 775 if (active) 776 hang_fini(&h); 777 778 return err; 779 } 780 781 static int igt_reset_idle_engine(void *arg) 782 { 783 return __igt_reset_engine(arg, false); 784 } 785 786 static int igt_reset_active_engine(void *arg) 787 { 788 return __igt_reset_engine(arg, true); 789 } 790 791 struct active_engine { 792 struct task_struct *task; 793 struct intel_engine_cs *engine; 794 unsigned long resets; 795 unsigned int flags; 796 }; 797 798 #define TEST_ACTIVE BIT(0) 799 #define TEST_OTHERS BIT(1) 800 #define TEST_SELF BIT(2) 801 #define TEST_PRIORITY BIT(3) 802 803 static int active_request_put(struct i915_request *rq) 804 { 805 int err = 0; 806 807 if (!rq) 808 return 0; 809 810 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 811 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 812 rq->engine->name, 813 rq->fence.context, 814 rq->fence.seqno); 815 GEM_TRACE_DUMP(); 816 817 intel_gt_set_wedged(rq->engine->gt); 818 err = -EIO; 819 } 820 821 i915_request_put(rq); 822 823 return err; 824 } 825 826 static int active_engine(void *data) 827 { 828 I915_RND_STATE(prng); 829 struct active_engine *arg = data; 830 struct intel_engine_cs *engine = arg->engine; 831 struct i915_request *rq[8] = {}; 832 struct intel_context *ce[ARRAY_SIZE(rq)]; 833 unsigned long count; 834 int err = 0; 835 836 for (count = 0; count < ARRAY_SIZE(ce); count++) { 837 ce[count] = intel_context_create(engine); 838 if (IS_ERR(ce[count])) { 839 err = PTR_ERR(ce[count]); 840 while (--count) 841 intel_context_put(ce[count]); 842 return err; 843 } 844 } 845 846 count = 0; 847 while (!kthread_should_stop()) { 848 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 849 struct i915_request *old = rq[idx]; 850 struct i915_request *new; 851 852 new = intel_context_create_request(ce[idx]); 853 if (IS_ERR(new)) { 854 err = PTR_ERR(new); 855 break; 856 } 857 858 rq[idx] = i915_request_get(new); 859 i915_request_add(new); 860 861 if (engine->schedule && arg->flags & TEST_PRIORITY) { 862 struct i915_sched_attr attr = { 863 .priority = 864 i915_prandom_u32_max_state(512, &prng), 865 }; 866 engine->schedule(rq[idx], &attr); 867 } 868 869 err = active_request_put(old); 870 if (err) 871 break; 872 873 cond_resched(); 874 } 875 876 for (count = 0; count < ARRAY_SIZE(rq); count++) { 877 int err__ = active_request_put(rq[count]); 878 879 /* Keep the first error */ 880 if (!err) 881 err = err__; 882 883 intel_context_put(ce[count]); 884 } 885 886 return err; 887 } 888 889 static int __igt_reset_engines(struct intel_gt *gt, 890 const char *test_name, 891 unsigned int flags) 892 { 893 struct i915_gpu_error *global = >->i915->gpu_error; 894 struct intel_engine_cs *engine, *other; 895 enum intel_engine_id id, tmp; 896 struct hang h; 897 int err = 0; 898 899 /* Check that issuing a reset on one engine does not interfere 900 * with any other engine. 901 */ 902 903 if (!intel_has_reset_engine(gt)) 904 return 0; 905 906 if (flags & TEST_ACTIVE) { 907 err = hang_init(&h, gt); 908 if (err) 909 return err; 910 911 if (flags & TEST_PRIORITY) 912 h.ctx->sched.priority = 1024; 913 } 914 915 for_each_engine(engine, gt, id) { 916 struct active_engine threads[I915_NUM_ENGINES] = {}; 917 unsigned long device = i915_reset_count(global); 918 unsigned long count = 0, reported; 919 IGT_TIMEOUT(end_time); 920 921 if (flags & TEST_ACTIVE && 922 !intel_engine_can_store_dword(engine)) 923 continue; 924 925 if (!wait_for_idle(engine)) { 926 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 927 engine->name, test_name); 928 err = -EIO; 929 break; 930 } 931 932 memset(threads, 0, sizeof(threads)); 933 for_each_engine(other, gt, tmp) { 934 struct task_struct *tsk; 935 936 threads[tmp].resets = 937 i915_reset_engine_count(global, other); 938 939 if (other == engine && !(flags & TEST_SELF)) 940 continue; 941 942 if (other != engine && !(flags & TEST_OTHERS)) 943 continue; 944 945 threads[tmp].engine = other; 946 threads[tmp].flags = flags; 947 948 tsk = kthread_run(active_engine, &threads[tmp], 949 "igt/%s", other->name); 950 if (IS_ERR(tsk)) { 951 err = PTR_ERR(tsk); 952 goto unwind; 953 } 954 955 threads[tmp].task = tsk; 956 get_task_struct(tsk); 957 } 958 959 yield(); /* start all threads before we begin */ 960 961 st_engine_heartbeat_disable(engine); 962 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 963 do { 964 struct i915_request *rq = NULL; 965 966 if (flags & TEST_ACTIVE) { 967 rq = hang_create_request(&h, engine); 968 if (IS_ERR(rq)) { 969 err = PTR_ERR(rq); 970 break; 971 } 972 973 i915_request_get(rq); 974 i915_request_add(rq); 975 976 if (!wait_until_running(&h, rq)) { 977 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 978 979 pr_err("%s: Failed to start request %llx, at %x\n", 980 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 981 intel_engine_dump(engine, &p, 982 "%s\n", engine->name); 983 984 i915_request_put(rq); 985 err = -EIO; 986 break; 987 } 988 } 989 990 err = intel_engine_reset(engine, NULL); 991 if (err) { 992 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 993 engine->name, test_name, err); 994 break; 995 } 996 997 count++; 998 999 if (rq) { 1000 if (rq->fence.error != -EIO) { 1001 pr_err("i915_reset_engine(%s:%s):" 1002 " failed to reset request %llx:%lld\n", 1003 engine->name, test_name, 1004 rq->fence.context, 1005 rq->fence.seqno); 1006 i915_request_put(rq); 1007 1008 GEM_TRACE_DUMP(); 1009 intel_gt_set_wedged(gt); 1010 err = -EIO; 1011 break; 1012 } 1013 1014 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1015 struct drm_printer p = 1016 drm_info_printer(gt->i915->drm.dev); 1017 1018 pr_err("i915_reset_engine(%s:%s):" 1019 " failed to complete request %llx:%lld after reset\n", 1020 engine->name, test_name, 1021 rq->fence.context, 1022 rq->fence.seqno); 1023 intel_engine_dump(engine, &p, 1024 "%s\n", engine->name); 1025 i915_request_put(rq); 1026 1027 GEM_TRACE_DUMP(); 1028 intel_gt_set_wedged(gt); 1029 err = -EIO; 1030 break; 1031 } 1032 1033 i915_request_put(rq); 1034 } 1035 1036 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1037 struct drm_printer p = 1038 drm_info_printer(gt->i915->drm.dev); 1039 1040 pr_err("i915_reset_engine(%s:%s):" 1041 " failed to idle after reset\n", 1042 engine->name, test_name); 1043 intel_engine_dump(engine, &p, 1044 "%s\n", engine->name); 1045 1046 err = -EIO; 1047 break; 1048 } 1049 } while (time_before(jiffies, end_time)); 1050 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 1051 st_engine_heartbeat_enable(engine); 1052 1053 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1054 engine->name, test_name, count); 1055 1056 reported = i915_reset_engine_count(global, engine); 1057 reported -= threads[engine->id].resets; 1058 if (reported != count) { 1059 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1060 engine->name, test_name, count, reported); 1061 if (!err) 1062 err = -EINVAL; 1063 } 1064 1065 unwind: 1066 for_each_engine(other, gt, tmp) { 1067 int ret; 1068 1069 if (!threads[tmp].task) 1070 continue; 1071 1072 ret = kthread_stop(threads[tmp].task); 1073 if (ret) { 1074 pr_err("kthread for other engine %s failed, err=%d\n", 1075 other->name, ret); 1076 if (!err) 1077 err = ret; 1078 } 1079 put_task_struct(threads[tmp].task); 1080 1081 if (other->uabi_class != engine->uabi_class && 1082 threads[tmp].resets != 1083 i915_reset_engine_count(global, other)) { 1084 pr_err("Innocent engine %s was reset (count=%ld)\n", 1085 other->name, 1086 i915_reset_engine_count(global, other) - 1087 threads[tmp].resets); 1088 if (!err) 1089 err = -EINVAL; 1090 } 1091 } 1092 1093 if (device != i915_reset_count(global)) { 1094 pr_err("Global reset (count=%ld)!\n", 1095 i915_reset_count(global) - device); 1096 if (!err) 1097 err = -EINVAL; 1098 } 1099 1100 if (err) 1101 break; 1102 1103 err = igt_flush_test(gt->i915); 1104 if (err) 1105 break; 1106 } 1107 1108 if (intel_gt_is_wedged(gt)) 1109 err = -EIO; 1110 1111 if (flags & TEST_ACTIVE) 1112 hang_fini(&h); 1113 1114 return err; 1115 } 1116 1117 static int igt_reset_engines(void *arg) 1118 { 1119 static const struct { 1120 const char *name; 1121 unsigned int flags; 1122 } phases[] = { 1123 { "idle", 0 }, 1124 { "active", TEST_ACTIVE }, 1125 { "others-idle", TEST_OTHERS }, 1126 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1127 { 1128 "others-priority", 1129 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1130 }, 1131 { 1132 "self-priority", 1133 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1134 }, 1135 { } 1136 }; 1137 struct intel_gt *gt = arg; 1138 typeof(*phases) *p; 1139 int err; 1140 1141 for (p = phases; p->name; p++) { 1142 if (p->flags & TEST_PRIORITY) { 1143 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1144 continue; 1145 } 1146 1147 err = __igt_reset_engines(arg, p->name, p->flags); 1148 if (err) 1149 return err; 1150 } 1151 1152 return 0; 1153 } 1154 1155 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1156 { 1157 u32 count = i915_reset_count(>->i915->gpu_error); 1158 1159 intel_gt_reset(gt, mask, NULL); 1160 1161 return count; 1162 } 1163 1164 static int igt_reset_wait(void *arg) 1165 { 1166 struct intel_gt *gt = arg; 1167 struct i915_gpu_error *global = >->i915->gpu_error; 1168 struct intel_engine_cs *engine = gt->engine[RCS0]; 1169 struct i915_request *rq; 1170 unsigned int reset_count; 1171 struct hang h; 1172 long timeout; 1173 int err; 1174 1175 if (!engine || !intel_engine_can_store_dword(engine)) 1176 return 0; 1177 1178 /* Check that we detect a stuck waiter and issue a reset */ 1179 1180 igt_global_reset_lock(gt); 1181 1182 err = hang_init(&h, gt); 1183 if (err) 1184 goto unlock; 1185 1186 rq = hang_create_request(&h, engine); 1187 if (IS_ERR(rq)) { 1188 err = PTR_ERR(rq); 1189 goto fini; 1190 } 1191 1192 i915_request_get(rq); 1193 i915_request_add(rq); 1194 1195 if (!wait_until_running(&h, rq)) { 1196 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1197 1198 pr_err("%s: Failed to start request %llx, at %x\n", 1199 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1200 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1201 1202 intel_gt_set_wedged(gt); 1203 1204 err = -EIO; 1205 goto out_rq; 1206 } 1207 1208 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1209 1210 timeout = i915_request_wait(rq, 0, 10); 1211 if (timeout < 0) { 1212 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1213 timeout); 1214 err = timeout; 1215 goto out_rq; 1216 } 1217 1218 if (i915_reset_count(global) == reset_count) { 1219 pr_err("No GPU reset recorded!\n"); 1220 err = -EINVAL; 1221 goto out_rq; 1222 } 1223 1224 out_rq: 1225 i915_request_put(rq); 1226 fini: 1227 hang_fini(&h); 1228 unlock: 1229 igt_global_reset_unlock(gt); 1230 1231 if (intel_gt_is_wedged(gt)) 1232 return -EIO; 1233 1234 return err; 1235 } 1236 1237 struct evict_vma { 1238 struct completion completion; 1239 struct i915_vma *vma; 1240 }; 1241 1242 static int evict_vma(void *data) 1243 { 1244 struct evict_vma *arg = data; 1245 struct i915_address_space *vm = arg->vma->vm; 1246 struct drm_mm_node evict = arg->vma->node; 1247 int err; 1248 1249 complete(&arg->completion); 1250 1251 mutex_lock(&vm->mutex); 1252 err = i915_gem_evict_for_node(vm, &evict, 0); 1253 mutex_unlock(&vm->mutex); 1254 1255 return err; 1256 } 1257 1258 static int evict_fence(void *data) 1259 { 1260 struct evict_vma *arg = data; 1261 int err; 1262 1263 complete(&arg->completion); 1264 1265 /* Mark the fence register as dirty to force the mmio update. */ 1266 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1267 if (err) { 1268 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1269 return err; 1270 } 1271 1272 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1273 if (err) { 1274 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1275 return err; 1276 } 1277 1278 err = i915_vma_pin_fence(arg->vma); 1279 i915_vma_unpin(arg->vma); 1280 if (err) { 1281 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1282 return err; 1283 } 1284 1285 i915_vma_unpin_fence(arg->vma); 1286 1287 return 0; 1288 } 1289 1290 static int __igt_reset_evict_vma(struct intel_gt *gt, 1291 struct i915_address_space *vm, 1292 int (*fn)(void *), 1293 unsigned int flags) 1294 { 1295 struct intel_engine_cs *engine = gt->engine[RCS0]; 1296 struct drm_i915_gem_object *obj; 1297 struct task_struct *tsk = NULL; 1298 struct i915_request *rq; 1299 struct evict_vma arg; 1300 struct hang h; 1301 unsigned int pin_flags; 1302 int err; 1303 1304 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1305 return 0; 1306 1307 if (!engine || !intel_engine_can_store_dword(engine)) 1308 return 0; 1309 1310 /* Check that we can recover an unbind stuck on a hanging request */ 1311 1312 err = hang_init(&h, gt); 1313 if (err) 1314 return err; 1315 1316 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1317 if (IS_ERR(obj)) { 1318 err = PTR_ERR(obj); 1319 goto fini; 1320 } 1321 1322 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1323 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1324 if (err) { 1325 pr_err("Invalid X-tiling settings; err:%d\n", err); 1326 goto out_obj; 1327 } 1328 } 1329 1330 arg.vma = i915_vma_instance(obj, vm, NULL); 1331 if (IS_ERR(arg.vma)) { 1332 err = PTR_ERR(arg.vma); 1333 goto out_obj; 1334 } 1335 1336 rq = hang_create_request(&h, engine); 1337 if (IS_ERR(rq)) { 1338 err = PTR_ERR(rq); 1339 goto out_obj; 1340 } 1341 1342 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1343 1344 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1345 pin_flags |= PIN_MAPPABLE; 1346 1347 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1348 if (err) { 1349 i915_request_add(rq); 1350 goto out_obj; 1351 } 1352 1353 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1354 err = i915_vma_pin_fence(arg.vma); 1355 if (err) { 1356 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1357 i915_vma_unpin(arg.vma); 1358 i915_request_add(rq); 1359 goto out_obj; 1360 } 1361 } 1362 1363 i915_vma_lock(arg.vma); 1364 err = i915_request_await_object(rq, arg.vma->obj, 1365 flags & EXEC_OBJECT_WRITE); 1366 if (err == 0) 1367 err = i915_vma_move_to_active(arg.vma, rq, flags); 1368 i915_vma_unlock(arg.vma); 1369 1370 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1371 i915_vma_unpin_fence(arg.vma); 1372 i915_vma_unpin(arg.vma); 1373 1374 i915_request_get(rq); 1375 i915_request_add(rq); 1376 if (err) 1377 goto out_rq; 1378 1379 if (!wait_until_running(&h, rq)) { 1380 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1381 1382 pr_err("%s: Failed to start request %llx, at %x\n", 1383 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1384 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1385 1386 intel_gt_set_wedged(gt); 1387 goto out_reset; 1388 } 1389 1390 init_completion(&arg.completion); 1391 1392 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1393 if (IS_ERR(tsk)) { 1394 err = PTR_ERR(tsk); 1395 tsk = NULL; 1396 goto out_reset; 1397 } 1398 get_task_struct(tsk); 1399 1400 wait_for_completion(&arg.completion); 1401 1402 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1403 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1404 1405 pr_err("igt/evict_vma kthread did not wait\n"); 1406 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1407 1408 intel_gt_set_wedged(gt); 1409 goto out_reset; 1410 } 1411 1412 out_reset: 1413 igt_global_reset_lock(gt); 1414 fake_hangcheck(gt, rq->engine->mask); 1415 igt_global_reset_unlock(gt); 1416 1417 if (tsk) { 1418 struct intel_wedge_me w; 1419 1420 /* The reset, even indirectly, should take less than 10ms. */ 1421 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1422 err = kthread_stop(tsk); 1423 1424 put_task_struct(tsk); 1425 } 1426 1427 out_rq: 1428 i915_request_put(rq); 1429 out_obj: 1430 i915_gem_object_put(obj); 1431 fini: 1432 hang_fini(&h); 1433 if (intel_gt_is_wedged(gt)) 1434 return -EIO; 1435 1436 return err; 1437 } 1438 1439 static int igt_reset_evict_ggtt(void *arg) 1440 { 1441 struct intel_gt *gt = arg; 1442 1443 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1444 evict_vma, EXEC_OBJECT_WRITE); 1445 } 1446 1447 static int igt_reset_evict_ppgtt(void *arg) 1448 { 1449 struct intel_gt *gt = arg; 1450 struct i915_ppgtt *ppgtt; 1451 int err; 1452 1453 /* aliasing == global gtt locking, covered above */ 1454 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1455 return 0; 1456 1457 ppgtt = i915_ppgtt_create(gt); 1458 if (IS_ERR(ppgtt)) 1459 return PTR_ERR(ppgtt); 1460 1461 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1462 evict_vma, EXEC_OBJECT_WRITE); 1463 i915_vm_put(&ppgtt->vm); 1464 1465 return err; 1466 } 1467 1468 static int igt_reset_evict_fence(void *arg) 1469 { 1470 struct intel_gt *gt = arg; 1471 1472 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1473 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1474 } 1475 1476 static int wait_for_others(struct intel_gt *gt, 1477 struct intel_engine_cs *exclude) 1478 { 1479 struct intel_engine_cs *engine; 1480 enum intel_engine_id id; 1481 1482 for_each_engine(engine, gt, id) { 1483 if (engine == exclude) 1484 continue; 1485 1486 if (!wait_for_idle(engine)) 1487 return -EIO; 1488 } 1489 1490 return 0; 1491 } 1492 1493 static int igt_reset_queue(void *arg) 1494 { 1495 struct intel_gt *gt = arg; 1496 struct i915_gpu_error *global = >->i915->gpu_error; 1497 struct intel_engine_cs *engine; 1498 enum intel_engine_id id; 1499 struct hang h; 1500 int err; 1501 1502 /* Check that we replay pending requests following a hang */ 1503 1504 igt_global_reset_lock(gt); 1505 1506 err = hang_init(&h, gt); 1507 if (err) 1508 goto unlock; 1509 1510 for_each_engine(engine, gt, id) { 1511 struct i915_request *prev; 1512 IGT_TIMEOUT(end_time); 1513 unsigned int count; 1514 1515 if (!intel_engine_can_store_dword(engine)) 1516 continue; 1517 1518 prev = hang_create_request(&h, engine); 1519 if (IS_ERR(prev)) { 1520 err = PTR_ERR(prev); 1521 goto fini; 1522 } 1523 1524 i915_request_get(prev); 1525 i915_request_add(prev); 1526 1527 count = 0; 1528 do { 1529 struct i915_request *rq; 1530 unsigned int reset_count; 1531 1532 rq = hang_create_request(&h, engine); 1533 if (IS_ERR(rq)) { 1534 err = PTR_ERR(rq); 1535 goto fini; 1536 } 1537 1538 i915_request_get(rq); 1539 i915_request_add(rq); 1540 1541 /* 1542 * XXX We don't handle resetting the kernel context 1543 * very well. If we trigger a device reset twice in 1544 * quick succession while the kernel context is 1545 * executing, we may end up skipping the breadcrumb. 1546 * This is really only a problem for the selftest as 1547 * normally there is a large interlude between resets 1548 * (hangcheck), or we focus on resetting just one 1549 * engine and so avoid repeatedly resetting innocents. 1550 */ 1551 err = wait_for_others(gt, engine); 1552 if (err) { 1553 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1554 __func__, engine->name); 1555 i915_request_put(rq); 1556 i915_request_put(prev); 1557 1558 GEM_TRACE_DUMP(); 1559 intel_gt_set_wedged(gt); 1560 goto fini; 1561 } 1562 1563 if (!wait_until_running(&h, prev)) { 1564 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1565 1566 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1567 __func__, engine->name, 1568 prev->fence.seqno, hws_seqno(&h, prev)); 1569 intel_engine_dump(engine, &p, 1570 "%s\n", engine->name); 1571 1572 i915_request_put(rq); 1573 i915_request_put(prev); 1574 1575 intel_gt_set_wedged(gt); 1576 1577 err = -EIO; 1578 goto fini; 1579 } 1580 1581 reset_count = fake_hangcheck(gt, BIT(id)); 1582 1583 if (prev->fence.error != -EIO) { 1584 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1585 prev->fence.error); 1586 i915_request_put(rq); 1587 i915_request_put(prev); 1588 err = -EINVAL; 1589 goto fini; 1590 } 1591 1592 if (rq->fence.error) { 1593 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1594 rq->fence.error); 1595 i915_request_put(rq); 1596 i915_request_put(prev); 1597 err = -EINVAL; 1598 goto fini; 1599 } 1600 1601 if (i915_reset_count(global) == reset_count) { 1602 pr_err("No GPU reset recorded!\n"); 1603 i915_request_put(rq); 1604 i915_request_put(prev); 1605 err = -EINVAL; 1606 goto fini; 1607 } 1608 1609 i915_request_put(prev); 1610 prev = rq; 1611 count++; 1612 } while (time_before(jiffies, end_time)); 1613 pr_info("%s: Completed %d queued resets\n", 1614 engine->name, count); 1615 1616 *h.batch = MI_BATCH_BUFFER_END; 1617 intel_gt_chipset_flush(engine->gt); 1618 1619 i915_request_put(prev); 1620 1621 err = igt_flush_test(gt->i915); 1622 if (err) 1623 break; 1624 } 1625 1626 fini: 1627 hang_fini(&h); 1628 unlock: 1629 igt_global_reset_unlock(gt); 1630 1631 if (intel_gt_is_wedged(gt)) 1632 return -EIO; 1633 1634 return err; 1635 } 1636 1637 static int igt_handle_error(void *arg) 1638 { 1639 struct intel_gt *gt = arg; 1640 struct i915_gpu_error *global = >->i915->gpu_error; 1641 struct intel_engine_cs *engine = gt->engine[RCS0]; 1642 struct hang h; 1643 struct i915_request *rq; 1644 struct i915_gpu_coredump *error; 1645 int err; 1646 1647 /* Check that we can issue a global GPU and engine reset */ 1648 1649 if (!intel_has_reset_engine(gt)) 1650 return 0; 1651 1652 if (!engine || !intel_engine_can_store_dword(engine)) 1653 return 0; 1654 1655 err = hang_init(&h, gt); 1656 if (err) 1657 return err; 1658 1659 rq = hang_create_request(&h, engine); 1660 if (IS_ERR(rq)) { 1661 err = PTR_ERR(rq); 1662 goto err_fini; 1663 } 1664 1665 i915_request_get(rq); 1666 i915_request_add(rq); 1667 1668 if (!wait_until_running(&h, rq)) { 1669 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1670 1671 pr_err("%s: Failed to start request %llx, at %x\n", 1672 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1673 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1674 1675 intel_gt_set_wedged(gt); 1676 1677 err = -EIO; 1678 goto err_request; 1679 } 1680 1681 /* Temporarily disable error capture */ 1682 error = xchg(&global->first_error, (void *)-1); 1683 1684 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1685 1686 xchg(&global->first_error, error); 1687 1688 if (rq->fence.error != -EIO) { 1689 pr_err("Guilty request not identified!\n"); 1690 err = -EINVAL; 1691 goto err_request; 1692 } 1693 1694 err_request: 1695 i915_request_put(rq); 1696 err_fini: 1697 hang_fini(&h); 1698 return err; 1699 } 1700 1701 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1702 const struct igt_atomic_section *p, 1703 const char *mode) 1704 { 1705 struct tasklet_struct * const t = &engine->execlists.tasklet; 1706 int err; 1707 1708 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1709 engine->name, mode, p->name); 1710 1711 if (t->func) 1712 tasklet_disable(t); 1713 if (strcmp(p->name, "softirq")) 1714 local_bh_disable(); 1715 p->critical_section_begin(); 1716 1717 err = __intel_engine_reset_bh(engine, NULL); 1718 1719 p->critical_section_end(); 1720 if (strcmp(p->name, "softirq")) 1721 local_bh_enable(); 1722 if (t->func) { 1723 tasklet_enable(t); 1724 tasklet_hi_schedule(t); 1725 } 1726 1727 if (err) 1728 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1729 engine->name, mode, p->name); 1730 1731 return err; 1732 } 1733 1734 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1735 const struct igt_atomic_section *p) 1736 { 1737 struct i915_request *rq; 1738 struct hang h; 1739 int err; 1740 1741 err = __igt_atomic_reset_engine(engine, p, "idle"); 1742 if (err) 1743 return err; 1744 1745 err = hang_init(&h, engine->gt); 1746 if (err) 1747 return err; 1748 1749 rq = hang_create_request(&h, engine); 1750 if (IS_ERR(rq)) { 1751 err = PTR_ERR(rq); 1752 goto out; 1753 } 1754 1755 i915_request_get(rq); 1756 i915_request_add(rq); 1757 1758 if (wait_until_running(&h, rq)) { 1759 err = __igt_atomic_reset_engine(engine, p, "active"); 1760 } else { 1761 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1762 __func__, engine->name, 1763 rq->fence.seqno, hws_seqno(&h, rq)); 1764 intel_gt_set_wedged(engine->gt); 1765 err = -EIO; 1766 } 1767 1768 if (err == 0) { 1769 struct intel_wedge_me w; 1770 1771 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1772 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1773 if (intel_gt_is_wedged(engine->gt)) 1774 err = -EIO; 1775 } 1776 1777 i915_request_put(rq); 1778 out: 1779 hang_fini(&h); 1780 return err; 1781 } 1782 1783 static int igt_reset_engines_atomic(void *arg) 1784 { 1785 struct intel_gt *gt = arg; 1786 const typeof(*igt_atomic_phases) *p; 1787 int err = 0; 1788 1789 /* Check that the engines resets are usable from atomic context */ 1790 1791 if (!intel_has_reset_engine(gt)) 1792 return 0; 1793 1794 if (intel_uc_uses_guc_submission(>->uc)) 1795 return 0; 1796 1797 igt_global_reset_lock(gt); 1798 1799 /* Flush any requests before we get started and check basics */ 1800 if (!igt_force_reset(gt)) 1801 goto unlock; 1802 1803 for (p = igt_atomic_phases; p->name; p++) { 1804 struct intel_engine_cs *engine; 1805 enum intel_engine_id id; 1806 1807 for_each_engine(engine, gt, id) { 1808 err = igt_atomic_reset_engine(engine, p); 1809 if (err) 1810 goto out; 1811 } 1812 } 1813 1814 out: 1815 /* As we poke around the guts, do a full reset before continuing. */ 1816 igt_force_reset(gt); 1817 unlock: 1818 igt_global_reset_unlock(gt); 1819 1820 return err; 1821 } 1822 1823 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1824 { 1825 static const struct i915_subtest tests[] = { 1826 SUBTEST(igt_hang_sanitycheck), 1827 SUBTEST(igt_reset_nop), 1828 SUBTEST(igt_reset_nop_engine), 1829 SUBTEST(igt_reset_idle_engine), 1830 SUBTEST(igt_reset_active_engine), 1831 SUBTEST(igt_reset_fail_engine), 1832 SUBTEST(igt_reset_engines), 1833 SUBTEST(igt_reset_engines_atomic), 1834 SUBTEST(igt_reset_queue), 1835 SUBTEST(igt_reset_wait), 1836 SUBTEST(igt_reset_evict_ggtt), 1837 SUBTEST(igt_reset_evict_ppgtt), 1838 SUBTEST(igt_reset_evict_fence), 1839 SUBTEST(igt_handle_error), 1840 }; 1841 struct intel_gt *gt = &i915->gt; 1842 intel_wakeref_t wakeref; 1843 int err; 1844 1845 if (!intel_has_gpu_reset(gt)) 1846 return 0; 1847 1848 if (intel_gt_is_wedged(gt)) 1849 return -EIO; /* we're long past hope of a successful reset */ 1850 1851 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1852 1853 err = intel_gt_live_subtests(tests, gt); 1854 1855 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1856 1857 return err; 1858 } 1859