1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/kthread.h> 26 27 #include "gem/i915_gem_context.h" 28 29 #include "intel_gt.h" 30 #include "intel_engine_heartbeat.h" 31 #include "intel_engine_pm.h" 32 #include "selftest_engine_heartbeat.h" 33 34 #include "i915_selftest.h" 35 #include "selftests/i915_random.h" 36 #include "selftests/igt_flush_test.h" 37 #include "selftests/igt_reset.h" 38 #include "selftests/igt_atomic.h" 39 40 #include "selftests/mock_drm.h" 41 42 #include "gem/selftests/mock_context.h" 43 #include "gem/selftests/igt_gem_utils.h" 44 45 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 46 47 struct hang { 48 struct intel_gt *gt; 49 struct drm_i915_gem_object *hws; 50 struct drm_i915_gem_object *obj; 51 struct i915_gem_context *ctx; 52 u32 *seqno; 53 u32 *batch; 54 }; 55 56 static int hang_init(struct hang *h, struct intel_gt *gt) 57 { 58 void *vaddr; 59 int err; 60 61 memset(h, 0, sizeof(*h)); 62 h->gt = gt; 63 64 h->ctx = kernel_context(gt->i915); 65 if (IS_ERR(h->ctx)) 66 return PTR_ERR(h->ctx); 67 68 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 69 70 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 71 if (IS_ERR(h->hws)) { 72 err = PTR_ERR(h->hws); 73 goto err_ctx; 74 } 75 76 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 77 if (IS_ERR(h->obj)) { 78 err = PTR_ERR(h->obj); 79 goto err_hws; 80 } 81 82 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 83 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 84 if (IS_ERR(vaddr)) { 85 err = PTR_ERR(vaddr); 86 goto err_obj; 87 } 88 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 89 90 vaddr = i915_gem_object_pin_map(h->obj, 91 i915_coherent_map_type(gt->i915)); 92 if (IS_ERR(vaddr)) { 93 err = PTR_ERR(vaddr); 94 goto err_unpin_hws; 95 } 96 h->batch = vaddr; 97 98 return 0; 99 100 err_unpin_hws: 101 i915_gem_object_unpin_map(h->hws); 102 err_obj: 103 i915_gem_object_put(h->obj); 104 err_hws: 105 i915_gem_object_put(h->hws); 106 err_ctx: 107 kernel_context_close(h->ctx); 108 return err; 109 } 110 111 static u64 hws_address(const struct i915_vma *hws, 112 const struct i915_request *rq) 113 { 114 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 115 } 116 117 static int move_to_active(struct i915_vma *vma, 118 struct i915_request *rq, 119 unsigned int flags) 120 { 121 int err; 122 123 i915_vma_lock(vma); 124 err = i915_request_await_object(rq, vma->obj, 125 flags & EXEC_OBJECT_WRITE); 126 if (err == 0) 127 err = i915_vma_move_to_active(vma, rq, flags); 128 i915_vma_unlock(vma); 129 130 return err; 131 } 132 133 static struct i915_request * 134 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 135 { 136 struct intel_gt *gt = h->gt; 137 struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx); 138 struct drm_i915_gem_object *obj; 139 struct i915_request *rq = NULL; 140 struct i915_vma *hws, *vma; 141 unsigned int flags; 142 void *vaddr; 143 u32 *batch; 144 int err; 145 146 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 147 if (IS_ERR(obj)) { 148 i915_vm_put(vm); 149 return ERR_CAST(obj); 150 } 151 152 vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915)); 153 if (IS_ERR(vaddr)) { 154 i915_gem_object_put(obj); 155 i915_vm_put(vm); 156 return ERR_CAST(vaddr); 157 } 158 159 i915_gem_object_unpin_map(h->obj); 160 i915_gem_object_put(h->obj); 161 162 h->obj = obj; 163 h->batch = vaddr; 164 165 vma = i915_vma_instance(h->obj, vm, NULL); 166 if (IS_ERR(vma)) { 167 i915_vm_put(vm); 168 return ERR_CAST(vma); 169 } 170 171 hws = i915_vma_instance(h->hws, vm, NULL); 172 if (IS_ERR(hws)) { 173 i915_vm_put(vm); 174 return ERR_CAST(hws); 175 } 176 177 err = i915_vma_pin(vma, 0, 0, PIN_USER); 178 if (err) { 179 i915_vm_put(vm); 180 return ERR_PTR(err); 181 } 182 183 err = i915_vma_pin(hws, 0, 0, PIN_USER); 184 if (err) 185 goto unpin_vma; 186 187 rq = igt_request_alloc(h->ctx, engine); 188 if (IS_ERR(rq)) { 189 err = PTR_ERR(rq); 190 goto unpin_hws; 191 } 192 193 err = move_to_active(vma, rq, 0); 194 if (err) 195 goto cancel_rq; 196 197 err = move_to_active(hws, rq, 0); 198 if (err) 199 goto cancel_rq; 200 201 batch = h->batch; 202 if (INTEL_GEN(gt->i915) >= 8) { 203 *batch++ = MI_STORE_DWORD_IMM_GEN4; 204 *batch++ = lower_32_bits(hws_address(hws, rq)); 205 *batch++ = upper_32_bits(hws_address(hws, rq)); 206 *batch++ = rq->fence.seqno; 207 *batch++ = MI_NOOP; 208 209 memset(batch, 0, 1024); 210 batch += 1024 / sizeof(*batch); 211 212 *batch++ = MI_NOOP; 213 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 214 *batch++ = lower_32_bits(vma->node.start); 215 *batch++ = upper_32_bits(vma->node.start); 216 } else if (INTEL_GEN(gt->i915) >= 6) { 217 *batch++ = MI_STORE_DWORD_IMM_GEN4; 218 *batch++ = 0; 219 *batch++ = lower_32_bits(hws_address(hws, rq)); 220 *batch++ = rq->fence.seqno; 221 *batch++ = MI_NOOP; 222 223 memset(batch, 0, 1024); 224 batch += 1024 / sizeof(*batch); 225 226 *batch++ = MI_NOOP; 227 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 228 *batch++ = lower_32_bits(vma->node.start); 229 } else if (INTEL_GEN(gt->i915) >= 4) { 230 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 231 *batch++ = 0; 232 *batch++ = lower_32_bits(hws_address(hws, rq)); 233 *batch++ = rq->fence.seqno; 234 *batch++ = MI_NOOP; 235 236 memset(batch, 0, 1024); 237 batch += 1024 / sizeof(*batch); 238 239 *batch++ = MI_NOOP; 240 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 241 *batch++ = lower_32_bits(vma->node.start); 242 } else { 243 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 244 *batch++ = lower_32_bits(hws_address(hws, rq)); 245 *batch++ = rq->fence.seqno; 246 *batch++ = MI_NOOP; 247 248 memset(batch, 0, 1024); 249 batch += 1024 / sizeof(*batch); 250 251 *batch++ = MI_NOOP; 252 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 253 *batch++ = lower_32_bits(vma->node.start); 254 } 255 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 256 intel_gt_chipset_flush(engine->gt); 257 258 if (rq->engine->emit_init_breadcrumb) { 259 err = rq->engine->emit_init_breadcrumb(rq); 260 if (err) 261 goto cancel_rq; 262 } 263 264 flags = 0; 265 if (INTEL_GEN(gt->i915) <= 5) 266 flags |= I915_DISPATCH_SECURE; 267 268 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 269 270 cancel_rq: 271 if (err) { 272 i915_request_set_error_once(rq, err); 273 i915_request_add(rq); 274 } 275 unpin_hws: 276 i915_vma_unpin(hws); 277 unpin_vma: 278 i915_vma_unpin(vma); 279 i915_vm_put(vm); 280 return err ? ERR_PTR(err) : rq; 281 } 282 283 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 284 { 285 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 286 } 287 288 static void hang_fini(struct hang *h) 289 { 290 *h->batch = MI_BATCH_BUFFER_END; 291 intel_gt_chipset_flush(h->gt); 292 293 i915_gem_object_unpin_map(h->obj); 294 i915_gem_object_put(h->obj); 295 296 i915_gem_object_unpin_map(h->hws); 297 i915_gem_object_put(h->hws); 298 299 kernel_context_close(h->ctx); 300 301 igt_flush_test(h->gt->i915); 302 } 303 304 static bool wait_until_running(struct hang *h, struct i915_request *rq) 305 { 306 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 307 rq->fence.seqno), 308 10) && 309 wait_for(i915_seqno_passed(hws_seqno(h, rq), 310 rq->fence.seqno), 311 1000)); 312 } 313 314 static int igt_hang_sanitycheck(void *arg) 315 { 316 struct intel_gt *gt = arg; 317 struct i915_request *rq; 318 struct intel_engine_cs *engine; 319 enum intel_engine_id id; 320 struct hang h; 321 int err; 322 323 /* Basic check that we can execute our hanging batch */ 324 325 err = hang_init(&h, gt); 326 if (err) 327 return err; 328 329 for_each_engine(engine, gt, id) { 330 struct intel_wedge_me w; 331 long timeout; 332 333 if (!intel_engine_can_store_dword(engine)) 334 continue; 335 336 rq = hang_create_request(&h, engine); 337 if (IS_ERR(rq)) { 338 err = PTR_ERR(rq); 339 pr_err("Failed to create request for %s, err=%d\n", 340 engine->name, err); 341 goto fini; 342 } 343 344 i915_request_get(rq); 345 346 *h.batch = MI_BATCH_BUFFER_END; 347 intel_gt_chipset_flush(engine->gt); 348 349 i915_request_add(rq); 350 351 timeout = 0; 352 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 353 timeout = i915_request_wait(rq, 0, 354 MAX_SCHEDULE_TIMEOUT); 355 if (intel_gt_is_wedged(gt)) 356 timeout = -EIO; 357 358 i915_request_put(rq); 359 360 if (timeout < 0) { 361 err = timeout; 362 pr_err("Wait for request failed on %s, err=%d\n", 363 engine->name, err); 364 goto fini; 365 } 366 } 367 368 fini: 369 hang_fini(&h); 370 return err; 371 } 372 373 static bool wait_for_idle(struct intel_engine_cs *engine) 374 { 375 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 376 } 377 378 static int igt_reset_nop(void *arg) 379 { 380 struct intel_gt *gt = arg; 381 struct i915_gpu_error *global = >->i915->gpu_error; 382 struct intel_engine_cs *engine; 383 unsigned int reset_count, count; 384 enum intel_engine_id id; 385 IGT_TIMEOUT(end_time); 386 int err = 0; 387 388 /* Check that we can reset during non-user portions of requests */ 389 390 reset_count = i915_reset_count(global); 391 count = 0; 392 do { 393 for_each_engine(engine, gt, id) { 394 struct intel_context *ce; 395 int i; 396 397 ce = intel_context_create(engine); 398 if (IS_ERR(ce)) { 399 err = PTR_ERR(ce); 400 break; 401 } 402 403 for (i = 0; i < 16; i++) { 404 struct i915_request *rq; 405 406 rq = intel_context_create_request(ce); 407 if (IS_ERR(rq)) { 408 err = PTR_ERR(rq); 409 break; 410 } 411 412 i915_request_add(rq); 413 } 414 415 intel_context_put(ce); 416 } 417 418 igt_global_reset_lock(gt); 419 intel_gt_reset(gt, ALL_ENGINES, NULL); 420 igt_global_reset_unlock(gt); 421 422 if (intel_gt_is_wedged(gt)) { 423 err = -EIO; 424 break; 425 } 426 427 if (i915_reset_count(global) != reset_count + ++count) { 428 pr_err("Full GPU reset not recorded!\n"); 429 err = -EINVAL; 430 break; 431 } 432 433 err = igt_flush_test(gt->i915); 434 if (err) 435 break; 436 } while (time_before(jiffies, end_time)); 437 pr_info("%s: %d resets\n", __func__, count); 438 439 if (igt_flush_test(gt->i915)) 440 err = -EIO; 441 return err; 442 } 443 444 static int igt_reset_nop_engine(void *arg) 445 { 446 struct intel_gt *gt = arg; 447 struct i915_gpu_error *global = >->i915->gpu_error; 448 struct intel_engine_cs *engine; 449 enum intel_engine_id id; 450 451 /* Check that we can engine-reset during non-user portions */ 452 453 if (!intel_has_reset_engine(gt)) 454 return 0; 455 456 for_each_engine(engine, gt, id) { 457 unsigned int reset_count, reset_engine_count, count; 458 struct intel_context *ce; 459 IGT_TIMEOUT(end_time); 460 int err; 461 462 ce = intel_context_create(engine); 463 if (IS_ERR(ce)) 464 return PTR_ERR(ce); 465 466 reset_count = i915_reset_count(global); 467 reset_engine_count = i915_reset_engine_count(global, engine); 468 count = 0; 469 470 st_engine_heartbeat_disable(engine); 471 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 472 do { 473 int i; 474 475 if (!wait_for_idle(engine)) { 476 pr_err("%s failed to idle before reset\n", 477 engine->name); 478 err = -EIO; 479 break; 480 } 481 482 for (i = 0; i < 16; i++) { 483 struct i915_request *rq; 484 485 rq = intel_context_create_request(ce); 486 if (IS_ERR(rq)) { 487 struct drm_printer p = 488 drm_info_printer(gt->i915->drm.dev); 489 intel_engine_dump(engine, &p, 490 "%s(%s): failed to submit request\n", 491 __func__, 492 engine->name); 493 494 GEM_TRACE("%s(%s): failed to submit request\n", 495 __func__, 496 engine->name); 497 GEM_TRACE_DUMP(); 498 499 intel_gt_set_wedged(gt); 500 501 err = PTR_ERR(rq); 502 break; 503 } 504 505 i915_request_add(rq); 506 } 507 err = intel_engine_reset(engine, NULL); 508 if (err) { 509 pr_err("intel_engine_reset(%s) failed, err:%d\n", 510 engine->name, err); 511 break; 512 } 513 514 if (i915_reset_count(global) != reset_count) { 515 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 516 err = -EINVAL; 517 break; 518 } 519 520 if (i915_reset_engine_count(global, engine) != 521 reset_engine_count + ++count) { 522 pr_err("%s engine reset not recorded!\n", 523 engine->name); 524 err = -EINVAL; 525 break; 526 } 527 } while (time_before(jiffies, end_time)); 528 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 529 st_engine_heartbeat_enable(engine); 530 531 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 532 533 intel_context_put(ce); 534 if (igt_flush_test(gt->i915)) 535 err = -EIO; 536 if (err) 537 return err; 538 } 539 540 return 0; 541 } 542 543 static void force_reset_timeout(struct intel_engine_cs *engine) 544 { 545 engine->reset_timeout.probability = 999; 546 atomic_set(&engine->reset_timeout.times, -1); 547 } 548 549 static void cancel_reset_timeout(struct intel_engine_cs *engine) 550 { 551 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 552 } 553 554 static int igt_reset_fail_engine(void *arg) 555 { 556 struct intel_gt *gt = arg; 557 struct intel_engine_cs *engine; 558 enum intel_engine_id id; 559 560 /* Check that we can recover from engine-reset failues */ 561 562 if (!intel_has_reset_engine(gt)) 563 return 0; 564 565 for_each_engine(engine, gt, id) { 566 unsigned int count; 567 struct intel_context *ce; 568 IGT_TIMEOUT(end_time); 569 int err; 570 571 ce = intel_context_create(engine); 572 if (IS_ERR(ce)) 573 return PTR_ERR(ce); 574 575 st_engine_heartbeat_disable(engine); 576 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 577 578 force_reset_timeout(engine); 579 err = intel_engine_reset(engine, NULL); 580 cancel_reset_timeout(engine); 581 if (err == 0) /* timeouts only generated on gen8+ */ 582 goto skip; 583 584 count = 0; 585 do { 586 struct i915_request *last = NULL; 587 int i; 588 589 if (!wait_for_idle(engine)) { 590 pr_err("%s failed to idle before reset\n", 591 engine->name); 592 err = -EIO; 593 break; 594 } 595 596 for (i = 0; i < count % 15; i++) { 597 struct i915_request *rq; 598 599 rq = intel_context_create_request(ce); 600 if (IS_ERR(rq)) { 601 struct drm_printer p = 602 drm_info_printer(gt->i915->drm.dev); 603 intel_engine_dump(engine, &p, 604 "%s(%s): failed to submit request\n", 605 __func__, 606 engine->name); 607 608 GEM_TRACE("%s(%s): failed to submit request\n", 609 __func__, 610 engine->name); 611 GEM_TRACE_DUMP(); 612 613 intel_gt_set_wedged(gt); 614 if (last) 615 i915_request_put(last); 616 617 err = PTR_ERR(rq); 618 goto out; 619 } 620 621 if (last) 622 i915_request_put(last); 623 last = i915_request_get(rq); 624 i915_request_add(rq); 625 } 626 627 if (count & 1) { 628 err = intel_engine_reset(engine, NULL); 629 if (err) { 630 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 631 engine->name, err); 632 GEM_TRACE_DUMP(); 633 i915_request_put(last); 634 break; 635 } 636 } else { 637 force_reset_timeout(engine); 638 err = intel_engine_reset(engine, NULL); 639 cancel_reset_timeout(engine); 640 if (err != -ETIMEDOUT) { 641 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 642 engine->name, err); 643 i915_request_put(last); 644 break; 645 } 646 } 647 648 err = 0; 649 if (last) { 650 if (i915_request_wait(last, 0, HZ / 2) < 0) { 651 struct drm_printer p = 652 drm_info_printer(gt->i915->drm.dev); 653 654 intel_engine_dump(engine, &p, 655 "%s(%s): failed to complete request\n", 656 __func__, 657 engine->name); 658 659 GEM_TRACE("%s(%s): failed to complete request\n", 660 __func__, 661 engine->name); 662 GEM_TRACE_DUMP(); 663 664 err = -EIO; 665 } 666 i915_request_put(last); 667 } 668 count++; 669 } while (err == 0 && time_before(jiffies, end_time)); 670 out: 671 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 672 skip: 673 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 674 st_engine_heartbeat_enable(engine); 675 intel_context_put(ce); 676 677 if (igt_flush_test(gt->i915)) 678 err = -EIO; 679 if (err) 680 return err; 681 } 682 683 return 0; 684 } 685 686 static int __igt_reset_engine(struct intel_gt *gt, bool active) 687 { 688 struct i915_gpu_error *global = >->i915->gpu_error; 689 struct intel_engine_cs *engine; 690 enum intel_engine_id id; 691 struct hang h; 692 int err = 0; 693 694 /* Check that we can issue an engine reset on an idle engine (no-op) */ 695 696 if (!intel_has_reset_engine(gt)) 697 return 0; 698 699 if (active) { 700 err = hang_init(&h, gt); 701 if (err) 702 return err; 703 } 704 705 for_each_engine(engine, gt, id) { 706 unsigned int reset_count, reset_engine_count; 707 unsigned long count; 708 IGT_TIMEOUT(end_time); 709 710 if (active && !intel_engine_can_store_dword(engine)) 711 continue; 712 713 if (!wait_for_idle(engine)) { 714 pr_err("%s failed to idle before reset\n", 715 engine->name); 716 err = -EIO; 717 break; 718 } 719 720 reset_count = i915_reset_count(global); 721 reset_engine_count = i915_reset_engine_count(global, engine); 722 723 st_engine_heartbeat_disable(engine); 724 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 725 count = 0; 726 do { 727 if (active) { 728 struct i915_request *rq; 729 730 rq = hang_create_request(&h, engine); 731 if (IS_ERR(rq)) { 732 err = PTR_ERR(rq); 733 break; 734 } 735 736 i915_request_get(rq); 737 i915_request_add(rq); 738 739 if (!wait_until_running(&h, rq)) { 740 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 741 742 pr_err("%s: Failed to start request %llx, at %x\n", 743 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 744 intel_engine_dump(engine, &p, 745 "%s\n", engine->name); 746 747 i915_request_put(rq); 748 err = -EIO; 749 break; 750 } 751 752 i915_request_put(rq); 753 } 754 755 err = intel_engine_reset(engine, NULL); 756 if (err) { 757 pr_err("intel_engine_reset(%s) failed, err:%d\n", 758 engine->name, err); 759 break; 760 } 761 762 if (i915_reset_count(global) != reset_count) { 763 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 764 err = -EINVAL; 765 break; 766 } 767 768 if (i915_reset_engine_count(global, engine) != 769 ++reset_engine_count) { 770 pr_err("%s engine reset not recorded!\n", 771 engine->name); 772 err = -EINVAL; 773 break; 774 } 775 776 count++; 777 } while (time_before(jiffies, end_time)); 778 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 779 st_engine_heartbeat_enable(engine); 780 pr_info("%s: Completed %lu %s resets\n", 781 engine->name, count, active ? "active" : "idle"); 782 783 if (err) 784 break; 785 786 err = igt_flush_test(gt->i915); 787 if (err) 788 break; 789 } 790 791 if (intel_gt_is_wedged(gt)) 792 err = -EIO; 793 794 if (active) 795 hang_fini(&h); 796 797 return err; 798 } 799 800 static int igt_reset_idle_engine(void *arg) 801 { 802 return __igt_reset_engine(arg, false); 803 } 804 805 static int igt_reset_active_engine(void *arg) 806 { 807 return __igt_reset_engine(arg, true); 808 } 809 810 struct active_engine { 811 struct task_struct *task; 812 struct intel_engine_cs *engine; 813 unsigned long resets; 814 unsigned int flags; 815 }; 816 817 #define TEST_ACTIVE BIT(0) 818 #define TEST_OTHERS BIT(1) 819 #define TEST_SELF BIT(2) 820 #define TEST_PRIORITY BIT(3) 821 822 static int active_request_put(struct i915_request *rq) 823 { 824 int err = 0; 825 826 if (!rq) 827 return 0; 828 829 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 830 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 831 rq->engine->name, 832 rq->fence.context, 833 rq->fence.seqno); 834 GEM_TRACE_DUMP(); 835 836 intel_gt_set_wedged(rq->engine->gt); 837 err = -EIO; 838 } 839 840 i915_request_put(rq); 841 842 return err; 843 } 844 845 static int active_engine(void *data) 846 { 847 I915_RND_STATE(prng); 848 struct active_engine *arg = data; 849 struct intel_engine_cs *engine = arg->engine; 850 struct i915_request *rq[8] = {}; 851 struct intel_context *ce[ARRAY_SIZE(rq)]; 852 unsigned long count; 853 int err = 0; 854 855 for (count = 0; count < ARRAY_SIZE(ce); count++) { 856 ce[count] = intel_context_create(engine); 857 if (IS_ERR(ce[count])) { 858 err = PTR_ERR(ce[count]); 859 while (--count) 860 intel_context_put(ce[count]); 861 return err; 862 } 863 } 864 865 count = 0; 866 while (!kthread_should_stop()) { 867 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 868 struct i915_request *old = rq[idx]; 869 struct i915_request *new; 870 871 new = intel_context_create_request(ce[idx]); 872 if (IS_ERR(new)) { 873 err = PTR_ERR(new); 874 break; 875 } 876 877 rq[idx] = i915_request_get(new); 878 i915_request_add(new); 879 880 if (engine->schedule && arg->flags & TEST_PRIORITY) { 881 struct i915_sched_attr attr = { 882 .priority = 883 i915_prandom_u32_max_state(512, &prng), 884 }; 885 engine->schedule(rq[idx], &attr); 886 } 887 888 err = active_request_put(old); 889 if (err) 890 break; 891 892 cond_resched(); 893 } 894 895 for (count = 0; count < ARRAY_SIZE(rq); count++) { 896 int err__ = active_request_put(rq[count]); 897 898 /* Keep the first error */ 899 if (!err) 900 err = err__; 901 902 intel_context_put(ce[count]); 903 } 904 905 return err; 906 } 907 908 static int __igt_reset_engines(struct intel_gt *gt, 909 const char *test_name, 910 unsigned int flags) 911 { 912 struct i915_gpu_error *global = >->i915->gpu_error; 913 struct intel_engine_cs *engine, *other; 914 enum intel_engine_id id, tmp; 915 struct hang h; 916 int err = 0; 917 918 /* Check that issuing a reset on one engine does not interfere 919 * with any other engine. 920 */ 921 922 if (!intel_has_reset_engine(gt)) 923 return 0; 924 925 if (flags & TEST_ACTIVE) { 926 err = hang_init(&h, gt); 927 if (err) 928 return err; 929 930 if (flags & TEST_PRIORITY) 931 h.ctx->sched.priority = 1024; 932 } 933 934 for_each_engine(engine, gt, id) { 935 struct active_engine threads[I915_NUM_ENGINES] = {}; 936 unsigned long device = i915_reset_count(global); 937 unsigned long count = 0, reported; 938 IGT_TIMEOUT(end_time); 939 940 if (flags & TEST_ACTIVE && 941 !intel_engine_can_store_dword(engine)) 942 continue; 943 944 if (!wait_for_idle(engine)) { 945 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 946 engine->name, test_name); 947 err = -EIO; 948 break; 949 } 950 951 memset(threads, 0, sizeof(threads)); 952 for_each_engine(other, gt, tmp) { 953 struct task_struct *tsk; 954 955 threads[tmp].resets = 956 i915_reset_engine_count(global, other); 957 958 if (other == engine && !(flags & TEST_SELF)) 959 continue; 960 961 if (other != engine && !(flags & TEST_OTHERS)) 962 continue; 963 964 threads[tmp].engine = other; 965 threads[tmp].flags = flags; 966 967 tsk = kthread_run(active_engine, &threads[tmp], 968 "igt/%s", other->name); 969 if (IS_ERR(tsk)) { 970 err = PTR_ERR(tsk); 971 goto unwind; 972 } 973 974 threads[tmp].task = tsk; 975 get_task_struct(tsk); 976 } 977 978 yield(); /* start all threads before we begin */ 979 980 st_engine_heartbeat_disable(engine); 981 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 982 do { 983 struct i915_request *rq = NULL; 984 985 if (flags & TEST_ACTIVE) { 986 rq = hang_create_request(&h, engine); 987 if (IS_ERR(rq)) { 988 err = PTR_ERR(rq); 989 break; 990 } 991 992 i915_request_get(rq); 993 i915_request_add(rq); 994 995 if (!wait_until_running(&h, rq)) { 996 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 997 998 pr_err("%s: Failed to start request %llx, at %x\n", 999 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1000 intel_engine_dump(engine, &p, 1001 "%s\n", engine->name); 1002 1003 i915_request_put(rq); 1004 err = -EIO; 1005 break; 1006 } 1007 } 1008 1009 err = intel_engine_reset(engine, NULL); 1010 if (err) { 1011 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1012 engine->name, test_name, err); 1013 break; 1014 } 1015 1016 count++; 1017 1018 if (rq) { 1019 if (rq->fence.error != -EIO) { 1020 pr_err("i915_reset_engine(%s:%s):" 1021 " failed to reset request %llx:%lld\n", 1022 engine->name, test_name, 1023 rq->fence.context, 1024 rq->fence.seqno); 1025 i915_request_put(rq); 1026 1027 GEM_TRACE_DUMP(); 1028 intel_gt_set_wedged(gt); 1029 err = -EIO; 1030 break; 1031 } 1032 1033 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1034 struct drm_printer p = 1035 drm_info_printer(gt->i915->drm.dev); 1036 1037 pr_err("i915_reset_engine(%s:%s):" 1038 " failed to complete request %llx:%lld after reset\n", 1039 engine->name, test_name, 1040 rq->fence.context, 1041 rq->fence.seqno); 1042 intel_engine_dump(engine, &p, 1043 "%s\n", engine->name); 1044 i915_request_put(rq); 1045 1046 GEM_TRACE_DUMP(); 1047 intel_gt_set_wedged(gt); 1048 err = -EIO; 1049 break; 1050 } 1051 1052 i915_request_put(rq); 1053 } 1054 1055 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1056 struct drm_printer p = 1057 drm_info_printer(gt->i915->drm.dev); 1058 1059 pr_err("i915_reset_engine(%s:%s):" 1060 " failed to idle after reset\n", 1061 engine->name, test_name); 1062 intel_engine_dump(engine, &p, 1063 "%s\n", engine->name); 1064 1065 err = -EIO; 1066 break; 1067 } 1068 } while (time_before(jiffies, end_time)); 1069 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 1070 st_engine_heartbeat_enable(engine); 1071 1072 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1073 engine->name, test_name, count); 1074 1075 reported = i915_reset_engine_count(global, engine); 1076 reported -= threads[engine->id].resets; 1077 if (reported != count) { 1078 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1079 engine->name, test_name, count, reported); 1080 if (!err) 1081 err = -EINVAL; 1082 } 1083 1084 unwind: 1085 for_each_engine(other, gt, tmp) { 1086 int ret; 1087 1088 if (!threads[tmp].task) 1089 continue; 1090 1091 ret = kthread_stop(threads[tmp].task); 1092 if (ret) { 1093 pr_err("kthread for other engine %s failed, err=%d\n", 1094 other->name, ret); 1095 if (!err) 1096 err = ret; 1097 } 1098 put_task_struct(threads[tmp].task); 1099 1100 if (other->uabi_class != engine->uabi_class && 1101 threads[tmp].resets != 1102 i915_reset_engine_count(global, other)) { 1103 pr_err("Innocent engine %s was reset (count=%ld)\n", 1104 other->name, 1105 i915_reset_engine_count(global, other) - 1106 threads[tmp].resets); 1107 if (!err) 1108 err = -EINVAL; 1109 } 1110 } 1111 1112 if (device != i915_reset_count(global)) { 1113 pr_err("Global reset (count=%ld)!\n", 1114 i915_reset_count(global) - device); 1115 if (!err) 1116 err = -EINVAL; 1117 } 1118 1119 if (err) 1120 break; 1121 1122 err = igt_flush_test(gt->i915); 1123 if (err) 1124 break; 1125 } 1126 1127 if (intel_gt_is_wedged(gt)) 1128 err = -EIO; 1129 1130 if (flags & TEST_ACTIVE) 1131 hang_fini(&h); 1132 1133 return err; 1134 } 1135 1136 static int igt_reset_engines(void *arg) 1137 { 1138 static const struct { 1139 const char *name; 1140 unsigned int flags; 1141 } phases[] = { 1142 { "idle", 0 }, 1143 { "active", TEST_ACTIVE }, 1144 { "others-idle", TEST_OTHERS }, 1145 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1146 { 1147 "others-priority", 1148 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1149 }, 1150 { 1151 "self-priority", 1152 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1153 }, 1154 { } 1155 }; 1156 struct intel_gt *gt = arg; 1157 typeof(*phases) *p; 1158 int err; 1159 1160 for (p = phases; p->name; p++) { 1161 if (p->flags & TEST_PRIORITY) { 1162 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1163 continue; 1164 } 1165 1166 err = __igt_reset_engines(arg, p->name, p->flags); 1167 if (err) 1168 return err; 1169 } 1170 1171 return 0; 1172 } 1173 1174 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1175 { 1176 u32 count = i915_reset_count(>->i915->gpu_error); 1177 1178 intel_gt_reset(gt, mask, NULL); 1179 1180 return count; 1181 } 1182 1183 static int igt_reset_wait(void *arg) 1184 { 1185 struct intel_gt *gt = arg; 1186 struct i915_gpu_error *global = >->i915->gpu_error; 1187 struct intel_engine_cs *engine = gt->engine[RCS0]; 1188 struct i915_request *rq; 1189 unsigned int reset_count; 1190 struct hang h; 1191 long timeout; 1192 int err; 1193 1194 if (!engine || !intel_engine_can_store_dword(engine)) 1195 return 0; 1196 1197 /* Check that we detect a stuck waiter and issue a reset */ 1198 1199 igt_global_reset_lock(gt); 1200 1201 err = hang_init(&h, gt); 1202 if (err) 1203 goto unlock; 1204 1205 rq = hang_create_request(&h, engine); 1206 if (IS_ERR(rq)) { 1207 err = PTR_ERR(rq); 1208 goto fini; 1209 } 1210 1211 i915_request_get(rq); 1212 i915_request_add(rq); 1213 1214 if (!wait_until_running(&h, rq)) { 1215 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1216 1217 pr_err("%s: Failed to start request %llx, at %x\n", 1218 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1219 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1220 1221 intel_gt_set_wedged(gt); 1222 1223 err = -EIO; 1224 goto out_rq; 1225 } 1226 1227 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1228 1229 timeout = i915_request_wait(rq, 0, 10); 1230 if (timeout < 0) { 1231 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1232 timeout); 1233 err = timeout; 1234 goto out_rq; 1235 } 1236 1237 if (i915_reset_count(global) == reset_count) { 1238 pr_err("No GPU reset recorded!\n"); 1239 err = -EINVAL; 1240 goto out_rq; 1241 } 1242 1243 out_rq: 1244 i915_request_put(rq); 1245 fini: 1246 hang_fini(&h); 1247 unlock: 1248 igt_global_reset_unlock(gt); 1249 1250 if (intel_gt_is_wedged(gt)) 1251 return -EIO; 1252 1253 return err; 1254 } 1255 1256 struct evict_vma { 1257 struct completion completion; 1258 struct i915_vma *vma; 1259 }; 1260 1261 static int evict_vma(void *data) 1262 { 1263 struct evict_vma *arg = data; 1264 struct i915_address_space *vm = arg->vma->vm; 1265 struct drm_mm_node evict = arg->vma->node; 1266 int err; 1267 1268 complete(&arg->completion); 1269 1270 mutex_lock(&vm->mutex); 1271 err = i915_gem_evict_for_node(vm, &evict, 0); 1272 mutex_unlock(&vm->mutex); 1273 1274 return err; 1275 } 1276 1277 static int evict_fence(void *data) 1278 { 1279 struct evict_vma *arg = data; 1280 int err; 1281 1282 complete(&arg->completion); 1283 1284 /* Mark the fence register as dirty to force the mmio update. */ 1285 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1286 if (err) { 1287 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1288 return err; 1289 } 1290 1291 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1292 if (err) { 1293 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1294 return err; 1295 } 1296 1297 err = i915_vma_pin_fence(arg->vma); 1298 i915_vma_unpin(arg->vma); 1299 if (err) { 1300 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1301 return err; 1302 } 1303 1304 i915_vma_unpin_fence(arg->vma); 1305 1306 return 0; 1307 } 1308 1309 static int __igt_reset_evict_vma(struct intel_gt *gt, 1310 struct i915_address_space *vm, 1311 int (*fn)(void *), 1312 unsigned int flags) 1313 { 1314 struct intel_engine_cs *engine = gt->engine[RCS0]; 1315 struct drm_i915_gem_object *obj; 1316 struct task_struct *tsk = NULL; 1317 struct i915_request *rq; 1318 struct evict_vma arg; 1319 struct hang h; 1320 unsigned int pin_flags; 1321 int err; 1322 1323 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1324 return 0; 1325 1326 if (!engine || !intel_engine_can_store_dword(engine)) 1327 return 0; 1328 1329 /* Check that we can recover an unbind stuck on a hanging request */ 1330 1331 err = hang_init(&h, gt); 1332 if (err) 1333 return err; 1334 1335 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1336 if (IS_ERR(obj)) { 1337 err = PTR_ERR(obj); 1338 goto fini; 1339 } 1340 1341 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1342 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1343 if (err) { 1344 pr_err("Invalid X-tiling settings; err:%d\n", err); 1345 goto out_obj; 1346 } 1347 } 1348 1349 arg.vma = i915_vma_instance(obj, vm, NULL); 1350 if (IS_ERR(arg.vma)) { 1351 err = PTR_ERR(arg.vma); 1352 goto out_obj; 1353 } 1354 1355 rq = hang_create_request(&h, engine); 1356 if (IS_ERR(rq)) { 1357 err = PTR_ERR(rq); 1358 goto out_obj; 1359 } 1360 1361 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1362 1363 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1364 pin_flags |= PIN_MAPPABLE; 1365 1366 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1367 if (err) { 1368 i915_request_add(rq); 1369 goto out_obj; 1370 } 1371 1372 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1373 err = i915_vma_pin_fence(arg.vma); 1374 if (err) { 1375 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1376 i915_vma_unpin(arg.vma); 1377 i915_request_add(rq); 1378 goto out_obj; 1379 } 1380 } 1381 1382 i915_vma_lock(arg.vma); 1383 err = i915_request_await_object(rq, arg.vma->obj, 1384 flags & EXEC_OBJECT_WRITE); 1385 if (err == 0) 1386 err = i915_vma_move_to_active(arg.vma, rq, flags); 1387 i915_vma_unlock(arg.vma); 1388 1389 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1390 i915_vma_unpin_fence(arg.vma); 1391 i915_vma_unpin(arg.vma); 1392 1393 i915_request_get(rq); 1394 i915_request_add(rq); 1395 if (err) 1396 goto out_rq; 1397 1398 if (!wait_until_running(&h, rq)) { 1399 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1400 1401 pr_err("%s: Failed to start request %llx, at %x\n", 1402 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1403 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1404 1405 intel_gt_set_wedged(gt); 1406 goto out_reset; 1407 } 1408 1409 init_completion(&arg.completion); 1410 1411 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1412 if (IS_ERR(tsk)) { 1413 err = PTR_ERR(tsk); 1414 tsk = NULL; 1415 goto out_reset; 1416 } 1417 get_task_struct(tsk); 1418 1419 wait_for_completion(&arg.completion); 1420 1421 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1422 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1423 1424 pr_err("igt/evict_vma kthread did not wait\n"); 1425 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1426 1427 intel_gt_set_wedged(gt); 1428 goto out_reset; 1429 } 1430 1431 out_reset: 1432 igt_global_reset_lock(gt); 1433 fake_hangcheck(gt, rq->engine->mask); 1434 igt_global_reset_unlock(gt); 1435 1436 if (tsk) { 1437 struct intel_wedge_me w; 1438 1439 /* The reset, even indirectly, should take less than 10ms. */ 1440 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1441 err = kthread_stop(tsk); 1442 1443 put_task_struct(tsk); 1444 } 1445 1446 out_rq: 1447 i915_request_put(rq); 1448 out_obj: 1449 i915_gem_object_put(obj); 1450 fini: 1451 hang_fini(&h); 1452 if (intel_gt_is_wedged(gt)) 1453 return -EIO; 1454 1455 return err; 1456 } 1457 1458 static int igt_reset_evict_ggtt(void *arg) 1459 { 1460 struct intel_gt *gt = arg; 1461 1462 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1463 evict_vma, EXEC_OBJECT_WRITE); 1464 } 1465 1466 static int igt_reset_evict_ppgtt(void *arg) 1467 { 1468 struct intel_gt *gt = arg; 1469 struct i915_ppgtt *ppgtt; 1470 int err; 1471 1472 /* aliasing == global gtt locking, covered above */ 1473 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1474 return 0; 1475 1476 ppgtt = i915_ppgtt_create(gt); 1477 if (IS_ERR(ppgtt)) 1478 return PTR_ERR(ppgtt); 1479 1480 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1481 evict_vma, EXEC_OBJECT_WRITE); 1482 i915_vm_put(&ppgtt->vm); 1483 1484 return err; 1485 } 1486 1487 static int igt_reset_evict_fence(void *arg) 1488 { 1489 struct intel_gt *gt = arg; 1490 1491 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1492 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1493 } 1494 1495 static int wait_for_others(struct intel_gt *gt, 1496 struct intel_engine_cs *exclude) 1497 { 1498 struct intel_engine_cs *engine; 1499 enum intel_engine_id id; 1500 1501 for_each_engine(engine, gt, id) { 1502 if (engine == exclude) 1503 continue; 1504 1505 if (!wait_for_idle(engine)) 1506 return -EIO; 1507 } 1508 1509 return 0; 1510 } 1511 1512 static int igt_reset_queue(void *arg) 1513 { 1514 struct intel_gt *gt = arg; 1515 struct i915_gpu_error *global = >->i915->gpu_error; 1516 struct intel_engine_cs *engine; 1517 enum intel_engine_id id; 1518 struct hang h; 1519 int err; 1520 1521 /* Check that we replay pending requests following a hang */ 1522 1523 igt_global_reset_lock(gt); 1524 1525 err = hang_init(&h, gt); 1526 if (err) 1527 goto unlock; 1528 1529 for_each_engine(engine, gt, id) { 1530 struct i915_request *prev; 1531 IGT_TIMEOUT(end_time); 1532 unsigned int count; 1533 1534 if (!intel_engine_can_store_dword(engine)) 1535 continue; 1536 1537 prev = hang_create_request(&h, engine); 1538 if (IS_ERR(prev)) { 1539 err = PTR_ERR(prev); 1540 goto fini; 1541 } 1542 1543 i915_request_get(prev); 1544 i915_request_add(prev); 1545 1546 count = 0; 1547 do { 1548 struct i915_request *rq; 1549 unsigned int reset_count; 1550 1551 rq = hang_create_request(&h, engine); 1552 if (IS_ERR(rq)) { 1553 err = PTR_ERR(rq); 1554 goto fini; 1555 } 1556 1557 i915_request_get(rq); 1558 i915_request_add(rq); 1559 1560 /* 1561 * XXX We don't handle resetting the kernel context 1562 * very well. If we trigger a device reset twice in 1563 * quick succession while the kernel context is 1564 * executing, we may end up skipping the breadcrumb. 1565 * This is really only a problem for the selftest as 1566 * normally there is a large interlude between resets 1567 * (hangcheck), or we focus on resetting just one 1568 * engine and so avoid repeatedly resetting innocents. 1569 */ 1570 err = wait_for_others(gt, engine); 1571 if (err) { 1572 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1573 __func__, engine->name); 1574 i915_request_put(rq); 1575 i915_request_put(prev); 1576 1577 GEM_TRACE_DUMP(); 1578 intel_gt_set_wedged(gt); 1579 goto fini; 1580 } 1581 1582 if (!wait_until_running(&h, prev)) { 1583 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1584 1585 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1586 __func__, engine->name, 1587 prev->fence.seqno, hws_seqno(&h, prev)); 1588 intel_engine_dump(engine, &p, 1589 "%s\n", engine->name); 1590 1591 i915_request_put(rq); 1592 i915_request_put(prev); 1593 1594 intel_gt_set_wedged(gt); 1595 1596 err = -EIO; 1597 goto fini; 1598 } 1599 1600 reset_count = fake_hangcheck(gt, BIT(id)); 1601 1602 if (prev->fence.error != -EIO) { 1603 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1604 prev->fence.error); 1605 i915_request_put(rq); 1606 i915_request_put(prev); 1607 err = -EINVAL; 1608 goto fini; 1609 } 1610 1611 if (rq->fence.error) { 1612 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1613 rq->fence.error); 1614 i915_request_put(rq); 1615 i915_request_put(prev); 1616 err = -EINVAL; 1617 goto fini; 1618 } 1619 1620 if (i915_reset_count(global) == reset_count) { 1621 pr_err("No GPU reset recorded!\n"); 1622 i915_request_put(rq); 1623 i915_request_put(prev); 1624 err = -EINVAL; 1625 goto fini; 1626 } 1627 1628 i915_request_put(prev); 1629 prev = rq; 1630 count++; 1631 } while (time_before(jiffies, end_time)); 1632 pr_info("%s: Completed %d queued resets\n", 1633 engine->name, count); 1634 1635 *h.batch = MI_BATCH_BUFFER_END; 1636 intel_gt_chipset_flush(engine->gt); 1637 1638 i915_request_put(prev); 1639 1640 err = igt_flush_test(gt->i915); 1641 if (err) 1642 break; 1643 } 1644 1645 fini: 1646 hang_fini(&h); 1647 unlock: 1648 igt_global_reset_unlock(gt); 1649 1650 if (intel_gt_is_wedged(gt)) 1651 return -EIO; 1652 1653 return err; 1654 } 1655 1656 static int igt_handle_error(void *arg) 1657 { 1658 struct intel_gt *gt = arg; 1659 struct i915_gpu_error *global = >->i915->gpu_error; 1660 struct intel_engine_cs *engine = gt->engine[RCS0]; 1661 struct hang h; 1662 struct i915_request *rq; 1663 struct i915_gpu_coredump *error; 1664 int err; 1665 1666 /* Check that we can issue a global GPU and engine reset */ 1667 1668 if (!intel_has_reset_engine(gt)) 1669 return 0; 1670 1671 if (!engine || !intel_engine_can_store_dword(engine)) 1672 return 0; 1673 1674 err = hang_init(&h, gt); 1675 if (err) 1676 return err; 1677 1678 rq = hang_create_request(&h, engine); 1679 if (IS_ERR(rq)) { 1680 err = PTR_ERR(rq); 1681 goto err_fini; 1682 } 1683 1684 i915_request_get(rq); 1685 i915_request_add(rq); 1686 1687 if (!wait_until_running(&h, rq)) { 1688 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1689 1690 pr_err("%s: Failed to start request %llx, at %x\n", 1691 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1692 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1693 1694 intel_gt_set_wedged(gt); 1695 1696 err = -EIO; 1697 goto err_request; 1698 } 1699 1700 /* Temporarily disable error capture */ 1701 error = xchg(&global->first_error, (void *)-1); 1702 1703 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1704 1705 xchg(&global->first_error, error); 1706 1707 if (rq->fence.error != -EIO) { 1708 pr_err("Guilty request not identified!\n"); 1709 err = -EINVAL; 1710 goto err_request; 1711 } 1712 1713 err_request: 1714 i915_request_put(rq); 1715 err_fini: 1716 hang_fini(&h); 1717 return err; 1718 } 1719 1720 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1721 const struct igt_atomic_section *p, 1722 const char *mode) 1723 { 1724 struct tasklet_struct * const t = &engine->execlists.tasklet; 1725 int err; 1726 1727 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1728 engine->name, mode, p->name); 1729 1730 if (t->func) 1731 tasklet_disable(t); 1732 if (strcmp(p->name, "softirq")) 1733 local_bh_disable(); 1734 p->critical_section_begin(); 1735 1736 err = __intel_engine_reset_bh(engine, NULL); 1737 1738 p->critical_section_end(); 1739 if (strcmp(p->name, "softirq")) 1740 local_bh_enable(); 1741 if (t->func) { 1742 tasklet_enable(t); 1743 tasklet_hi_schedule(t); 1744 } 1745 1746 if (err) 1747 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1748 engine->name, mode, p->name); 1749 1750 return err; 1751 } 1752 1753 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1754 const struct igt_atomic_section *p) 1755 { 1756 struct i915_request *rq; 1757 struct hang h; 1758 int err; 1759 1760 err = __igt_atomic_reset_engine(engine, p, "idle"); 1761 if (err) 1762 return err; 1763 1764 err = hang_init(&h, engine->gt); 1765 if (err) 1766 return err; 1767 1768 rq = hang_create_request(&h, engine); 1769 if (IS_ERR(rq)) { 1770 err = PTR_ERR(rq); 1771 goto out; 1772 } 1773 1774 i915_request_get(rq); 1775 i915_request_add(rq); 1776 1777 if (wait_until_running(&h, rq)) { 1778 err = __igt_atomic_reset_engine(engine, p, "active"); 1779 } else { 1780 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1781 __func__, engine->name, 1782 rq->fence.seqno, hws_seqno(&h, rq)); 1783 intel_gt_set_wedged(engine->gt); 1784 err = -EIO; 1785 } 1786 1787 if (err == 0) { 1788 struct intel_wedge_me w; 1789 1790 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1791 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1792 if (intel_gt_is_wedged(engine->gt)) 1793 err = -EIO; 1794 } 1795 1796 i915_request_put(rq); 1797 out: 1798 hang_fini(&h); 1799 return err; 1800 } 1801 1802 static int igt_reset_engines_atomic(void *arg) 1803 { 1804 struct intel_gt *gt = arg; 1805 const typeof(*igt_atomic_phases) *p; 1806 int err = 0; 1807 1808 /* Check that the engines resets are usable from atomic context */ 1809 1810 if (!intel_has_reset_engine(gt)) 1811 return 0; 1812 1813 if (intel_uc_uses_guc_submission(>->uc)) 1814 return 0; 1815 1816 igt_global_reset_lock(gt); 1817 1818 /* Flush any requests before we get started and check basics */ 1819 if (!igt_force_reset(gt)) 1820 goto unlock; 1821 1822 for (p = igt_atomic_phases; p->name; p++) { 1823 struct intel_engine_cs *engine; 1824 enum intel_engine_id id; 1825 1826 for_each_engine(engine, gt, id) { 1827 err = igt_atomic_reset_engine(engine, p); 1828 if (err) 1829 goto out; 1830 } 1831 } 1832 1833 out: 1834 /* As we poke around the guts, do a full reset before continuing. */ 1835 igt_force_reset(gt); 1836 unlock: 1837 igt_global_reset_unlock(gt); 1838 1839 return err; 1840 } 1841 1842 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1843 { 1844 static const struct i915_subtest tests[] = { 1845 SUBTEST(igt_hang_sanitycheck), 1846 SUBTEST(igt_reset_nop), 1847 SUBTEST(igt_reset_nop_engine), 1848 SUBTEST(igt_reset_idle_engine), 1849 SUBTEST(igt_reset_active_engine), 1850 SUBTEST(igt_reset_fail_engine), 1851 SUBTEST(igt_reset_engines), 1852 SUBTEST(igt_reset_engines_atomic), 1853 SUBTEST(igt_reset_queue), 1854 SUBTEST(igt_reset_wait), 1855 SUBTEST(igt_reset_evict_ggtt), 1856 SUBTEST(igt_reset_evict_ppgtt), 1857 SUBTEST(igt_reset_evict_fence), 1858 SUBTEST(igt_handle_error), 1859 }; 1860 struct intel_gt *gt = &i915->gt; 1861 intel_wakeref_t wakeref; 1862 int err; 1863 1864 if (!intel_has_gpu_reset(gt)) 1865 return 0; 1866 1867 if (intel_gt_is_wedged(gt)) 1868 return -EIO; /* we're long past hope of a successful reset */ 1869 1870 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1871 1872 err = intel_gt_live_subtests(tests, gt); 1873 1874 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1875 1876 return err; 1877 } 1878