1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/kthread.h> 26 27 #include "gem/i915_gem_context.h" 28 29 #include "intel_gt.h" 30 #include "intel_engine_heartbeat.h" 31 #include "intel_engine_pm.h" 32 #include "selftest_engine_heartbeat.h" 33 34 #include "i915_selftest.h" 35 #include "selftests/i915_random.h" 36 #include "selftests/igt_flush_test.h" 37 #include "selftests/igt_reset.h" 38 #include "selftests/igt_atomic.h" 39 40 #include "selftests/mock_drm.h" 41 42 #include "gem/selftests/mock_context.h" 43 #include "gem/selftests/igt_gem_utils.h" 44 45 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 46 47 struct hang { 48 struct intel_gt *gt; 49 struct drm_i915_gem_object *hws; 50 struct drm_i915_gem_object *obj; 51 struct i915_gem_context *ctx; 52 u32 *seqno; 53 u32 *batch; 54 }; 55 56 static int hang_init(struct hang *h, struct intel_gt *gt) 57 { 58 void *vaddr; 59 int err; 60 61 memset(h, 0, sizeof(*h)); 62 h->gt = gt; 63 64 h->ctx = kernel_context(gt->i915); 65 if (IS_ERR(h->ctx)) 66 return PTR_ERR(h->ctx); 67 68 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 69 70 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 71 if (IS_ERR(h->hws)) { 72 err = PTR_ERR(h->hws); 73 goto err_ctx; 74 } 75 76 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 77 if (IS_ERR(h->obj)) { 78 err = PTR_ERR(h->obj); 79 goto err_hws; 80 } 81 82 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 83 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 84 if (IS_ERR(vaddr)) { 85 err = PTR_ERR(vaddr); 86 goto err_obj; 87 } 88 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 89 90 vaddr = i915_gem_object_pin_map(h->obj, 91 i915_coherent_map_type(gt->i915)); 92 if (IS_ERR(vaddr)) { 93 err = PTR_ERR(vaddr); 94 goto err_unpin_hws; 95 } 96 h->batch = vaddr; 97 98 return 0; 99 100 err_unpin_hws: 101 i915_gem_object_unpin_map(h->hws); 102 err_obj: 103 i915_gem_object_put(h->obj); 104 err_hws: 105 i915_gem_object_put(h->hws); 106 err_ctx: 107 kernel_context_close(h->ctx); 108 return err; 109 } 110 111 static u64 hws_address(const struct i915_vma *hws, 112 const struct i915_request *rq) 113 { 114 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 115 } 116 117 static int move_to_active(struct i915_vma *vma, 118 struct i915_request *rq, 119 unsigned int flags) 120 { 121 int err; 122 123 i915_vma_lock(vma); 124 err = i915_request_await_object(rq, vma->obj, 125 flags & EXEC_OBJECT_WRITE); 126 if (err == 0) 127 err = i915_vma_move_to_active(vma, rq, flags); 128 i915_vma_unlock(vma); 129 130 return err; 131 } 132 133 static struct i915_request * 134 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 135 { 136 struct intel_gt *gt = h->gt; 137 struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx); 138 struct drm_i915_gem_object *obj; 139 struct i915_request *rq = NULL; 140 struct i915_vma *hws, *vma; 141 unsigned int flags; 142 void *vaddr; 143 u32 *batch; 144 int err; 145 146 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 147 if (IS_ERR(obj)) { 148 i915_vm_put(vm); 149 return ERR_CAST(obj); 150 } 151 152 vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915)); 153 if (IS_ERR(vaddr)) { 154 i915_gem_object_put(obj); 155 i915_vm_put(vm); 156 return ERR_CAST(vaddr); 157 } 158 159 i915_gem_object_unpin_map(h->obj); 160 i915_gem_object_put(h->obj); 161 162 h->obj = obj; 163 h->batch = vaddr; 164 165 vma = i915_vma_instance(h->obj, vm, NULL); 166 if (IS_ERR(vma)) { 167 i915_vm_put(vm); 168 return ERR_CAST(vma); 169 } 170 171 hws = i915_vma_instance(h->hws, vm, NULL); 172 if (IS_ERR(hws)) { 173 i915_vm_put(vm); 174 return ERR_CAST(hws); 175 } 176 177 err = i915_vma_pin(vma, 0, 0, PIN_USER); 178 if (err) { 179 i915_vm_put(vm); 180 return ERR_PTR(err); 181 } 182 183 err = i915_vma_pin(hws, 0, 0, PIN_USER); 184 if (err) 185 goto unpin_vma; 186 187 rq = igt_request_alloc(h->ctx, engine); 188 if (IS_ERR(rq)) { 189 err = PTR_ERR(rq); 190 goto unpin_hws; 191 } 192 193 err = move_to_active(vma, rq, 0); 194 if (err) 195 goto cancel_rq; 196 197 err = move_to_active(hws, rq, 0); 198 if (err) 199 goto cancel_rq; 200 201 batch = h->batch; 202 if (INTEL_GEN(gt->i915) >= 8) { 203 *batch++ = MI_STORE_DWORD_IMM_GEN4; 204 *batch++ = lower_32_bits(hws_address(hws, rq)); 205 *batch++ = upper_32_bits(hws_address(hws, rq)); 206 *batch++ = rq->fence.seqno; 207 *batch++ = MI_NOOP; 208 209 memset(batch, 0, 1024); 210 batch += 1024 / sizeof(*batch); 211 212 *batch++ = MI_NOOP; 213 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 214 *batch++ = lower_32_bits(vma->node.start); 215 *batch++ = upper_32_bits(vma->node.start); 216 } else if (INTEL_GEN(gt->i915) >= 6) { 217 *batch++ = MI_STORE_DWORD_IMM_GEN4; 218 *batch++ = 0; 219 *batch++ = lower_32_bits(hws_address(hws, rq)); 220 *batch++ = rq->fence.seqno; 221 *batch++ = MI_NOOP; 222 223 memset(batch, 0, 1024); 224 batch += 1024 / sizeof(*batch); 225 226 *batch++ = MI_NOOP; 227 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 228 *batch++ = lower_32_bits(vma->node.start); 229 } else if (INTEL_GEN(gt->i915) >= 4) { 230 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 231 *batch++ = 0; 232 *batch++ = lower_32_bits(hws_address(hws, rq)); 233 *batch++ = rq->fence.seqno; 234 *batch++ = MI_NOOP; 235 236 memset(batch, 0, 1024); 237 batch += 1024 / sizeof(*batch); 238 239 *batch++ = MI_NOOP; 240 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 241 *batch++ = lower_32_bits(vma->node.start); 242 } else { 243 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 244 *batch++ = lower_32_bits(hws_address(hws, rq)); 245 *batch++ = rq->fence.seqno; 246 *batch++ = MI_NOOP; 247 248 memset(batch, 0, 1024); 249 batch += 1024 / sizeof(*batch); 250 251 *batch++ = MI_NOOP; 252 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 253 *batch++ = lower_32_bits(vma->node.start); 254 } 255 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 256 intel_gt_chipset_flush(engine->gt); 257 258 if (rq->engine->emit_init_breadcrumb) { 259 err = rq->engine->emit_init_breadcrumb(rq); 260 if (err) 261 goto cancel_rq; 262 } 263 264 flags = 0; 265 if (INTEL_GEN(gt->i915) <= 5) 266 flags |= I915_DISPATCH_SECURE; 267 268 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 269 270 cancel_rq: 271 if (err) { 272 i915_request_set_error_once(rq, err); 273 i915_request_add(rq); 274 } 275 unpin_hws: 276 i915_vma_unpin(hws); 277 unpin_vma: 278 i915_vma_unpin(vma); 279 i915_vm_put(vm); 280 return err ? ERR_PTR(err) : rq; 281 } 282 283 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 284 { 285 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 286 } 287 288 static void hang_fini(struct hang *h) 289 { 290 *h->batch = MI_BATCH_BUFFER_END; 291 intel_gt_chipset_flush(h->gt); 292 293 i915_gem_object_unpin_map(h->obj); 294 i915_gem_object_put(h->obj); 295 296 i915_gem_object_unpin_map(h->hws); 297 i915_gem_object_put(h->hws); 298 299 kernel_context_close(h->ctx); 300 301 igt_flush_test(h->gt->i915); 302 } 303 304 static bool wait_until_running(struct hang *h, struct i915_request *rq) 305 { 306 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 307 rq->fence.seqno), 308 10) && 309 wait_for(i915_seqno_passed(hws_seqno(h, rq), 310 rq->fence.seqno), 311 1000)); 312 } 313 314 static int igt_hang_sanitycheck(void *arg) 315 { 316 struct intel_gt *gt = arg; 317 struct i915_request *rq; 318 struct intel_engine_cs *engine; 319 enum intel_engine_id id; 320 struct hang h; 321 int err; 322 323 /* Basic check that we can execute our hanging batch */ 324 325 err = hang_init(&h, gt); 326 if (err) 327 return err; 328 329 for_each_engine(engine, gt, id) { 330 struct intel_wedge_me w; 331 long timeout; 332 333 if (!intel_engine_can_store_dword(engine)) 334 continue; 335 336 rq = hang_create_request(&h, engine); 337 if (IS_ERR(rq)) { 338 err = PTR_ERR(rq); 339 pr_err("Failed to create request for %s, err=%d\n", 340 engine->name, err); 341 goto fini; 342 } 343 344 i915_request_get(rq); 345 346 *h.batch = MI_BATCH_BUFFER_END; 347 intel_gt_chipset_flush(engine->gt); 348 349 i915_request_add(rq); 350 351 timeout = 0; 352 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 353 timeout = i915_request_wait(rq, 0, 354 MAX_SCHEDULE_TIMEOUT); 355 if (intel_gt_is_wedged(gt)) 356 timeout = -EIO; 357 358 i915_request_put(rq); 359 360 if (timeout < 0) { 361 err = timeout; 362 pr_err("Wait for request failed on %s, err=%d\n", 363 engine->name, err); 364 goto fini; 365 } 366 } 367 368 fini: 369 hang_fini(&h); 370 return err; 371 } 372 373 static bool wait_for_idle(struct intel_engine_cs *engine) 374 { 375 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 376 } 377 378 static int igt_reset_nop(void *arg) 379 { 380 struct intel_gt *gt = arg; 381 struct i915_gpu_error *global = >->i915->gpu_error; 382 struct intel_engine_cs *engine; 383 unsigned int reset_count, count; 384 enum intel_engine_id id; 385 IGT_TIMEOUT(end_time); 386 int err = 0; 387 388 /* Check that we can reset during non-user portions of requests */ 389 390 reset_count = i915_reset_count(global); 391 count = 0; 392 do { 393 for_each_engine(engine, gt, id) { 394 struct intel_context *ce; 395 int i; 396 397 ce = intel_context_create(engine); 398 if (IS_ERR(ce)) { 399 err = PTR_ERR(ce); 400 break; 401 } 402 403 for (i = 0; i < 16; i++) { 404 struct i915_request *rq; 405 406 rq = intel_context_create_request(ce); 407 if (IS_ERR(rq)) { 408 err = PTR_ERR(rq); 409 break; 410 } 411 412 i915_request_add(rq); 413 } 414 415 intel_context_put(ce); 416 } 417 418 igt_global_reset_lock(gt); 419 intel_gt_reset(gt, ALL_ENGINES, NULL); 420 igt_global_reset_unlock(gt); 421 422 if (intel_gt_is_wedged(gt)) { 423 err = -EIO; 424 break; 425 } 426 427 if (i915_reset_count(global) != reset_count + ++count) { 428 pr_err("Full GPU reset not recorded!\n"); 429 err = -EINVAL; 430 break; 431 } 432 433 err = igt_flush_test(gt->i915); 434 if (err) 435 break; 436 } while (time_before(jiffies, end_time)); 437 pr_info("%s: %d resets\n", __func__, count); 438 439 if (igt_flush_test(gt->i915)) 440 err = -EIO; 441 return err; 442 } 443 444 static int igt_reset_nop_engine(void *arg) 445 { 446 struct intel_gt *gt = arg; 447 struct i915_gpu_error *global = >->i915->gpu_error; 448 struct intel_engine_cs *engine; 449 enum intel_engine_id id; 450 451 /* Check that we can engine-reset during non-user portions */ 452 453 if (!intel_has_reset_engine(gt)) 454 return 0; 455 456 for_each_engine(engine, gt, id) { 457 unsigned int reset_count, reset_engine_count, count; 458 struct intel_context *ce; 459 IGT_TIMEOUT(end_time); 460 int err; 461 462 ce = intel_context_create(engine); 463 if (IS_ERR(ce)) 464 return PTR_ERR(ce); 465 466 reset_count = i915_reset_count(global); 467 reset_engine_count = i915_reset_engine_count(global, engine); 468 count = 0; 469 470 st_engine_heartbeat_disable(engine); 471 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 472 do { 473 int i; 474 475 if (!wait_for_idle(engine)) { 476 pr_err("%s failed to idle before reset\n", 477 engine->name); 478 err = -EIO; 479 break; 480 } 481 482 for (i = 0; i < 16; i++) { 483 struct i915_request *rq; 484 485 rq = intel_context_create_request(ce); 486 if (IS_ERR(rq)) { 487 struct drm_printer p = 488 drm_info_printer(gt->i915->drm.dev); 489 intel_engine_dump(engine, &p, 490 "%s(%s): failed to submit request\n", 491 __func__, 492 engine->name); 493 494 GEM_TRACE("%s(%s): failed to submit request\n", 495 __func__, 496 engine->name); 497 GEM_TRACE_DUMP(); 498 499 intel_gt_set_wedged(gt); 500 501 err = PTR_ERR(rq); 502 break; 503 } 504 505 i915_request_add(rq); 506 } 507 err = intel_engine_reset(engine, NULL); 508 if (err) { 509 pr_err("intel_engine_reset(%s) failed, err:%d\n", 510 engine->name, err); 511 break; 512 } 513 514 if (i915_reset_count(global) != reset_count) { 515 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 516 err = -EINVAL; 517 break; 518 } 519 520 if (i915_reset_engine_count(global, engine) != 521 reset_engine_count + ++count) { 522 pr_err("%s engine reset not recorded!\n", 523 engine->name); 524 err = -EINVAL; 525 break; 526 } 527 } while (time_before(jiffies, end_time)); 528 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 529 st_engine_heartbeat_enable(engine); 530 531 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 532 533 intel_context_put(ce); 534 if (igt_flush_test(gt->i915)) 535 err = -EIO; 536 if (err) 537 return err; 538 } 539 540 return 0; 541 } 542 543 static void force_reset_timeout(struct intel_engine_cs *engine) 544 { 545 engine->reset_timeout.probability = 999; 546 atomic_set(&engine->reset_timeout.times, -1); 547 } 548 549 static void cancel_reset_timeout(struct intel_engine_cs *engine) 550 { 551 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 552 } 553 554 static int igt_reset_fail_engine(void *arg) 555 { 556 struct intel_gt *gt = arg; 557 struct intel_engine_cs *engine; 558 enum intel_engine_id id; 559 560 /* Check that we can recover from engine-reset failues */ 561 562 if (!intel_has_reset_engine(gt)) 563 return 0; 564 565 for_each_engine(engine, gt, id) { 566 unsigned int count; 567 struct intel_context *ce; 568 IGT_TIMEOUT(end_time); 569 int err; 570 571 ce = intel_context_create(engine); 572 if (IS_ERR(ce)) 573 return PTR_ERR(ce); 574 575 st_engine_heartbeat_disable(engine); 576 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 577 578 force_reset_timeout(engine); 579 err = intel_engine_reset(engine, NULL); 580 cancel_reset_timeout(engine); 581 if (err == 0) /* timeouts only generated on gen8+ */ 582 goto skip; 583 584 count = 0; 585 do { 586 struct i915_request *last = NULL; 587 int i; 588 589 if (!wait_for_idle(engine)) { 590 pr_err("%s failed to idle before reset\n", 591 engine->name); 592 err = -EIO; 593 break; 594 } 595 596 for (i = 0; i < count % 15; i++) { 597 struct i915_request *rq; 598 599 rq = intel_context_create_request(ce); 600 if (IS_ERR(rq)) { 601 struct drm_printer p = 602 drm_info_printer(gt->i915->drm.dev); 603 intel_engine_dump(engine, &p, 604 "%s(%s): failed to submit request\n", 605 __func__, 606 engine->name); 607 608 GEM_TRACE("%s(%s): failed to submit request\n", 609 __func__, 610 engine->name); 611 GEM_TRACE_DUMP(); 612 613 intel_gt_set_wedged(gt); 614 if (last) 615 i915_request_put(last); 616 617 err = PTR_ERR(rq); 618 goto out; 619 } 620 621 if (last) 622 i915_request_put(last); 623 last = i915_request_get(rq); 624 i915_request_add(rq); 625 } 626 627 if (count & 1) { 628 err = intel_engine_reset(engine, NULL); 629 if (err) { 630 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 631 engine->name, err); 632 GEM_TRACE_DUMP(); 633 i915_request_put(last); 634 break; 635 } 636 } else { 637 force_reset_timeout(engine); 638 err = intel_engine_reset(engine, NULL); 639 cancel_reset_timeout(engine); 640 if (err != -ETIMEDOUT) { 641 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 642 engine->name, err); 643 i915_request_put(last); 644 break; 645 } 646 } 647 648 err = 0; 649 if (last) { 650 if (i915_request_wait(last, 0, HZ / 2) < 0) { 651 struct drm_printer p = 652 drm_info_printer(gt->i915->drm.dev); 653 654 intel_engine_dump(engine, &p, 655 "%s(%s): failed to complete request\n", 656 __func__, 657 engine->name); 658 659 GEM_TRACE("%s(%s): failed to complete request\n", 660 __func__, 661 engine->name); 662 GEM_TRACE_DUMP(); 663 664 err = -EIO; 665 } 666 i915_request_put(last); 667 } 668 count++; 669 } while (err == 0 && time_before(jiffies, end_time)); 670 out: 671 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 672 skip: 673 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 674 st_engine_heartbeat_enable(engine); 675 intel_context_put(ce); 676 677 if (igt_flush_test(gt->i915)) 678 err = -EIO; 679 if (err) 680 return err; 681 } 682 683 return 0; 684 } 685 686 static int __igt_reset_engine(struct intel_gt *gt, bool active) 687 { 688 struct i915_gpu_error *global = >->i915->gpu_error; 689 struct intel_engine_cs *engine; 690 enum intel_engine_id id; 691 struct hang h; 692 int err = 0; 693 694 /* Check that we can issue an engine reset on an idle engine (no-op) */ 695 696 if (!intel_has_reset_engine(gt)) 697 return 0; 698 699 if (active) { 700 err = hang_init(&h, gt); 701 if (err) 702 return err; 703 } 704 705 for_each_engine(engine, gt, id) { 706 unsigned int reset_count, reset_engine_count; 707 IGT_TIMEOUT(end_time); 708 709 if (active && !intel_engine_can_store_dword(engine)) 710 continue; 711 712 if (!wait_for_idle(engine)) { 713 pr_err("%s failed to idle before reset\n", 714 engine->name); 715 err = -EIO; 716 break; 717 } 718 719 reset_count = i915_reset_count(global); 720 reset_engine_count = i915_reset_engine_count(global, engine); 721 722 st_engine_heartbeat_disable(engine); 723 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 724 do { 725 if (active) { 726 struct i915_request *rq; 727 728 rq = hang_create_request(&h, engine); 729 if (IS_ERR(rq)) { 730 err = PTR_ERR(rq); 731 break; 732 } 733 734 i915_request_get(rq); 735 i915_request_add(rq); 736 737 if (!wait_until_running(&h, rq)) { 738 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 739 740 pr_err("%s: Failed to start request %llx, at %x\n", 741 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 742 intel_engine_dump(engine, &p, 743 "%s\n", engine->name); 744 745 i915_request_put(rq); 746 err = -EIO; 747 break; 748 } 749 750 i915_request_put(rq); 751 } 752 753 err = intel_engine_reset(engine, NULL); 754 if (err) { 755 pr_err("intel_engine_reset(%s) failed, err:%d\n", 756 engine->name, err); 757 break; 758 } 759 760 if (i915_reset_count(global) != reset_count) { 761 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 762 err = -EINVAL; 763 break; 764 } 765 766 if (i915_reset_engine_count(global, engine) != 767 ++reset_engine_count) { 768 pr_err("%s engine reset not recorded!\n", 769 engine->name); 770 err = -EINVAL; 771 break; 772 } 773 } while (time_before(jiffies, end_time)); 774 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 775 st_engine_heartbeat_enable(engine); 776 777 if (err) 778 break; 779 780 err = igt_flush_test(gt->i915); 781 if (err) 782 break; 783 } 784 785 if (intel_gt_is_wedged(gt)) 786 err = -EIO; 787 788 if (active) 789 hang_fini(&h); 790 791 return err; 792 } 793 794 static int igt_reset_idle_engine(void *arg) 795 { 796 return __igt_reset_engine(arg, false); 797 } 798 799 static int igt_reset_active_engine(void *arg) 800 { 801 return __igt_reset_engine(arg, true); 802 } 803 804 struct active_engine { 805 struct task_struct *task; 806 struct intel_engine_cs *engine; 807 unsigned long resets; 808 unsigned int flags; 809 }; 810 811 #define TEST_ACTIVE BIT(0) 812 #define TEST_OTHERS BIT(1) 813 #define TEST_SELF BIT(2) 814 #define TEST_PRIORITY BIT(3) 815 816 static int active_request_put(struct i915_request *rq) 817 { 818 int err = 0; 819 820 if (!rq) 821 return 0; 822 823 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 824 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 825 rq->engine->name, 826 rq->fence.context, 827 rq->fence.seqno); 828 GEM_TRACE_DUMP(); 829 830 intel_gt_set_wedged(rq->engine->gt); 831 err = -EIO; 832 } 833 834 i915_request_put(rq); 835 836 return err; 837 } 838 839 static int active_engine(void *data) 840 { 841 I915_RND_STATE(prng); 842 struct active_engine *arg = data; 843 struct intel_engine_cs *engine = arg->engine; 844 struct i915_request *rq[8] = {}; 845 struct intel_context *ce[ARRAY_SIZE(rq)]; 846 unsigned long count; 847 int err = 0; 848 849 for (count = 0; count < ARRAY_SIZE(ce); count++) { 850 ce[count] = intel_context_create(engine); 851 if (IS_ERR(ce[count])) { 852 err = PTR_ERR(ce[count]); 853 while (--count) 854 intel_context_put(ce[count]); 855 return err; 856 } 857 } 858 859 count = 0; 860 while (!kthread_should_stop()) { 861 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 862 struct i915_request *old = rq[idx]; 863 struct i915_request *new; 864 865 new = intel_context_create_request(ce[idx]); 866 if (IS_ERR(new)) { 867 err = PTR_ERR(new); 868 break; 869 } 870 871 rq[idx] = i915_request_get(new); 872 i915_request_add(new); 873 874 if (engine->schedule && arg->flags & TEST_PRIORITY) { 875 struct i915_sched_attr attr = { 876 .priority = 877 i915_prandom_u32_max_state(512, &prng), 878 }; 879 engine->schedule(rq[idx], &attr); 880 } 881 882 err = active_request_put(old); 883 if (err) 884 break; 885 886 cond_resched(); 887 } 888 889 for (count = 0; count < ARRAY_SIZE(rq); count++) { 890 int err__ = active_request_put(rq[count]); 891 892 /* Keep the first error */ 893 if (!err) 894 err = err__; 895 896 intel_context_put(ce[count]); 897 } 898 899 return err; 900 } 901 902 static int __igt_reset_engines(struct intel_gt *gt, 903 const char *test_name, 904 unsigned int flags) 905 { 906 struct i915_gpu_error *global = >->i915->gpu_error; 907 struct intel_engine_cs *engine, *other; 908 enum intel_engine_id id, tmp; 909 struct hang h; 910 int err = 0; 911 912 /* Check that issuing a reset on one engine does not interfere 913 * with any other engine. 914 */ 915 916 if (!intel_has_reset_engine(gt)) 917 return 0; 918 919 if (flags & TEST_ACTIVE) { 920 err = hang_init(&h, gt); 921 if (err) 922 return err; 923 924 if (flags & TEST_PRIORITY) 925 h.ctx->sched.priority = 1024; 926 } 927 928 for_each_engine(engine, gt, id) { 929 struct active_engine threads[I915_NUM_ENGINES] = {}; 930 unsigned long device = i915_reset_count(global); 931 unsigned long count = 0, reported; 932 IGT_TIMEOUT(end_time); 933 934 if (flags & TEST_ACTIVE && 935 !intel_engine_can_store_dword(engine)) 936 continue; 937 938 if (!wait_for_idle(engine)) { 939 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 940 engine->name, test_name); 941 err = -EIO; 942 break; 943 } 944 945 memset(threads, 0, sizeof(threads)); 946 for_each_engine(other, gt, tmp) { 947 struct task_struct *tsk; 948 949 threads[tmp].resets = 950 i915_reset_engine_count(global, other); 951 952 if (other == engine && !(flags & TEST_SELF)) 953 continue; 954 955 if (other != engine && !(flags & TEST_OTHERS)) 956 continue; 957 958 threads[tmp].engine = other; 959 threads[tmp].flags = flags; 960 961 tsk = kthread_run(active_engine, &threads[tmp], 962 "igt/%s", other->name); 963 if (IS_ERR(tsk)) { 964 err = PTR_ERR(tsk); 965 goto unwind; 966 } 967 968 threads[tmp].task = tsk; 969 get_task_struct(tsk); 970 } 971 972 yield(); /* start all threads before we begin */ 973 974 st_engine_heartbeat_disable(engine); 975 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 976 do { 977 struct i915_request *rq = NULL; 978 979 if (flags & TEST_ACTIVE) { 980 rq = hang_create_request(&h, engine); 981 if (IS_ERR(rq)) { 982 err = PTR_ERR(rq); 983 break; 984 } 985 986 i915_request_get(rq); 987 i915_request_add(rq); 988 989 if (!wait_until_running(&h, rq)) { 990 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 991 992 pr_err("%s: Failed to start request %llx, at %x\n", 993 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 994 intel_engine_dump(engine, &p, 995 "%s\n", engine->name); 996 997 i915_request_put(rq); 998 err = -EIO; 999 break; 1000 } 1001 } 1002 1003 err = intel_engine_reset(engine, NULL); 1004 if (err) { 1005 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1006 engine->name, test_name, err); 1007 break; 1008 } 1009 1010 count++; 1011 1012 if (rq) { 1013 if (rq->fence.error != -EIO) { 1014 pr_err("i915_reset_engine(%s:%s):" 1015 " failed to reset request %llx:%lld\n", 1016 engine->name, test_name, 1017 rq->fence.context, 1018 rq->fence.seqno); 1019 i915_request_put(rq); 1020 1021 GEM_TRACE_DUMP(); 1022 intel_gt_set_wedged(gt); 1023 err = -EIO; 1024 break; 1025 } 1026 1027 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1028 struct drm_printer p = 1029 drm_info_printer(gt->i915->drm.dev); 1030 1031 pr_err("i915_reset_engine(%s:%s):" 1032 " failed to complete request %llx:%lld after reset\n", 1033 engine->name, test_name, 1034 rq->fence.context, 1035 rq->fence.seqno); 1036 intel_engine_dump(engine, &p, 1037 "%s\n", engine->name); 1038 i915_request_put(rq); 1039 1040 GEM_TRACE_DUMP(); 1041 intel_gt_set_wedged(gt); 1042 err = -EIO; 1043 break; 1044 } 1045 1046 i915_request_put(rq); 1047 } 1048 1049 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1050 struct drm_printer p = 1051 drm_info_printer(gt->i915->drm.dev); 1052 1053 pr_err("i915_reset_engine(%s:%s):" 1054 " failed to idle after reset\n", 1055 engine->name, test_name); 1056 intel_engine_dump(engine, &p, 1057 "%s\n", engine->name); 1058 1059 err = -EIO; 1060 break; 1061 } 1062 } while (time_before(jiffies, end_time)); 1063 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 1064 st_engine_heartbeat_enable(engine); 1065 1066 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1067 engine->name, test_name, count); 1068 1069 reported = i915_reset_engine_count(global, engine); 1070 reported -= threads[engine->id].resets; 1071 if (reported != count) { 1072 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1073 engine->name, test_name, count, reported); 1074 if (!err) 1075 err = -EINVAL; 1076 } 1077 1078 unwind: 1079 for_each_engine(other, gt, tmp) { 1080 int ret; 1081 1082 if (!threads[tmp].task) 1083 continue; 1084 1085 ret = kthread_stop(threads[tmp].task); 1086 if (ret) { 1087 pr_err("kthread for other engine %s failed, err=%d\n", 1088 other->name, ret); 1089 if (!err) 1090 err = ret; 1091 } 1092 put_task_struct(threads[tmp].task); 1093 1094 if (other->uabi_class != engine->uabi_class && 1095 threads[tmp].resets != 1096 i915_reset_engine_count(global, other)) { 1097 pr_err("Innocent engine %s was reset (count=%ld)\n", 1098 other->name, 1099 i915_reset_engine_count(global, other) - 1100 threads[tmp].resets); 1101 if (!err) 1102 err = -EINVAL; 1103 } 1104 } 1105 1106 if (device != i915_reset_count(global)) { 1107 pr_err("Global reset (count=%ld)!\n", 1108 i915_reset_count(global) - device); 1109 if (!err) 1110 err = -EINVAL; 1111 } 1112 1113 if (err) 1114 break; 1115 1116 err = igt_flush_test(gt->i915); 1117 if (err) 1118 break; 1119 } 1120 1121 if (intel_gt_is_wedged(gt)) 1122 err = -EIO; 1123 1124 if (flags & TEST_ACTIVE) 1125 hang_fini(&h); 1126 1127 return err; 1128 } 1129 1130 static int igt_reset_engines(void *arg) 1131 { 1132 static const struct { 1133 const char *name; 1134 unsigned int flags; 1135 } phases[] = { 1136 { "idle", 0 }, 1137 { "active", TEST_ACTIVE }, 1138 { "others-idle", TEST_OTHERS }, 1139 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1140 { 1141 "others-priority", 1142 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1143 }, 1144 { 1145 "self-priority", 1146 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1147 }, 1148 { } 1149 }; 1150 struct intel_gt *gt = arg; 1151 typeof(*phases) *p; 1152 int err; 1153 1154 for (p = phases; p->name; p++) { 1155 if (p->flags & TEST_PRIORITY) { 1156 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1157 continue; 1158 } 1159 1160 err = __igt_reset_engines(arg, p->name, p->flags); 1161 if (err) 1162 return err; 1163 } 1164 1165 return 0; 1166 } 1167 1168 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1169 { 1170 u32 count = i915_reset_count(>->i915->gpu_error); 1171 1172 intel_gt_reset(gt, mask, NULL); 1173 1174 return count; 1175 } 1176 1177 static int igt_reset_wait(void *arg) 1178 { 1179 struct intel_gt *gt = arg; 1180 struct i915_gpu_error *global = >->i915->gpu_error; 1181 struct intel_engine_cs *engine = gt->engine[RCS0]; 1182 struct i915_request *rq; 1183 unsigned int reset_count; 1184 struct hang h; 1185 long timeout; 1186 int err; 1187 1188 if (!engine || !intel_engine_can_store_dword(engine)) 1189 return 0; 1190 1191 /* Check that we detect a stuck waiter and issue a reset */ 1192 1193 igt_global_reset_lock(gt); 1194 1195 err = hang_init(&h, gt); 1196 if (err) 1197 goto unlock; 1198 1199 rq = hang_create_request(&h, engine); 1200 if (IS_ERR(rq)) { 1201 err = PTR_ERR(rq); 1202 goto fini; 1203 } 1204 1205 i915_request_get(rq); 1206 i915_request_add(rq); 1207 1208 if (!wait_until_running(&h, rq)) { 1209 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1210 1211 pr_err("%s: Failed to start request %llx, at %x\n", 1212 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1213 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1214 1215 intel_gt_set_wedged(gt); 1216 1217 err = -EIO; 1218 goto out_rq; 1219 } 1220 1221 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1222 1223 timeout = i915_request_wait(rq, 0, 10); 1224 if (timeout < 0) { 1225 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1226 timeout); 1227 err = timeout; 1228 goto out_rq; 1229 } 1230 1231 if (i915_reset_count(global) == reset_count) { 1232 pr_err("No GPU reset recorded!\n"); 1233 err = -EINVAL; 1234 goto out_rq; 1235 } 1236 1237 out_rq: 1238 i915_request_put(rq); 1239 fini: 1240 hang_fini(&h); 1241 unlock: 1242 igt_global_reset_unlock(gt); 1243 1244 if (intel_gt_is_wedged(gt)) 1245 return -EIO; 1246 1247 return err; 1248 } 1249 1250 struct evict_vma { 1251 struct completion completion; 1252 struct i915_vma *vma; 1253 }; 1254 1255 static int evict_vma(void *data) 1256 { 1257 struct evict_vma *arg = data; 1258 struct i915_address_space *vm = arg->vma->vm; 1259 struct drm_mm_node evict = arg->vma->node; 1260 int err; 1261 1262 complete(&arg->completion); 1263 1264 mutex_lock(&vm->mutex); 1265 err = i915_gem_evict_for_node(vm, &evict, 0); 1266 mutex_unlock(&vm->mutex); 1267 1268 return err; 1269 } 1270 1271 static int evict_fence(void *data) 1272 { 1273 struct evict_vma *arg = data; 1274 int err; 1275 1276 complete(&arg->completion); 1277 1278 /* Mark the fence register as dirty to force the mmio update. */ 1279 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1280 if (err) { 1281 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1282 return err; 1283 } 1284 1285 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1286 if (err) { 1287 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1288 return err; 1289 } 1290 1291 err = i915_vma_pin_fence(arg->vma); 1292 i915_vma_unpin(arg->vma); 1293 if (err) { 1294 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1295 return err; 1296 } 1297 1298 i915_vma_unpin_fence(arg->vma); 1299 1300 return 0; 1301 } 1302 1303 static int __igt_reset_evict_vma(struct intel_gt *gt, 1304 struct i915_address_space *vm, 1305 int (*fn)(void *), 1306 unsigned int flags) 1307 { 1308 struct intel_engine_cs *engine = gt->engine[RCS0]; 1309 struct drm_i915_gem_object *obj; 1310 struct task_struct *tsk = NULL; 1311 struct i915_request *rq; 1312 struct evict_vma arg; 1313 struct hang h; 1314 unsigned int pin_flags; 1315 int err; 1316 1317 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1318 return 0; 1319 1320 if (!engine || !intel_engine_can_store_dword(engine)) 1321 return 0; 1322 1323 /* Check that we can recover an unbind stuck on a hanging request */ 1324 1325 err = hang_init(&h, gt); 1326 if (err) 1327 return err; 1328 1329 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1330 if (IS_ERR(obj)) { 1331 err = PTR_ERR(obj); 1332 goto fini; 1333 } 1334 1335 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1336 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1337 if (err) { 1338 pr_err("Invalid X-tiling settings; err:%d\n", err); 1339 goto out_obj; 1340 } 1341 } 1342 1343 arg.vma = i915_vma_instance(obj, vm, NULL); 1344 if (IS_ERR(arg.vma)) { 1345 err = PTR_ERR(arg.vma); 1346 goto out_obj; 1347 } 1348 1349 rq = hang_create_request(&h, engine); 1350 if (IS_ERR(rq)) { 1351 err = PTR_ERR(rq); 1352 goto out_obj; 1353 } 1354 1355 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1356 1357 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1358 pin_flags |= PIN_MAPPABLE; 1359 1360 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1361 if (err) { 1362 i915_request_add(rq); 1363 goto out_obj; 1364 } 1365 1366 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1367 err = i915_vma_pin_fence(arg.vma); 1368 if (err) { 1369 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1370 i915_vma_unpin(arg.vma); 1371 i915_request_add(rq); 1372 goto out_obj; 1373 } 1374 } 1375 1376 i915_vma_lock(arg.vma); 1377 err = i915_request_await_object(rq, arg.vma->obj, 1378 flags & EXEC_OBJECT_WRITE); 1379 if (err == 0) 1380 err = i915_vma_move_to_active(arg.vma, rq, flags); 1381 i915_vma_unlock(arg.vma); 1382 1383 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1384 i915_vma_unpin_fence(arg.vma); 1385 i915_vma_unpin(arg.vma); 1386 1387 i915_request_get(rq); 1388 i915_request_add(rq); 1389 if (err) 1390 goto out_rq; 1391 1392 if (!wait_until_running(&h, rq)) { 1393 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1394 1395 pr_err("%s: Failed to start request %llx, at %x\n", 1396 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1397 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1398 1399 intel_gt_set_wedged(gt); 1400 goto out_reset; 1401 } 1402 1403 init_completion(&arg.completion); 1404 1405 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1406 if (IS_ERR(tsk)) { 1407 err = PTR_ERR(tsk); 1408 tsk = NULL; 1409 goto out_reset; 1410 } 1411 get_task_struct(tsk); 1412 1413 wait_for_completion(&arg.completion); 1414 1415 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1416 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1417 1418 pr_err("igt/evict_vma kthread did not wait\n"); 1419 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1420 1421 intel_gt_set_wedged(gt); 1422 goto out_reset; 1423 } 1424 1425 out_reset: 1426 igt_global_reset_lock(gt); 1427 fake_hangcheck(gt, rq->engine->mask); 1428 igt_global_reset_unlock(gt); 1429 1430 if (tsk) { 1431 struct intel_wedge_me w; 1432 1433 /* The reset, even indirectly, should take less than 10ms. */ 1434 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1435 err = kthread_stop(tsk); 1436 1437 put_task_struct(tsk); 1438 } 1439 1440 out_rq: 1441 i915_request_put(rq); 1442 out_obj: 1443 i915_gem_object_put(obj); 1444 fini: 1445 hang_fini(&h); 1446 if (intel_gt_is_wedged(gt)) 1447 return -EIO; 1448 1449 return err; 1450 } 1451 1452 static int igt_reset_evict_ggtt(void *arg) 1453 { 1454 struct intel_gt *gt = arg; 1455 1456 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1457 evict_vma, EXEC_OBJECT_WRITE); 1458 } 1459 1460 static int igt_reset_evict_ppgtt(void *arg) 1461 { 1462 struct intel_gt *gt = arg; 1463 struct i915_ppgtt *ppgtt; 1464 int err; 1465 1466 /* aliasing == global gtt locking, covered above */ 1467 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1468 return 0; 1469 1470 ppgtt = i915_ppgtt_create(gt); 1471 if (IS_ERR(ppgtt)) 1472 return PTR_ERR(ppgtt); 1473 1474 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1475 evict_vma, EXEC_OBJECT_WRITE); 1476 i915_vm_put(&ppgtt->vm); 1477 1478 return err; 1479 } 1480 1481 static int igt_reset_evict_fence(void *arg) 1482 { 1483 struct intel_gt *gt = arg; 1484 1485 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1486 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1487 } 1488 1489 static int wait_for_others(struct intel_gt *gt, 1490 struct intel_engine_cs *exclude) 1491 { 1492 struct intel_engine_cs *engine; 1493 enum intel_engine_id id; 1494 1495 for_each_engine(engine, gt, id) { 1496 if (engine == exclude) 1497 continue; 1498 1499 if (!wait_for_idle(engine)) 1500 return -EIO; 1501 } 1502 1503 return 0; 1504 } 1505 1506 static int igt_reset_queue(void *arg) 1507 { 1508 struct intel_gt *gt = arg; 1509 struct i915_gpu_error *global = >->i915->gpu_error; 1510 struct intel_engine_cs *engine; 1511 enum intel_engine_id id; 1512 struct hang h; 1513 int err; 1514 1515 /* Check that we replay pending requests following a hang */ 1516 1517 igt_global_reset_lock(gt); 1518 1519 err = hang_init(&h, gt); 1520 if (err) 1521 goto unlock; 1522 1523 for_each_engine(engine, gt, id) { 1524 struct i915_request *prev; 1525 IGT_TIMEOUT(end_time); 1526 unsigned int count; 1527 1528 if (!intel_engine_can_store_dword(engine)) 1529 continue; 1530 1531 prev = hang_create_request(&h, engine); 1532 if (IS_ERR(prev)) { 1533 err = PTR_ERR(prev); 1534 goto fini; 1535 } 1536 1537 i915_request_get(prev); 1538 i915_request_add(prev); 1539 1540 count = 0; 1541 do { 1542 struct i915_request *rq; 1543 unsigned int reset_count; 1544 1545 rq = hang_create_request(&h, engine); 1546 if (IS_ERR(rq)) { 1547 err = PTR_ERR(rq); 1548 goto fini; 1549 } 1550 1551 i915_request_get(rq); 1552 i915_request_add(rq); 1553 1554 /* 1555 * XXX We don't handle resetting the kernel context 1556 * very well. If we trigger a device reset twice in 1557 * quick succession while the kernel context is 1558 * executing, we may end up skipping the breadcrumb. 1559 * This is really only a problem for the selftest as 1560 * normally there is a large interlude between resets 1561 * (hangcheck), or we focus on resetting just one 1562 * engine and so avoid repeatedly resetting innocents. 1563 */ 1564 err = wait_for_others(gt, engine); 1565 if (err) { 1566 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1567 __func__, engine->name); 1568 i915_request_put(rq); 1569 i915_request_put(prev); 1570 1571 GEM_TRACE_DUMP(); 1572 intel_gt_set_wedged(gt); 1573 goto fini; 1574 } 1575 1576 if (!wait_until_running(&h, prev)) { 1577 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1578 1579 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1580 __func__, engine->name, 1581 prev->fence.seqno, hws_seqno(&h, prev)); 1582 intel_engine_dump(engine, &p, 1583 "%s\n", engine->name); 1584 1585 i915_request_put(rq); 1586 i915_request_put(prev); 1587 1588 intel_gt_set_wedged(gt); 1589 1590 err = -EIO; 1591 goto fini; 1592 } 1593 1594 reset_count = fake_hangcheck(gt, BIT(id)); 1595 1596 if (prev->fence.error != -EIO) { 1597 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1598 prev->fence.error); 1599 i915_request_put(rq); 1600 i915_request_put(prev); 1601 err = -EINVAL; 1602 goto fini; 1603 } 1604 1605 if (rq->fence.error) { 1606 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1607 rq->fence.error); 1608 i915_request_put(rq); 1609 i915_request_put(prev); 1610 err = -EINVAL; 1611 goto fini; 1612 } 1613 1614 if (i915_reset_count(global) == reset_count) { 1615 pr_err("No GPU reset recorded!\n"); 1616 i915_request_put(rq); 1617 i915_request_put(prev); 1618 err = -EINVAL; 1619 goto fini; 1620 } 1621 1622 i915_request_put(prev); 1623 prev = rq; 1624 count++; 1625 } while (time_before(jiffies, end_time)); 1626 pr_info("%s: Completed %d resets\n", engine->name, count); 1627 1628 *h.batch = MI_BATCH_BUFFER_END; 1629 intel_gt_chipset_flush(engine->gt); 1630 1631 i915_request_put(prev); 1632 1633 err = igt_flush_test(gt->i915); 1634 if (err) 1635 break; 1636 } 1637 1638 fini: 1639 hang_fini(&h); 1640 unlock: 1641 igt_global_reset_unlock(gt); 1642 1643 if (intel_gt_is_wedged(gt)) 1644 return -EIO; 1645 1646 return err; 1647 } 1648 1649 static int igt_handle_error(void *arg) 1650 { 1651 struct intel_gt *gt = arg; 1652 struct i915_gpu_error *global = >->i915->gpu_error; 1653 struct intel_engine_cs *engine = gt->engine[RCS0]; 1654 struct hang h; 1655 struct i915_request *rq; 1656 struct i915_gpu_coredump *error; 1657 int err; 1658 1659 /* Check that we can issue a global GPU and engine reset */ 1660 1661 if (!intel_has_reset_engine(gt)) 1662 return 0; 1663 1664 if (!engine || !intel_engine_can_store_dword(engine)) 1665 return 0; 1666 1667 err = hang_init(&h, gt); 1668 if (err) 1669 return err; 1670 1671 rq = hang_create_request(&h, engine); 1672 if (IS_ERR(rq)) { 1673 err = PTR_ERR(rq); 1674 goto err_fini; 1675 } 1676 1677 i915_request_get(rq); 1678 i915_request_add(rq); 1679 1680 if (!wait_until_running(&h, rq)) { 1681 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1682 1683 pr_err("%s: Failed to start request %llx, at %x\n", 1684 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1685 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1686 1687 intel_gt_set_wedged(gt); 1688 1689 err = -EIO; 1690 goto err_request; 1691 } 1692 1693 /* Temporarily disable error capture */ 1694 error = xchg(&global->first_error, (void *)-1); 1695 1696 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1697 1698 xchg(&global->first_error, error); 1699 1700 if (rq->fence.error != -EIO) { 1701 pr_err("Guilty request not identified!\n"); 1702 err = -EINVAL; 1703 goto err_request; 1704 } 1705 1706 err_request: 1707 i915_request_put(rq); 1708 err_fini: 1709 hang_fini(&h); 1710 return err; 1711 } 1712 1713 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1714 const struct igt_atomic_section *p, 1715 const char *mode) 1716 { 1717 struct tasklet_struct * const t = &engine->execlists.tasklet; 1718 int err; 1719 1720 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1721 engine->name, mode, p->name); 1722 1723 tasklet_disable(t); 1724 if (strcmp(p->name, "softirq")) 1725 local_bh_disable(); 1726 p->critical_section_begin(); 1727 1728 err = __intel_engine_reset_bh(engine, NULL); 1729 1730 p->critical_section_end(); 1731 if (strcmp(p->name, "softirq")) 1732 local_bh_enable(); 1733 tasklet_enable(t); 1734 tasklet_hi_schedule(t); 1735 1736 if (err) 1737 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1738 engine->name, mode, p->name); 1739 1740 return err; 1741 } 1742 1743 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1744 const struct igt_atomic_section *p) 1745 { 1746 struct i915_request *rq; 1747 struct hang h; 1748 int err; 1749 1750 err = __igt_atomic_reset_engine(engine, p, "idle"); 1751 if (err) 1752 return err; 1753 1754 err = hang_init(&h, engine->gt); 1755 if (err) 1756 return err; 1757 1758 rq = hang_create_request(&h, engine); 1759 if (IS_ERR(rq)) { 1760 err = PTR_ERR(rq); 1761 goto out; 1762 } 1763 1764 i915_request_get(rq); 1765 i915_request_add(rq); 1766 1767 if (wait_until_running(&h, rq)) { 1768 err = __igt_atomic_reset_engine(engine, p, "active"); 1769 } else { 1770 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1771 __func__, engine->name, 1772 rq->fence.seqno, hws_seqno(&h, rq)); 1773 intel_gt_set_wedged(engine->gt); 1774 err = -EIO; 1775 } 1776 1777 if (err == 0) { 1778 struct intel_wedge_me w; 1779 1780 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1781 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1782 if (intel_gt_is_wedged(engine->gt)) 1783 err = -EIO; 1784 } 1785 1786 i915_request_put(rq); 1787 out: 1788 hang_fini(&h); 1789 return err; 1790 } 1791 1792 static int igt_reset_engines_atomic(void *arg) 1793 { 1794 struct intel_gt *gt = arg; 1795 const typeof(*igt_atomic_phases) *p; 1796 int err = 0; 1797 1798 /* Check that the engines resets are usable from atomic context */ 1799 1800 if (!intel_has_reset_engine(gt)) 1801 return 0; 1802 1803 if (intel_uc_uses_guc_submission(>->uc)) 1804 return 0; 1805 1806 igt_global_reset_lock(gt); 1807 1808 /* Flush any requests before we get started and check basics */ 1809 if (!igt_force_reset(gt)) 1810 goto unlock; 1811 1812 for (p = igt_atomic_phases; p->name; p++) { 1813 struct intel_engine_cs *engine; 1814 enum intel_engine_id id; 1815 1816 for_each_engine(engine, gt, id) { 1817 err = igt_atomic_reset_engine(engine, p); 1818 if (err) 1819 goto out; 1820 } 1821 } 1822 1823 out: 1824 /* As we poke around the guts, do a full reset before continuing. */ 1825 igt_force_reset(gt); 1826 unlock: 1827 igt_global_reset_unlock(gt); 1828 1829 return err; 1830 } 1831 1832 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1833 { 1834 static const struct i915_subtest tests[] = { 1835 SUBTEST(igt_hang_sanitycheck), 1836 SUBTEST(igt_reset_nop), 1837 SUBTEST(igt_reset_nop_engine), 1838 SUBTEST(igt_reset_idle_engine), 1839 SUBTEST(igt_reset_active_engine), 1840 SUBTEST(igt_reset_fail_engine), 1841 SUBTEST(igt_reset_engines), 1842 SUBTEST(igt_reset_engines_atomic), 1843 SUBTEST(igt_reset_queue), 1844 SUBTEST(igt_reset_wait), 1845 SUBTEST(igt_reset_evict_ggtt), 1846 SUBTEST(igt_reset_evict_ppgtt), 1847 SUBTEST(igt_reset_evict_fence), 1848 SUBTEST(igt_handle_error), 1849 }; 1850 struct intel_gt *gt = &i915->gt; 1851 intel_wakeref_t wakeref; 1852 int err; 1853 1854 if (!intel_has_gpu_reset(gt)) 1855 return 0; 1856 1857 if (intel_gt_is_wedged(gt)) 1858 return -EIO; /* we're long past hope of a successful reset */ 1859 1860 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1861 1862 err = intel_gt_live_subtests(tests, gt); 1863 1864 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1865 1866 return err; 1867 } 1868