1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/kthread.h> 26 27 #include "gem/i915_gem_context.h" 28 29 #include "intel_gt.h" 30 #include "intel_engine_heartbeat.h" 31 #include "intel_engine_pm.h" 32 33 #include "i915_selftest.h" 34 #include "selftests/i915_random.h" 35 #include "selftests/igt_flush_test.h" 36 #include "selftests/igt_reset.h" 37 #include "selftests/igt_atomic.h" 38 39 #include "selftests/mock_drm.h" 40 41 #include "gem/selftests/mock_context.h" 42 #include "gem/selftests/igt_gem_utils.h" 43 44 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 45 46 struct hang { 47 struct intel_gt *gt; 48 struct drm_i915_gem_object *hws; 49 struct drm_i915_gem_object *obj; 50 struct i915_gem_context *ctx; 51 u32 *seqno; 52 u32 *batch; 53 }; 54 55 static int hang_init(struct hang *h, struct intel_gt *gt) 56 { 57 void *vaddr; 58 int err; 59 60 memset(h, 0, sizeof(*h)); 61 h->gt = gt; 62 63 h->ctx = kernel_context(gt->i915); 64 if (IS_ERR(h->ctx)) 65 return PTR_ERR(h->ctx); 66 67 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 68 69 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 70 if (IS_ERR(h->hws)) { 71 err = PTR_ERR(h->hws); 72 goto err_ctx; 73 } 74 75 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 76 if (IS_ERR(h->obj)) { 77 err = PTR_ERR(h->obj); 78 goto err_hws; 79 } 80 81 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 82 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 83 if (IS_ERR(vaddr)) { 84 err = PTR_ERR(vaddr); 85 goto err_obj; 86 } 87 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 88 89 vaddr = i915_gem_object_pin_map(h->obj, 90 i915_coherent_map_type(gt->i915)); 91 if (IS_ERR(vaddr)) { 92 err = PTR_ERR(vaddr); 93 goto err_unpin_hws; 94 } 95 h->batch = vaddr; 96 97 return 0; 98 99 err_unpin_hws: 100 i915_gem_object_unpin_map(h->hws); 101 err_obj: 102 i915_gem_object_put(h->obj); 103 err_hws: 104 i915_gem_object_put(h->hws); 105 err_ctx: 106 kernel_context_close(h->ctx); 107 return err; 108 } 109 110 static u64 hws_address(const struct i915_vma *hws, 111 const struct i915_request *rq) 112 { 113 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 114 } 115 116 static int move_to_active(struct i915_vma *vma, 117 struct i915_request *rq, 118 unsigned int flags) 119 { 120 int err; 121 122 i915_vma_lock(vma); 123 err = i915_request_await_object(rq, vma->obj, 124 flags & EXEC_OBJECT_WRITE); 125 if (err == 0) 126 err = i915_vma_move_to_active(vma, rq, flags); 127 i915_vma_unlock(vma); 128 129 return err; 130 } 131 132 static struct i915_request * 133 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 134 { 135 struct intel_gt *gt = h->gt; 136 struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx); 137 struct drm_i915_gem_object *obj; 138 struct i915_request *rq = NULL; 139 struct i915_vma *hws, *vma; 140 unsigned int flags; 141 void *vaddr; 142 u32 *batch; 143 int err; 144 145 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 146 if (IS_ERR(obj)) { 147 i915_vm_put(vm); 148 return ERR_CAST(obj); 149 } 150 151 vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915)); 152 if (IS_ERR(vaddr)) { 153 i915_gem_object_put(obj); 154 i915_vm_put(vm); 155 return ERR_CAST(vaddr); 156 } 157 158 i915_gem_object_unpin_map(h->obj); 159 i915_gem_object_put(h->obj); 160 161 h->obj = obj; 162 h->batch = vaddr; 163 164 vma = i915_vma_instance(h->obj, vm, NULL); 165 if (IS_ERR(vma)) { 166 i915_vm_put(vm); 167 return ERR_CAST(vma); 168 } 169 170 hws = i915_vma_instance(h->hws, vm, NULL); 171 if (IS_ERR(hws)) { 172 i915_vm_put(vm); 173 return ERR_CAST(hws); 174 } 175 176 err = i915_vma_pin(vma, 0, 0, PIN_USER); 177 if (err) { 178 i915_vm_put(vm); 179 return ERR_PTR(err); 180 } 181 182 err = i915_vma_pin(hws, 0, 0, PIN_USER); 183 if (err) 184 goto unpin_vma; 185 186 rq = igt_request_alloc(h->ctx, engine); 187 if (IS_ERR(rq)) { 188 err = PTR_ERR(rq); 189 goto unpin_hws; 190 } 191 192 err = move_to_active(vma, rq, 0); 193 if (err) 194 goto cancel_rq; 195 196 err = move_to_active(hws, rq, 0); 197 if (err) 198 goto cancel_rq; 199 200 batch = h->batch; 201 if (INTEL_GEN(gt->i915) >= 8) { 202 *batch++ = MI_STORE_DWORD_IMM_GEN4; 203 *batch++ = lower_32_bits(hws_address(hws, rq)); 204 *batch++ = upper_32_bits(hws_address(hws, rq)); 205 *batch++ = rq->fence.seqno; 206 *batch++ = MI_NOOP; 207 208 memset(batch, 0, 1024); 209 batch += 1024 / sizeof(*batch); 210 211 *batch++ = MI_NOOP; 212 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 213 *batch++ = lower_32_bits(vma->node.start); 214 *batch++ = upper_32_bits(vma->node.start); 215 } else if (INTEL_GEN(gt->i915) >= 6) { 216 *batch++ = MI_STORE_DWORD_IMM_GEN4; 217 *batch++ = 0; 218 *batch++ = lower_32_bits(hws_address(hws, rq)); 219 *batch++ = rq->fence.seqno; 220 *batch++ = MI_NOOP; 221 222 memset(batch, 0, 1024); 223 batch += 1024 / sizeof(*batch); 224 225 *batch++ = MI_NOOP; 226 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 227 *batch++ = lower_32_bits(vma->node.start); 228 } else if (INTEL_GEN(gt->i915) >= 4) { 229 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 230 *batch++ = 0; 231 *batch++ = lower_32_bits(hws_address(hws, rq)); 232 *batch++ = rq->fence.seqno; 233 *batch++ = MI_NOOP; 234 235 memset(batch, 0, 1024); 236 batch += 1024 / sizeof(*batch); 237 238 *batch++ = MI_NOOP; 239 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 240 *batch++ = lower_32_bits(vma->node.start); 241 } else { 242 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 243 *batch++ = lower_32_bits(hws_address(hws, rq)); 244 *batch++ = rq->fence.seqno; 245 *batch++ = MI_NOOP; 246 247 memset(batch, 0, 1024); 248 batch += 1024 / sizeof(*batch); 249 250 *batch++ = MI_NOOP; 251 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 252 *batch++ = lower_32_bits(vma->node.start); 253 } 254 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 255 intel_gt_chipset_flush(engine->gt); 256 257 if (rq->engine->emit_init_breadcrumb) { 258 err = rq->engine->emit_init_breadcrumb(rq); 259 if (err) 260 goto cancel_rq; 261 } 262 263 flags = 0; 264 if (INTEL_GEN(gt->i915) <= 5) 265 flags |= I915_DISPATCH_SECURE; 266 267 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 268 269 cancel_rq: 270 if (err) { 271 i915_request_set_error_once(rq, err); 272 i915_request_add(rq); 273 } 274 unpin_hws: 275 i915_vma_unpin(hws); 276 unpin_vma: 277 i915_vma_unpin(vma); 278 i915_vm_put(vm); 279 return err ? ERR_PTR(err) : rq; 280 } 281 282 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 283 { 284 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 285 } 286 287 static void hang_fini(struct hang *h) 288 { 289 *h->batch = MI_BATCH_BUFFER_END; 290 intel_gt_chipset_flush(h->gt); 291 292 i915_gem_object_unpin_map(h->obj); 293 i915_gem_object_put(h->obj); 294 295 i915_gem_object_unpin_map(h->hws); 296 i915_gem_object_put(h->hws); 297 298 kernel_context_close(h->ctx); 299 300 igt_flush_test(h->gt->i915); 301 } 302 303 static bool wait_until_running(struct hang *h, struct i915_request *rq) 304 { 305 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 306 rq->fence.seqno), 307 10) && 308 wait_for(i915_seqno_passed(hws_seqno(h, rq), 309 rq->fence.seqno), 310 1000)); 311 } 312 313 static void engine_heartbeat_disable(struct intel_engine_cs *engine) 314 { 315 engine->props.heartbeat_interval_ms = 0; 316 317 intel_engine_pm_get(engine); 318 intel_engine_park_heartbeat(engine); 319 } 320 321 static void engine_heartbeat_enable(struct intel_engine_cs *engine) 322 { 323 intel_engine_pm_put(engine); 324 325 engine->props.heartbeat_interval_ms = 326 engine->defaults.heartbeat_interval_ms; 327 } 328 329 static int igt_hang_sanitycheck(void *arg) 330 { 331 struct intel_gt *gt = arg; 332 struct i915_request *rq; 333 struct intel_engine_cs *engine; 334 enum intel_engine_id id; 335 struct hang h; 336 int err; 337 338 /* Basic check that we can execute our hanging batch */ 339 340 err = hang_init(&h, gt); 341 if (err) 342 return err; 343 344 for_each_engine(engine, gt, id) { 345 struct intel_wedge_me w; 346 long timeout; 347 348 if (!intel_engine_can_store_dword(engine)) 349 continue; 350 351 rq = hang_create_request(&h, engine); 352 if (IS_ERR(rq)) { 353 err = PTR_ERR(rq); 354 pr_err("Failed to create request for %s, err=%d\n", 355 engine->name, err); 356 goto fini; 357 } 358 359 i915_request_get(rq); 360 361 *h.batch = MI_BATCH_BUFFER_END; 362 intel_gt_chipset_flush(engine->gt); 363 364 i915_request_add(rq); 365 366 timeout = 0; 367 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 368 timeout = i915_request_wait(rq, 0, 369 MAX_SCHEDULE_TIMEOUT); 370 if (intel_gt_is_wedged(gt)) 371 timeout = -EIO; 372 373 i915_request_put(rq); 374 375 if (timeout < 0) { 376 err = timeout; 377 pr_err("Wait for request failed on %s, err=%d\n", 378 engine->name, err); 379 goto fini; 380 } 381 } 382 383 fini: 384 hang_fini(&h); 385 return err; 386 } 387 388 static bool wait_for_idle(struct intel_engine_cs *engine) 389 { 390 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 391 } 392 393 static int igt_reset_nop(void *arg) 394 { 395 struct intel_gt *gt = arg; 396 struct i915_gpu_error *global = >->i915->gpu_error; 397 struct intel_engine_cs *engine; 398 unsigned int reset_count, count; 399 enum intel_engine_id id; 400 IGT_TIMEOUT(end_time); 401 int err = 0; 402 403 /* Check that we can reset during non-user portions of requests */ 404 405 reset_count = i915_reset_count(global); 406 count = 0; 407 do { 408 for_each_engine(engine, gt, id) { 409 struct intel_context *ce; 410 int i; 411 412 ce = intel_context_create(engine); 413 if (IS_ERR(ce)) { 414 err = PTR_ERR(ce); 415 break; 416 } 417 418 for (i = 0; i < 16; i++) { 419 struct i915_request *rq; 420 421 rq = intel_context_create_request(ce); 422 if (IS_ERR(rq)) { 423 err = PTR_ERR(rq); 424 break; 425 } 426 427 i915_request_add(rq); 428 } 429 430 intel_context_put(ce); 431 } 432 433 igt_global_reset_lock(gt); 434 intel_gt_reset(gt, ALL_ENGINES, NULL); 435 igt_global_reset_unlock(gt); 436 437 if (intel_gt_is_wedged(gt)) { 438 err = -EIO; 439 break; 440 } 441 442 if (i915_reset_count(global) != reset_count + ++count) { 443 pr_err("Full GPU reset not recorded!\n"); 444 err = -EINVAL; 445 break; 446 } 447 448 err = igt_flush_test(gt->i915); 449 if (err) 450 break; 451 } while (time_before(jiffies, end_time)); 452 pr_info("%s: %d resets\n", __func__, count); 453 454 if (igt_flush_test(gt->i915)) 455 err = -EIO; 456 return err; 457 } 458 459 static int igt_reset_nop_engine(void *arg) 460 { 461 struct intel_gt *gt = arg; 462 struct i915_gpu_error *global = >->i915->gpu_error; 463 struct intel_engine_cs *engine; 464 enum intel_engine_id id; 465 466 /* Check that we can engine-reset during non-user portions */ 467 468 if (!intel_has_reset_engine(gt)) 469 return 0; 470 471 for_each_engine(engine, gt, id) { 472 unsigned int reset_count, reset_engine_count, count; 473 struct intel_context *ce; 474 IGT_TIMEOUT(end_time); 475 int err; 476 477 ce = intel_context_create(engine); 478 if (IS_ERR(ce)) 479 return PTR_ERR(ce); 480 481 reset_count = i915_reset_count(global); 482 reset_engine_count = i915_reset_engine_count(global, engine); 483 count = 0; 484 485 engine_heartbeat_disable(engine); 486 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 487 do { 488 int i; 489 490 if (!wait_for_idle(engine)) { 491 pr_err("%s failed to idle before reset\n", 492 engine->name); 493 err = -EIO; 494 break; 495 } 496 497 for (i = 0; i < 16; i++) { 498 struct i915_request *rq; 499 500 rq = intel_context_create_request(ce); 501 if (IS_ERR(rq)) { 502 err = PTR_ERR(rq); 503 break; 504 } 505 506 i915_request_add(rq); 507 } 508 err = intel_engine_reset(engine, NULL); 509 if (err) { 510 pr_err("i915_reset_engine failed\n"); 511 break; 512 } 513 514 if (i915_reset_count(global) != reset_count) { 515 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 516 err = -EINVAL; 517 break; 518 } 519 520 if (i915_reset_engine_count(global, engine) != 521 reset_engine_count + ++count) { 522 pr_err("%s engine reset not recorded!\n", 523 engine->name); 524 err = -EINVAL; 525 break; 526 } 527 } while (time_before(jiffies, end_time)); 528 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 529 engine_heartbeat_enable(engine); 530 531 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 532 533 intel_context_put(ce); 534 if (igt_flush_test(gt->i915)) 535 err = -EIO; 536 if (err) 537 return err; 538 } 539 540 return 0; 541 } 542 543 static int __igt_reset_engine(struct intel_gt *gt, bool active) 544 { 545 struct i915_gpu_error *global = >->i915->gpu_error; 546 struct intel_engine_cs *engine; 547 enum intel_engine_id id; 548 struct hang h; 549 int err = 0; 550 551 /* Check that we can issue an engine reset on an idle engine (no-op) */ 552 553 if (!intel_has_reset_engine(gt)) 554 return 0; 555 556 if (active) { 557 err = hang_init(&h, gt); 558 if (err) 559 return err; 560 } 561 562 for_each_engine(engine, gt, id) { 563 unsigned int reset_count, reset_engine_count; 564 IGT_TIMEOUT(end_time); 565 566 if (active && !intel_engine_can_store_dword(engine)) 567 continue; 568 569 if (!wait_for_idle(engine)) { 570 pr_err("%s failed to idle before reset\n", 571 engine->name); 572 err = -EIO; 573 break; 574 } 575 576 reset_count = i915_reset_count(global); 577 reset_engine_count = i915_reset_engine_count(global, engine); 578 579 engine_heartbeat_disable(engine); 580 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 581 do { 582 if (active) { 583 struct i915_request *rq; 584 585 rq = hang_create_request(&h, engine); 586 if (IS_ERR(rq)) { 587 err = PTR_ERR(rq); 588 break; 589 } 590 591 i915_request_get(rq); 592 i915_request_add(rq); 593 594 if (!wait_until_running(&h, rq)) { 595 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 596 597 pr_err("%s: Failed to start request %llx, at %x\n", 598 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 599 intel_engine_dump(engine, &p, 600 "%s\n", engine->name); 601 602 i915_request_put(rq); 603 err = -EIO; 604 break; 605 } 606 607 i915_request_put(rq); 608 } 609 610 err = intel_engine_reset(engine, NULL); 611 if (err) { 612 pr_err("i915_reset_engine failed\n"); 613 break; 614 } 615 616 if (i915_reset_count(global) != reset_count) { 617 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 618 err = -EINVAL; 619 break; 620 } 621 622 if (i915_reset_engine_count(global, engine) != 623 ++reset_engine_count) { 624 pr_err("%s engine reset not recorded!\n", 625 engine->name); 626 err = -EINVAL; 627 break; 628 } 629 } while (time_before(jiffies, end_time)); 630 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 631 engine_heartbeat_enable(engine); 632 633 if (err) 634 break; 635 636 err = igt_flush_test(gt->i915); 637 if (err) 638 break; 639 } 640 641 if (intel_gt_is_wedged(gt)) 642 err = -EIO; 643 644 if (active) 645 hang_fini(&h); 646 647 return err; 648 } 649 650 static int igt_reset_idle_engine(void *arg) 651 { 652 return __igt_reset_engine(arg, false); 653 } 654 655 static int igt_reset_active_engine(void *arg) 656 { 657 return __igt_reset_engine(arg, true); 658 } 659 660 struct active_engine { 661 struct task_struct *task; 662 struct intel_engine_cs *engine; 663 unsigned long resets; 664 unsigned int flags; 665 }; 666 667 #define TEST_ACTIVE BIT(0) 668 #define TEST_OTHERS BIT(1) 669 #define TEST_SELF BIT(2) 670 #define TEST_PRIORITY BIT(3) 671 672 static int active_request_put(struct i915_request *rq) 673 { 674 int err = 0; 675 676 if (!rq) 677 return 0; 678 679 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 680 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 681 rq->engine->name, 682 rq->fence.context, 683 rq->fence.seqno); 684 GEM_TRACE_DUMP(); 685 686 intel_gt_set_wedged(rq->engine->gt); 687 err = -EIO; 688 } 689 690 i915_request_put(rq); 691 692 return err; 693 } 694 695 static int active_engine(void *data) 696 { 697 I915_RND_STATE(prng); 698 struct active_engine *arg = data; 699 struct intel_engine_cs *engine = arg->engine; 700 struct i915_request *rq[8] = {}; 701 struct intel_context *ce[ARRAY_SIZE(rq)]; 702 unsigned long count; 703 int err = 0; 704 705 for (count = 0; count < ARRAY_SIZE(ce); count++) { 706 ce[count] = intel_context_create(engine); 707 if (IS_ERR(ce[count])) { 708 err = PTR_ERR(ce[count]); 709 while (--count) 710 intel_context_put(ce[count]); 711 return err; 712 } 713 } 714 715 count = 0; 716 while (!kthread_should_stop()) { 717 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 718 struct i915_request *old = rq[idx]; 719 struct i915_request *new; 720 721 new = intel_context_create_request(ce[idx]); 722 if (IS_ERR(new)) { 723 err = PTR_ERR(new); 724 break; 725 } 726 727 rq[idx] = i915_request_get(new); 728 i915_request_add(new); 729 730 if (engine->schedule && arg->flags & TEST_PRIORITY) { 731 struct i915_sched_attr attr = { 732 .priority = 733 i915_prandom_u32_max_state(512, &prng), 734 }; 735 engine->schedule(rq[idx], &attr); 736 } 737 738 err = active_request_put(old); 739 if (err) 740 break; 741 742 cond_resched(); 743 } 744 745 for (count = 0; count < ARRAY_SIZE(rq); count++) { 746 int err__ = active_request_put(rq[count]); 747 748 /* Keep the first error */ 749 if (!err) 750 err = err__; 751 752 intel_context_put(ce[count]); 753 } 754 755 return err; 756 } 757 758 static int __igt_reset_engines(struct intel_gt *gt, 759 const char *test_name, 760 unsigned int flags) 761 { 762 struct i915_gpu_error *global = >->i915->gpu_error; 763 struct intel_engine_cs *engine, *other; 764 enum intel_engine_id id, tmp; 765 struct hang h; 766 int err = 0; 767 768 /* Check that issuing a reset on one engine does not interfere 769 * with any other engine. 770 */ 771 772 if (!intel_has_reset_engine(gt)) 773 return 0; 774 775 if (flags & TEST_ACTIVE) { 776 err = hang_init(&h, gt); 777 if (err) 778 return err; 779 780 if (flags & TEST_PRIORITY) 781 h.ctx->sched.priority = 1024; 782 } 783 784 for_each_engine(engine, gt, id) { 785 struct active_engine threads[I915_NUM_ENGINES] = {}; 786 unsigned long device = i915_reset_count(global); 787 unsigned long count = 0, reported; 788 IGT_TIMEOUT(end_time); 789 790 if (flags & TEST_ACTIVE && 791 !intel_engine_can_store_dword(engine)) 792 continue; 793 794 if (!wait_for_idle(engine)) { 795 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 796 engine->name, test_name); 797 err = -EIO; 798 break; 799 } 800 801 memset(threads, 0, sizeof(threads)); 802 for_each_engine(other, gt, tmp) { 803 struct task_struct *tsk; 804 805 threads[tmp].resets = 806 i915_reset_engine_count(global, other); 807 808 if (!(flags & TEST_OTHERS)) 809 continue; 810 811 if (other == engine && !(flags & TEST_SELF)) 812 continue; 813 814 threads[tmp].engine = other; 815 threads[tmp].flags = flags; 816 817 tsk = kthread_run(active_engine, &threads[tmp], 818 "igt/%s", other->name); 819 if (IS_ERR(tsk)) { 820 err = PTR_ERR(tsk); 821 goto unwind; 822 } 823 824 threads[tmp].task = tsk; 825 get_task_struct(tsk); 826 } 827 828 yield(); /* start all threads before we begin */ 829 830 engine_heartbeat_disable(engine); 831 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 832 do { 833 struct i915_request *rq = NULL; 834 835 if (flags & TEST_ACTIVE) { 836 rq = hang_create_request(&h, engine); 837 if (IS_ERR(rq)) { 838 err = PTR_ERR(rq); 839 break; 840 } 841 842 i915_request_get(rq); 843 i915_request_add(rq); 844 845 if (!wait_until_running(&h, rq)) { 846 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 847 848 pr_err("%s: Failed to start request %llx, at %x\n", 849 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 850 intel_engine_dump(engine, &p, 851 "%s\n", engine->name); 852 853 i915_request_put(rq); 854 err = -EIO; 855 break; 856 } 857 } 858 859 err = intel_engine_reset(engine, NULL); 860 if (err) { 861 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 862 engine->name, test_name, err); 863 break; 864 } 865 866 count++; 867 868 if (rq) { 869 if (rq->fence.error != -EIO) { 870 pr_err("i915_reset_engine(%s:%s):" 871 " failed to reset request %llx:%lld\n", 872 engine->name, test_name, 873 rq->fence.context, 874 rq->fence.seqno); 875 i915_request_put(rq); 876 877 GEM_TRACE_DUMP(); 878 intel_gt_set_wedged(gt); 879 err = -EIO; 880 break; 881 } 882 883 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 884 struct drm_printer p = 885 drm_info_printer(gt->i915->drm.dev); 886 887 pr_err("i915_reset_engine(%s:%s):" 888 " failed to complete request %llx:%lld after reset\n", 889 engine->name, test_name, 890 rq->fence.context, 891 rq->fence.seqno); 892 intel_engine_dump(engine, &p, 893 "%s\n", engine->name); 894 i915_request_put(rq); 895 896 GEM_TRACE_DUMP(); 897 intel_gt_set_wedged(gt); 898 err = -EIO; 899 break; 900 } 901 902 i915_request_put(rq); 903 } 904 905 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 906 struct drm_printer p = 907 drm_info_printer(gt->i915->drm.dev); 908 909 pr_err("i915_reset_engine(%s:%s):" 910 " failed to idle after reset\n", 911 engine->name, test_name); 912 intel_engine_dump(engine, &p, 913 "%s\n", engine->name); 914 915 err = -EIO; 916 break; 917 } 918 } while (time_before(jiffies, end_time)); 919 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 920 engine_heartbeat_enable(engine); 921 922 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 923 engine->name, test_name, count); 924 925 reported = i915_reset_engine_count(global, engine); 926 reported -= threads[engine->id].resets; 927 if (reported != count) { 928 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 929 engine->name, test_name, count, reported); 930 if (!err) 931 err = -EINVAL; 932 } 933 934 unwind: 935 for_each_engine(other, gt, tmp) { 936 int ret; 937 938 if (!threads[tmp].task) 939 continue; 940 941 ret = kthread_stop(threads[tmp].task); 942 if (ret) { 943 pr_err("kthread for other engine %s failed, err=%d\n", 944 other->name, ret); 945 if (!err) 946 err = ret; 947 } 948 put_task_struct(threads[tmp].task); 949 950 if (other->uabi_class != engine->uabi_class && 951 threads[tmp].resets != 952 i915_reset_engine_count(global, other)) { 953 pr_err("Innocent engine %s was reset (count=%ld)\n", 954 other->name, 955 i915_reset_engine_count(global, other) - 956 threads[tmp].resets); 957 if (!err) 958 err = -EINVAL; 959 } 960 } 961 962 if (device != i915_reset_count(global)) { 963 pr_err("Global reset (count=%ld)!\n", 964 i915_reset_count(global) - device); 965 if (!err) 966 err = -EINVAL; 967 } 968 969 if (err) 970 break; 971 972 err = igt_flush_test(gt->i915); 973 if (err) 974 break; 975 } 976 977 if (intel_gt_is_wedged(gt)) 978 err = -EIO; 979 980 if (flags & TEST_ACTIVE) 981 hang_fini(&h); 982 983 return err; 984 } 985 986 static int igt_reset_engines(void *arg) 987 { 988 static const struct { 989 const char *name; 990 unsigned int flags; 991 } phases[] = { 992 { "idle", 0 }, 993 { "active", TEST_ACTIVE }, 994 { "others-idle", TEST_OTHERS }, 995 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 996 { 997 "others-priority", 998 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 999 }, 1000 { 1001 "self-priority", 1002 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1003 }, 1004 { } 1005 }; 1006 struct intel_gt *gt = arg; 1007 typeof(*phases) *p; 1008 int err; 1009 1010 for (p = phases; p->name; p++) { 1011 if (p->flags & TEST_PRIORITY) { 1012 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1013 continue; 1014 } 1015 1016 err = __igt_reset_engines(arg, p->name, p->flags); 1017 if (err) 1018 return err; 1019 } 1020 1021 return 0; 1022 } 1023 1024 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1025 { 1026 u32 count = i915_reset_count(>->i915->gpu_error); 1027 1028 intel_gt_reset(gt, mask, NULL); 1029 1030 return count; 1031 } 1032 1033 static int igt_reset_wait(void *arg) 1034 { 1035 struct intel_gt *gt = arg; 1036 struct i915_gpu_error *global = >->i915->gpu_error; 1037 struct intel_engine_cs *engine = gt->engine[RCS0]; 1038 struct i915_request *rq; 1039 unsigned int reset_count; 1040 struct hang h; 1041 long timeout; 1042 int err; 1043 1044 if (!engine || !intel_engine_can_store_dword(engine)) 1045 return 0; 1046 1047 /* Check that we detect a stuck waiter and issue a reset */ 1048 1049 igt_global_reset_lock(gt); 1050 1051 err = hang_init(&h, gt); 1052 if (err) 1053 goto unlock; 1054 1055 rq = hang_create_request(&h, engine); 1056 if (IS_ERR(rq)) { 1057 err = PTR_ERR(rq); 1058 goto fini; 1059 } 1060 1061 i915_request_get(rq); 1062 i915_request_add(rq); 1063 1064 if (!wait_until_running(&h, rq)) { 1065 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1066 1067 pr_err("%s: Failed to start request %llx, at %x\n", 1068 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1069 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1070 1071 intel_gt_set_wedged(gt); 1072 1073 err = -EIO; 1074 goto out_rq; 1075 } 1076 1077 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1078 1079 timeout = i915_request_wait(rq, 0, 10); 1080 if (timeout < 0) { 1081 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1082 timeout); 1083 err = timeout; 1084 goto out_rq; 1085 } 1086 1087 if (i915_reset_count(global) == reset_count) { 1088 pr_err("No GPU reset recorded!\n"); 1089 err = -EINVAL; 1090 goto out_rq; 1091 } 1092 1093 out_rq: 1094 i915_request_put(rq); 1095 fini: 1096 hang_fini(&h); 1097 unlock: 1098 igt_global_reset_unlock(gt); 1099 1100 if (intel_gt_is_wedged(gt)) 1101 return -EIO; 1102 1103 return err; 1104 } 1105 1106 struct evict_vma { 1107 struct completion completion; 1108 struct i915_vma *vma; 1109 }; 1110 1111 static int evict_vma(void *data) 1112 { 1113 struct evict_vma *arg = data; 1114 struct i915_address_space *vm = arg->vma->vm; 1115 struct drm_mm_node evict = arg->vma->node; 1116 int err; 1117 1118 complete(&arg->completion); 1119 1120 mutex_lock(&vm->mutex); 1121 err = i915_gem_evict_for_node(vm, &evict, 0); 1122 mutex_unlock(&vm->mutex); 1123 1124 return err; 1125 } 1126 1127 static int evict_fence(void *data) 1128 { 1129 struct evict_vma *arg = data; 1130 int err; 1131 1132 complete(&arg->completion); 1133 1134 /* Mark the fence register as dirty to force the mmio update. */ 1135 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1136 if (err) { 1137 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1138 return err; 1139 } 1140 1141 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1142 if (err) { 1143 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1144 return err; 1145 } 1146 1147 err = i915_vma_pin_fence(arg->vma); 1148 i915_vma_unpin(arg->vma); 1149 if (err) { 1150 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1151 return err; 1152 } 1153 1154 i915_vma_unpin_fence(arg->vma); 1155 1156 return 0; 1157 } 1158 1159 static int __igt_reset_evict_vma(struct intel_gt *gt, 1160 struct i915_address_space *vm, 1161 int (*fn)(void *), 1162 unsigned int flags) 1163 { 1164 struct intel_engine_cs *engine = gt->engine[RCS0]; 1165 struct drm_i915_gem_object *obj; 1166 struct task_struct *tsk = NULL; 1167 struct i915_request *rq; 1168 struct evict_vma arg; 1169 struct hang h; 1170 unsigned int pin_flags; 1171 int err; 1172 1173 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1174 return 0; 1175 1176 if (!engine || !intel_engine_can_store_dword(engine)) 1177 return 0; 1178 1179 /* Check that we can recover an unbind stuck on a hanging request */ 1180 1181 err = hang_init(&h, gt); 1182 if (err) 1183 return err; 1184 1185 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1186 if (IS_ERR(obj)) { 1187 err = PTR_ERR(obj); 1188 goto fini; 1189 } 1190 1191 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1192 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1193 if (err) { 1194 pr_err("Invalid X-tiling settings; err:%d\n", err); 1195 goto out_obj; 1196 } 1197 } 1198 1199 arg.vma = i915_vma_instance(obj, vm, NULL); 1200 if (IS_ERR(arg.vma)) { 1201 err = PTR_ERR(arg.vma); 1202 goto out_obj; 1203 } 1204 1205 rq = hang_create_request(&h, engine); 1206 if (IS_ERR(rq)) { 1207 err = PTR_ERR(rq); 1208 goto out_obj; 1209 } 1210 1211 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1212 1213 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1214 pin_flags |= PIN_MAPPABLE; 1215 1216 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1217 if (err) { 1218 i915_request_add(rq); 1219 goto out_obj; 1220 } 1221 1222 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1223 err = i915_vma_pin_fence(arg.vma); 1224 if (err) { 1225 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1226 i915_vma_unpin(arg.vma); 1227 i915_request_add(rq); 1228 goto out_obj; 1229 } 1230 } 1231 1232 i915_vma_lock(arg.vma); 1233 err = i915_request_await_object(rq, arg.vma->obj, 1234 flags & EXEC_OBJECT_WRITE); 1235 if (err == 0) 1236 err = i915_vma_move_to_active(arg.vma, rq, flags); 1237 i915_vma_unlock(arg.vma); 1238 1239 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1240 i915_vma_unpin_fence(arg.vma); 1241 i915_vma_unpin(arg.vma); 1242 1243 i915_request_get(rq); 1244 i915_request_add(rq); 1245 if (err) 1246 goto out_rq; 1247 1248 if (!wait_until_running(&h, rq)) { 1249 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1250 1251 pr_err("%s: Failed to start request %llx, at %x\n", 1252 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1253 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1254 1255 intel_gt_set_wedged(gt); 1256 goto out_reset; 1257 } 1258 1259 init_completion(&arg.completion); 1260 1261 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1262 if (IS_ERR(tsk)) { 1263 err = PTR_ERR(tsk); 1264 tsk = NULL; 1265 goto out_reset; 1266 } 1267 get_task_struct(tsk); 1268 1269 wait_for_completion(&arg.completion); 1270 1271 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1272 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1273 1274 pr_err("igt/evict_vma kthread did not wait\n"); 1275 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1276 1277 intel_gt_set_wedged(gt); 1278 goto out_reset; 1279 } 1280 1281 out_reset: 1282 igt_global_reset_lock(gt); 1283 fake_hangcheck(gt, rq->engine->mask); 1284 igt_global_reset_unlock(gt); 1285 1286 if (tsk) { 1287 struct intel_wedge_me w; 1288 1289 /* The reset, even indirectly, should take less than 10ms. */ 1290 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1291 err = kthread_stop(tsk); 1292 1293 put_task_struct(tsk); 1294 } 1295 1296 out_rq: 1297 i915_request_put(rq); 1298 out_obj: 1299 i915_gem_object_put(obj); 1300 fini: 1301 hang_fini(&h); 1302 if (intel_gt_is_wedged(gt)) 1303 return -EIO; 1304 1305 return err; 1306 } 1307 1308 static int igt_reset_evict_ggtt(void *arg) 1309 { 1310 struct intel_gt *gt = arg; 1311 1312 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1313 evict_vma, EXEC_OBJECT_WRITE); 1314 } 1315 1316 static int igt_reset_evict_ppgtt(void *arg) 1317 { 1318 struct intel_gt *gt = arg; 1319 struct i915_ppgtt *ppgtt; 1320 int err; 1321 1322 /* aliasing == global gtt locking, covered above */ 1323 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1324 return 0; 1325 1326 ppgtt = i915_ppgtt_create(gt); 1327 if (IS_ERR(ppgtt)) 1328 return PTR_ERR(ppgtt); 1329 1330 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1331 evict_vma, EXEC_OBJECT_WRITE); 1332 i915_vm_put(&ppgtt->vm); 1333 1334 return err; 1335 } 1336 1337 static int igt_reset_evict_fence(void *arg) 1338 { 1339 struct intel_gt *gt = arg; 1340 1341 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1342 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1343 } 1344 1345 static int wait_for_others(struct intel_gt *gt, 1346 struct intel_engine_cs *exclude) 1347 { 1348 struct intel_engine_cs *engine; 1349 enum intel_engine_id id; 1350 1351 for_each_engine(engine, gt, id) { 1352 if (engine == exclude) 1353 continue; 1354 1355 if (!wait_for_idle(engine)) 1356 return -EIO; 1357 } 1358 1359 return 0; 1360 } 1361 1362 static int igt_reset_queue(void *arg) 1363 { 1364 struct intel_gt *gt = arg; 1365 struct i915_gpu_error *global = >->i915->gpu_error; 1366 struct intel_engine_cs *engine; 1367 enum intel_engine_id id; 1368 struct hang h; 1369 int err; 1370 1371 /* Check that we replay pending requests following a hang */ 1372 1373 igt_global_reset_lock(gt); 1374 1375 err = hang_init(&h, gt); 1376 if (err) 1377 goto unlock; 1378 1379 for_each_engine(engine, gt, id) { 1380 struct i915_request *prev; 1381 IGT_TIMEOUT(end_time); 1382 unsigned int count; 1383 1384 if (!intel_engine_can_store_dword(engine)) 1385 continue; 1386 1387 prev = hang_create_request(&h, engine); 1388 if (IS_ERR(prev)) { 1389 err = PTR_ERR(prev); 1390 goto fini; 1391 } 1392 1393 i915_request_get(prev); 1394 i915_request_add(prev); 1395 1396 count = 0; 1397 do { 1398 struct i915_request *rq; 1399 unsigned int reset_count; 1400 1401 rq = hang_create_request(&h, engine); 1402 if (IS_ERR(rq)) { 1403 err = PTR_ERR(rq); 1404 goto fini; 1405 } 1406 1407 i915_request_get(rq); 1408 i915_request_add(rq); 1409 1410 /* 1411 * XXX We don't handle resetting the kernel context 1412 * very well. If we trigger a device reset twice in 1413 * quick succession while the kernel context is 1414 * executing, we may end up skipping the breadcrumb. 1415 * This is really only a problem for the selftest as 1416 * normally there is a large interlude between resets 1417 * (hangcheck), or we focus on resetting just one 1418 * engine and so avoid repeatedly resetting innocents. 1419 */ 1420 err = wait_for_others(gt, engine); 1421 if (err) { 1422 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1423 __func__, engine->name); 1424 i915_request_put(rq); 1425 i915_request_put(prev); 1426 1427 GEM_TRACE_DUMP(); 1428 intel_gt_set_wedged(gt); 1429 goto fini; 1430 } 1431 1432 if (!wait_until_running(&h, prev)) { 1433 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1434 1435 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1436 __func__, engine->name, 1437 prev->fence.seqno, hws_seqno(&h, prev)); 1438 intel_engine_dump(engine, &p, 1439 "%s\n", engine->name); 1440 1441 i915_request_put(rq); 1442 i915_request_put(prev); 1443 1444 intel_gt_set_wedged(gt); 1445 1446 err = -EIO; 1447 goto fini; 1448 } 1449 1450 reset_count = fake_hangcheck(gt, BIT(id)); 1451 1452 if (prev->fence.error != -EIO) { 1453 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1454 prev->fence.error); 1455 i915_request_put(rq); 1456 i915_request_put(prev); 1457 err = -EINVAL; 1458 goto fini; 1459 } 1460 1461 if (rq->fence.error) { 1462 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1463 rq->fence.error); 1464 i915_request_put(rq); 1465 i915_request_put(prev); 1466 err = -EINVAL; 1467 goto fini; 1468 } 1469 1470 if (i915_reset_count(global) == reset_count) { 1471 pr_err("No GPU reset recorded!\n"); 1472 i915_request_put(rq); 1473 i915_request_put(prev); 1474 err = -EINVAL; 1475 goto fini; 1476 } 1477 1478 i915_request_put(prev); 1479 prev = rq; 1480 count++; 1481 } while (time_before(jiffies, end_time)); 1482 pr_info("%s: Completed %d resets\n", engine->name, count); 1483 1484 *h.batch = MI_BATCH_BUFFER_END; 1485 intel_gt_chipset_flush(engine->gt); 1486 1487 i915_request_put(prev); 1488 1489 err = igt_flush_test(gt->i915); 1490 if (err) 1491 break; 1492 } 1493 1494 fini: 1495 hang_fini(&h); 1496 unlock: 1497 igt_global_reset_unlock(gt); 1498 1499 if (intel_gt_is_wedged(gt)) 1500 return -EIO; 1501 1502 return err; 1503 } 1504 1505 static int igt_handle_error(void *arg) 1506 { 1507 struct intel_gt *gt = arg; 1508 struct i915_gpu_error *global = >->i915->gpu_error; 1509 struct intel_engine_cs *engine = gt->engine[RCS0]; 1510 struct hang h; 1511 struct i915_request *rq; 1512 struct i915_gpu_coredump *error; 1513 int err; 1514 1515 /* Check that we can issue a global GPU and engine reset */ 1516 1517 if (!intel_has_reset_engine(gt)) 1518 return 0; 1519 1520 if (!engine || !intel_engine_can_store_dword(engine)) 1521 return 0; 1522 1523 err = hang_init(&h, gt); 1524 if (err) 1525 return err; 1526 1527 rq = hang_create_request(&h, engine); 1528 if (IS_ERR(rq)) { 1529 err = PTR_ERR(rq); 1530 goto err_fini; 1531 } 1532 1533 i915_request_get(rq); 1534 i915_request_add(rq); 1535 1536 if (!wait_until_running(&h, rq)) { 1537 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1538 1539 pr_err("%s: Failed to start request %llx, at %x\n", 1540 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1541 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1542 1543 intel_gt_set_wedged(gt); 1544 1545 err = -EIO; 1546 goto err_request; 1547 } 1548 1549 /* Temporarily disable error capture */ 1550 error = xchg(&global->first_error, (void *)-1); 1551 1552 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1553 1554 xchg(&global->first_error, error); 1555 1556 if (rq->fence.error != -EIO) { 1557 pr_err("Guilty request not identified!\n"); 1558 err = -EINVAL; 1559 goto err_request; 1560 } 1561 1562 err_request: 1563 i915_request_put(rq); 1564 err_fini: 1565 hang_fini(&h); 1566 return err; 1567 } 1568 1569 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1570 const struct igt_atomic_section *p, 1571 const char *mode) 1572 { 1573 struct tasklet_struct * const t = &engine->execlists.tasklet; 1574 int err; 1575 1576 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1577 engine->name, mode, p->name); 1578 1579 tasklet_disable(t); 1580 p->critical_section_begin(); 1581 1582 err = intel_engine_reset(engine, NULL); 1583 1584 p->critical_section_end(); 1585 tasklet_enable(t); 1586 1587 if (err) 1588 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1589 engine->name, mode, p->name); 1590 1591 return err; 1592 } 1593 1594 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1595 const struct igt_atomic_section *p) 1596 { 1597 struct i915_request *rq; 1598 struct hang h; 1599 int err; 1600 1601 err = __igt_atomic_reset_engine(engine, p, "idle"); 1602 if (err) 1603 return err; 1604 1605 err = hang_init(&h, engine->gt); 1606 if (err) 1607 return err; 1608 1609 rq = hang_create_request(&h, engine); 1610 if (IS_ERR(rq)) { 1611 err = PTR_ERR(rq); 1612 goto out; 1613 } 1614 1615 i915_request_get(rq); 1616 i915_request_add(rq); 1617 1618 if (wait_until_running(&h, rq)) { 1619 err = __igt_atomic_reset_engine(engine, p, "active"); 1620 } else { 1621 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1622 __func__, engine->name, 1623 rq->fence.seqno, hws_seqno(&h, rq)); 1624 intel_gt_set_wedged(engine->gt); 1625 err = -EIO; 1626 } 1627 1628 if (err == 0) { 1629 struct intel_wedge_me w; 1630 1631 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1632 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1633 if (intel_gt_is_wedged(engine->gt)) 1634 err = -EIO; 1635 } 1636 1637 i915_request_put(rq); 1638 out: 1639 hang_fini(&h); 1640 return err; 1641 } 1642 1643 static int igt_reset_engines_atomic(void *arg) 1644 { 1645 struct intel_gt *gt = arg; 1646 const typeof(*igt_atomic_phases) *p; 1647 int err = 0; 1648 1649 /* Check that the engines resets are usable from atomic context */ 1650 1651 if (!intel_has_reset_engine(gt)) 1652 return 0; 1653 1654 if (intel_uc_uses_guc_submission(>->uc)) 1655 return 0; 1656 1657 igt_global_reset_lock(gt); 1658 1659 /* Flush any requests before we get started and check basics */ 1660 if (!igt_force_reset(gt)) 1661 goto unlock; 1662 1663 for (p = igt_atomic_phases; p->name; p++) { 1664 struct intel_engine_cs *engine; 1665 enum intel_engine_id id; 1666 1667 for_each_engine(engine, gt, id) { 1668 err = igt_atomic_reset_engine(engine, p); 1669 if (err) 1670 goto out; 1671 } 1672 } 1673 1674 out: 1675 /* As we poke around the guts, do a full reset before continuing. */ 1676 igt_force_reset(gt); 1677 unlock: 1678 igt_global_reset_unlock(gt); 1679 1680 return err; 1681 } 1682 1683 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1684 { 1685 static const struct i915_subtest tests[] = { 1686 SUBTEST(igt_hang_sanitycheck), 1687 SUBTEST(igt_reset_nop), 1688 SUBTEST(igt_reset_nop_engine), 1689 SUBTEST(igt_reset_idle_engine), 1690 SUBTEST(igt_reset_active_engine), 1691 SUBTEST(igt_reset_engines), 1692 SUBTEST(igt_reset_engines_atomic), 1693 SUBTEST(igt_reset_queue), 1694 SUBTEST(igt_reset_wait), 1695 SUBTEST(igt_reset_evict_ggtt), 1696 SUBTEST(igt_reset_evict_ppgtt), 1697 SUBTEST(igt_reset_evict_fence), 1698 SUBTEST(igt_handle_error), 1699 }; 1700 struct intel_gt *gt = &i915->gt; 1701 intel_wakeref_t wakeref; 1702 int err; 1703 1704 if (!intel_has_gpu_reset(gt)) 1705 return 0; 1706 1707 if (intel_gt_is_wedged(gt)) 1708 return -EIO; /* we're long past hope of a successful reset */ 1709 1710 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1711 1712 err = intel_gt_live_subtests(tests, gt); 1713 1714 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1715 1716 return err; 1717 } 1718