1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/kthread.h> 26 27 #include "gem/i915_gem_context.h" 28 29 #include "intel_gt.h" 30 #include "intel_engine_heartbeat.h" 31 #include "intel_engine_pm.h" 32 33 #include "i915_selftest.h" 34 #include "selftests/i915_random.h" 35 #include "selftests/igt_flush_test.h" 36 #include "selftests/igt_reset.h" 37 #include "selftests/igt_atomic.h" 38 39 #include "selftests/mock_drm.h" 40 41 #include "gem/selftests/mock_context.h" 42 #include "gem/selftests/igt_gem_utils.h" 43 44 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 45 46 struct hang { 47 struct intel_gt *gt; 48 struct drm_i915_gem_object *hws; 49 struct drm_i915_gem_object *obj; 50 struct i915_gem_context *ctx; 51 u32 *seqno; 52 u32 *batch; 53 }; 54 55 static int hang_init(struct hang *h, struct intel_gt *gt) 56 { 57 void *vaddr; 58 int err; 59 60 memset(h, 0, sizeof(*h)); 61 h->gt = gt; 62 63 h->ctx = kernel_context(gt->i915); 64 if (IS_ERR(h->ctx)) 65 return PTR_ERR(h->ctx); 66 67 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 68 69 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 70 if (IS_ERR(h->hws)) { 71 err = PTR_ERR(h->hws); 72 goto err_ctx; 73 } 74 75 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 76 if (IS_ERR(h->obj)) { 77 err = PTR_ERR(h->obj); 78 goto err_hws; 79 } 80 81 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 82 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 83 if (IS_ERR(vaddr)) { 84 err = PTR_ERR(vaddr); 85 goto err_obj; 86 } 87 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 88 89 vaddr = i915_gem_object_pin_map(h->obj, 90 i915_coherent_map_type(gt->i915)); 91 if (IS_ERR(vaddr)) { 92 err = PTR_ERR(vaddr); 93 goto err_unpin_hws; 94 } 95 h->batch = vaddr; 96 97 return 0; 98 99 err_unpin_hws: 100 i915_gem_object_unpin_map(h->hws); 101 err_obj: 102 i915_gem_object_put(h->obj); 103 err_hws: 104 i915_gem_object_put(h->hws); 105 err_ctx: 106 kernel_context_close(h->ctx); 107 return err; 108 } 109 110 static u64 hws_address(const struct i915_vma *hws, 111 const struct i915_request *rq) 112 { 113 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 114 } 115 116 static int move_to_active(struct i915_vma *vma, 117 struct i915_request *rq, 118 unsigned int flags) 119 { 120 int err; 121 122 i915_vma_lock(vma); 123 err = i915_request_await_object(rq, vma->obj, 124 flags & EXEC_OBJECT_WRITE); 125 if (err == 0) 126 err = i915_vma_move_to_active(vma, rq, flags); 127 i915_vma_unlock(vma); 128 129 return err; 130 } 131 132 static struct i915_request * 133 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 134 { 135 struct intel_gt *gt = h->gt; 136 struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx); 137 struct drm_i915_gem_object *obj; 138 struct i915_request *rq = NULL; 139 struct i915_vma *hws, *vma; 140 unsigned int flags; 141 void *vaddr; 142 u32 *batch; 143 int err; 144 145 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 146 if (IS_ERR(obj)) { 147 i915_vm_put(vm); 148 return ERR_CAST(obj); 149 } 150 151 vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915)); 152 if (IS_ERR(vaddr)) { 153 i915_gem_object_put(obj); 154 i915_vm_put(vm); 155 return ERR_CAST(vaddr); 156 } 157 158 i915_gem_object_unpin_map(h->obj); 159 i915_gem_object_put(h->obj); 160 161 h->obj = obj; 162 h->batch = vaddr; 163 164 vma = i915_vma_instance(h->obj, vm, NULL); 165 if (IS_ERR(vma)) { 166 i915_vm_put(vm); 167 return ERR_CAST(vma); 168 } 169 170 hws = i915_vma_instance(h->hws, vm, NULL); 171 if (IS_ERR(hws)) { 172 i915_vm_put(vm); 173 return ERR_CAST(hws); 174 } 175 176 err = i915_vma_pin(vma, 0, 0, PIN_USER); 177 if (err) { 178 i915_vm_put(vm); 179 return ERR_PTR(err); 180 } 181 182 err = i915_vma_pin(hws, 0, 0, PIN_USER); 183 if (err) 184 goto unpin_vma; 185 186 rq = igt_request_alloc(h->ctx, engine); 187 if (IS_ERR(rq)) { 188 err = PTR_ERR(rq); 189 goto unpin_hws; 190 } 191 192 err = move_to_active(vma, rq, 0); 193 if (err) 194 goto cancel_rq; 195 196 err = move_to_active(hws, rq, 0); 197 if (err) 198 goto cancel_rq; 199 200 batch = h->batch; 201 if (INTEL_GEN(gt->i915) >= 8) { 202 *batch++ = MI_STORE_DWORD_IMM_GEN4; 203 *batch++ = lower_32_bits(hws_address(hws, rq)); 204 *batch++ = upper_32_bits(hws_address(hws, rq)); 205 *batch++ = rq->fence.seqno; 206 *batch++ = MI_ARB_CHECK; 207 208 memset(batch, 0, 1024); 209 batch += 1024 / sizeof(*batch); 210 211 *batch++ = MI_ARB_CHECK; 212 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 213 *batch++ = lower_32_bits(vma->node.start); 214 *batch++ = upper_32_bits(vma->node.start); 215 } else if (INTEL_GEN(gt->i915) >= 6) { 216 *batch++ = MI_STORE_DWORD_IMM_GEN4; 217 *batch++ = 0; 218 *batch++ = lower_32_bits(hws_address(hws, rq)); 219 *batch++ = rq->fence.seqno; 220 *batch++ = MI_ARB_CHECK; 221 222 memset(batch, 0, 1024); 223 batch += 1024 / sizeof(*batch); 224 225 *batch++ = MI_ARB_CHECK; 226 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 227 *batch++ = lower_32_bits(vma->node.start); 228 } else if (INTEL_GEN(gt->i915) >= 4) { 229 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 230 *batch++ = 0; 231 *batch++ = lower_32_bits(hws_address(hws, rq)); 232 *batch++ = rq->fence.seqno; 233 *batch++ = MI_ARB_CHECK; 234 235 memset(batch, 0, 1024); 236 batch += 1024 / sizeof(*batch); 237 238 *batch++ = MI_ARB_CHECK; 239 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 240 *batch++ = lower_32_bits(vma->node.start); 241 } else { 242 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 243 *batch++ = lower_32_bits(hws_address(hws, rq)); 244 *batch++ = rq->fence.seqno; 245 *batch++ = MI_ARB_CHECK; 246 247 memset(batch, 0, 1024); 248 batch += 1024 / sizeof(*batch); 249 250 *batch++ = MI_ARB_CHECK; 251 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 252 *batch++ = lower_32_bits(vma->node.start); 253 } 254 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 255 intel_gt_chipset_flush(engine->gt); 256 257 if (rq->engine->emit_init_breadcrumb) { 258 err = rq->engine->emit_init_breadcrumb(rq); 259 if (err) 260 goto cancel_rq; 261 } 262 263 flags = 0; 264 if (INTEL_GEN(gt->i915) <= 5) 265 flags |= I915_DISPATCH_SECURE; 266 267 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 268 269 cancel_rq: 270 if (err) { 271 i915_request_skip(rq, err); 272 i915_request_add(rq); 273 } 274 unpin_hws: 275 i915_vma_unpin(hws); 276 unpin_vma: 277 i915_vma_unpin(vma); 278 i915_vm_put(vm); 279 return err ? ERR_PTR(err) : rq; 280 } 281 282 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 283 { 284 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 285 } 286 287 static void hang_fini(struct hang *h) 288 { 289 *h->batch = MI_BATCH_BUFFER_END; 290 intel_gt_chipset_flush(h->gt); 291 292 i915_gem_object_unpin_map(h->obj); 293 i915_gem_object_put(h->obj); 294 295 i915_gem_object_unpin_map(h->hws); 296 i915_gem_object_put(h->hws); 297 298 kernel_context_close(h->ctx); 299 300 igt_flush_test(h->gt->i915); 301 } 302 303 static bool wait_until_running(struct hang *h, struct i915_request *rq) 304 { 305 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 306 rq->fence.seqno), 307 10) && 308 wait_for(i915_seqno_passed(hws_seqno(h, rq), 309 rq->fence.seqno), 310 1000)); 311 } 312 313 static void engine_heartbeat_disable(struct intel_engine_cs *engine, 314 unsigned long *saved) 315 { 316 *saved = engine->props.heartbeat_interval_ms; 317 engine->props.heartbeat_interval_ms = 0; 318 319 intel_engine_pm_get(engine); 320 intel_engine_park_heartbeat(engine); 321 } 322 323 static void engine_heartbeat_enable(struct intel_engine_cs *engine, 324 unsigned long saved) 325 { 326 intel_engine_pm_put(engine); 327 328 engine->props.heartbeat_interval_ms = saved; 329 } 330 331 static int igt_hang_sanitycheck(void *arg) 332 { 333 struct intel_gt *gt = arg; 334 struct i915_request *rq; 335 struct intel_engine_cs *engine; 336 enum intel_engine_id id; 337 struct hang h; 338 int err; 339 340 /* Basic check that we can execute our hanging batch */ 341 342 err = hang_init(&h, gt); 343 if (err) 344 return err; 345 346 for_each_engine(engine, gt, id) { 347 struct intel_wedge_me w; 348 long timeout; 349 350 if (!intel_engine_can_store_dword(engine)) 351 continue; 352 353 rq = hang_create_request(&h, engine); 354 if (IS_ERR(rq)) { 355 err = PTR_ERR(rq); 356 pr_err("Failed to create request for %s, err=%d\n", 357 engine->name, err); 358 goto fini; 359 } 360 361 i915_request_get(rq); 362 363 *h.batch = MI_BATCH_BUFFER_END; 364 intel_gt_chipset_flush(engine->gt); 365 366 i915_request_add(rq); 367 368 timeout = 0; 369 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 370 timeout = i915_request_wait(rq, 0, 371 MAX_SCHEDULE_TIMEOUT); 372 if (intel_gt_is_wedged(gt)) 373 timeout = -EIO; 374 375 i915_request_put(rq); 376 377 if (timeout < 0) { 378 err = timeout; 379 pr_err("Wait for request failed on %s, err=%d\n", 380 engine->name, err); 381 goto fini; 382 } 383 } 384 385 fini: 386 hang_fini(&h); 387 return err; 388 } 389 390 static bool wait_for_idle(struct intel_engine_cs *engine) 391 { 392 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 393 } 394 395 static int igt_reset_nop(void *arg) 396 { 397 struct intel_gt *gt = arg; 398 struct i915_gpu_error *global = >->i915->gpu_error; 399 struct intel_engine_cs *engine; 400 unsigned int reset_count, count; 401 enum intel_engine_id id; 402 IGT_TIMEOUT(end_time); 403 int err = 0; 404 405 /* Check that we can reset during non-user portions of requests */ 406 407 reset_count = i915_reset_count(global); 408 count = 0; 409 do { 410 for_each_engine(engine, gt, id) { 411 struct intel_context *ce; 412 int i; 413 414 ce = intel_context_create(engine); 415 if (IS_ERR(ce)) { 416 err = PTR_ERR(ce); 417 break; 418 } 419 420 for (i = 0; i < 16; i++) { 421 struct i915_request *rq; 422 423 rq = intel_context_create_request(ce); 424 if (IS_ERR(rq)) { 425 err = PTR_ERR(rq); 426 break; 427 } 428 429 i915_request_add(rq); 430 } 431 432 intel_context_put(ce); 433 } 434 435 igt_global_reset_lock(gt); 436 intel_gt_reset(gt, ALL_ENGINES, NULL); 437 igt_global_reset_unlock(gt); 438 439 if (intel_gt_is_wedged(gt)) { 440 err = -EIO; 441 break; 442 } 443 444 if (i915_reset_count(global) != reset_count + ++count) { 445 pr_err("Full GPU reset not recorded!\n"); 446 err = -EINVAL; 447 break; 448 } 449 450 err = igt_flush_test(gt->i915); 451 if (err) 452 break; 453 } while (time_before(jiffies, end_time)); 454 pr_info("%s: %d resets\n", __func__, count); 455 456 if (igt_flush_test(gt->i915)) 457 err = -EIO; 458 return err; 459 } 460 461 static int igt_reset_nop_engine(void *arg) 462 { 463 struct intel_gt *gt = arg; 464 struct i915_gpu_error *global = >->i915->gpu_error; 465 struct intel_engine_cs *engine; 466 enum intel_engine_id id; 467 468 /* Check that we can engine-reset during non-user portions */ 469 470 if (!intel_has_reset_engine(gt)) 471 return 0; 472 473 for_each_engine(engine, gt, id) { 474 unsigned int reset_count, reset_engine_count, count; 475 struct intel_context *ce; 476 unsigned long heartbeat; 477 IGT_TIMEOUT(end_time); 478 int err; 479 480 ce = intel_context_create(engine); 481 if (IS_ERR(ce)) 482 return PTR_ERR(ce); 483 484 reset_count = i915_reset_count(global); 485 reset_engine_count = i915_reset_engine_count(global, engine); 486 count = 0; 487 488 engine_heartbeat_disable(engine, &heartbeat); 489 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 490 do { 491 int i; 492 493 if (!wait_for_idle(engine)) { 494 pr_err("%s failed to idle before reset\n", 495 engine->name); 496 err = -EIO; 497 break; 498 } 499 500 for (i = 0; i < 16; i++) { 501 struct i915_request *rq; 502 503 rq = intel_context_create_request(ce); 504 if (IS_ERR(rq)) { 505 err = PTR_ERR(rq); 506 break; 507 } 508 509 i915_request_add(rq); 510 } 511 err = intel_engine_reset(engine, NULL); 512 if (err) { 513 pr_err("i915_reset_engine failed\n"); 514 break; 515 } 516 517 if (i915_reset_count(global) != reset_count) { 518 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 519 err = -EINVAL; 520 break; 521 } 522 523 if (i915_reset_engine_count(global, engine) != 524 reset_engine_count + ++count) { 525 pr_err("%s engine reset not recorded!\n", 526 engine->name); 527 err = -EINVAL; 528 break; 529 } 530 } while (time_before(jiffies, end_time)); 531 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 532 engine_heartbeat_enable(engine, heartbeat); 533 534 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 535 536 intel_context_put(ce); 537 if (igt_flush_test(gt->i915)) 538 err = -EIO; 539 if (err) 540 return err; 541 } 542 543 return 0; 544 } 545 546 static int __igt_reset_engine(struct intel_gt *gt, bool active) 547 { 548 struct i915_gpu_error *global = >->i915->gpu_error; 549 struct intel_engine_cs *engine; 550 enum intel_engine_id id; 551 struct hang h; 552 int err = 0; 553 554 /* Check that we can issue an engine reset on an idle engine (no-op) */ 555 556 if (!intel_has_reset_engine(gt)) 557 return 0; 558 559 if (active) { 560 err = hang_init(&h, gt); 561 if (err) 562 return err; 563 } 564 565 for_each_engine(engine, gt, id) { 566 unsigned int reset_count, reset_engine_count; 567 unsigned long heartbeat; 568 IGT_TIMEOUT(end_time); 569 570 if (active && !intel_engine_can_store_dword(engine)) 571 continue; 572 573 if (!wait_for_idle(engine)) { 574 pr_err("%s failed to idle before reset\n", 575 engine->name); 576 err = -EIO; 577 break; 578 } 579 580 reset_count = i915_reset_count(global); 581 reset_engine_count = i915_reset_engine_count(global, engine); 582 583 engine_heartbeat_disable(engine, &heartbeat); 584 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 585 do { 586 if (active) { 587 struct i915_request *rq; 588 589 rq = hang_create_request(&h, engine); 590 if (IS_ERR(rq)) { 591 err = PTR_ERR(rq); 592 break; 593 } 594 595 i915_request_get(rq); 596 i915_request_add(rq); 597 598 if (!wait_until_running(&h, rq)) { 599 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 600 601 pr_err("%s: Failed to start request %llx, at %x\n", 602 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 603 intel_engine_dump(engine, &p, 604 "%s\n", engine->name); 605 606 i915_request_put(rq); 607 err = -EIO; 608 break; 609 } 610 611 i915_request_put(rq); 612 } 613 614 err = intel_engine_reset(engine, NULL); 615 if (err) { 616 pr_err("i915_reset_engine failed\n"); 617 break; 618 } 619 620 if (i915_reset_count(global) != reset_count) { 621 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 622 err = -EINVAL; 623 break; 624 } 625 626 if (i915_reset_engine_count(global, engine) != 627 ++reset_engine_count) { 628 pr_err("%s engine reset not recorded!\n", 629 engine->name); 630 err = -EINVAL; 631 break; 632 } 633 } while (time_before(jiffies, end_time)); 634 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 635 engine_heartbeat_enable(engine, heartbeat); 636 637 if (err) 638 break; 639 640 err = igt_flush_test(gt->i915); 641 if (err) 642 break; 643 } 644 645 if (intel_gt_is_wedged(gt)) 646 err = -EIO; 647 648 if (active) 649 hang_fini(&h); 650 651 return err; 652 } 653 654 static int igt_reset_idle_engine(void *arg) 655 { 656 return __igt_reset_engine(arg, false); 657 } 658 659 static int igt_reset_active_engine(void *arg) 660 { 661 return __igt_reset_engine(arg, true); 662 } 663 664 struct active_engine { 665 struct task_struct *task; 666 struct intel_engine_cs *engine; 667 unsigned long resets; 668 unsigned int flags; 669 }; 670 671 #define TEST_ACTIVE BIT(0) 672 #define TEST_OTHERS BIT(1) 673 #define TEST_SELF BIT(2) 674 #define TEST_PRIORITY BIT(3) 675 676 static int active_request_put(struct i915_request *rq) 677 { 678 int err = 0; 679 680 if (!rq) 681 return 0; 682 683 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 684 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 685 rq->engine->name, 686 rq->fence.context, 687 rq->fence.seqno); 688 GEM_TRACE_DUMP(); 689 690 intel_gt_set_wedged(rq->engine->gt); 691 err = -EIO; 692 } 693 694 i915_request_put(rq); 695 696 return err; 697 } 698 699 static int active_engine(void *data) 700 { 701 I915_RND_STATE(prng); 702 struct active_engine *arg = data; 703 struct intel_engine_cs *engine = arg->engine; 704 struct i915_request *rq[8] = {}; 705 struct intel_context *ce[ARRAY_SIZE(rq)]; 706 unsigned long count; 707 int err = 0; 708 709 for (count = 0; count < ARRAY_SIZE(ce); count++) { 710 ce[count] = intel_context_create(engine); 711 if (IS_ERR(ce[count])) { 712 err = PTR_ERR(ce[count]); 713 while (--count) 714 intel_context_put(ce[count]); 715 return err; 716 } 717 } 718 719 count = 0; 720 while (!kthread_should_stop()) { 721 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 722 struct i915_request *old = rq[idx]; 723 struct i915_request *new; 724 725 new = intel_context_create_request(ce[idx]); 726 if (IS_ERR(new)) { 727 err = PTR_ERR(new); 728 break; 729 } 730 731 rq[idx] = i915_request_get(new); 732 i915_request_add(new); 733 734 if (engine->schedule && arg->flags & TEST_PRIORITY) { 735 struct i915_sched_attr attr = { 736 .priority = 737 i915_prandom_u32_max_state(512, &prng), 738 }; 739 engine->schedule(rq[idx], &attr); 740 } 741 742 err = active_request_put(old); 743 if (err) 744 break; 745 746 cond_resched(); 747 } 748 749 for (count = 0; count < ARRAY_SIZE(rq); count++) { 750 int err__ = active_request_put(rq[count]); 751 752 /* Keep the first error */ 753 if (!err) 754 err = err__; 755 756 intel_context_put(ce[count]); 757 } 758 759 return err; 760 } 761 762 static int __igt_reset_engines(struct intel_gt *gt, 763 const char *test_name, 764 unsigned int flags) 765 { 766 struct i915_gpu_error *global = >->i915->gpu_error; 767 struct intel_engine_cs *engine, *other; 768 enum intel_engine_id id, tmp; 769 struct hang h; 770 int err = 0; 771 772 /* Check that issuing a reset on one engine does not interfere 773 * with any other engine. 774 */ 775 776 if (!intel_has_reset_engine(gt)) 777 return 0; 778 779 if (flags & TEST_ACTIVE) { 780 err = hang_init(&h, gt); 781 if (err) 782 return err; 783 784 if (flags & TEST_PRIORITY) 785 h.ctx->sched.priority = 1024; 786 } 787 788 for_each_engine(engine, gt, id) { 789 struct active_engine threads[I915_NUM_ENGINES] = {}; 790 unsigned long device = i915_reset_count(global); 791 unsigned long count = 0, reported; 792 unsigned long heartbeat; 793 IGT_TIMEOUT(end_time); 794 795 if (flags & TEST_ACTIVE && 796 !intel_engine_can_store_dword(engine)) 797 continue; 798 799 if (!wait_for_idle(engine)) { 800 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 801 engine->name, test_name); 802 err = -EIO; 803 break; 804 } 805 806 memset(threads, 0, sizeof(threads)); 807 for_each_engine(other, gt, tmp) { 808 struct task_struct *tsk; 809 810 threads[tmp].resets = 811 i915_reset_engine_count(global, other); 812 813 if (!(flags & TEST_OTHERS)) 814 continue; 815 816 if (other == engine && !(flags & TEST_SELF)) 817 continue; 818 819 threads[tmp].engine = other; 820 threads[tmp].flags = flags; 821 822 tsk = kthread_run(active_engine, &threads[tmp], 823 "igt/%s", other->name); 824 if (IS_ERR(tsk)) { 825 err = PTR_ERR(tsk); 826 goto unwind; 827 } 828 829 threads[tmp].task = tsk; 830 get_task_struct(tsk); 831 } 832 833 yield(); /* start all threads before we begin */ 834 835 engine_heartbeat_disable(engine, &heartbeat); 836 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 837 do { 838 struct i915_request *rq = NULL; 839 840 if (flags & TEST_ACTIVE) { 841 rq = hang_create_request(&h, engine); 842 if (IS_ERR(rq)) { 843 err = PTR_ERR(rq); 844 break; 845 } 846 847 i915_request_get(rq); 848 i915_request_add(rq); 849 850 if (!wait_until_running(&h, rq)) { 851 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 852 853 pr_err("%s: Failed to start request %llx, at %x\n", 854 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 855 intel_engine_dump(engine, &p, 856 "%s\n", engine->name); 857 858 i915_request_put(rq); 859 err = -EIO; 860 break; 861 } 862 } 863 864 err = intel_engine_reset(engine, NULL); 865 if (err) { 866 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 867 engine->name, test_name, err); 868 break; 869 } 870 871 count++; 872 873 if (rq) { 874 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 875 struct drm_printer p = 876 drm_info_printer(gt->i915->drm.dev); 877 878 pr_err("i915_reset_engine(%s:%s):" 879 " failed to complete request after reset\n", 880 engine->name, test_name); 881 intel_engine_dump(engine, &p, 882 "%s\n", engine->name); 883 i915_request_put(rq); 884 885 GEM_TRACE_DUMP(); 886 intel_gt_set_wedged(gt); 887 err = -EIO; 888 break; 889 } 890 891 i915_request_put(rq); 892 } 893 894 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 895 struct drm_printer p = 896 drm_info_printer(gt->i915->drm.dev); 897 898 pr_err("i915_reset_engine(%s:%s):" 899 " failed to idle after reset\n", 900 engine->name, test_name); 901 intel_engine_dump(engine, &p, 902 "%s\n", engine->name); 903 904 err = -EIO; 905 break; 906 } 907 } while (time_before(jiffies, end_time)); 908 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 909 engine_heartbeat_enable(engine, heartbeat); 910 911 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 912 engine->name, test_name, count); 913 914 reported = i915_reset_engine_count(global, engine); 915 reported -= threads[engine->id].resets; 916 if (reported != count) { 917 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 918 engine->name, test_name, count, reported); 919 if (!err) 920 err = -EINVAL; 921 } 922 923 unwind: 924 for_each_engine(other, gt, tmp) { 925 int ret; 926 927 if (!threads[tmp].task) 928 continue; 929 930 ret = kthread_stop(threads[tmp].task); 931 if (ret) { 932 pr_err("kthread for other engine %s failed, err=%d\n", 933 other->name, ret); 934 if (!err) 935 err = ret; 936 } 937 put_task_struct(threads[tmp].task); 938 939 if (other->uabi_class != engine->uabi_class && 940 threads[tmp].resets != 941 i915_reset_engine_count(global, other)) { 942 pr_err("Innocent engine %s was reset (count=%ld)\n", 943 other->name, 944 i915_reset_engine_count(global, other) - 945 threads[tmp].resets); 946 if (!err) 947 err = -EINVAL; 948 } 949 } 950 951 if (device != i915_reset_count(global)) { 952 pr_err("Global reset (count=%ld)!\n", 953 i915_reset_count(global) - device); 954 if (!err) 955 err = -EINVAL; 956 } 957 958 if (err) 959 break; 960 961 err = igt_flush_test(gt->i915); 962 if (err) 963 break; 964 } 965 966 if (intel_gt_is_wedged(gt)) 967 err = -EIO; 968 969 if (flags & TEST_ACTIVE) 970 hang_fini(&h); 971 972 return err; 973 } 974 975 static int igt_reset_engines(void *arg) 976 { 977 static const struct { 978 const char *name; 979 unsigned int flags; 980 } phases[] = { 981 { "idle", 0 }, 982 { "active", TEST_ACTIVE }, 983 { "others-idle", TEST_OTHERS }, 984 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 985 { 986 "others-priority", 987 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 988 }, 989 { 990 "self-priority", 991 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 992 }, 993 { } 994 }; 995 struct intel_gt *gt = arg; 996 typeof(*phases) *p; 997 int err; 998 999 for (p = phases; p->name; p++) { 1000 if (p->flags & TEST_PRIORITY) { 1001 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1002 continue; 1003 } 1004 1005 err = __igt_reset_engines(arg, p->name, p->flags); 1006 if (err) 1007 return err; 1008 } 1009 1010 return 0; 1011 } 1012 1013 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1014 { 1015 u32 count = i915_reset_count(>->i915->gpu_error); 1016 1017 intel_gt_reset(gt, mask, NULL); 1018 1019 return count; 1020 } 1021 1022 static int igt_reset_wait(void *arg) 1023 { 1024 struct intel_gt *gt = arg; 1025 struct i915_gpu_error *global = >->i915->gpu_error; 1026 struct intel_engine_cs *engine = gt->engine[RCS0]; 1027 struct i915_request *rq; 1028 unsigned int reset_count; 1029 struct hang h; 1030 long timeout; 1031 int err; 1032 1033 if (!engine || !intel_engine_can_store_dword(engine)) 1034 return 0; 1035 1036 /* Check that we detect a stuck waiter and issue a reset */ 1037 1038 igt_global_reset_lock(gt); 1039 1040 err = hang_init(&h, gt); 1041 if (err) 1042 goto unlock; 1043 1044 rq = hang_create_request(&h, engine); 1045 if (IS_ERR(rq)) { 1046 err = PTR_ERR(rq); 1047 goto fini; 1048 } 1049 1050 i915_request_get(rq); 1051 i915_request_add(rq); 1052 1053 if (!wait_until_running(&h, rq)) { 1054 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1055 1056 pr_err("%s: Failed to start request %llx, at %x\n", 1057 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1058 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1059 1060 intel_gt_set_wedged(gt); 1061 1062 err = -EIO; 1063 goto out_rq; 1064 } 1065 1066 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1067 1068 timeout = i915_request_wait(rq, 0, 10); 1069 if (timeout < 0) { 1070 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1071 timeout); 1072 err = timeout; 1073 goto out_rq; 1074 } 1075 1076 if (i915_reset_count(global) == reset_count) { 1077 pr_err("No GPU reset recorded!\n"); 1078 err = -EINVAL; 1079 goto out_rq; 1080 } 1081 1082 out_rq: 1083 i915_request_put(rq); 1084 fini: 1085 hang_fini(&h); 1086 unlock: 1087 igt_global_reset_unlock(gt); 1088 1089 if (intel_gt_is_wedged(gt)) 1090 return -EIO; 1091 1092 return err; 1093 } 1094 1095 struct evict_vma { 1096 struct completion completion; 1097 struct i915_vma *vma; 1098 }; 1099 1100 static int evict_vma(void *data) 1101 { 1102 struct evict_vma *arg = data; 1103 struct i915_address_space *vm = arg->vma->vm; 1104 struct drm_mm_node evict = arg->vma->node; 1105 int err; 1106 1107 complete(&arg->completion); 1108 1109 mutex_lock(&vm->mutex); 1110 err = i915_gem_evict_for_node(vm, &evict, 0); 1111 mutex_unlock(&vm->mutex); 1112 1113 return err; 1114 } 1115 1116 static int evict_fence(void *data) 1117 { 1118 struct evict_vma *arg = data; 1119 int err; 1120 1121 complete(&arg->completion); 1122 1123 /* Mark the fence register as dirty to force the mmio update. */ 1124 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1125 if (err) { 1126 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1127 return err; 1128 } 1129 1130 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1131 if (err) { 1132 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1133 return err; 1134 } 1135 1136 err = i915_vma_pin_fence(arg->vma); 1137 i915_vma_unpin(arg->vma); 1138 if (err) { 1139 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1140 return err; 1141 } 1142 1143 i915_vma_unpin_fence(arg->vma); 1144 1145 return 0; 1146 } 1147 1148 static int __igt_reset_evict_vma(struct intel_gt *gt, 1149 struct i915_address_space *vm, 1150 int (*fn)(void *), 1151 unsigned int flags) 1152 { 1153 struct intel_engine_cs *engine = gt->engine[RCS0]; 1154 struct drm_i915_gem_object *obj; 1155 struct task_struct *tsk = NULL; 1156 struct i915_request *rq; 1157 struct evict_vma arg; 1158 struct hang h; 1159 unsigned int pin_flags; 1160 int err; 1161 1162 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1163 return 0; 1164 1165 if (!engine || !intel_engine_can_store_dword(engine)) 1166 return 0; 1167 1168 /* Check that we can recover an unbind stuck on a hanging request */ 1169 1170 err = hang_init(&h, gt); 1171 if (err) 1172 return err; 1173 1174 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1175 if (IS_ERR(obj)) { 1176 err = PTR_ERR(obj); 1177 goto fini; 1178 } 1179 1180 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1181 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1182 if (err) { 1183 pr_err("Invalid X-tiling settings; err:%d\n", err); 1184 goto out_obj; 1185 } 1186 } 1187 1188 arg.vma = i915_vma_instance(obj, vm, NULL); 1189 if (IS_ERR(arg.vma)) { 1190 err = PTR_ERR(arg.vma); 1191 goto out_obj; 1192 } 1193 1194 rq = hang_create_request(&h, engine); 1195 if (IS_ERR(rq)) { 1196 err = PTR_ERR(rq); 1197 goto out_obj; 1198 } 1199 1200 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1201 1202 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1203 pin_flags |= PIN_MAPPABLE; 1204 1205 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1206 if (err) { 1207 i915_request_add(rq); 1208 goto out_obj; 1209 } 1210 1211 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1212 err = i915_vma_pin_fence(arg.vma); 1213 if (err) { 1214 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1215 i915_vma_unpin(arg.vma); 1216 i915_request_add(rq); 1217 goto out_obj; 1218 } 1219 } 1220 1221 i915_vma_lock(arg.vma); 1222 err = i915_request_await_object(rq, arg.vma->obj, 1223 flags & EXEC_OBJECT_WRITE); 1224 if (err == 0) 1225 err = i915_vma_move_to_active(arg.vma, rq, flags); 1226 i915_vma_unlock(arg.vma); 1227 1228 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1229 i915_vma_unpin_fence(arg.vma); 1230 i915_vma_unpin(arg.vma); 1231 1232 i915_request_get(rq); 1233 i915_request_add(rq); 1234 if (err) 1235 goto out_rq; 1236 1237 if (!wait_until_running(&h, rq)) { 1238 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1239 1240 pr_err("%s: Failed to start request %llx, at %x\n", 1241 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1242 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1243 1244 intel_gt_set_wedged(gt); 1245 goto out_reset; 1246 } 1247 1248 init_completion(&arg.completion); 1249 1250 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1251 if (IS_ERR(tsk)) { 1252 err = PTR_ERR(tsk); 1253 tsk = NULL; 1254 goto out_reset; 1255 } 1256 get_task_struct(tsk); 1257 1258 wait_for_completion(&arg.completion); 1259 1260 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1261 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1262 1263 pr_err("igt/evict_vma kthread did not wait\n"); 1264 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1265 1266 intel_gt_set_wedged(gt); 1267 goto out_reset; 1268 } 1269 1270 out_reset: 1271 igt_global_reset_lock(gt); 1272 fake_hangcheck(gt, rq->engine->mask); 1273 igt_global_reset_unlock(gt); 1274 1275 if (tsk) { 1276 struct intel_wedge_me w; 1277 1278 /* The reset, even indirectly, should take less than 10ms. */ 1279 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1280 err = kthread_stop(tsk); 1281 1282 put_task_struct(tsk); 1283 } 1284 1285 out_rq: 1286 i915_request_put(rq); 1287 out_obj: 1288 i915_gem_object_put(obj); 1289 fini: 1290 hang_fini(&h); 1291 if (intel_gt_is_wedged(gt)) 1292 return -EIO; 1293 1294 return err; 1295 } 1296 1297 static int igt_reset_evict_ggtt(void *arg) 1298 { 1299 struct intel_gt *gt = arg; 1300 1301 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1302 evict_vma, EXEC_OBJECT_WRITE); 1303 } 1304 1305 static int igt_reset_evict_ppgtt(void *arg) 1306 { 1307 struct intel_gt *gt = arg; 1308 struct i915_ppgtt *ppgtt; 1309 int err; 1310 1311 /* aliasing == global gtt locking, covered above */ 1312 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1313 return 0; 1314 1315 ppgtt = i915_ppgtt_create(gt); 1316 if (IS_ERR(ppgtt)) 1317 return PTR_ERR(ppgtt); 1318 1319 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1320 evict_vma, EXEC_OBJECT_WRITE); 1321 i915_vm_put(&ppgtt->vm); 1322 1323 return err; 1324 } 1325 1326 static int igt_reset_evict_fence(void *arg) 1327 { 1328 struct intel_gt *gt = arg; 1329 1330 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1331 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1332 } 1333 1334 static int wait_for_others(struct intel_gt *gt, 1335 struct intel_engine_cs *exclude) 1336 { 1337 struct intel_engine_cs *engine; 1338 enum intel_engine_id id; 1339 1340 for_each_engine(engine, gt, id) { 1341 if (engine == exclude) 1342 continue; 1343 1344 if (!wait_for_idle(engine)) 1345 return -EIO; 1346 } 1347 1348 return 0; 1349 } 1350 1351 static int igt_reset_queue(void *arg) 1352 { 1353 struct intel_gt *gt = arg; 1354 struct i915_gpu_error *global = >->i915->gpu_error; 1355 struct intel_engine_cs *engine; 1356 enum intel_engine_id id; 1357 struct hang h; 1358 int err; 1359 1360 /* Check that we replay pending requests following a hang */ 1361 1362 igt_global_reset_lock(gt); 1363 1364 err = hang_init(&h, gt); 1365 if (err) 1366 goto unlock; 1367 1368 for_each_engine(engine, gt, id) { 1369 struct i915_request *prev; 1370 IGT_TIMEOUT(end_time); 1371 unsigned int count; 1372 1373 if (!intel_engine_can_store_dword(engine)) 1374 continue; 1375 1376 prev = hang_create_request(&h, engine); 1377 if (IS_ERR(prev)) { 1378 err = PTR_ERR(prev); 1379 goto fini; 1380 } 1381 1382 i915_request_get(prev); 1383 i915_request_add(prev); 1384 1385 count = 0; 1386 do { 1387 struct i915_request *rq; 1388 unsigned int reset_count; 1389 1390 rq = hang_create_request(&h, engine); 1391 if (IS_ERR(rq)) { 1392 err = PTR_ERR(rq); 1393 goto fini; 1394 } 1395 1396 i915_request_get(rq); 1397 i915_request_add(rq); 1398 1399 /* 1400 * XXX We don't handle resetting the kernel context 1401 * very well. If we trigger a device reset twice in 1402 * quick succession while the kernel context is 1403 * executing, we may end up skipping the breadcrumb. 1404 * This is really only a problem for the selftest as 1405 * normally there is a large interlude between resets 1406 * (hangcheck), or we focus on resetting just one 1407 * engine and so avoid repeatedly resetting innocents. 1408 */ 1409 err = wait_for_others(gt, engine); 1410 if (err) { 1411 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1412 __func__, engine->name); 1413 i915_request_put(rq); 1414 i915_request_put(prev); 1415 1416 GEM_TRACE_DUMP(); 1417 intel_gt_set_wedged(gt); 1418 goto fini; 1419 } 1420 1421 if (!wait_until_running(&h, prev)) { 1422 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1423 1424 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1425 __func__, engine->name, 1426 prev->fence.seqno, hws_seqno(&h, prev)); 1427 intel_engine_dump(engine, &p, 1428 "%s\n", engine->name); 1429 1430 i915_request_put(rq); 1431 i915_request_put(prev); 1432 1433 intel_gt_set_wedged(gt); 1434 1435 err = -EIO; 1436 goto fini; 1437 } 1438 1439 reset_count = fake_hangcheck(gt, BIT(id)); 1440 1441 if (prev->fence.error != -EIO) { 1442 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1443 prev->fence.error); 1444 i915_request_put(rq); 1445 i915_request_put(prev); 1446 err = -EINVAL; 1447 goto fini; 1448 } 1449 1450 if (rq->fence.error) { 1451 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1452 rq->fence.error); 1453 i915_request_put(rq); 1454 i915_request_put(prev); 1455 err = -EINVAL; 1456 goto fini; 1457 } 1458 1459 if (i915_reset_count(global) == reset_count) { 1460 pr_err("No GPU reset recorded!\n"); 1461 i915_request_put(rq); 1462 i915_request_put(prev); 1463 err = -EINVAL; 1464 goto fini; 1465 } 1466 1467 i915_request_put(prev); 1468 prev = rq; 1469 count++; 1470 } while (time_before(jiffies, end_time)); 1471 pr_info("%s: Completed %d resets\n", engine->name, count); 1472 1473 *h.batch = MI_BATCH_BUFFER_END; 1474 intel_gt_chipset_flush(engine->gt); 1475 1476 i915_request_put(prev); 1477 1478 err = igt_flush_test(gt->i915); 1479 if (err) 1480 break; 1481 } 1482 1483 fini: 1484 hang_fini(&h); 1485 unlock: 1486 igt_global_reset_unlock(gt); 1487 1488 if (intel_gt_is_wedged(gt)) 1489 return -EIO; 1490 1491 return err; 1492 } 1493 1494 static int igt_handle_error(void *arg) 1495 { 1496 struct intel_gt *gt = arg; 1497 struct i915_gpu_error *global = >->i915->gpu_error; 1498 struct intel_engine_cs *engine = gt->engine[RCS0]; 1499 struct hang h; 1500 struct i915_request *rq; 1501 struct i915_gpu_coredump *error; 1502 int err; 1503 1504 /* Check that we can issue a global GPU and engine reset */ 1505 1506 if (!intel_has_reset_engine(gt)) 1507 return 0; 1508 1509 if (!engine || !intel_engine_can_store_dword(engine)) 1510 return 0; 1511 1512 err = hang_init(&h, gt); 1513 if (err) 1514 return err; 1515 1516 rq = hang_create_request(&h, engine); 1517 if (IS_ERR(rq)) { 1518 err = PTR_ERR(rq); 1519 goto err_fini; 1520 } 1521 1522 i915_request_get(rq); 1523 i915_request_add(rq); 1524 1525 if (!wait_until_running(&h, rq)) { 1526 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1527 1528 pr_err("%s: Failed to start request %llx, at %x\n", 1529 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1530 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1531 1532 intel_gt_set_wedged(gt); 1533 1534 err = -EIO; 1535 goto err_request; 1536 } 1537 1538 /* Temporarily disable error capture */ 1539 error = xchg(&global->first_error, (void *)-1); 1540 1541 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1542 1543 xchg(&global->first_error, error); 1544 1545 if (rq->fence.error != -EIO) { 1546 pr_err("Guilty request not identified!\n"); 1547 err = -EINVAL; 1548 goto err_request; 1549 } 1550 1551 err_request: 1552 i915_request_put(rq); 1553 err_fini: 1554 hang_fini(&h); 1555 return err; 1556 } 1557 1558 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1559 const struct igt_atomic_section *p, 1560 const char *mode) 1561 { 1562 struct tasklet_struct * const t = &engine->execlists.tasklet; 1563 int err; 1564 1565 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1566 engine->name, mode, p->name); 1567 1568 tasklet_disable(t); 1569 p->critical_section_begin(); 1570 1571 err = intel_engine_reset(engine, NULL); 1572 1573 p->critical_section_end(); 1574 tasklet_enable(t); 1575 1576 if (err) 1577 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1578 engine->name, mode, p->name); 1579 1580 return err; 1581 } 1582 1583 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1584 const struct igt_atomic_section *p) 1585 { 1586 struct i915_request *rq; 1587 struct hang h; 1588 int err; 1589 1590 err = __igt_atomic_reset_engine(engine, p, "idle"); 1591 if (err) 1592 return err; 1593 1594 err = hang_init(&h, engine->gt); 1595 if (err) 1596 return err; 1597 1598 rq = hang_create_request(&h, engine); 1599 if (IS_ERR(rq)) { 1600 err = PTR_ERR(rq); 1601 goto out; 1602 } 1603 1604 i915_request_get(rq); 1605 i915_request_add(rq); 1606 1607 if (wait_until_running(&h, rq)) { 1608 err = __igt_atomic_reset_engine(engine, p, "active"); 1609 } else { 1610 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1611 __func__, engine->name, 1612 rq->fence.seqno, hws_seqno(&h, rq)); 1613 intel_gt_set_wedged(engine->gt); 1614 err = -EIO; 1615 } 1616 1617 if (err == 0) { 1618 struct intel_wedge_me w; 1619 1620 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1621 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1622 if (intel_gt_is_wedged(engine->gt)) 1623 err = -EIO; 1624 } 1625 1626 i915_request_put(rq); 1627 out: 1628 hang_fini(&h); 1629 return err; 1630 } 1631 1632 static int igt_reset_engines_atomic(void *arg) 1633 { 1634 struct intel_gt *gt = arg; 1635 const typeof(*igt_atomic_phases) *p; 1636 int err = 0; 1637 1638 /* Check that the engines resets are usable from atomic context */ 1639 1640 if (!intel_has_reset_engine(gt)) 1641 return 0; 1642 1643 if (USES_GUC_SUBMISSION(gt->i915)) 1644 return 0; 1645 1646 igt_global_reset_lock(gt); 1647 1648 /* Flush any requests before we get started and check basics */ 1649 if (!igt_force_reset(gt)) 1650 goto unlock; 1651 1652 for (p = igt_atomic_phases; p->name; p++) { 1653 struct intel_engine_cs *engine; 1654 enum intel_engine_id id; 1655 1656 for_each_engine(engine, gt, id) { 1657 err = igt_atomic_reset_engine(engine, p); 1658 if (err) 1659 goto out; 1660 } 1661 } 1662 1663 out: 1664 /* As we poke around the guts, do a full reset before continuing. */ 1665 igt_force_reset(gt); 1666 unlock: 1667 igt_global_reset_unlock(gt); 1668 1669 return err; 1670 } 1671 1672 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1673 { 1674 static const struct i915_subtest tests[] = { 1675 SUBTEST(igt_hang_sanitycheck), 1676 SUBTEST(igt_reset_nop), 1677 SUBTEST(igt_reset_nop_engine), 1678 SUBTEST(igt_reset_idle_engine), 1679 SUBTEST(igt_reset_active_engine), 1680 SUBTEST(igt_reset_engines), 1681 SUBTEST(igt_reset_engines_atomic), 1682 SUBTEST(igt_reset_queue), 1683 SUBTEST(igt_reset_wait), 1684 SUBTEST(igt_reset_evict_ggtt), 1685 SUBTEST(igt_reset_evict_ppgtt), 1686 SUBTEST(igt_reset_evict_fence), 1687 SUBTEST(igt_handle_error), 1688 }; 1689 struct intel_gt *gt = &i915->gt; 1690 intel_wakeref_t wakeref; 1691 int err; 1692 1693 if (!intel_has_gpu_reset(gt)) 1694 return 0; 1695 1696 if (intel_gt_is_wedged(gt)) 1697 return -EIO; /* we're long past hope of a successful reset */ 1698 1699 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1700 1701 err = intel_gt_live_subtests(tests, gt); 1702 1703 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1704 1705 return err; 1706 } 1707