1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/kthread.h> 26 27 #include "gem/i915_gem_context.h" 28 #include "gt/intel_gt.h" 29 #include "intel_engine_pm.h" 30 31 #include "i915_selftest.h" 32 #include "selftests/i915_random.h" 33 #include "selftests/igt_flush_test.h" 34 #include "selftests/igt_reset.h" 35 #include "selftests/igt_atomic.h" 36 37 #include "selftests/mock_drm.h" 38 39 #include "gem/selftests/mock_context.h" 40 #include "gem/selftests/igt_gem_utils.h" 41 42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 43 44 struct hang { 45 struct intel_gt *gt; 46 struct drm_i915_gem_object *hws; 47 struct drm_i915_gem_object *obj; 48 struct i915_gem_context *ctx; 49 u32 *seqno; 50 u32 *batch; 51 }; 52 53 static int hang_init(struct hang *h, struct intel_gt *gt) 54 { 55 void *vaddr; 56 int err; 57 58 memset(h, 0, sizeof(*h)); 59 h->gt = gt; 60 61 h->ctx = kernel_context(gt->i915); 62 if (IS_ERR(h->ctx)) 63 return PTR_ERR(h->ctx); 64 65 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 66 67 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 68 if (IS_ERR(h->hws)) { 69 err = PTR_ERR(h->hws); 70 goto err_ctx; 71 } 72 73 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 74 if (IS_ERR(h->obj)) { 75 err = PTR_ERR(h->obj); 76 goto err_hws; 77 } 78 79 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 80 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 81 if (IS_ERR(vaddr)) { 82 err = PTR_ERR(vaddr); 83 goto err_obj; 84 } 85 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 86 87 vaddr = i915_gem_object_pin_map(h->obj, 88 i915_coherent_map_type(gt->i915)); 89 if (IS_ERR(vaddr)) { 90 err = PTR_ERR(vaddr); 91 goto err_unpin_hws; 92 } 93 h->batch = vaddr; 94 95 return 0; 96 97 err_unpin_hws: 98 i915_gem_object_unpin_map(h->hws); 99 err_obj: 100 i915_gem_object_put(h->obj); 101 err_hws: 102 i915_gem_object_put(h->hws); 103 err_ctx: 104 kernel_context_close(h->ctx); 105 return err; 106 } 107 108 static u64 hws_address(const struct i915_vma *hws, 109 const struct i915_request *rq) 110 { 111 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 112 } 113 114 static int move_to_active(struct i915_vma *vma, 115 struct i915_request *rq, 116 unsigned int flags) 117 { 118 int err; 119 120 i915_vma_lock(vma); 121 err = i915_vma_move_to_active(vma, rq, flags); 122 i915_vma_unlock(vma); 123 124 return err; 125 } 126 127 static struct i915_request * 128 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 129 { 130 struct intel_gt *gt = h->gt; 131 struct i915_address_space *vm = h->ctx->vm ?: &engine->gt->ggtt->vm; 132 struct drm_i915_gem_object *obj; 133 struct i915_request *rq = NULL; 134 struct i915_vma *hws, *vma; 135 unsigned int flags; 136 void *vaddr; 137 u32 *batch; 138 int err; 139 140 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 141 if (IS_ERR(obj)) 142 return ERR_CAST(obj); 143 144 vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915)); 145 if (IS_ERR(vaddr)) { 146 i915_gem_object_put(obj); 147 return ERR_CAST(vaddr); 148 } 149 150 i915_gem_object_unpin_map(h->obj); 151 i915_gem_object_put(h->obj); 152 153 h->obj = obj; 154 h->batch = vaddr; 155 156 vma = i915_vma_instance(h->obj, vm, NULL); 157 if (IS_ERR(vma)) 158 return ERR_CAST(vma); 159 160 hws = i915_vma_instance(h->hws, vm, NULL); 161 if (IS_ERR(hws)) 162 return ERR_CAST(hws); 163 164 err = i915_vma_pin(vma, 0, 0, PIN_USER); 165 if (err) 166 return ERR_PTR(err); 167 168 err = i915_vma_pin(hws, 0, 0, PIN_USER); 169 if (err) 170 goto unpin_vma; 171 172 rq = igt_request_alloc(h->ctx, engine); 173 if (IS_ERR(rq)) { 174 err = PTR_ERR(rq); 175 goto unpin_hws; 176 } 177 178 err = move_to_active(vma, rq, 0); 179 if (err) 180 goto cancel_rq; 181 182 err = move_to_active(hws, rq, 0); 183 if (err) 184 goto cancel_rq; 185 186 batch = h->batch; 187 if (INTEL_GEN(gt->i915) >= 8) { 188 *batch++ = MI_STORE_DWORD_IMM_GEN4; 189 *batch++ = lower_32_bits(hws_address(hws, rq)); 190 *batch++ = upper_32_bits(hws_address(hws, rq)); 191 *batch++ = rq->fence.seqno; 192 *batch++ = MI_ARB_CHECK; 193 194 memset(batch, 0, 1024); 195 batch += 1024 / sizeof(*batch); 196 197 *batch++ = MI_ARB_CHECK; 198 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 199 *batch++ = lower_32_bits(vma->node.start); 200 *batch++ = upper_32_bits(vma->node.start); 201 } else if (INTEL_GEN(gt->i915) >= 6) { 202 *batch++ = MI_STORE_DWORD_IMM_GEN4; 203 *batch++ = 0; 204 *batch++ = lower_32_bits(hws_address(hws, rq)); 205 *batch++ = rq->fence.seqno; 206 *batch++ = MI_ARB_CHECK; 207 208 memset(batch, 0, 1024); 209 batch += 1024 / sizeof(*batch); 210 211 *batch++ = MI_ARB_CHECK; 212 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 213 *batch++ = lower_32_bits(vma->node.start); 214 } else if (INTEL_GEN(gt->i915) >= 4) { 215 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 216 *batch++ = 0; 217 *batch++ = lower_32_bits(hws_address(hws, rq)); 218 *batch++ = rq->fence.seqno; 219 *batch++ = MI_ARB_CHECK; 220 221 memset(batch, 0, 1024); 222 batch += 1024 / sizeof(*batch); 223 224 *batch++ = MI_ARB_CHECK; 225 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 226 *batch++ = lower_32_bits(vma->node.start); 227 } else { 228 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 229 *batch++ = lower_32_bits(hws_address(hws, rq)); 230 *batch++ = rq->fence.seqno; 231 *batch++ = MI_ARB_CHECK; 232 233 memset(batch, 0, 1024); 234 batch += 1024 / sizeof(*batch); 235 236 *batch++ = MI_ARB_CHECK; 237 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 238 *batch++ = lower_32_bits(vma->node.start); 239 } 240 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 241 intel_gt_chipset_flush(engine->gt); 242 243 if (rq->engine->emit_init_breadcrumb) { 244 err = rq->engine->emit_init_breadcrumb(rq); 245 if (err) 246 goto cancel_rq; 247 } 248 249 flags = 0; 250 if (INTEL_GEN(gt->i915) <= 5) 251 flags |= I915_DISPATCH_SECURE; 252 253 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 254 255 cancel_rq: 256 if (err) { 257 i915_request_skip(rq, err); 258 i915_request_add(rq); 259 } 260 unpin_hws: 261 i915_vma_unpin(hws); 262 unpin_vma: 263 i915_vma_unpin(vma); 264 return err ? ERR_PTR(err) : rq; 265 } 266 267 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 268 { 269 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 270 } 271 272 static void hang_fini(struct hang *h) 273 { 274 *h->batch = MI_BATCH_BUFFER_END; 275 intel_gt_chipset_flush(h->gt); 276 277 i915_gem_object_unpin_map(h->obj); 278 i915_gem_object_put(h->obj); 279 280 i915_gem_object_unpin_map(h->hws); 281 i915_gem_object_put(h->hws); 282 283 kernel_context_close(h->ctx); 284 285 igt_flush_test(h->gt->i915, I915_WAIT_LOCKED); 286 } 287 288 static bool wait_until_running(struct hang *h, struct i915_request *rq) 289 { 290 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 291 rq->fence.seqno), 292 10) && 293 wait_for(i915_seqno_passed(hws_seqno(h, rq), 294 rq->fence.seqno), 295 1000)); 296 } 297 298 static int igt_hang_sanitycheck(void *arg) 299 { 300 struct intel_gt *gt = arg; 301 struct i915_request *rq; 302 struct intel_engine_cs *engine; 303 enum intel_engine_id id; 304 struct hang h; 305 int err; 306 307 /* Basic check that we can execute our hanging batch */ 308 309 mutex_lock(>->i915->drm.struct_mutex); 310 err = hang_init(&h, gt); 311 if (err) 312 goto unlock; 313 314 for_each_engine(engine, gt->i915, id) { 315 struct intel_wedge_me w; 316 long timeout; 317 318 if (!intel_engine_can_store_dword(engine)) 319 continue; 320 321 rq = hang_create_request(&h, engine); 322 if (IS_ERR(rq)) { 323 err = PTR_ERR(rq); 324 pr_err("Failed to create request for %s, err=%d\n", 325 engine->name, err); 326 goto fini; 327 } 328 329 i915_request_get(rq); 330 331 *h.batch = MI_BATCH_BUFFER_END; 332 intel_gt_chipset_flush(engine->gt); 333 334 i915_request_add(rq); 335 336 timeout = 0; 337 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 338 timeout = i915_request_wait(rq, 0, 339 MAX_SCHEDULE_TIMEOUT); 340 if (intel_gt_is_wedged(gt)) 341 timeout = -EIO; 342 343 i915_request_put(rq); 344 345 if (timeout < 0) { 346 err = timeout; 347 pr_err("Wait for request failed on %s, err=%d\n", 348 engine->name, err); 349 goto fini; 350 } 351 } 352 353 fini: 354 hang_fini(&h); 355 unlock: 356 mutex_unlock(>->i915->drm.struct_mutex); 357 return err; 358 } 359 360 static bool wait_for_idle(struct intel_engine_cs *engine) 361 { 362 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 363 } 364 365 static int igt_reset_nop(void *arg) 366 { 367 struct intel_gt *gt = arg; 368 struct i915_gpu_error *global = >->i915->gpu_error; 369 struct intel_engine_cs *engine; 370 struct i915_gem_context *ctx; 371 unsigned int reset_count, count; 372 enum intel_engine_id id; 373 struct drm_file *file; 374 IGT_TIMEOUT(end_time); 375 int err = 0; 376 377 /* Check that we can reset during non-user portions of requests */ 378 379 file = mock_file(gt->i915); 380 if (IS_ERR(file)) 381 return PTR_ERR(file); 382 383 mutex_lock(>->i915->drm.struct_mutex); 384 ctx = live_context(gt->i915, file); 385 mutex_unlock(>->i915->drm.struct_mutex); 386 if (IS_ERR(ctx)) { 387 err = PTR_ERR(ctx); 388 goto out; 389 } 390 391 i915_gem_context_clear_bannable(ctx); 392 reset_count = i915_reset_count(global); 393 count = 0; 394 do { 395 mutex_lock(>->i915->drm.struct_mutex); 396 397 for_each_engine(engine, gt->i915, id) { 398 int i; 399 400 for (i = 0; i < 16; i++) { 401 struct i915_request *rq; 402 403 rq = igt_request_alloc(ctx, engine); 404 if (IS_ERR(rq)) { 405 err = PTR_ERR(rq); 406 break; 407 } 408 409 i915_request_add(rq); 410 } 411 } 412 413 igt_global_reset_lock(gt); 414 intel_gt_reset(gt, ALL_ENGINES, NULL); 415 igt_global_reset_unlock(gt); 416 417 mutex_unlock(>->i915->drm.struct_mutex); 418 if (intel_gt_is_wedged(gt)) { 419 err = -EIO; 420 break; 421 } 422 423 if (i915_reset_count(global) != reset_count + ++count) { 424 pr_err("Full GPU reset not recorded!\n"); 425 err = -EINVAL; 426 break; 427 } 428 429 err = igt_flush_test(gt->i915, 0); 430 if (err) 431 break; 432 } while (time_before(jiffies, end_time)); 433 pr_info("%s: %d resets\n", __func__, count); 434 435 mutex_lock(>->i915->drm.struct_mutex); 436 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED); 437 mutex_unlock(>->i915->drm.struct_mutex); 438 439 out: 440 mock_file_free(gt->i915, file); 441 if (intel_gt_is_wedged(gt)) 442 err = -EIO; 443 return err; 444 } 445 446 static int igt_reset_nop_engine(void *arg) 447 { 448 struct intel_gt *gt = arg; 449 struct i915_gpu_error *global = >->i915->gpu_error; 450 struct intel_engine_cs *engine; 451 struct i915_gem_context *ctx; 452 enum intel_engine_id id; 453 struct drm_file *file; 454 int err = 0; 455 456 /* Check that we can engine-reset during non-user portions */ 457 458 if (!intel_has_reset_engine(gt->i915)) 459 return 0; 460 461 file = mock_file(gt->i915); 462 if (IS_ERR(file)) 463 return PTR_ERR(file); 464 465 mutex_lock(>->i915->drm.struct_mutex); 466 ctx = live_context(gt->i915, file); 467 mutex_unlock(>->i915->drm.struct_mutex); 468 if (IS_ERR(ctx)) { 469 err = PTR_ERR(ctx); 470 goto out; 471 } 472 473 i915_gem_context_clear_bannable(ctx); 474 for_each_engine(engine, gt->i915, id) { 475 unsigned int reset_count, reset_engine_count; 476 unsigned int count; 477 IGT_TIMEOUT(end_time); 478 479 reset_count = i915_reset_count(global); 480 reset_engine_count = i915_reset_engine_count(global, engine); 481 count = 0; 482 483 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 484 do { 485 int i; 486 487 if (!wait_for_idle(engine)) { 488 pr_err("%s failed to idle before reset\n", 489 engine->name); 490 err = -EIO; 491 break; 492 } 493 494 mutex_lock(>->i915->drm.struct_mutex); 495 for (i = 0; i < 16; i++) { 496 struct i915_request *rq; 497 498 rq = igt_request_alloc(ctx, engine); 499 if (IS_ERR(rq)) { 500 err = PTR_ERR(rq); 501 break; 502 } 503 504 i915_request_add(rq); 505 } 506 err = intel_engine_reset(engine, NULL); 507 mutex_unlock(>->i915->drm.struct_mutex); 508 if (err) { 509 pr_err("i915_reset_engine failed\n"); 510 break; 511 } 512 513 if (i915_reset_count(global) != reset_count) { 514 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 515 err = -EINVAL; 516 break; 517 } 518 519 if (i915_reset_engine_count(global, engine) != 520 reset_engine_count + ++count) { 521 pr_err("%s engine reset not recorded!\n", 522 engine->name); 523 err = -EINVAL; 524 break; 525 } 526 } while (time_before(jiffies, end_time)); 527 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 528 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 529 530 if (err) 531 break; 532 533 err = igt_flush_test(gt->i915, 0); 534 if (err) 535 break; 536 } 537 538 mutex_lock(>->i915->drm.struct_mutex); 539 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED); 540 mutex_unlock(>->i915->drm.struct_mutex); 541 542 out: 543 mock_file_free(gt->i915, file); 544 if (intel_gt_is_wedged(gt)) 545 err = -EIO; 546 return err; 547 } 548 549 static int __igt_reset_engine(struct intel_gt *gt, bool active) 550 { 551 struct i915_gpu_error *global = >->i915->gpu_error; 552 struct intel_engine_cs *engine; 553 enum intel_engine_id id; 554 struct hang h; 555 int err = 0; 556 557 /* Check that we can issue an engine reset on an idle engine (no-op) */ 558 559 if (!intel_has_reset_engine(gt->i915)) 560 return 0; 561 562 if (active) { 563 mutex_lock(>->i915->drm.struct_mutex); 564 err = hang_init(&h, gt); 565 mutex_unlock(>->i915->drm.struct_mutex); 566 if (err) 567 return err; 568 } 569 570 for_each_engine(engine, gt->i915, id) { 571 unsigned int reset_count, reset_engine_count; 572 IGT_TIMEOUT(end_time); 573 574 if (active && !intel_engine_can_store_dword(engine)) 575 continue; 576 577 if (!wait_for_idle(engine)) { 578 pr_err("%s failed to idle before reset\n", 579 engine->name); 580 err = -EIO; 581 break; 582 } 583 584 reset_count = i915_reset_count(global); 585 reset_engine_count = i915_reset_engine_count(global, engine); 586 587 intel_engine_pm_get(engine); 588 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 589 do { 590 if (active) { 591 struct i915_request *rq; 592 593 mutex_lock(>->i915->drm.struct_mutex); 594 rq = hang_create_request(&h, engine); 595 if (IS_ERR(rq)) { 596 err = PTR_ERR(rq); 597 mutex_unlock(>->i915->drm.struct_mutex); 598 break; 599 } 600 601 i915_request_get(rq); 602 i915_request_add(rq); 603 mutex_unlock(>->i915->drm.struct_mutex); 604 605 if (!wait_until_running(&h, rq)) { 606 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 607 608 pr_err("%s: Failed to start request %llx, at %x\n", 609 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 610 intel_engine_dump(engine, &p, 611 "%s\n", engine->name); 612 613 i915_request_put(rq); 614 err = -EIO; 615 break; 616 } 617 618 i915_request_put(rq); 619 } 620 621 err = intel_engine_reset(engine, NULL); 622 if (err) { 623 pr_err("i915_reset_engine failed\n"); 624 break; 625 } 626 627 if (i915_reset_count(global) != reset_count) { 628 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 629 err = -EINVAL; 630 break; 631 } 632 633 if (i915_reset_engine_count(global, engine) != 634 ++reset_engine_count) { 635 pr_err("%s engine reset not recorded!\n", 636 engine->name); 637 err = -EINVAL; 638 break; 639 } 640 } while (time_before(jiffies, end_time)); 641 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 642 intel_engine_pm_put(engine); 643 644 if (err) 645 break; 646 647 err = igt_flush_test(gt->i915, 0); 648 if (err) 649 break; 650 } 651 652 if (intel_gt_is_wedged(gt)) 653 err = -EIO; 654 655 if (active) { 656 mutex_lock(>->i915->drm.struct_mutex); 657 hang_fini(&h); 658 mutex_unlock(>->i915->drm.struct_mutex); 659 } 660 661 return err; 662 } 663 664 static int igt_reset_idle_engine(void *arg) 665 { 666 return __igt_reset_engine(arg, false); 667 } 668 669 static int igt_reset_active_engine(void *arg) 670 { 671 return __igt_reset_engine(arg, true); 672 } 673 674 struct active_engine { 675 struct task_struct *task; 676 struct intel_engine_cs *engine; 677 unsigned long resets; 678 unsigned int flags; 679 }; 680 681 #define TEST_ACTIVE BIT(0) 682 #define TEST_OTHERS BIT(1) 683 #define TEST_SELF BIT(2) 684 #define TEST_PRIORITY BIT(3) 685 686 static int active_request_put(struct i915_request *rq) 687 { 688 int err = 0; 689 690 if (!rq) 691 return 0; 692 693 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 694 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 695 rq->engine->name, 696 rq->fence.context, 697 rq->fence.seqno); 698 GEM_TRACE_DUMP(); 699 700 intel_gt_set_wedged(rq->engine->gt); 701 err = -EIO; 702 } 703 704 i915_request_put(rq); 705 706 return err; 707 } 708 709 static int active_engine(void *data) 710 { 711 I915_RND_STATE(prng); 712 struct active_engine *arg = data; 713 struct intel_engine_cs *engine = arg->engine; 714 struct i915_request *rq[8] = {}; 715 struct i915_gem_context *ctx[ARRAY_SIZE(rq)]; 716 struct drm_file *file; 717 unsigned long count = 0; 718 int err = 0; 719 720 file = mock_file(engine->i915); 721 if (IS_ERR(file)) 722 return PTR_ERR(file); 723 724 for (count = 0; count < ARRAY_SIZE(ctx); count++) { 725 mutex_lock(&engine->i915->drm.struct_mutex); 726 ctx[count] = live_context(engine->i915, file); 727 mutex_unlock(&engine->i915->drm.struct_mutex); 728 if (IS_ERR(ctx[count])) { 729 err = PTR_ERR(ctx[count]); 730 while (--count) 731 i915_gem_context_put(ctx[count]); 732 goto err_file; 733 } 734 } 735 736 while (!kthread_should_stop()) { 737 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 738 struct i915_request *old = rq[idx]; 739 struct i915_request *new; 740 741 mutex_lock(&engine->i915->drm.struct_mutex); 742 new = igt_request_alloc(ctx[idx], engine); 743 if (IS_ERR(new)) { 744 mutex_unlock(&engine->i915->drm.struct_mutex); 745 err = PTR_ERR(new); 746 break; 747 } 748 749 if (arg->flags & TEST_PRIORITY) 750 ctx[idx]->sched.priority = 751 i915_prandom_u32_max_state(512, &prng); 752 753 rq[idx] = i915_request_get(new); 754 i915_request_add(new); 755 mutex_unlock(&engine->i915->drm.struct_mutex); 756 757 err = active_request_put(old); 758 if (err) 759 break; 760 761 cond_resched(); 762 } 763 764 for (count = 0; count < ARRAY_SIZE(rq); count++) { 765 int err__ = active_request_put(rq[count]); 766 767 /* Keep the first error */ 768 if (!err) 769 err = err__; 770 } 771 772 err_file: 773 mock_file_free(engine->i915, file); 774 return err; 775 } 776 777 static int __igt_reset_engines(struct intel_gt *gt, 778 const char *test_name, 779 unsigned int flags) 780 { 781 struct i915_gpu_error *global = >->i915->gpu_error; 782 struct intel_engine_cs *engine, *other; 783 enum intel_engine_id id, tmp; 784 struct hang h; 785 int err = 0; 786 787 /* Check that issuing a reset on one engine does not interfere 788 * with any other engine. 789 */ 790 791 if (!intel_has_reset_engine(gt->i915)) 792 return 0; 793 794 if (flags & TEST_ACTIVE) { 795 mutex_lock(>->i915->drm.struct_mutex); 796 err = hang_init(&h, gt); 797 mutex_unlock(>->i915->drm.struct_mutex); 798 if (err) 799 return err; 800 801 if (flags & TEST_PRIORITY) 802 h.ctx->sched.priority = 1024; 803 } 804 805 for_each_engine(engine, gt->i915, id) { 806 struct active_engine threads[I915_NUM_ENGINES] = {}; 807 unsigned long device = i915_reset_count(global); 808 unsigned long count = 0, reported; 809 IGT_TIMEOUT(end_time); 810 811 if (flags & TEST_ACTIVE && 812 !intel_engine_can_store_dword(engine)) 813 continue; 814 815 if (!wait_for_idle(engine)) { 816 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 817 engine->name, test_name); 818 err = -EIO; 819 break; 820 } 821 822 memset(threads, 0, sizeof(threads)); 823 for_each_engine(other, gt->i915, tmp) { 824 struct task_struct *tsk; 825 826 threads[tmp].resets = 827 i915_reset_engine_count(global, other); 828 829 if (!(flags & TEST_OTHERS)) 830 continue; 831 832 if (other == engine && !(flags & TEST_SELF)) 833 continue; 834 835 threads[tmp].engine = other; 836 threads[tmp].flags = flags; 837 838 tsk = kthread_run(active_engine, &threads[tmp], 839 "igt/%s", other->name); 840 if (IS_ERR(tsk)) { 841 err = PTR_ERR(tsk); 842 goto unwind; 843 } 844 845 threads[tmp].task = tsk; 846 get_task_struct(tsk); 847 } 848 849 intel_engine_pm_get(engine); 850 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 851 do { 852 struct i915_request *rq = NULL; 853 854 if (flags & TEST_ACTIVE) { 855 mutex_lock(>->i915->drm.struct_mutex); 856 rq = hang_create_request(&h, engine); 857 if (IS_ERR(rq)) { 858 err = PTR_ERR(rq); 859 mutex_unlock(>->i915->drm.struct_mutex); 860 break; 861 } 862 863 i915_request_get(rq); 864 i915_request_add(rq); 865 mutex_unlock(>->i915->drm.struct_mutex); 866 867 if (!wait_until_running(&h, rq)) { 868 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 869 870 pr_err("%s: Failed to start request %llx, at %x\n", 871 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 872 intel_engine_dump(engine, &p, 873 "%s\n", engine->name); 874 875 i915_request_put(rq); 876 err = -EIO; 877 break; 878 } 879 } 880 881 err = intel_engine_reset(engine, NULL); 882 if (err) { 883 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 884 engine->name, test_name, err); 885 break; 886 } 887 888 count++; 889 890 if (rq) { 891 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 892 struct drm_printer p = 893 drm_info_printer(gt->i915->drm.dev); 894 895 pr_err("i915_reset_engine(%s:%s):" 896 " failed to complete request after reset\n", 897 engine->name, test_name); 898 intel_engine_dump(engine, &p, 899 "%s\n", engine->name); 900 i915_request_put(rq); 901 902 GEM_TRACE_DUMP(); 903 intel_gt_set_wedged(gt); 904 err = -EIO; 905 break; 906 } 907 908 i915_request_put(rq); 909 } 910 911 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 912 struct drm_printer p = 913 drm_info_printer(gt->i915->drm.dev); 914 915 pr_err("i915_reset_engine(%s:%s):" 916 " failed to idle after reset\n", 917 engine->name, test_name); 918 intel_engine_dump(engine, &p, 919 "%s\n", engine->name); 920 921 err = -EIO; 922 break; 923 } 924 } while (time_before(jiffies, end_time)); 925 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 926 intel_engine_pm_put(engine); 927 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 928 engine->name, test_name, count); 929 930 reported = i915_reset_engine_count(global, engine); 931 reported -= threads[engine->id].resets; 932 if (reported != count) { 933 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 934 engine->name, test_name, count, reported); 935 if (!err) 936 err = -EINVAL; 937 } 938 939 unwind: 940 for_each_engine(other, gt->i915, tmp) { 941 int ret; 942 943 if (!threads[tmp].task) 944 continue; 945 946 ret = kthread_stop(threads[tmp].task); 947 if (ret) { 948 pr_err("kthread for other engine %s failed, err=%d\n", 949 other->name, ret); 950 if (!err) 951 err = ret; 952 } 953 put_task_struct(threads[tmp].task); 954 955 if (other->uabi_class != engine->uabi_class && 956 threads[tmp].resets != 957 i915_reset_engine_count(global, other)) { 958 pr_err("Innocent engine %s was reset (count=%ld)\n", 959 other->name, 960 i915_reset_engine_count(global, other) - 961 threads[tmp].resets); 962 if (!err) 963 err = -EINVAL; 964 } 965 } 966 967 if (device != i915_reset_count(global)) { 968 pr_err("Global reset (count=%ld)!\n", 969 i915_reset_count(global) - device); 970 if (!err) 971 err = -EINVAL; 972 } 973 974 if (err) 975 break; 976 977 mutex_lock(>->i915->drm.struct_mutex); 978 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED); 979 mutex_unlock(>->i915->drm.struct_mutex); 980 if (err) 981 break; 982 } 983 984 if (intel_gt_is_wedged(gt)) 985 err = -EIO; 986 987 if (flags & TEST_ACTIVE) { 988 mutex_lock(>->i915->drm.struct_mutex); 989 hang_fini(&h); 990 mutex_unlock(>->i915->drm.struct_mutex); 991 } 992 993 return err; 994 } 995 996 static int igt_reset_engines(void *arg) 997 { 998 static const struct { 999 const char *name; 1000 unsigned int flags; 1001 } phases[] = { 1002 { "idle", 0 }, 1003 { "active", TEST_ACTIVE }, 1004 { "others-idle", TEST_OTHERS }, 1005 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1006 { 1007 "others-priority", 1008 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1009 }, 1010 { 1011 "self-priority", 1012 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1013 }, 1014 { } 1015 }; 1016 struct intel_gt *gt = arg; 1017 typeof(*phases) *p; 1018 int err; 1019 1020 for (p = phases; p->name; p++) { 1021 if (p->flags & TEST_PRIORITY) { 1022 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1023 continue; 1024 } 1025 1026 err = __igt_reset_engines(arg, p->name, p->flags); 1027 if (err) 1028 return err; 1029 } 1030 1031 return 0; 1032 } 1033 1034 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1035 { 1036 u32 count = i915_reset_count(>->i915->gpu_error); 1037 1038 intel_gt_reset(gt, mask, NULL); 1039 1040 return count; 1041 } 1042 1043 static int igt_reset_wait(void *arg) 1044 { 1045 struct intel_gt *gt = arg; 1046 struct i915_gpu_error *global = >->i915->gpu_error; 1047 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1048 struct i915_request *rq; 1049 unsigned int reset_count; 1050 struct hang h; 1051 long timeout; 1052 int err; 1053 1054 if (!engine || !intel_engine_can_store_dword(engine)) 1055 return 0; 1056 1057 /* Check that we detect a stuck waiter and issue a reset */ 1058 1059 igt_global_reset_lock(gt); 1060 1061 mutex_lock(>->i915->drm.struct_mutex); 1062 err = hang_init(&h, gt); 1063 if (err) 1064 goto unlock; 1065 1066 rq = hang_create_request(&h, engine); 1067 if (IS_ERR(rq)) { 1068 err = PTR_ERR(rq); 1069 goto fini; 1070 } 1071 1072 i915_request_get(rq); 1073 i915_request_add(rq); 1074 1075 if (!wait_until_running(&h, rq)) { 1076 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1077 1078 pr_err("%s: Failed to start request %llx, at %x\n", 1079 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1080 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1081 1082 intel_gt_set_wedged(gt); 1083 1084 err = -EIO; 1085 goto out_rq; 1086 } 1087 1088 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1089 1090 timeout = i915_request_wait(rq, 0, 10); 1091 if (timeout < 0) { 1092 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1093 timeout); 1094 err = timeout; 1095 goto out_rq; 1096 } 1097 1098 if (i915_reset_count(global) == reset_count) { 1099 pr_err("No GPU reset recorded!\n"); 1100 err = -EINVAL; 1101 goto out_rq; 1102 } 1103 1104 out_rq: 1105 i915_request_put(rq); 1106 fini: 1107 hang_fini(&h); 1108 unlock: 1109 mutex_unlock(>->i915->drm.struct_mutex); 1110 igt_global_reset_unlock(gt); 1111 1112 if (intel_gt_is_wedged(gt)) 1113 return -EIO; 1114 1115 return err; 1116 } 1117 1118 struct evict_vma { 1119 struct completion completion; 1120 struct i915_vma *vma; 1121 }; 1122 1123 static int evict_vma(void *data) 1124 { 1125 struct evict_vma *arg = data; 1126 struct i915_address_space *vm = arg->vma->vm; 1127 struct drm_i915_private *i915 = vm->i915; 1128 struct drm_mm_node evict = arg->vma->node; 1129 int err; 1130 1131 complete(&arg->completion); 1132 1133 mutex_lock(&i915->drm.struct_mutex); 1134 err = i915_gem_evict_for_node(vm, &evict, 0); 1135 mutex_unlock(&i915->drm.struct_mutex); 1136 1137 return err; 1138 } 1139 1140 static int evict_fence(void *data) 1141 { 1142 struct evict_vma *arg = data; 1143 struct drm_i915_private *i915 = arg->vma->vm->i915; 1144 int err; 1145 1146 complete(&arg->completion); 1147 1148 mutex_lock(&i915->drm.struct_mutex); 1149 1150 /* Mark the fence register as dirty to force the mmio update. */ 1151 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1152 if (err) { 1153 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1154 goto out_unlock; 1155 } 1156 1157 err = i915_vma_pin_fence(arg->vma); 1158 if (err) { 1159 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1160 goto out_unlock; 1161 } 1162 1163 i915_vma_unpin_fence(arg->vma); 1164 1165 out_unlock: 1166 mutex_unlock(&i915->drm.struct_mutex); 1167 1168 return err; 1169 } 1170 1171 static int __igt_reset_evict_vma(struct intel_gt *gt, 1172 struct i915_address_space *vm, 1173 int (*fn)(void *), 1174 unsigned int flags) 1175 { 1176 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1177 struct drm_i915_gem_object *obj; 1178 struct task_struct *tsk = NULL; 1179 struct i915_request *rq; 1180 struct evict_vma arg; 1181 struct hang h; 1182 int err; 1183 1184 if (!engine || !intel_engine_can_store_dword(engine)) 1185 return 0; 1186 1187 /* Check that we can recover an unbind stuck on a hanging request */ 1188 1189 mutex_lock(>->i915->drm.struct_mutex); 1190 err = hang_init(&h, gt); 1191 if (err) 1192 goto unlock; 1193 1194 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1195 if (IS_ERR(obj)) { 1196 err = PTR_ERR(obj); 1197 goto fini; 1198 } 1199 1200 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1201 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1202 if (err) { 1203 pr_err("Invalid X-tiling settings; err:%d\n", err); 1204 goto out_obj; 1205 } 1206 } 1207 1208 arg.vma = i915_vma_instance(obj, vm, NULL); 1209 if (IS_ERR(arg.vma)) { 1210 err = PTR_ERR(arg.vma); 1211 goto out_obj; 1212 } 1213 1214 rq = hang_create_request(&h, engine); 1215 if (IS_ERR(rq)) { 1216 err = PTR_ERR(rq); 1217 goto out_obj; 1218 } 1219 1220 err = i915_vma_pin(arg.vma, 0, 0, 1221 i915_vma_is_ggtt(arg.vma) ? 1222 PIN_GLOBAL | PIN_MAPPABLE : 1223 PIN_USER); 1224 if (err) { 1225 i915_request_add(rq); 1226 goto out_obj; 1227 } 1228 1229 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1230 err = i915_vma_pin_fence(arg.vma); 1231 if (err) { 1232 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1233 i915_vma_unpin(arg.vma); 1234 i915_request_add(rq); 1235 goto out_obj; 1236 } 1237 } 1238 1239 i915_vma_lock(arg.vma); 1240 err = i915_vma_move_to_active(arg.vma, rq, flags); 1241 i915_vma_unlock(arg.vma); 1242 1243 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1244 i915_vma_unpin_fence(arg.vma); 1245 i915_vma_unpin(arg.vma); 1246 1247 i915_request_get(rq); 1248 i915_request_add(rq); 1249 if (err) 1250 goto out_rq; 1251 1252 mutex_unlock(>->i915->drm.struct_mutex); 1253 1254 if (!wait_until_running(&h, rq)) { 1255 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1256 1257 pr_err("%s: Failed to start request %llx, at %x\n", 1258 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1259 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1260 1261 intel_gt_set_wedged(gt); 1262 goto out_reset; 1263 } 1264 1265 init_completion(&arg.completion); 1266 1267 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1268 if (IS_ERR(tsk)) { 1269 err = PTR_ERR(tsk); 1270 tsk = NULL; 1271 goto out_reset; 1272 } 1273 get_task_struct(tsk); 1274 1275 wait_for_completion(&arg.completion); 1276 1277 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1278 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1279 1280 pr_err("igt/evict_vma kthread did not wait\n"); 1281 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1282 1283 intel_gt_set_wedged(gt); 1284 goto out_reset; 1285 } 1286 1287 out_reset: 1288 igt_global_reset_lock(gt); 1289 fake_hangcheck(gt, rq->engine->mask); 1290 igt_global_reset_unlock(gt); 1291 1292 if (tsk) { 1293 struct intel_wedge_me w; 1294 1295 /* The reset, even indirectly, should take less than 10ms. */ 1296 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1297 err = kthread_stop(tsk); 1298 1299 put_task_struct(tsk); 1300 } 1301 1302 mutex_lock(>->i915->drm.struct_mutex); 1303 out_rq: 1304 i915_request_put(rq); 1305 out_obj: 1306 i915_gem_object_put(obj); 1307 fini: 1308 hang_fini(&h); 1309 unlock: 1310 mutex_unlock(>->i915->drm.struct_mutex); 1311 1312 if (intel_gt_is_wedged(gt)) 1313 return -EIO; 1314 1315 return err; 1316 } 1317 1318 static int igt_reset_evict_ggtt(void *arg) 1319 { 1320 struct intel_gt *gt = arg; 1321 1322 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1323 evict_vma, EXEC_OBJECT_WRITE); 1324 } 1325 1326 static int igt_reset_evict_ppgtt(void *arg) 1327 { 1328 struct intel_gt *gt = arg; 1329 struct i915_gem_context *ctx; 1330 struct drm_file *file; 1331 int err; 1332 1333 file = mock_file(gt->i915); 1334 if (IS_ERR(file)) 1335 return PTR_ERR(file); 1336 1337 mutex_lock(>->i915->drm.struct_mutex); 1338 ctx = live_context(gt->i915, file); 1339 mutex_unlock(>->i915->drm.struct_mutex); 1340 if (IS_ERR(ctx)) { 1341 err = PTR_ERR(ctx); 1342 goto out; 1343 } 1344 1345 err = 0; 1346 if (ctx->vm) /* aliasing == global gtt locking, covered above */ 1347 err = __igt_reset_evict_vma(gt, ctx->vm, 1348 evict_vma, EXEC_OBJECT_WRITE); 1349 1350 out: 1351 mock_file_free(gt->i915, file); 1352 return err; 1353 } 1354 1355 static int igt_reset_evict_fence(void *arg) 1356 { 1357 struct intel_gt *gt = arg; 1358 1359 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1360 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1361 } 1362 1363 static int wait_for_others(struct intel_gt *gt, 1364 struct intel_engine_cs *exclude) 1365 { 1366 struct intel_engine_cs *engine; 1367 enum intel_engine_id id; 1368 1369 for_each_engine(engine, gt->i915, id) { 1370 if (engine == exclude) 1371 continue; 1372 1373 if (!wait_for_idle(engine)) 1374 return -EIO; 1375 } 1376 1377 return 0; 1378 } 1379 1380 static int igt_reset_queue(void *arg) 1381 { 1382 struct intel_gt *gt = arg; 1383 struct i915_gpu_error *global = >->i915->gpu_error; 1384 struct intel_engine_cs *engine; 1385 enum intel_engine_id id; 1386 struct hang h; 1387 int err; 1388 1389 /* Check that we replay pending requests following a hang */ 1390 1391 igt_global_reset_lock(gt); 1392 1393 mutex_lock(>->i915->drm.struct_mutex); 1394 err = hang_init(&h, gt); 1395 if (err) 1396 goto unlock; 1397 1398 for_each_engine(engine, gt->i915, id) { 1399 struct i915_request *prev; 1400 IGT_TIMEOUT(end_time); 1401 unsigned int count; 1402 1403 if (!intel_engine_can_store_dword(engine)) 1404 continue; 1405 1406 prev = hang_create_request(&h, engine); 1407 if (IS_ERR(prev)) { 1408 err = PTR_ERR(prev); 1409 goto fini; 1410 } 1411 1412 i915_request_get(prev); 1413 i915_request_add(prev); 1414 1415 count = 0; 1416 do { 1417 struct i915_request *rq; 1418 unsigned int reset_count; 1419 1420 rq = hang_create_request(&h, engine); 1421 if (IS_ERR(rq)) { 1422 err = PTR_ERR(rq); 1423 goto fini; 1424 } 1425 1426 i915_request_get(rq); 1427 i915_request_add(rq); 1428 1429 /* 1430 * XXX We don't handle resetting the kernel context 1431 * very well. If we trigger a device reset twice in 1432 * quick succession while the kernel context is 1433 * executing, we may end up skipping the breadcrumb. 1434 * This is really only a problem for the selftest as 1435 * normally there is a large interlude between resets 1436 * (hangcheck), or we focus on resetting just one 1437 * engine and so avoid repeatedly resetting innocents. 1438 */ 1439 err = wait_for_others(gt, engine); 1440 if (err) { 1441 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1442 __func__, engine->name); 1443 i915_request_put(rq); 1444 i915_request_put(prev); 1445 1446 GEM_TRACE_DUMP(); 1447 intel_gt_set_wedged(gt); 1448 goto fini; 1449 } 1450 1451 if (!wait_until_running(&h, prev)) { 1452 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1453 1454 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1455 __func__, engine->name, 1456 prev->fence.seqno, hws_seqno(&h, prev)); 1457 intel_engine_dump(engine, &p, 1458 "%s\n", engine->name); 1459 1460 i915_request_put(rq); 1461 i915_request_put(prev); 1462 1463 intel_gt_set_wedged(gt); 1464 1465 err = -EIO; 1466 goto fini; 1467 } 1468 1469 reset_count = fake_hangcheck(gt, BIT(id)); 1470 1471 if (prev->fence.error != -EIO) { 1472 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1473 prev->fence.error); 1474 i915_request_put(rq); 1475 i915_request_put(prev); 1476 err = -EINVAL; 1477 goto fini; 1478 } 1479 1480 if (rq->fence.error) { 1481 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1482 rq->fence.error); 1483 i915_request_put(rq); 1484 i915_request_put(prev); 1485 err = -EINVAL; 1486 goto fini; 1487 } 1488 1489 if (i915_reset_count(global) == reset_count) { 1490 pr_err("No GPU reset recorded!\n"); 1491 i915_request_put(rq); 1492 i915_request_put(prev); 1493 err = -EINVAL; 1494 goto fini; 1495 } 1496 1497 i915_request_put(prev); 1498 prev = rq; 1499 count++; 1500 } while (time_before(jiffies, end_time)); 1501 pr_info("%s: Completed %d resets\n", engine->name, count); 1502 1503 *h.batch = MI_BATCH_BUFFER_END; 1504 intel_gt_chipset_flush(engine->gt); 1505 1506 i915_request_put(prev); 1507 1508 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED); 1509 if (err) 1510 break; 1511 } 1512 1513 fini: 1514 hang_fini(&h); 1515 unlock: 1516 mutex_unlock(>->i915->drm.struct_mutex); 1517 igt_global_reset_unlock(gt); 1518 1519 if (intel_gt_is_wedged(gt)) 1520 return -EIO; 1521 1522 return err; 1523 } 1524 1525 static int igt_handle_error(void *arg) 1526 { 1527 struct intel_gt *gt = arg; 1528 struct i915_gpu_error *global = >->i915->gpu_error; 1529 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1530 struct hang h; 1531 struct i915_request *rq; 1532 struct i915_gpu_state *error; 1533 int err; 1534 1535 /* Check that we can issue a global GPU and engine reset */ 1536 1537 if (!intel_has_reset_engine(gt->i915)) 1538 return 0; 1539 1540 if (!engine || !intel_engine_can_store_dword(engine)) 1541 return 0; 1542 1543 mutex_lock(>->i915->drm.struct_mutex); 1544 1545 err = hang_init(&h, gt); 1546 if (err) 1547 goto err_unlock; 1548 1549 rq = hang_create_request(&h, engine); 1550 if (IS_ERR(rq)) { 1551 err = PTR_ERR(rq); 1552 goto err_fini; 1553 } 1554 1555 i915_request_get(rq); 1556 i915_request_add(rq); 1557 1558 if (!wait_until_running(&h, rq)) { 1559 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1560 1561 pr_err("%s: Failed to start request %llx, at %x\n", 1562 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1563 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1564 1565 intel_gt_set_wedged(gt); 1566 1567 err = -EIO; 1568 goto err_request; 1569 } 1570 1571 mutex_unlock(>->i915->drm.struct_mutex); 1572 1573 /* Temporarily disable error capture */ 1574 error = xchg(&global->first_error, (void *)-1); 1575 1576 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1577 1578 xchg(&global->first_error, error); 1579 1580 mutex_lock(>->i915->drm.struct_mutex); 1581 1582 if (rq->fence.error != -EIO) { 1583 pr_err("Guilty request not identified!\n"); 1584 err = -EINVAL; 1585 goto err_request; 1586 } 1587 1588 err_request: 1589 i915_request_put(rq); 1590 err_fini: 1591 hang_fini(&h); 1592 err_unlock: 1593 mutex_unlock(>->i915->drm.struct_mutex); 1594 return err; 1595 } 1596 1597 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1598 const struct igt_atomic_section *p, 1599 const char *mode) 1600 { 1601 struct tasklet_struct * const t = &engine->execlists.tasklet; 1602 int err; 1603 1604 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1605 engine->name, mode, p->name); 1606 1607 tasklet_disable_nosync(t); 1608 p->critical_section_begin(); 1609 1610 err = intel_engine_reset(engine, NULL); 1611 1612 p->critical_section_end(); 1613 tasklet_enable(t); 1614 1615 if (err) 1616 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1617 engine->name, mode, p->name); 1618 1619 return err; 1620 } 1621 1622 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1623 const struct igt_atomic_section *p) 1624 { 1625 struct i915_request *rq; 1626 struct hang h; 1627 int err; 1628 1629 err = __igt_atomic_reset_engine(engine, p, "idle"); 1630 if (err) 1631 return err; 1632 1633 err = hang_init(&h, engine->gt); 1634 if (err) 1635 return err; 1636 1637 rq = hang_create_request(&h, engine); 1638 if (IS_ERR(rq)) { 1639 err = PTR_ERR(rq); 1640 goto out; 1641 } 1642 1643 i915_request_get(rq); 1644 i915_request_add(rq); 1645 1646 if (wait_until_running(&h, rq)) { 1647 err = __igt_atomic_reset_engine(engine, p, "active"); 1648 } else { 1649 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1650 __func__, engine->name, 1651 rq->fence.seqno, hws_seqno(&h, rq)); 1652 intel_gt_set_wedged(engine->gt); 1653 err = -EIO; 1654 } 1655 1656 if (err == 0) { 1657 struct intel_wedge_me w; 1658 1659 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1660 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1661 if (intel_gt_is_wedged(engine->gt)) 1662 err = -EIO; 1663 } 1664 1665 i915_request_put(rq); 1666 out: 1667 hang_fini(&h); 1668 return err; 1669 } 1670 1671 static int igt_reset_engines_atomic(void *arg) 1672 { 1673 struct intel_gt *gt = arg; 1674 const typeof(*igt_atomic_phases) *p; 1675 int err = 0; 1676 1677 /* Check that the engines resets are usable from atomic context */ 1678 1679 if (!intel_has_reset_engine(gt->i915)) 1680 return 0; 1681 1682 if (USES_GUC_SUBMISSION(gt->i915)) 1683 return 0; 1684 1685 igt_global_reset_lock(gt); 1686 mutex_lock(>->i915->drm.struct_mutex); 1687 1688 /* Flush any requests before we get started and check basics */ 1689 if (!igt_force_reset(gt)) 1690 goto unlock; 1691 1692 for (p = igt_atomic_phases; p->name; p++) { 1693 struct intel_engine_cs *engine; 1694 enum intel_engine_id id; 1695 1696 for_each_engine(engine, gt->i915, id) { 1697 err = igt_atomic_reset_engine(engine, p); 1698 if (err) 1699 goto out; 1700 } 1701 } 1702 1703 out: 1704 /* As we poke around the guts, do a full reset before continuing. */ 1705 igt_force_reset(gt); 1706 1707 unlock: 1708 mutex_unlock(>->i915->drm.struct_mutex); 1709 igt_global_reset_unlock(gt); 1710 1711 return err; 1712 } 1713 1714 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1715 { 1716 static const struct i915_subtest tests[] = { 1717 SUBTEST(igt_hang_sanitycheck), 1718 SUBTEST(igt_reset_nop), 1719 SUBTEST(igt_reset_nop_engine), 1720 SUBTEST(igt_reset_idle_engine), 1721 SUBTEST(igt_reset_active_engine), 1722 SUBTEST(igt_reset_engines), 1723 SUBTEST(igt_reset_engines_atomic), 1724 SUBTEST(igt_reset_queue), 1725 SUBTEST(igt_reset_wait), 1726 SUBTEST(igt_reset_evict_ggtt), 1727 SUBTEST(igt_reset_evict_ppgtt), 1728 SUBTEST(igt_reset_evict_fence), 1729 SUBTEST(igt_handle_error), 1730 }; 1731 struct intel_gt *gt = &i915->gt; 1732 intel_wakeref_t wakeref; 1733 bool saved_hangcheck; 1734 int err; 1735 1736 if (!intel_has_gpu_reset(gt->i915)) 1737 return 0; 1738 1739 if (intel_gt_is_wedged(gt)) 1740 return -EIO; /* we're long past hope of a successful reset */ 1741 1742 wakeref = intel_runtime_pm_get(>->i915->runtime_pm); 1743 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck); 1744 drain_delayed_work(>->hangcheck.work); /* flush param */ 1745 1746 err = intel_gt_live_subtests(tests, gt); 1747 1748 mutex_lock(>->i915->drm.struct_mutex); 1749 igt_flush_test(gt->i915, I915_WAIT_LOCKED); 1750 mutex_unlock(>->i915->drm.struct_mutex); 1751 1752 i915_modparams.enable_hangcheck = saved_hangcheck; 1753 intel_runtime_pm_put(>->i915->runtime_pm, wakeref); 1754 1755 return err; 1756 } 1757