1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/kthread.h> 26 27 #include "gem/i915_gem_context.h" 28 #include "gt/intel_gt.h" 29 #include "intel_engine_pm.h" 30 31 #include "i915_selftest.h" 32 #include "selftests/i915_random.h" 33 #include "selftests/igt_flush_test.h" 34 #include "selftests/igt_reset.h" 35 #include "selftests/igt_atomic.h" 36 37 #include "selftests/mock_drm.h" 38 39 #include "gem/selftests/mock_context.h" 40 #include "gem/selftests/igt_gem_utils.h" 41 42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 43 44 struct hang { 45 struct intel_gt *gt; 46 struct drm_i915_gem_object *hws; 47 struct drm_i915_gem_object *obj; 48 struct i915_gem_context *ctx; 49 u32 *seqno; 50 u32 *batch; 51 }; 52 53 static int hang_init(struct hang *h, struct intel_gt *gt) 54 { 55 void *vaddr; 56 int err; 57 58 memset(h, 0, sizeof(*h)); 59 h->gt = gt; 60 61 h->ctx = kernel_context(gt->i915); 62 if (IS_ERR(h->ctx)) 63 return PTR_ERR(h->ctx); 64 65 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 66 67 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 68 if (IS_ERR(h->hws)) { 69 err = PTR_ERR(h->hws); 70 goto err_ctx; 71 } 72 73 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 74 if (IS_ERR(h->obj)) { 75 err = PTR_ERR(h->obj); 76 goto err_hws; 77 } 78 79 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 80 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 81 if (IS_ERR(vaddr)) { 82 err = PTR_ERR(vaddr); 83 goto err_obj; 84 } 85 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 86 87 vaddr = i915_gem_object_pin_map(h->obj, 88 i915_coherent_map_type(gt->i915)); 89 if (IS_ERR(vaddr)) { 90 err = PTR_ERR(vaddr); 91 goto err_unpin_hws; 92 } 93 h->batch = vaddr; 94 95 return 0; 96 97 err_unpin_hws: 98 i915_gem_object_unpin_map(h->hws); 99 err_obj: 100 i915_gem_object_put(h->obj); 101 err_hws: 102 i915_gem_object_put(h->hws); 103 err_ctx: 104 kernel_context_close(h->ctx); 105 return err; 106 } 107 108 static u64 hws_address(const struct i915_vma *hws, 109 const struct i915_request *rq) 110 { 111 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 112 } 113 114 static int move_to_active(struct i915_vma *vma, 115 struct i915_request *rq, 116 unsigned int flags) 117 { 118 int err; 119 120 i915_vma_lock(vma); 121 err = i915_request_await_object(rq, vma->obj, 122 flags & EXEC_OBJECT_WRITE); 123 if (err == 0) 124 err = i915_vma_move_to_active(vma, rq, flags); 125 i915_vma_unlock(vma); 126 127 return err; 128 } 129 130 static struct i915_request * 131 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 132 { 133 struct intel_gt *gt = h->gt; 134 struct i915_address_space *vm = h->ctx->vm ?: &engine->gt->ggtt->vm; 135 struct drm_i915_gem_object *obj; 136 struct i915_request *rq = NULL; 137 struct i915_vma *hws, *vma; 138 unsigned int flags; 139 void *vaddr; 140 u32 *batch; 141 int err; 142 143 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 144 if (IS_ERR(obj)) 145 return ERR_CAST(obj); 146 147 vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915)); 148 if (IS_ERR(vaddr)) { 149 i915_gem_object_put(obj); 150 return ERR_CAST(vaddr); 151 } 152 153 i915_gem_object_unpin_map(h->obj); 154 i915_gem_object_put(h->obj); 155 156 h->obj = obj; 157 h->batch = vaddr; 158 159 vma = i915_vma_instance(h->obj, vm, NULL); 160 if (IS_ERR(vma)) 161 return ERR_CAST(vma); 162 163 hws = i915_vma_instance(h->hws, vm, NULL); 164 if (IS_ERR(hws)) 165 return ERR_CAST(hws); 166 167 err = i915_vma_pin(vma, 0, 0, PIN_USER); 168 if (err) 169 return ERR_PTR(err); 170 171 err = i915_vma_pin(hws, 0, 0, PIN_USER); 172 if (err) 173 goto unpin_vma; 174 175 rq = igt_request_alloc(h->ctx, engine); 176 if (IS_ERR(rq)) { 177 err = PTR_ERR(rq); 178 goto unpin_hws; 179 } 180 181 err = move_to_active(vma, rq, 0); 182 if (err) 183 goto cancel_rq; 184 185 err = move_to_active(hws, rq, 0); 186 if (err) 187 goto cancel_rq; 188 189 batch = h->batch; 190 if (INTEL_GEN(gt->i915) >= 8) { 191 *batch++ = MI_STORE_DWORD_IMM_GEN4; 192 *batch++ = lower_32_bits(hws_address(hws, rq)); 193 *batch++ = upper_32_bits(hws_address(hws, rq)); 194 *batch++ = rq->fence.seqno; 195 *batch++ = MI_ARB_CHECK; 196 197 memset(batch, 0, 1024); 198 batch += 1024 / sizeof(*batch); 199 200 *batch++ = MI_ARB_CHECK; 201 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 202 *batch++ = lower_32_bits(vma->node.start); 203 *batch++ = upper_32_bits(vma->node.start); 204 } else if (INTEL_GEN(gt->i915) >= 6) { 205 *batch++ = MI_STORE_DWORD_IMM_GEN4; 206 *batch++ = 0; 207 *batch++ = lower_32_bits(hws_address(hws, rq)); 208 *batch++ = rq->fence.seqno; 209 *batch++ = MI_ARB_CHECK; 210 211 memset(batch, 0, 1024); 212 batch += 1024 / sizeof(*batch); 213 214 *batch++ = MI_ARB_CHECK; 215 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 216 *batch++ = lower_32_bits(vma->node.start); 217 } else if (INTEL_GEN(gt->i915) >= 4) { 218 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 219 *batch++ = 0; 220 *batch++ = lower_32_bits(hws_address(hws, rq)); 221 *batch++ = rq->fence.seqno; 222 *batch++ = MI_ARB_CHECK; 223 224 memset(batch, 0, 1024); 225 batch += 1024 / sizeof(*batch); 226 227 *batch++ = MI_ARB_CHECK; 228 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 229 *batch++ = lower_32_bits(vma->node.start); 230 } else { 231 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 232 *batch++ = lower_32_bits(hws_address(hws, rq)); 233 *batch++ = rq->fence.seqno; 234 *batch++ = MI_ARB_CHECK; 235 236 memset(batch, 0, 1024); 237 batch += 1024 / sizeof(*batch); 238 239 *batch++ = MI_ARB_CHECK; 240 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 241 *batch++ = lower_32_bits(vma->node.start); 242 } 243 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 244 intel_gt_chipset_flush(engine->gt); 245 246 if (rq->engine->emit_init_breadcrumb) { 247 err = rq->engine->emit_init_breadcrumb(rq); 248 if (err) 249 goto cancel_rq; 250 } 251 252 flags = 0; 253 if (INTEL_GEN(gt->i915) <= 5) 254 flags |= I915_DISPATCH_SECURE; 255 256 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 257 258 cancel_rq: 259 if (err) { 260 i915_request_skip(rq, err); 261 i915_request_add(rq); 262 } 263 unpin_hws: 264 i915_vma_unpin(hws); 265 unpin_vma: 266 i915_vma_unpin(vma); 267 return err ? ERR_PTR(err) : rq; 268 } 269 270 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 271 { 272 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 273 } 274 275 static void hang_fini(struct hang *h) 276 { 277 *h->batch = MI_BATCH_BUFFER_END; 278 intel_gt_chipset_flush(h->gt); 279 280 i915_gem_object_unpin_map(h->obj); 281 i915_gem_object_put(h->obj); 282 283 i915_gem_object_unpin_map(h->hws); 284 i915_gem_object_put(h->hws); 285 286 kernel_context_close(h->ctx); 287 288 igt_flush_test(h->gt->i915, I915_WAIT_LOCKED); 289 } 290 291 static bool wait_until_running(struct hang *h, struct i915_request *rq) 292 { 293 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 294 rq->fence.seqno), 295 10) && 296 wait_for(i915_seqno_passed(hws_seqno(h, rq), 297 rq->fence.seqno), 298 1000)); 299 } 300 301 static int igt_hang_sanitycheck(void *arg) 302 { 303 struct intel_gt *gt = arg; 304 struct i915_request *rq; 305 struct intel_engine_cs *engine; 306 enum intel_engine_id id; 307 struct hang h; 308 int err; 309 310 /* Basic check that we can execute our hanging batch */ 311 312 mutex_lock(>->i915->drm.struct_mutex); 313 err = hang_init(&h, gt); 314 if (err) 315 goto unlock; 316 317 for_each_engine(engine, gt->i915, id) { 318 struct intel_wedge_me w; 319 long timeout; 320 321 if (!intel_engine_can_store_dword(engine)) 322 continue; 323 324 rq = hang_create_request(&h, engine); 325 if (IS_ERR(rq)) { 326 err = PTR_ERR(rq); 327 pr_err("Failed to create request for %s, err=%d\n", 328 engine->name, err); 329 goto fini; 330 } 331 332 i915_request_get(rq); 333 334 *h.batch = MI_BATCH_BUFFER_END; 335 intel_gt_chipset_flush(engine->gt); 336 337 i915_request_add(rq); 338 339 timeout = 0; 340 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 341 timeout = i915_request_wait(rq, 0, 342 MAX_SCHEDULE_TIMEOUT); 343 if (intel_gt_is_wedged(gt)) 344 timeout = -EIO; 345 346 i915_request_put(rq); 347 348 if (timeout < 0) { 349 err = timeout; 350 pr_err("Wait for request failed on %s, err=%d\n", 351 engine->name, err); 352 goto fini; 353 } 354 } 355 356 fini: 357 hang_fini(&h); 358 unlock: 359 mutex_unlock(>->i915->drm.struct_mutex); 360 return err; 361 } 362 363 static bool wait_for_idle(struct intel_engine_cs *engine) 364 { 365 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 366 } 367 368 static int igt_reset_nop(void *arg) 369 { 370 struct intel_gt *gt = arg; 371 struct i915_gpu_error *global = >->i915->gpu_error; 372 struct intel_engine_cs *engine; 373 struct i915_gem_context *ctx; 374 unsigned int reset_count, count; 375 enum intel_engine_id id; 376 struct drm_file *file; 377 IGT_TIMEOUT(end_time); 378 int err = 0; 379 380 /* Check that we can reset during non-user portions of requests */ 381 382 file = mock_file(gt->i915); 383 if (IS_ERR(file)) 384 return PTR_ERR(file); 385 386 mutex_lock(>->i915->drm.struct_mutex); 387 ctx = live_context(gt->i915, file); 388 mutex_unlock(>->i915->drm.struct_mutex); 389 if (IS_ERR(ctx)) { 390 err = PTR_ERR(ctx); 391 goto out; 392 } 393 394 i915_gem_context_clear_bannable(ctx); 395 reset_count = i915_reset_count(global); 396 count = 0; 397 do { 398 mutex_lock(>->i915->drm.struct_mutex); 399 400 for_each_engine(engine, gt->i915, id) { 401 int i; 402 403 for (i = 0; i < 16; i++) { 404 struct i915_request *rq; 405 406 rq = igt_request_alloc(ctx, engine); 407 if (IS_ERR(rq)) { 408 err = PTR_ERR(rq); 409 break; 410 } 411 412 i915_request_add(rq); 413 } 414 } 415 416 igt_global_reset_lock(gt); 417 intel_gt_reset(gt, ALL_ENGINES, NULL); 418 igt_global_reset_unlock(gt); 419 420 mutex_unlock(>->i915->drm.struct_mutex); 421 if (intel_gt_is_wedged(gt)) { 422 err = -EIO; 423 break; 424 } 425 426 if (i915_reset_count(global) != reset_count + ++count) { 427 pr_err("Full GPU reset not recorded!\n"); 428 err = -EINVAL; 429 break; 430 } 431 432 err = igt_flush_test(gt->i915, 0); 433 if (err) 434 break; 435 } while (time_before(jiffies, end_time)); 436 pr_info("%s: %d resets\n", __func__, count); 437 438 mutex_lock(>->i915->drm.struct_mutex); 439 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED); 440 mutex_unlock(>->i915->drm.struct_mutex); 441 442 out: 443 mock_file_free(gt->i915, file); 444 if (intel_gt_is_wedged(gt)) 445 err = -EIO; 446 return err; 447 } 448 449 static int igt_reset_nop_engine(void *arg) 450 { 451 struct intel_gt *gt = arg; 452 struct i915_gpu_error *global = >->i915->gpu_error; 453 struct intel_engine_cs *engine; 454 struct i915_gem_context *ctx; 455 enum intel_engine_id id; 456 struct drm_file *file; 457 int err = 0; 458 459 /* Check that we can engine-reset during non-user portions */ 460 461 if (!intel_has_reset_engine(gt->i915)) 462 return 0; 463 464 file = mock_file(gt->i915); 465 if (IS_ERR(file)) 466 return PTR_ERR(file); 467 468 mutex_lock(>->i915->drm.struct_mutex); 469 ctx = live_context(gt->i915, file); 470 mutex_unlock(>->i915->drm.struct_mutex); 471 if (IS_ERR(ctx)) { 472 err = PTR_ERR(ctx); 473 goto out; 474 } 475 476 i915_gem_context_clear_bannable(ctx); 477 for_each_engine(engine, gt->i915, id) { 478 unsigned int reset_count, reset_engine_count; 479 unsigned int count; 480 IGT_TIMEOUT(end_time); 481 482 reset_count = i915_reset_count(global); 483 reset_engine_count = i915_reset_engine_count(global, engine); 484 count = 0; 485 486 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 487 do { 488 int i; 489 490 if (!wait_for_idle(engine)) { 491 pr_err("%s failed to idle before reset\n", 492 engine->name); 493 err = -EIO; 494 break; 495 } 496 497 mutex_lock(>->i915->drm.struct_mutex); 498 for (i = 0; i < 16; i++) { 499 struct i915_request *rq; 500 501 rq = igt_request_alloc(ctx, engine); 502 if (IS_ERR(rq)) { 503 err = PTR_ERR(rq); 504 break; 505 } 506 507 i915_request_add(rq); 508 } 509 err = intel_engine_reset(engine, NULL); 510 mutex_unlock(>->i915->drm.struct_mutex); 511 if (err) { 512 pr_err("i915_reset_engine failed\n"); 513 break; 514 } 515 516 if (i915_reset_count(global) != reset_count) { 517 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 518 err = -EINVAL; 519 break; 520 } 521 522 if (i915_reset_engine_count(global, engine) != 523 reset_engine_count + ++count) { 524 pr_err("%s engine reset not recorded!\n", 525 engine->name); 526 err = -EINVAL; 527 break; 528 } 529 } while (time_before(jiffies, end_time)); 530 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 531 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 532 533 if (err) 534 break; 535 536 err = igt_flush_test(gt->i915, 0); 537 if (err) 538 break; 539 } 540 541 mutex_lock(>->i915->drm.struct_mutex); 542 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED); 543 mutex_unlock(>->i915->drm.struct_mutex); 544 545 out: 546 mock_file_free(gt->i915, file); 547 if (intel_gt_is_wedged(gt)) 548 err = -EIO; 549 return err; 550 } 551 552 static int __igt_reset_engine(struct intel_gt *gt, bool active) 553 { 554 struct i915_gpu_error *global = >->i915->gpu_error; 555 struct intel_engine_cs *engine; 556 enum intel_engine_id id; 557 struct hang h; 558 int err = 0; 559 560 /* Check that we can issue an engine reset on an idle engine (no-op) */ 561 562 if (!intel_has_reset_engine(gt->i915)) 563 return 0; 564 565 if (active) { 566 mutex_lock(>->i915->drm.struct_mutex); 567 err = hang_init(&h, gt); 568 mutex_unlock(>->i915->drm.struct_mutex); 569 if (err) 570 return err; 571 } 572 573 for_each_engine(engine, gt->i915, id) { 574 unsigned int reset_count, reset_engine_count; 575 IGT_TIMEOUT(end_time); 576 577 if (active && !intel_engine_can_store_dword(engine)) 578 continue; 579 580 if (!wait_for_idle(engine)) { 581 pr_err("%s failed to idle before reset\n", 582 engine->name); 583 err = -EIO; 584 break; 585 } 586 587 reset_count = i915_reset_count(global); 588 reset_engine_count = i915_reset_engine_count(global, engine); 589 590 intel_engine_pm_get(engine); 591 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 592 do { 593 if (active) { 594 struct i915_request *rq; 595 596 mutex_lock(>->i915->drm.struct_mutex); 597 rq = hang_create_request(&h, engine); 598 if (IS_ERR(rq)) { 599 err = PTR_ERR(rq); 600 mutex_unlock(>->i915->drm.struct_mutex); 601 break; 602 } 603 604 i915_request_get(rq); 605 i915_request_add(rq); 606 mutex_unlock(>->i915->drm.struct_mutex); 607 608 if (!wait_until_running(&h, rq)) { 609 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 610 611 pr_err("%s: Failed to start request %llx, at %x\n", 612 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 613 intel_engine_dump(engine, &p, 614 "%s\n", engine->name); 615 616 i915_request_put(rq); 617 err = -EIO; 618 break; 619 } 620 621 i915_request_put(rq); 622 } 623 624 err = intel_engine_reset(engine, NULL); 625 if (err) { 626 pr_err("i915_reset_engine failed\n"); 627 break; 628 } 629 630 if (i915_reset_count(global) != reset_count) { 631 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 632 err = -EINVAL; 633 break; 634 } 635 636 if (i915_reset_engine_count(global, engine) != 637 ++reset_engine_count) { 638 pr_err("%s engine reset not recorded!\n", 639 engine->name); 640 err = -EINVAL; 641 break; 642 } 643 } while (time_before(jiffies, end_time)); 644 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 645 intel_engine_pm_put(engine); 646 647 if (err) 648 break; 649 650 err = igt_flush_test(gt->i915, 0); 651 if (err) 652 break; 653 } 654 655 if (intel_gt_is_wedged(gt)) 656 err = -EIO; 657 658 if (active) { 659 mutex_lock(>->i915->drm.struct_mutex); 660 hang_fini(&h); 661 mutex_unlock(>->i915->drm.struct_mutex); 662 } 663 664 return err; 665 } 666 667 static int igt_reset_idle_engine(void *arg) 668 { 669 return __igt_reset_engine(arg, false); 670 } 671 672 static int igt_reset_active_engine(void *arg) 673 { 674 return __igt_reset_engine(arg, true); 675 } 676 677 struct active_engine { 678 struct task_struct *task; 679 struct intel_engine_cs *engine; 680 unsigned long resets; 681 unsigned int flags; 682 }; 683 684 #define TEST_ACTIVE BIT(0) 685 #define TEST_OTHERS BIT(1) 686 #define TEST_SELF BIT(2) 687 #define TEST_PRIORITY BIT(3) 688 689 static int active_request_put(struct i915_request *rq) 690 { 691 int err = 0; 692 693 if (!rq) 694 return 0; 695 696 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 697 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 698 rq->engine->name, 699 rq->fence.context, 700 rq->fence.seqno); 701 GEM_TRACE_DUMP(); 702 703 intel_gt_set_wedged(rq->engine->gt); 704 err = -EIO; 705 } 706 707 i915_request_put(rq); 708 709 return err; 710 } 711 712 static int active_engine(void *data) 713 { 714 I915_RND_STATE(prng); 715 struct active_engine *arg = data; 716 struct intel_engine_cs *engine = arg->engine; 717 struct i915_request *rq[8] = {}; 718 struct i915_gem_context *ctx[ARRAY_SIZE(rq)]; 719 struct drm_file *file; 720 unsigned long count = 0; 721 int err = 0; 722 723 file = mock_file(engine->i915); 724 if (IS_ERR(file)) 725 return PTR_ERR(file); 726 727 for (count = 0; count < ARRAY_SIZE(ctx); count++) { 728 mutex_lock(&engine->i915->drm.struct_mutex); 729 ctx[count] = live_context(engine->i915, file); 730 mutex_unlock(&engine->i915->drm.struct_mutex); 731 if (IS_ERR(ctx[count])) { 732 err = PTR_ERR(ctx[count]); 733 while (--count) 734 i915_gem_context_put(ctx[count]); 735 goto err_file; 736 } 737 } 738 739 while (!kthread_should_stop()) { 740 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 741 struct i915_request *old = rq[idx]; 742 struct i915_request *new; 743 744 mutex_lock(&engine->i915->drm.struct_mutex); 745 new = igt_request_alloc(ctx[idx], engine); 746 if (IS_ERR(new)) { 747 mutex_unlock(&engine->i915->drm.struct_mutex); 748 err = PTR_ERR(new); 749 break; 750 } 751 752 if (arg->flags & TEST_PRIORITY) 753 ctx[idx]->sched.priority = 754 i915_prandom_u32_max_state(512, &prng); 755 756 rq[idx] = i915_request_get(new); 757 i915_request_add(new); 758 mutex_unlock(&engine->i915->drm.struct_mutex); 759 760 err = active_request_put(old); 761 if (err) 762 break; 763 764 cond_resched(); 765 } 766 767 for (count = 0; count < ARRAY_SIZE(rq); count++) { 768 int err__ = active_request_put(rq[count]); 769 770 /* Keep the first error */ 771 if (!err) 772 err = err__; 773 } 774 775 err_file: 776 mock_file_free(engine->i915, file); 777 return err; 778 } 779 780 static int __igt_reset_engines(struct intel_gt *gt, 781 const char *test_name, 782 unsigned int flags) 783 { 784 struct i915_gpu_error *global = >->i915->gpu_error; 785 struct intel_engine_cs *engine, *other; 786 enum intel_engine_id id, tmp; 787 struct hang h; 788 int err = 0; 789 790 /* Check that issuing a reset on one engine does not interfere 791 * with any other engine. 792 */ 793 794 if (!intel_has_reset_engine(gt->i915)) 795 return 0; 796 797 if (flags & TEST_ACTIVE) { 798 mutex_lock(>->i915->drm.struct_mutex); 799 err = hang_init(&h, gt); 800 mutex_unlock(>->i915->drm.struct_mutex); 801 if (err) 802 return err; 803 804 if (flags & TEST_PRIORITY) 805 h.ctx->sched.priority = 1024; 806 } 807 808 for_each_engine(engine, gt->i915, id) { 809 struct active_engine threads[I915_NUM_ENGINES] = {}; 810 unsigned long device = i915_reset_count(global); 811 unsigned long count = 0, reported; 812 IGT_TIMEOUT(end_time); 813 814 if (flags & TEST_ACTIVE && 815 !intel_engine_can_store_dword(engine)) 816 continue; 817 818 if (!wait_for_idle(engine)) { 819 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 820 engine->name, test_name); 821 err = -EIO; 822 break; 823 } 824 825 memset(threads, 0, sizeof(threads)); 826 for_each_engine(other, gt->i915, tmp) { 827 struct task_struct *tsk; 828 829 threads[tmp].resets = 830 i915_reset_engine_count(global, other); 831 832 if (!(flags & TEST_OTHERS)) 833 continue; 834 835 if (other == engine && !(flags & TEST_SELF)) 836 continue; 837 838 threads[tmp].engine = other; 839 threads[tmp].flags = flags; 840 841 tsk = kthread_run(active_engine, &threads[tmp], 842 "igt/%s", other->name); 843 if (IS_ERR(tsk)) { 844 err = PTR_ERR(tsk); 845 goto unwind; 846 } 847 848 threads[tmp].task = tsk; 849 get_task_struct(tsk); 850 } 851 852 intel_engine_pm_get(engine); 853 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 854 do { 855 struct i915_request *rq = NULL; 856 857 if (flags & TEST_ACTIVE) { 858 mutex_lock(>->i915->drm.struct_mutex); 859 rq = hang_create_request(&h, engine); 860 if (IS_ERR(rq)) { 861 err = PTR_ERR(rq); 862 mutex_unlock(>->i915->drm.struct_mutex); 863 break; 864 } 865 866 i915_request_get(rq); 867 i915_request_add(rq); 868 mutex_unlock(>->i915->drm.struct_mutex); 869 870 if (!wait_until_running(&h, rq)) { 871 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 872 873 pr_err("%s: Failed to start request %llx, at %x\n", 874 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 875 intel_engine_dump(engine, &p, 876 "%s\n", engine->name); 877 878 i915_request_put(rq); 879 err = -EIO; 880 break; 881 } 882 } 883 884 err = intel_engine_reset(engine, NULL); 885 if (err) { 886 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 887 engine->name, test_name, err); 888 break; 889 } 890 891 count++; 892 893 if (rq) { 894 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 895 struct drm_printer p = 896 drm_info_printer(gt->i915->drm.dev); 897 898 pr_err("i915_reset_engine(%s:%s):" 899 " failed to complete request after reset\n", 900 engine->name, test_name); 901 intel_engine_dump(engine, &p, 902 "%s\n", engine->name); 903 i915_request_put(rq); 904 905 GEM_TRACE_DUMP(); 906 intel_gt_set_wedged(gt); 907 err = -EIO; 908 break; 909 } 910 911 i915_request_put(rq); 912 } 913 914 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 915 struct drm_printer p = 916 drm_info_printer(gt->i915->drm.dev); 917 918 pr_err("i915_reset_engine(%s:%s):" 919 " failed to idle after reset\n", 920 engine->name, test_name); 921 intel_engine_dump(engine, &p, 922 "%s\n", engine->name); 923 924 err = -EIO; 925 break; 926 } 927 } while (time_before(jiffies, end_time)); 928 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 929 intel_engine_pm_put(engine); 930 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 931 engine->name, test_name, count); 932 933 reported = i915_reset_engine_count(global, engine); 934 reported -= threads[engine->id].resets; 935 if (reported != count) { 936 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 937 engine->name, test_name, count, reported); 938 if (!err) 939 err = -EINVAL; 940 } 941 942 unwind: 943 for_each_engine(other, gt->i915, tmp) { 944 int ret; 945 946 if (!threads[tmp].task) 947 continue; 948 949 ret = kthread_stop(threads[tmp].task); 950 if (ret) { 951 pr_err("kthread for other engine %s failed, err=%d\n", 952 other->name, ret); 953 if (!err) 954 err = ret; 955 } 956 put_task_struct(threads[tmp].task); 957 958 if (other->uabi_class != engine->uabi_class && 959 threads[tmp].resets != 960 i915_reset_engine_count(global, other)) { 961 pr_err("Innocent engine %s was reset (count=%ld)\n", 962 other->name, 963 i915_reset_engine_count(global, other) - 964 threads[tmp].resets); 965 if (!err) 966 err = -EINVAL; 967 } 968 } 969 970 if (device != i915_reset_count(global)) { 971 pr_err("Global reset (count=%ld)!\n", 972 i915_reset_count(global) - device); 973 if (!err) 974 err = -EINVAL; 975 } 976 977 if (err) 978 break; 979 980 mutex_lock(>->i915->drm.struct_mutex); 981 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED); 982 mutex_unlock(>->i915->drm.struct_mutex); 983 if (err) 984 break; 985 } 986 987 if (intel_gt_is_wedged(gt)) 988 err = -EIO; 989 990 if (flags & TEST_ACTIVE) { 991 mutex_lock(>->i915->drm.struct_mutex); 992 hang_fini(&h); 993 mutex_unlock(>->i915->drm.struct_mutex); 994 } 995 996 return err; 997 } 998 999 static int igt_reset_engines(void *arg) 1000 { 1001 static const struct { 1002 const char *name; 1003 unsigned int flags; 1004 } phases[] = { 1005 { "idle", 0 }, 1006 { "active", TEST_ACTIVE }, 1007 { "others-idle", TEST_OTHERS }, 1008 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1009 { 1010 "others-priority", 1011 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1012 }, 1013 { 1014 "self-priority", 1015 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1016 }, 1017 { } 1018 }; 1019 struct intel_gt *gt = arg; 1020 typeof(*phases) *p; 1021 int err; 1022 1023 for (p = phases; p->name; p++) { 1024 if (p->flags & TEST_PRIORITY) { 1025 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1026 continue; 1027 } 1028 1029 err = __igt_reset_engines(arg, p->name, p->flags); 1030 if (err) 1031 return err; 1032 } 1033 1034 return 0; 1035 } 1036 1037 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1038 { 1039 u32 count = i915_reset_count(>->i915->gpu_error); 1040 1041 intel_gt_reset(gt, mask, NULL); 1042 1043 return count; 1044 } 1045 1046 static int igt_reset_wait(void *arg) 1047 { 1048 struct intel_gt *gt = arg; 1049 struct i915_gpu_error *global = >->i915->gpu_error; 1050 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1051 struct i915_request *rq; 1052 unsigned int reset_count; 1053 struct hang h; 1054 long timeout; 1055 int err; 1056 1057 if (!engine || !intel_engine_can_store_dword(engine)) 1058 return 0; 1059 1060 /* Check that we detect a stuck waiter and issue a reset */ 1061 1062 igt_global_reset_lock(gt); 1063 1064 mutex_lock(>->i915->drm.struct_mutex); 1065 err = hang_init(&h, gt); 1066 if (err) 1067 goto unlock; 1068 1069 rq = hang_create_request(&h, engine); 1070 if (IS_ERR(rq)) { 1071 err = PTR_ERR(rq); 1072 goto fini; 1073 } 1074 1075 i915_request_get(rq); 1076 i915_request_add(rq); 1077 1078 if (!wait_until_running(&h, rq)) { 1079 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1080 1081 pr_err("%s: Failed to start request %llx, at %x\n", 1082 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1083 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1084 1085 intel_gt_set_wedged(gt); 1086 1087 err = -EIO; 1088 goto out_rq; 1089 } 1090 1091 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1092 1093 timeout = i915_request_wait(rq, 0, 10); 1094 if (timeout < 0) { 1095 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1096 timeout); 1097 err = timeout; 1098 goto out_rq; 1099 } 1100 1101 if (i915_reset_count(global) == reset_count) { 1102 pr_err("No GPU reset recorded!\n"); 1103 err = -EINVAL; 1104 goto out_rq; 1105 } 1106 1107 out_rq: 1108 i915_request_put(rq); 1109 fini: 1110 hang_fini(&h); 1111 unlock: 1112 mutex_unlock(>->i915->drm.struct_mutex); 1113 igt_global_reset_unlock(gt); 1114 1115 if (intel_gt_is_wedged(gt)) 1116 return -EIO; 1117 1118 return err; 1119 } 1120 1121 struct evict_vma { 1122 struct completion completion; 1123 struct i915_vma *vma; 1124 }; 1125 1126 static int evict_vma(void *data) 1127 { 1128 struct evict_vma *arg = data; 1129 struct i915_address_space *vm = arg->vma->vm; 1130 struct drm_i915_private *i915 = vm->i915; 1131 struct drm_mm_node evict = arg->vma->node; 1132 int err; 1133 1134 complete(&arg->completion); 1135 1136 mutex_lock(&i915->drm.struct_mutex); 1137 err = i915_gem_evict_for_node(vm, &evict, 0); 1138 mutex_unlock(&i915->drm.struct_mutex); 1139 1140 return err; 1141 } 1142 1143 static int evict_fence(void *data) 1144 { 1145 struct evict_vma *arg = data; 1146 struct drm_i915_private *i915 = arg->vma->vm->i915; 1147 int err; 1148 1149 complete(&arg->completion); 1150 1151 mutex_lock(&i915->drm.struct_mutex); 1152 1153 /* Mark the fence register as dirty to force the mmio update. */ 1154 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1155 if (err) { 1156 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1157 goto out_unlock; 1158 } 1159 1160 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1161 if (err) { 1162 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1163 goto out_unlock; 1164 } 1165 1166 err = i915_vma_pin_fence(arg->vma); 1167 i915_vma_unpin(arg->vma); 1168 if (err) { 1169 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1170 goto out_unlock; 1171 } 1172 1173 i915_vma_unpin_fence(arg->vma); 1174 1175 out_unlock: 1176 mutex_unlock(&i915->drm.struct_mutex); 1177 1178 return err; 1179 } 1180 1181 static int __igt_reset_evict_vma(struct intel_gt *gt, 1182 struct i915_address_space *vm, 1183 int (*fn)(void *), 1184 unsigned int flags) 1185 { 1186 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1187 struct drm_i915_gem_object *obj; 1188 struct task_struct *tsk = NULL; 1189 struct i915_request *rq; 1190 struct evict_vma arg; 1191 struct hang h; 1192 int err; 1193 1194 if (!engine || !intel_engine_can_store_dword(engine)) 1195 return 0; 1196 1197 /* Check that we can recover an unbind stuck on a hanging request */ 1198 1199 mutex_lock(>->i915->drm.struct_mutex); 1200 err = hang_init(&h, gt); 1201 if (err) 1202 goto unlock; 1203 1204 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1205 if (IS_ERR(obj)) { 1206 err = PTR_ERR(obj); 1207 goto fini; 1208 } 1209 1210 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1211 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1212 if (err) { 1213 pr_err("Invalid X-tiling settings; err:%d\n", err); 1214 goto out_obj; 1215 } 1216 } 1217 1218 arg.vma = i915_vma_instance(obj, vm, NULL); 1219 if (IS_ERR(arg.vma)) { 1220 err = PTR_ERR(arg.vma); 1221 goto out_obj; 1222 } 1223 1224 rq = hang_create_request(&h, engine); 1225 if (IS_ERR(rq)) { 1226 err = PTR_ERR(rq); 1227 goto out_obj; 1228 } 1229 1230 err = i915_vma_pin(arg.vma, 0, 0, 1231 i915_vma_is_ggtt(arg.vma) ? 1232 PIN_GLOBAL | PIN_MAPPABLE : 1233 PIN_USER); 1234 if (err) { 1235 i915_request_add(rq); 1236 goto out_obj; 1237 } 1238 1239 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1240 err = i915_vma_pin_fence(arg.vma); 1241 if (err) { 1242 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1243 i915_vma_unpin(arg.vma); 1244 i915_request_add(rq); 1245 goto out_obj; 1246 } 1247 } 1248 1249 i915_vma_lock(arg.vma); 1250 err = i915_request_await_object(rq, arg.vma->obj, 1251 flags & EXEC_OBJECT_WRITE); 1252 if (err == 0) 1253 err = i915_vma_move_to_active(arg.vma, rq, flags); 1254 i915_vma_unlock(arg.vma); 1255 1256 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1257 i915_vma_unpin_fence(arg.vma); 1258 i915_vma_unpin(arg.vma); 1259 1260 i915_request_get(rq); 1261 i915_request_add(rq); 1262 if (err) 1263 goto out_rq; 1264 1265 mutex_unlock(>->i915->drm.struct_mutex); 1266 1267 if (!wait_until_running(&h, rq)) { 1268 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1269 1270 pr_err("%s: Failed to start request %llx, at %x\n", 1271 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1272 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1273 1274 intel_gt_set_wedged(gt); 1275 goto out_reset; 1276 } 1277 1278 init_completion(&arg.completion); 1279 1280 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1281 if (IS_ERR(tsk)) { 1282 err = PTR_ERR(tsk); 1283 tsk = NULL; 1284 goto out_reset; 1285 } 1286 get_task_struct(tsk); 1287 1288 wait_for_completion(&arg.completion); 1289 1290 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1291 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1292 1293 pr_err("igt/evict_vma kthread did not wait\n"); 1294 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1295 1296 intel_gt_set_wedged(gt); 1297 goto out_reset; 1298 } 1299 1300 out_reset: 1301 igt_global_reset_lock(gt); 1302 fake_hangcheck(gt, rq->engine->mask); 1303 igt_global_reset_unlock(gt); 1304 1305 if (tsk) { 1306 struct intel_wedge_me w; 1307 1308 /* The reset, even indirectly, should take less than 10ms. */ 1309 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1310 err = kthread_stop(tsk); 1311 1312 put_task_struct(tsk); 1313 } 1314 1315 mutex_lock(>->i915->drm.struct_mutex); 1316 out_rq: 1317 i915_request_put(rq); 1318 out_obj: 1319 i915_gem_object_put(obj); 1320 fini: 1321 hang_fini(&h); 1322 unlock: 1323 mutex_unlock(>->i915->drm.struct_mutex); 1324 1325 if (intel_gt_is_wedged(gt)) 1326 return -EIO; 1327 1328 return err; 1329 } 1330 1331 static int igt_reset_evict_ggtt(void *arg) 1332 { 1333 struct intel_gt *gt = arg; 1334 1335 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1336 evict_vma, EXEC_OBJECT_WRITE); 1337 } 1338 1339 static int igt_reset_evict_ppgtt(void *arg) 1340 { 1341 struct intel_gt *gt = arg; 1342 struct i915_gem_context *ctx; 1343 struct drm_file *file; 1344 int err; 1345 1346 file = mock_file(gt->i915); 1347 if (IS_ERR(file)) 1348 return PTR_ERR(file); 1349 1350 mutex_lock(>->i915->drm.struct_mutex); 1351 ctx = live_context(gt->i915, file); 1352 mutex_unlock(>->i915->drm.struct_mutex); 1353 if (IS_ERR(ctx)) { 1354 err = PTR_ERR(ctx); 1355 goto out; 1356 } 1357 1358 err = 0; 1359 if (ctx->vm) /* aliasing == global gtt locking, covered above */ 1360 err = __igt_reset_evict_vma(gt, ctx->vm, 1361 evict_vma, EXEC_OBJECT_WRITE); 1362 1363 out: 1364 mock_file_free(gt->i915, file); 1365 return err; 1366 } 1367 1368 static int igt_reset_evict_fence(void *arg) 1369 { 1370 struct intel_gt *gt = arg; 1371 1372 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1373 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1374 } 1375 1376 static int wait_for_others(struct intel_gt *gt, 1377 struct intel_engine_cs *exclude) 1378 { 1379 struct intel_engine_cs *engine; 1380 enum intel_engine_id id; 1381 1382 for_each_engine(engine, gt->i915, id) { 1383 if (engine == exclude) 1384 continue; 1385 1386 if (!wait_for_idle(engine)) 1387 return -EIO; 1388 } 1389 1390 return 0; 1391 } 1392 1393 static int igt_reset_queue(void *arg) 1394 { 1395 struct intel_gt *gt = arg; 1396 struct i915_gpu_error *global = >->i915->gpu_error; 1397 struct intel_engine_cs *engine; 1398 enum intel_engine_id id; 1399 struct hang h; 1400 int err; 1401 1402 /* Check that we replay pending requests following a hang */ 1403 1404 igt_global_reset_lock(gt); 1405 1406 mutex_lock(>->i915->drm.struct_mutex); 1407 err = hang_init(&h, gt); 1408 if (err) 1409 goto unlock; 1410 1411 for_each_engine(engine, gt->i915, id) { 1412 struct i915_request *prev; 1413 IGT_TIMEOUT(end_time); 1414 unsigned int count; 1415 1416 if (!intel_engine_can_store_dword(engine)) 1417 continue; 1418 1419 prev = hang_create_request(&h, engine); 1420 if (IS_ERR(prev)) { 1421 err = PTR_ERR(prev); 1422 goto fini; 1423 } 1424 1425 i915_request_get(prev); 1426 i915_request_add(prev); 1427 1428 count = 0; 1429 do { 1430 struct i915_request *rq; 1431 unsigned int reset_count; 1432 1433 rq = hang_create_request(&h, engine); 1434 if (IS_ERR(rq)) { 1435 err = PTR_ERR(rq); 1436 goto fini; 1437 } 1438 1439 i915_request_get(rq); 1440 i915_request_add(rq); 1441 1442 /* 1443 * XXX We don't handle resetting the kernel context 1444 * very well. If we trigger a device reset twice in 1445 * quick succession while the kernel context is 1446 * executing, we may end up skipping the breadcrumb. 1447 * This is really only a problem for the selftest as 1448 * normally there is a large interlude between resets 1449 * (hangcheck), or we focus on resetting just one 1450 * engine and so avoid repeatedly resetting innocents. 1451 */ 1452 err = wait_for_others(gt, engine); 1453 if (err) { 1454 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1455 __func__, engine->name); 1456 i915_request_put(rq); 1457 i915_request_put(prev); 1458 1459 GEM_TRACE_DUMP(); 1460 intel_gt_set_wedged(gt); 1461 goto fini; 1462 } 1463 1464 if (!wait_until_running(&h, prev)) { 1465 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1466 1467 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1468 __func__, engine->name, 1469 prev->fence.seqno, hws_seqno(&h, prev)); 1470 intel_engine_dump(engine, &p, 1471 "%s\n", engine->name); 1472 1473 i915_request_put(rq); 1474 i915_request_put(prev); 1475 1476 intel_gt_set_wedged(gt); 1477 1478 err = -EIO; 1479 goto fini; 1480 } 1481 1482 reset_count = fake_hangcheck(gt, BIT(id)); 1483 1484 if (prev->fence.error != -EIO) { 1485 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1486 prev->fence.error); 1487 i915_request_put(rq); 1488 i915_request_put(prev); 1489 err = -EINVAL; 1490 goto fini; 1491 } 1492 1493 if (rq->fence.error) { 1494 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1495 rq->fence.error); 1496 i915_request_put(rq); 1497 i915_request_put(prev); 1498 err = -EINVAL; 1499 goto fini; 1500 } 1501 1502 if (i915_reset_count(global) == reset_count) { 1503 pr_err("No GPU reset recorded!\n"); 1504 i915_request_put(rq); 1505 i915_request_put(prev); 1506 err = -EINVAL; 1507 goto fini; 1508 } 1509 1510 i915_request_put(prev); 1511 prev = rq; 1512 count++; 1513 } while (time_before(jiffies, end_time)); 1514 pr_info("%s: Completed %d resets\n", engine->name, count); 1515 1516 *h.batch = MI_BATCH_BUFFER_END; 1517 intel_gt_chipset_flush(engine->gt); 1518 1519 i915_request_put(prev); 1520 1521 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED); 1522 if (err) 1523 break; 1524 } 1525 1526 fini: 1527 hang_fini(&h); 1528 unlock: 1529 mutex_unlock(>->i915->drm.struct_mutex); 1530 igt_global_reset_unlock(gt); 1531 1532 if (intel_gt_is_wedged(gt)) 1533 return -EIO; 1534 1535 return err; 1536 } 1537 1538 static int igt_handle_error(void *arg) 1539 { 1540 struct intel_gt *gt = arg; 1541 struct i915_gpu_error *global = >->i915->gpu_error; 1542 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1543 struct hang h; 1544 struct i915_request *rq; 1545 struct i915_gpu_state *error; 1546 int err; 1547 1548 /* Check that we can issue a global GPU and engine reset */ 1549 1550 if (!intel_has_reset_engine(gt->i915)) 1551 return 0; 1552 1553 if (!engine || !intel_engine_can_store_dword(engine)) 1554 return 0; 1555 1556 mutex_lock(>->i915->drm.struct_mutex); 1557 1558 err = hang_init(&h, gt); 1559 if (err) 1560 goto err_unlock; 1561 1562 rq = hang_create_request(&h, engine); 1563 if (IS_ERR(rq)) { 1564 err = PTR_ERR(rq); 1565 goto err_fini; 1566 } 1567 1568 i915_request_get(rq); 1569 i915_request_add(rq); 1570 1571 if (!wait_until_running(&h, rq)) { 1572 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1573 1574 pr_err("%s: Failed to start request %llx, at %x\n", 1575 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1576 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1577 1578 intel_gt_set_wedged(gt); 1579 1580 err = -EIO; 1581 goto err_request; 1582 } 1583 1584 mutex_unlock(>->i915->drm.struct_mutex); 1585 1586 /* Temporarily disable error capture */ 1587 error = xchg(&global->first_error, (void *)-1); 1588 1589 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1590 1591 xchg(&global->first_error, error); 1592 1593 mutex_lock(>->i915->drm.struct_mutex); 1594 1595 if (rq->fence.error != -EIO) { 1596 pr_err("Guilty request not identified!\n"); 1597 err = -EINVAL; 1598 goto err_request; 1599 } 1600 1601 err_request: 1602 i915_request_put(rq); 1603 err_fini: 1604 hang_fini(&h); 1605 err_unlock: 1606 mutex_unlock(>->i915->drm.struct_mutex); 1607 return err; 1608 } 1609 1610 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1611 const struct igt_atomic_section *p, 1612 const char *mode) 1613 { 1614 struct tasklet_struct * const t = &engine->execlists.tasklet; 1615 int err; 1616 1617 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1618 engine->name, mode, p->name); 1619 1620 tasklet_disable_nosync(t); 1621 p->critical_section_begin(); 1622 1623 err = intel_engine_reset(engine, NULL); 1624 1625 p->critical_section_end(); 1626 tasklet_enable(t); 1627 1628 if (err) 1629 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1630 engine->name, mode, p->name); 1631 1632 return err; 1633 } 1634 1635 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1636 const struct igt_atomic_section *p) 1637 { 1638 struct i915_request *rq; 1639 struct hang h; 1640 int err; 1641 1642 err = __igt_atomic_reset_engine(engine, p, "idle"); 1643 if (err) 1644 return err; 1645 1646 err = hang_init(&h, engine->gt); 1647 if (err) 1648 return err; 1649 1650 rq = hang_create_request(&h, engine); 1651 if (IS_ERR(rq)) { 1652 err = PTR_ERR(rq); 1653 goto out; 1654 } 1655 1656 i915_request_get(rq); 1657 i915_request_add(rq); 1658 1659 if (wait_until_running(&h, rq)) { 1660 err = __igt_atomic_reset_engine(engine, p, "active"); 1661 } else { 1662 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1663 __func__, engine->name, 1664 rq->fence.seqno, hws_seqno(&h, rq)); 1665 intel_gt_set_wedged(engine->gt); 1666 err = -EIO; 1667 } 1668 1669 if (err == 0) { 1670 struct intel_wedge_me w; 1671 1672 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1673 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1674 if (intel_gt_is_wedged(engine->gt)) 1675 err = -EIO; 1676 } 1677 1678 i915_request_put(rq); 1679 out: 1680 hang_fini(&h); 1681 return err; 1682 } 1683 1684 static int igt_reset_engines_atomic(void *arg) 1685 { 1686 struct intel_gt *gt = arg; 1687 const typeof(*igt_atomic_phases) *p; 1688 int err = 0; 1689 1690 /* Check that the engines resets are usable from atomic context */ 1691 1692 if (!intel_has_reset_engine(gt->i915)) 1693 return 0; 1694 1695 if (USES_GUC_SUBMISSION(gt->i915)) 1696 return 0; 1697 1698 igt_global_reset_lock(gt); 1699 mutex_lock(>->i915->drm.struct_mutex); 1700 1701 /* Flush any requests before we get started and check basics */ 1702 if (!igt_force_reset(gt)) 1703 goto unlock; 1704 1705 for (p = igt_atomic_phases; p->name; p++) { 1706 struct intel_engine_cs *engine; 1707 enum intel_engine_id id; 1708 1709 for_each_engine(engine, gt->i915, id) { 1710 err = igt_atomic_reset_engine(engine, p); 1711 if (err) 1712 goto out; 1713 } 1714 } 1715 1716 out: 1717 /* As we poke around the guts, do a full reset before continuing. */ 1718 igt_force_reset(gt); 1719 1720 unlock: 1721 mutex_unlock(>->i915->drm.struct_mutex); 1722 igt_global_reset_unlock(gt); 1723 1724 return err; 1725 } 1726 1727 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1728 { 1729 static const struct i915_subtest tests[] = { 1730 SUBTEST(igt_hang_sanitycheck), 1731 SUBTEST(igt_reset_nop), 1732 SUBTEST(igt_reset_nop_engine), 1733 SUBTEST(igt_reset_idle_engine), 1734 SUBTEST(igt_reset_active_engine), 1735 SUBTEST(igt_reset_engines), 1736 SUBTEST(igt_reset_engines_atomic), 1737 SUBTEST(igt_reset_queue), 1738 SUBTEST(igt_reset_wait), 1739 SUBTEST(igt_reset_evict_ggtt), 1740 SUBTEST(igt_reset_evict_ppgtt), 1741 SUBTEST(igt_reset_evict_fence), 1742 SUBTEST(igt_handle_error), 1743 }; 1744 struct intel_gt *gt = &i915->gt; 1745 intel_wakeref_t wakeref; 1746 bool saved_hangcheck; 1747 int err; 1748 1749 if (!intel_has_gpu_reset(gt->i915)) 1750 return 0; 1751 1752 if (intel_gt_is_wedged(gt)) 1753 return -EIO; /* we're long past hope of a successful reset */ 1754 1755 wakeref = intel_runtime_pm_get(>->i915->runtime_pm); 1756 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck); 1757 drain_delayed_work(>->hangcheck.work); /* flush param */ 1758 1759 err = intel_gt_live_subtests(tests, gt); 1760 1761 mutex_lock(>->i915->drm.struct_mutex); 1762 igt_flush_test(gt->i915, I915_WAIT_LOCKED); 1763 mutex_unlock(>->i915->drm.struct_mutex); 1764 1765 i915_modparams.enable_hangcheck = saved_hangcheck; 1766 intel_runtime_pm_put(>->i915->runtime_pm, wakeref); 1767 1768 return err; 1769 } 1770