1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/kthread.h> 26 27 #include "intel_engine_pm.h" 28 29 #include "i915_selftest.h" 30 #include "selftests/i915_random.h" 31 #include "selftests/igt_flush_test.h" 32 #include "selftests/igt_gem_utils.h" 33 #include "selftests/igt_reset.h" 34 #include "selftests/igt_wedge_me.h" 35 #include "selftests/igt_atomic.h" 36 37 #include "selftests/mock_context.h" 38 #include "selftests/mock_drm.h" 39 40 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 41 42 struct hang { 43 struct drm_i915_private *i915; 44 struct drm_i915_gem_object *hws; 45 struct drm_i915_gem_object *obj; 46 struct i915_gem_context *ctx; 47 u32 *seqno; 48 u32 *batch; 49 }; 50 51 static int hang_init(struct hang *h, struct drm_i915_private *i915) 52 { 53 void *vaddr; 54 int err; 55 56 memset(h, 0, sizeof(*h)); 57 h->i915 = i915; 58 59 h->ctx = kernel_context(i915); 60 if (IS_ERR(h->ctx)) 61 return PTR_ERR(h->ctx); 62 63 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 64 65 h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE); 66 if (IS_ERR(h->hws)) { 67 err = PTR_ERR(h->hws); 68 goto err_ctx; 69 } 70 71 h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 72 if (IS_ERR(h->obj)) { 73 err = PTR_ERR(h->obj); 74 goto err_hws; 75 } 76 77 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 78 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 79 if (IS_ERR(vaddr)) { 80 err = PTR_ERR(vaddr); 81 goto err_obj; 82 } 83 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 84 85 vaddr = i915_gem_object_pin_map(h->obj, 86 i915_coherent_map_type(i915)); 87 if (IS_ERR(vaddr)) { 88 err = PTR_ERR(vaddr); 89 goto err_unpin_hws; 90 } 91 h->batch = vaddr; 92 93 return 0; 94 95 err_unpin_hws: 96 i915_gem_object_unpin_map(h->hws); 97 err_obj: 98 i915_gem_object_put(h->obj); 99 err_hws: 100 i915_gem_object_put(h->hws); 101 err_ctx: 102 kernel_context_close(h->ctx); 103 return err; 104 } 105 106 static u64 hws_address(const struct i915_vma *hws, 107 const struct i915_request *rq) 108 { 109 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 110 } 111 112 static int move_to_active(struct i915_vma *vma, 113 struct i915_request *rq, 114 unsigned int flags) 115 { 116 int err; 117 118 err = i915_vma_move_to_active(vma, rq, flags); 119 if (err) 120 return err; 121 122 if (!i915_gem_object_has_active_reference(vma->obj)) { 123 i915_gem_object_get(vma->obj); 124 i915_gem_object_set_active_reference(vma->obj); 125 } 126 127 return 0; 128 } 129 130 static struct i915_request * 131 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 132 { 133 struct drm_i915_private *i915 = h->i915; 134 struct i915_address_space *vm = 135 h->ctx->ppgtt ? &h->ctx->ppgtt->vm : &i915->ggtt.vm; 136 struct i915_request *rq = NULL; 137 struct i915_vma *hws, *vma; 138 unsigned int flags; 139 u32 *batch; 140 int err; 141 142 if (i915_gem_object_is_active(h->obj)) { 143 struct drm_i915_gem_object *obj; 144 void *vaddr; 145 146 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE); 147 if (IS_ERR(obj)) 148 return ERR_CAST(obj); 149 150 vaddr = i915_gem_object_pin_map(obj, 151 i915_coherent_map_type(h->i915)); 152 if (IS_ERR(vaddr)) { 153 i915_gem_object_put(obj); 154 return ERR_CAST(vaddr); 155 } 156 157 i915_gem_object_unpin_map(h->obj); 158 i915_gem_object_put(h->obj); 159 160 h->obj = obj; 161 h->batch = vaddr; 162 } 163 164 vma = i915_vma_instance(h->obj, vm, NULL); 165 if (IS_ERR(vma)) 166 return ERR_CAST(vma); 167 168 hws = i915_vma_instance(h->hws, vm, NULL); 169 if (IS_ERR(hws)) 170 return ERR_CAST(hws); 171 172 err = i915_vma_pin(vma, 0, 0, PIN_USER); 173 if (err) 174 return ERR_PTR(err); 175 176 err = i915_vma_pin(hws, 0, 0, PIN_USER); 177 if (err) 178 goto unpin_vma; 179 180 rq = igt_request_alloc(h->ctx, engine); 181 if (IS_ERR(rq)) { 182 err = PTR_ERR(rq); 183 goto unpin_hws; 184 } 185 186 err = move_to_active(vma, rq, 0); 187 if (err) 188 goto cancel_rq; 189 190 err = move_to_active(hws, rq, 0); 191 if (err) 192 goto cancel_rq; 193 194 batch = h->batch; 195 if (INTEL_GEN(i915) >= 8) { 196 *batch++ = MI_STORE_DWORD_IMM_GEN4; 197 *batch++ = lower_32_bits(hws_address(hws, rq)); 198 *batch++ = upper_32_bits(hws_address(hws, rq)); 199 *batch++ = rq->fence.seqno; 200 *batch++ = MI_ARB_CHECK; 201 202 memset(batch, 0, 1024); 203 batch += 1024 / sizeof(*batch); 204 205 *batch++ = MI_ARB_CHECK; 206 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 207 *batch++ = lower_32_bits(vma->node.start); 208 *batch++ = upper_32_bits(vma->node.start); 209 } else if (INTEL_GEN(i915) >= 6) { 210 *batch++ = MI_STORE_DWORD_IMM_GEN4; 211 *batch++ = 0; 212 *batch++ = lower_32_bits(hws_address(hws, rq)); 213 *batch++ = rq->fence.seqno; 214 *batch++ = MI_ARB_CHECK; 215 216 memset(batch, 0, 1024); 217 batch += 1024 / sizeof(*batch); 218 219 *batch++ = MI_ARB_CHECK; 220 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 221 *batch++ = lower_32_bits(vma->node.start); 222 } else if (INTEL_GEN(i915) >= 4) { 223 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 224 *batch++ = 0; 225 *batch++ = lower_32_bits(hws_address(hws, rq)); 226 *batch++ = rq->fence.seqno; 227 *batch++ = MI_ARB_CHECK; 228 229 memset(batch, 0, 1024); 230 batch += 1024 / sizeof(*batch); 231 232 *batch++ = MI_ARB_CHECK; 233 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 234 *batch++ = lower_32_bits(vma->node.start); 235 } else { 236 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 237 *batch++ = lower_32_bits(hws_address(hws, rq)); 238 *batch++ = rq->fence.seqno; 239 *batch++ = MI_ARB_CHECK; 240 241 memset(batch, 0, 1024); 242 batch += 1024 / sizeof(*batch); 243 244 *batch++ = MI_ARB_CHECK; 245 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 246 *batch++ = lower_32_bits(vma->node.start); 247 } 248 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 249 i915_gem_chipset_flush(h->i915); 250 251 if (rq->engine->emit_init_breadcrumb) { 252 err = rq->engine->emit_init_breadcrumb(rq); 253 if (err) 254 goto cancel_rq; 255 } 256 257 flags = 0; 258 if (INTEL_GEN(vm->i915) <= 5) 259 flags |= I915_DISPATCH_SECURE; 260 261 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 262 263 cancel_rq: 264 if (err) { 265 i915_request_skip(rq, err); 266 i915_request_add(rq); 267 } 268 unpin_hws: 269 i915_vma_unpin(hws); 270 unpin_vma: 271 i915_vma_unpin(vma); 272 return err ? ERR_PTR(err) : rq; 273 } 274 275 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 276 { 277 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 278 } 279 280 static void hang_fini(struct hang *h) 281 { 282 *h->batch = MI_BATCH_BUFFER_END; 283 i915_gem_chipset_flush(h->i915); 284 285 i915_gem_object_unpin_map(h->obj); 286 i915_gem_object_put(h->obj); 287 288 i915_gem_object_unpin_map(h->hws); 289 i915_gem_object_put(h->hws); 290 291 kernel_context_close(h->ctx); 292 293 igt_flush_test(h->i915, I915_WAIT_LOCKED); 294 } 295 296 static bool wait_until_running(struct hang *h, struct i915_request *rq) 297 { 298 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 299 rq->fence.seqno), 300 10) && 301 wait_for(i915_seqno_passed(hws_seqno(h, rq), 302 rq->fence.seqno), 303 1000)); 304 } 305 306 static int igt_hang_sanitycheck(void *arg) 307 { 308 struct drm_i915_private *i915 = arg; 309 struct i915_request *rq; 310 struct intel_engine_cs *engine; 311 enum intel_engine_id id; 312 struct hang h; 313 int err; 314 315 /* Basic check that we can execute our hanging batch */ 316 317 mutex_lock(&i915->drm.struct_mutex); 318 err = hang_init(&h, i915); 319 if (err) 320 goto unlock; 321 322 for_each_engine(engine, i915, id) { 323 struct igt_wedge_me w; 324 long timeout; 325 326 if (!intel_engine_can_store_dword(engine)) 327 continue; 328 329 rq = hang_create_request(&h, engine); 330 if (IS_ERR(rq)) { 331 err = PTR_ERR(rq); 332 pr_err("Failed to create request for %s, err=%d\n", 333 engine->name, err); 334 goto fini; 335 } 336 337 i915_request_get(rq); 338 339 *h.batch = MI_BATCH_BUFFER_END; 340 i915_gem_chipset_flush(i915); 341 342 i915_request_add(rq); 343 344 timeout = 0; 345 igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/) 346 timeout = i915_request_wait(rq, 347 I915_WAIT_LOCKED, 348 MAX_SCHEDULE_TIMEOUT); 349 if (i915_reset_failed(i915)) 350 timeout = -EIO; 351 352 i915_request_put(rq); 353 354 if (timeout < 0) { 355 err = timeout; 356 pr_err("Wait for request failed on %s, err=%d\n", 357 engine->name, err); 358 goto fini; 359 } 360 } 361 362 fini: 363 hang_fini(&h); 364 unlock: 365 mutex_unlock(&i915->drm.struct_mutex); 366 return err; 367 } 368 369 static bool wait_for_idle(struct intel_engine_cs *engine) 370 { 371 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 372 } 373 374 static int igt_reset_nop(void *arg) 375 { 376 struct drm_i915_private *i915 = arg; 377 struct intel_engine_cs *engine; 378 struct i915_gem_context *ctx; 379 unsigned int reset_count, count; 380 enum intel_engine_id id; 381 intel_wakeref_t wakeref; 382 struct drm_file *file; 383 IGT_TIMEOUT(end_time); 384 int err = 0; 385 386 /* Check that we can reset during non-user portions of requests */ 387 388 file = mock_file(i915); 389 if (IS_ERR(file)) 390 return PTR_ERR(file); 391 392 mutex_lock(&i915->drm.struct_mutex); 393 ctx = live_context(i915, file); 394 mutex_unlock(&i915->drm.struct_mutex); 395 if (IS_ERR(ctx)) { 396 err = PTR_ERR(ctx); 397 goto out; 398 } 399 400 i915_gem_context_clear_bannable(ctx); 401 wakeref = intel_runtime_pm_get(i915); 402 reset_count = i915_reset_count(&i915->gpu_error); 403 count = 0; 404 do { 405 mutex_lock(&i915->drm.struct_mutex); 406 for_each_engine(engine, i915, id) { 407 int i; 408 409 for (i = 0; i < 16; i++) { 410 struct i915_request *rq; 411 412 rq = igt_request_alloc(ctx, engine); 413 if (IS_ERR(rq)) { 414 err = PTR_ERR(rq); 415 break; 416 } 417 418 i915_request_add(rq); 419 } 420 } 421 mutex_unlock(&i915->drm.struct_mutex); 422 423 igt_global_reset_lock(i915); 424 i915_reset(i915, ALL_ENGINES, NULL); 425 igt_global_reset_unlock(i915); 426 if (i915_reset_failed(i915)) { 427 err = -EIO; 428 break; 429 } 430 431 if (i915_reset_count(&i915->gpu_error) != 432 reset_count + ++count) { 433 pr_err("Full GPU reset not recorded!\n"); 434 err = -EINVAL; 435 break; 436 } 437 438 err = igt_flush_test(i915, 0); 439 if (err) 440 break; 441 } while (time_before(jiffies, end_time)); 442 pr_info("%s: %d resets\n", __func__, count); 443 444 mutex_lock(&i915->drm.struct_mutex); 445 err = igt_flush_test(i915, I915_WAIT_LOCKED); 446 mutex_unlock(&i915->drm.struct_mutex); 447 448 intel_runtime_pm_put(i915, wakeref); 449 450 out: 451 mock_file_free(i915, file); 452 if (i915_reset_failed(i915)) 453 err = -EIO; 454 return err; 455 } 456 457 static int igt_reset_nop_engine(void *arg) 458 { 459 struct drm_i915_private *i915 = arg; 460 struct intel_engine_cs *engine; 461 struct i915_gem_context *ctx; 462 enum intel_engine_id id; 463 intel_wakeref_t wakeref; 464 struct drm_file *file; 465 int err = 0; 466 467 /* Check that we can engine-reset during non-user portions */ 468 469 if (!intel_has_reset_engine(i915)) 470 return 0; 471 472 file = mock_file(i915); 473 if (IS_ERR(file)) 474 return PTR_ERR(file); 475 476 mutex_lock(&i915->drm.struct_mutex); 477 ctx = live_context(i915, file); 478 mutex_unlock(&i915->drm.struct_mutex); 479 if (IS_ERR(ctx)) { 480 err = PTR_ERR(ctx); 481 goto out; 482 } 483 484 i915_gem_context_clear_bannable(ctx); 485 wakeref = intel_runtime_pm_get(i915); 486 for_each_engine(engine, i915, id) { 487 unsigned int reset_count, reset_engine_count; 488 unsigned int count; 489 IGT_TIMEOUT(end_time); 490 491 reset_count = i915_reset_count(&i915->gpu_error); 492 reset_engine_count = i915_reset_engine_count(&i915->gpu_error, 493 engine); 494 count = 0; 495 496 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 497 do { 498 int i; 499 500 if (!wait_for_idle(engine)) { 501 pr_err("%s failed to idle before reset\n", 502 engine->name); 503 err = -EIO; 504 break; 505 } 506 507 mutex_lock(&i915->drm.struct_mutex); 508 for (i = 0; i < 16; i++) { 509 struct i915_request *rq; 510 511 rq = igt_request_alloc(ctx, engine); 512 if (IS_ERR(rq)) { 513 err = PTR_ERR(rq); 514 break; 515 } 516 517 i915_request_add(rq); 518 } 519 mutex_unlock(&i915->drm.struct_mutex); 520 521 err = i915_reset_engine(engine, NULL); 522 if (err) { 523 pr_err("i915_reset_engine failed\n"); 524 break; 525 } 526 527 if (i915_reset_count(&i915->gpu_error) != reset_count) { 528 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 529 err = -EINVAL; 530 break; 531 } 532 533 if (i915_reset_engine_count(&i915->gpu_error, engine) != 534 reset_engine_count + ++count) { 535 pr_err("%s engine reset not recorded!\n", 536 engine->name); 537 err = -EINVAL; 538 break; 539 } 540 } while (time_before(jiffies, end_time)); 541 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 542 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 543 544 if (err) 545 break; 546 547 err = igt_flush_test(i915, 0); 548 if (err) 549 break; 550 } 551 552 mutex_lock(&i915->drm.struct_mutex); 553 err = igt_flush_test(i915, I915_WAIT_LOCKED); 554 mutex_unlock(&i915->drm.struct_mutex); 555 556 intel_runtime_pm_put(i915, wakeref); 557 out: 558 mock_file_free(i915, file); 559 if (i915_reset_failed(i915)) 560 err = -EIO; 561 return err; 562 } 563 564 static int __igt_reset_engine(struct drm_i915_private *i915, bool active) 565 { 566 struct intel_engine_cs *engine; 567 enum intel_engine_id id; 568 struct hang h; 569 int err = 0; 570 571 /* Check that we can issue an engine reset on an idle engine (no-op) */ 572 573 if (!intel_has_reset_engine(i915)) 574 return 0; 575 576 if (active) { 577 mutex_lock(&i915->drm.struct_mutex); 578 err = hang_init(&h, i915); 579 mutex_unlock(&i915->drm.struct_mutex); 580 if (err) 581 return err; 582 } 583 584 for_each_engine(engine, i915, id) { 585 unsigned int reset_count, reset_engine_count; 586 IGT_TIMEOUT(end_time); 587 588 if (active && !intel_engine_can_store_dword(engine)) 589 continue; 590 591 if (!wait_for_idle(engine)) { 592 pr_err("%s failed to idle before reset\n", 593 engine->name); 594 err = -EIO; 595 break; 596 } 597 598 reset_count = i915_reset_count(&i915->gpu_error); 599 reset_engine_count = i915_reset_engine_count(&i915->gpu_error, 600 engine); 601 602 intel_engine_pm_get(engine); 603 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 604 do { 605 if (active) { 606 struct i915_request *rq; 607 608 mutex_lock(&i915->drm.struct_mutex); 609 rq = hang_create_request(&h, engine); 610 if (IS_ERR(rq)) { 611 err = PTR_ERR(rq); 612 mutex_unlock(&i915->drm.struct_mutex); 613 break; 614 } 615 616 i915_request_get(rq); 617 i915_request_add(rq); 618 mutex_unlock(&i915->drm.struct_mutex); 619 620 if (!wait_until_running(&h, rq)) { 621 struct drm_printer p = drm_info_printer(i915->drm.dev); 622 623 pr_err("%s: Failed to start request %llx, at %x\n", 624 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 625 intel_engine_dump(engine, &p, 626 "%s\n", engine->name); 627 628 i915_request_put(rq); 629 err = -EIO; 630 break; 631 } 632 633 i915_request_put(rq); 634 } 635 636 err = i915_reset_engine(engine, NULL); 637 if (err) { 638 pr_err("i915_reset_engine failed\n"); 639 break; 640 } 641 642 if (i915_reset_count(&i915->gpu_error) != reset_count) { 643 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 644 err = -EINVAL; 645 break; 646 } 647 648 if (i915_reset_engine_count(&i915->gpu_error, engine) != 649 ++reset_engine_count) { 650 pr_err("%s engine reset not recorded!\n", 651 engine->name); 652 err = -EINVAL; 653 break; 654 } 655 } while (time_before(jiffies, end_time)); 656 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 657 intel_engine_pm_put(engine); 658 659 if (err) 660 break; 661 662 err = igt_flush_test(i915, 0); 663 if (err) 664 break; 665 } 666 667 if (i915_reset_failed(i915)) 668 err = -EIO; 669 670 if (active) { 671 mutex_lock(&i915->drm.struct_mutex); 672 hang_fini(&h); 673 mutex_unlock(&i915->drm.struct_mutex); 674 } 675 676 return err; 677 } 678 679 static int igt_reset_idle_engine(void *arg) 680 { 681 return __igt_reset_engine(arg, false); 682 } 683 684 static int igt_reset_active_engine(void *arg) 685 { 686 return __igt_reset_engine(arg, true); 687 } 688 689 struct active_engine { 690 struct task_struct *task; 691 struct intel_engine_cs *engine; 692 unsigned long resets; 693 unsigned int flags; 694 }; 695 696 #define TEST_ACTIVE BIT(0) 697 #define TEST_OTHERS BIT(1) 698 #define TEST_SELF BIT(2) 699 #define TEST_PRIORITY BIT(3) 700 701 static int active_request_put(struct i915_request *rq) 702 { 703 int err = 0; 704 705 if (!rq) 706 return 0; 707 708 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 709 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 710 rq->engine->name, 711 rq->fence.context, 712 rq->fence.seqno); 713 GEM_TRACE_DUMP(); 714 715 i915_gem_set_wedged(rq->i915); 716 err = -EIO; 717 } 718 719 i915_request_put(rq); 720 721 return err; 722 } 723 724 static int active_engine(void *data) 725 { 726 I915_RND_STATE(prng); 727 struct active_engine *arg = data; 728 struct intel_engine_cs *engine = arg->engine; 729 struct i915_request *rq[8] = {}; 730 struct i915_gem_context *ctx[ARRAY_SIZE(rq)]; 731 struct drm_file *file; 732 unsigned long count = 0; 733 int err = 0; 734 735 file = mock_file(engine->i915); 736 if (IS_ERR(file)) 737 return PTR_ERR(file); 738 739 for (count = 0; count < ARRAY_SIZE(ctx); count++) { 740 mutex_lock(&engine->i915->drm.struct_mutex); 741 ctx[count] = live_context(engine->i915, file); 742 mutex_unlock(&engine->i915->drm.struct_mutex); 743 if (IS_ERR(ctx[count])) { 744 err = PTR_ERR(ctx[count]); 745 while (--count) 746 i915_gem_context_put(ctx[count]); 747 goto err_file; 748 } 749 } 750 751 while (!kthread_should_stop()) { 752 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 753 struct i915_request *old = rq[idx]; 754 struct i915_request *new; 755 756 mutex_lock(&engine->i915->drm.struct_mutex); 757 new = igt_request_alloc(ctx[idx], engine); 758 if (IS_ERR(new)) { 759 mutex_unlock(&engine->i915->drm.struct_mutex); 760 err = PTR_ERR(new); 761 break; 762 } 763 764 if (arg->flags & TEST_PRIORITY) 765 ctx[idx]->sched.priority = 766 i915_prandom_u32_max_state(512, &prng); 767 768 rq[idx] = i915_request_get(new); 769 i915_request_add(new); 770 mutex_unlock(&engine->i915->drm.struct_mutex); 771 772 err = active_request_put(old); 773 if (err) 774 break; 775 776 cond_resched(); 777 } 778 779 for (count = 0; count < ARRAY_SIZE(rq); count++) { 780 int err__ = active_request_put(rq[count]); 781 782 /* Keep the first error */ 783 if (!err) 784 err = err__; 785 } 786 787 err_file: 788 mock_file_free(engine->i915, file); 789 return err; 790 } 791 792 static int __igt_reset_engines(struct drm_i915_private *i915, 793 const char *test_name, 794 unsigned int flags) 795 { 796 struct intel_engine_cs *engine, *other; 797 enum intel_engine_id id, tmp; 798 struct hang h; 799 int err = 0; 800 801 /* Check that issuing a reset on one engine does not interfere 802 * with any other engine. 803 */ 804 805 if (!intel_has_reset_engine(i915)) 806 return 0; 807 808 if (flags & TEST_ACTIVE) { 809 mutex_lock(&i915->drm.struct_mutex); 810 err = hang_init(&h, i915); 811 mutex_unlock(&i915->drm.struct_mutex); 812 if (err) 813 return err; 814 815 if (flags & TEST_PRIORITY) 816 h.ctx->sched.priority = 1024; 817 } 818 819 for_each_engine(engine, i915, id) { 820 struct active_engine threads[I915_NUM_ENGINES] = {}; 821 unsigned long global = i915_reset_count(&i915->gpu_error); 822 unsigned long count = 0, reported; 823 IGT_TIMEOUT(end_time); 824 825 if (flags & TEST_ACTIVE && 826 !intel_engine_can_store_dword(engine)) 827 continue; 828 829 if (!wait_for_idle(engine)) { 830 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 831 engine->name, test_name); 832 err = -EIO; 833 break; 834 } 835 836 memset(threads, 0, sizeof(threads)); 837 for_each_engine(other, i915, tmp) { 838 struct task_struct *tsk; 839 840 threads[tmp].resets = 841 i915_reset_engine_count(&i915->gpu_error, 842 other); 843 844 if (!(flags & TEST_OTHERS)) 845 continue; 846 847 if (other == engine && !(flags & TEST_SELF)) 848 continue; 849 850 threads[tmp].engine = other; 851 threads[tmp].flags = flags; 852 853 tsk = kthread_run(active_engine, &threads[tmp], 854 "igt/%s", other->name); 855 if (IS_ERR(tsk)) { 856 err = PTR_ERR(tsk); 857 goto unwind; 858 } 859 860 threads[tmp].task = tsk; 861 get_task_struct(tsk); 862 } 863 864 intel_engine_pm_get(engine); 865 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 866 do { 867 struct i915_request *rq = NULL; 868 869 if (flags & TEST_ACTIVE) { 870 mutex_lock(&i915->drm.struct_mutex); 871 rq = hang_create_request(&h, engine); 872 if (IS_ERR(rq)) { 873 err = PTR_ERR(rq); 874 mutex_unlock(&i915->drm.struct_mutex); 875 break; 876 } 877 878 i915_request_get(rq); 879 i915_request_add(rq); 880 mutex_unlock(&i915->drm.struct_mutex); 881 882 if (!wait_until_running(&h, rq)) { 883 struct drm_printer p = drm_info_printer(i915->drm.dev); 884 885 pr_err("%s: Failed to start request %llx, at %x\n", 886 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 887 intel_engine_dump(engine, &p, 888 "%s\n", engine->name); 889 890 i915_request_put(rq); 891 err = -EIO; 892 break; 893 } 894 } 895 896 err = i915_reset_engine(engine, NULL); 897 if (err) { 898 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 899 engine->name, test_name, err); 900 break; 901 } 902 903 count++; 904 905 if (rq) { 906 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 907 struct drm_printer p = 908 drm_info_printer(i915->drm.dev); 909 910 pr_err("i915_reset_engine(%s:%s):" 911 " failed to complete request after reset\n", 912 engine->name, test_name); 913 intel_engine_dump(engine, &p, 914 "%s\n", engine->name); 915 i915_request_put(rq); 916 917 GEM_TRACE_DUMP(); 918 i915_gem_set_wedged(i915); 919 err = -EIO; 920 break; 921 } 922 923 i915_request_put(rq); 924 } 925 926 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 927 struct drm_printer p = 928 drm_info_printer(i915->drm.dev); 929 930 pr_err("i915_reset_engine(%s:%s):" 931 " failed to idle after reset\n", 932 engine->name, test_name); 933 intel_engine_dump(engine, &p, 934 "%s\n", engine->name); 935 936 err = -EIO; 937 break; 938 } 939 } while (time_before(jiffies, end_time)); 940 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 941 intel_engine_pm_put(engine); 942 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 943 engine->name, test_name, count); 944 945 reported = i915_reset_engine_count(&i915->gpu_error, engine); 946 reported -= threads[engine->id].resets; 947 if (reported != count) { 948 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 949 engine->name, test_name, count, reported); 950 if (!err) 951 err = -EINVAL; 952 } 953 954 unwind: 955 for_each_engine(other, i915, tmp) { 956 int ret; 957 958 if (!threads[tmp].task) 959 continue; 960 961 ret = kthread_stop(threads[tmp].task); 962 if (ret) { 963 pr_err("kthread for other engine %s failed, err=%d\n", 964 other->name, ret); 965 if (!err) 966 err = ret; 967 } 968 put_task_struct(threads[tmp].task); 969 970 if (other != engine && 971 threads[tmp].resets != 972 i915_reset_engine_count(&i915->gpu_error, other)) { 973 pr_err("Innocent engine %s was reset (count=%ld)\n", 974 other->name, 975 i915_reset_engine_count(&i915->gpu_error, 976 other) - 977 threads[tmp].resets); 978 if (!err) 979 err = -EINVAL; 980 } 981 } 982 983 if (global != i915_reset_count(&i915->gpu_error)) { 984 pr_err("Global reset (count=%ld)!\n", 985 i915_reset_count(&i915->gpu_error) - global); 986 if (!err) 987 err = -EINVAL; 988 } 989 990 if (err) 991 break; 992 993 mutex_lock(&i915->drm.struct_mutex); 994 err = igt_flush_test(i915, I915_WAIT_LOCKED); 995 mutex_unlock(&i915->drm.struct_mutex); 996 if (err) 997 break; 998 } 999 1000 if (i915_reset_failed(i915)) 1001 err = -EIO; 1002 1003 if (flags & TEST_ACTIVE) { 1004 mutex_lock(&i915->drm.struct_mutex); 1005 hang_fini(&h); 1006 mutex_unlock(&i915->drm.struct_mutex); 1007 } 1008 1009 return err; 1010 } 1011 1012 static int igt_reset_engines(void *arg) 1013 { 1014 static const struct { 1015 const char *name; 1016 unsigned int flags; 1017 } phases[] = { 1018 { "idle", 0 }, 1019 { "active", TEST_ACTIVE }, 1020 { "others-idle", TEST_OTHERS }, 1021 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1022 { 1023 "others-priority", 1024 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1025 }, 1026 { 1027 "self-priority", 1028 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1029 }, 1030 { } 1031 }; 1032 struct drm_i915_private *i915 = arg; 1033 typeof(*phases) *p; 1034 int err; 1035 1036 for (p = phases; p->name; p++) { 1037 if (p->flags & TEST_PRIORITY) { 1038 if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1039 continue; 1040 } 1041 1042 err = __igt_reset_engines(arg, p->name, p->flags); 1043 if (err) 1044 return err; 1045 } 1046 1047 return 0; 1048 } 1049 1050 static u32 fake_hangcheck(struct drm_i915_private *i915, 1051 intel_engine_mask_t mask) 1052 { 1053 u32 count = i915_reset_count(&i915->gpu_error); 1054 1055 i915_reset(i915, mask, NULL); 1056 1057 return count; 1058 } 1059 1060 static int igt_reset_wait(void *arg) 1061 { 1062 struct drm_i915_private *i915 = arg; 1063 struct i915_request *rq; 1064 unsigned int reset_count; 1065 struct hang h; 1066 long timeout; 1067 int err; 1068 1069 if (!intel_engine_can_store_dword(i915->engine[RCS0])) 1070 return 0; 1071 1072 /* Check that we detect a stuck waiter and issue a reset */ 1073 1074 igt_global_reset_lock(i915); 1075 1076 mutex_lock(&i915->drm.struct_mutex); 1077 err = hang_init(&h, i915); 1078 if (err) 1079 goto unlock; 1080 1081 rq = hang_create_request(&h, i915->engine[RCS0]); 1082 if (IS_ERR(rq)) { 1083 err = PTR_ERR(rq); 1084 goto fini; 1085 } 1086 1087 i915_request_get(rq); 1088 i915_request_add(rq); 1089 1090 if (!wait_until_running(&h, rq)) { 1091 struct drm_printer p = drm_info_printer(i915->drm.dev); 1092 1093 pr_err("%s: Failed to start request %llx, at %x\n", 1094 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1095 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1096 1097 i915_gem_set_wedged(i915); 1098 1099 err = -EIO; 1100 goto out_rq; 1101 } 1102 1103 reset_count = fake_hangcheck(i915, ALL_ENGINES); 1104 1105 timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10); 1106 if (timeout < 0) { 1107 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1108 timeout); 1109 err = timeout; 1110 goto out_rq; 1111 } 1112 1113 if (i915_reset_count(&i915->gpu_error) == reset_count) { 1114 pr_err("No GPU reset recorded!\n"); 1115 err = -EINVAL; 1116 goto out_rq; 1117 } 1118 1119 out_rq: 1120 i915_request_put(rq); 1121 fini: 1122 hang_fini(&h); 1123 unlock: 1124 mutex_unlock(&i915->drm.struct_mutex); 1125 igt_global_reset_unlock(i915); 1126 1127 if (i915_reset_failed(i915)) 1128 return -EIO; 1129 1130 return err; 1131 } 1132 1133 struct evict_vma { 1134 struct completion completion; 1135 struct i915_vma *vma; 1136 }; 1137 1138 static int evict_vma(void *data) 1139 { 1140 struct evict_vma *arg = data; 1141 struct i915_address_space *vm = arg->vma->vm; 1142 struct drm_i915_private *i915 = vm->i915; 1143 struct drm_mm_node evict = arg->vma->node; 1144 int err; 1145 1146 complete(&arg->completion); 1147 1148 mutex_lock(&i915->drm.struct_mutex); 1149 err = i915_gem_evict_for_node(vm, &evict, 0); 1150 mutex_unlock(&i915->drm.struct_mutex); 1151 1152 return err; 1153 } 1154 1155 static int evict_fence(void *data) 1156 { 1157 struct evict_vma *arg = data; 1158 struct drm_i915_private *i915 = arg->vma->vm->i915; 1159 int err; 1160 1161 complete(&arg->completion); 1162 1163 mutex_lock(&i915->drm.struct_mutex); 1164 1165 /* Mark the fence register as dirty to force the mmio update. */ 1166 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1167 if (err) { 1168 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1169 goto out_unlock; 1170 } 1171 1172 err = i915_vma_pin_fence(arg->vma); 1173 if (err) { 1174 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1175 goto out_unlock; 1176 } 1177 1178 i915_vma_unpin_fence(arg->vma); 1179 1180 out_unlock: 1181 mutex_unlock(&i915->drm.struct_mutex); 1182 1183 return err; 1184 } 1185 1186 static int __igt_reset_evict_vma(struct drm_i915_private *i915, 1187 struct i915_address_space *vm, 1188 int (*fn)(void *), 1189 unsigned int flags) 1190 { 1191 struct drm_i915_gem_object *obj; 1192 struct task_struct *tsk = NULL; 1193 struct i915_request *rq; 1194 struct evict_vma arg; 1195 struct hang h; 1196 int err; 1197 1198 if (!intel_engine_can_store_dword(i915->engine[RCS0])) 1199 return 0; 1200 1201 /* Check that we can recover an unbind stuck on a hanging request */ 1202 1203 mutex_lock(&i915->drm.struct_mutex); 1204 err = hang_init(&h, i915); 1205 if (err) 1206 goto unlock; 1207 1208 obj = i915_gem_object_create_internal(i915, SZ_1M); 1209 if (IS_ERR(obj)) { 1210 err = PTR_ERR(obj); 1211 goto fini; 1212 } 1213 1214 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1215 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1216 if (err) { 1217 pr_err("Invalid X-tiling settings; err:%d\n", err); 1218 goto out_obj; 1219 } 1220 } 1221 1222 arg.vma = i915_vma_instance(obj, vm, NULL); 1223 if (IS_ERR(arg.vma)) { 1224 err = PTR_ERR(arg.vma); 1225 goto out_obj; 1226 } 1227 1228 rq = hang_create_request(&h, i915->engine[RCS0]); 1229 if (IS_ERR(rq)) { 1230 err = PTR_ERR(rq); 1231 goto out_obj; 1232 } 1233 1234 err = i915_vma_pin(arg.vma, 0, 0, 1235 i915_vma_is_ggtt(arg.vma) ? 1236 PIN_GLOBAL | PIN_MAPPABLE : 1237 PIN_USER); 1238 if (err) { 1239 i915_request_add(rq); 1240 goto out_obj; 1241 } 1242 1243 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1244 err = i915_vma_pin_fence(arg.vma); 1245 if (err) { 1246 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1247 i915_vma_unpin(arg.vma); 1248 i915_request_add(rq); 1249 goto out_obj; 1250 } 1251 } 1252 1253 err = i915_vma_move_to_active(arg.vma, rq, flags); 1254 1255 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1256 i915_vma_unpin_fence(arg.vma); 1257 i915_vma_unpin(arg.vma); 1258 1259 i915_request_get(rq); 1260 i915_request_add(rq); 1261 if (err) 1262 goto out_rq; 1263 1264 mutex_unlock(&i915->drm.struct_mutex); 1265 1266 if (!wait_until_running(&h, rq)) { 1267 struct drm_printer p = drm_info_printer(i915->drm.dev); 1268 1269 pr_err("%s: Failed to start request %llx, at %x\n", 1270 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1271 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1272 1273 i915_gem_set_wedged(i915); 1274 goto out_reset; 1275 } 1276 1277 init_completion(&arg.completion); 1278 1279 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1280 if (IS_ERR(tsk)) { 1281 err = PTR_ERR(tsk); 1282 tsk = NULL; 1283 goto out_reset; 1284 } 1285 get_task_struct(tsk); 1286 1287 wait_for_completion(&arg.completion); 1288 1289 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1290 struct drm_printer p = drm_info_printer(i915->drm.dev); 1291 1292 pr_err("igt/evict_vma kthread did not wait\n"); 1293 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1294 1295 i915_gem_set_wedged(i915); 1296 goto out_reset; 1297 } 1298 1299 out_reset: 1300 igt_global_reset_lock(i915); 1301 fake_hangcheck(rq->i915, rq->engine->mask); 1302 igt_global_reset_unlock(i915); 1303 1304 if (tsk) { 1305 struct igt_wedge_me w; 1306 1307 /* The reset, even indirectly, should take less than 10ms. */ 1308 igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/) 1309 err = kthread_stop(tsk); 1310 1311 put_task_struct(tsk); 1312 } 1313 1314 mutex_lock(&i915->drm.struct_mutex); 1315 out_rq: 1316 i915_request_put(rq); 1317 out_obj: 1318 i915_gem_object_put(obj); 1319 fini: 1320 hang_fini(&h); 1321 unlock: 1322 mutex_unlock(&i915->drm.struct_mutex); 1323 1324 if (i915_reset_failed(i915)) 1325 return -EIO; 1326 1327 return err; 1328 } 1329 1330 static int igt_reset_evict_ggtt(void *arg) 1331 { 1332 struct drm_i915_private *i915 = arg; 1333 1334 return __igt_reset_evict_vma(i915, &i915->ggtt.vm, 1335 evict_vma, EXEC_OBJECT_WRITE); 1336 } 1337 1338 static int igt_reset_evict_ppgtt(void *arg) 1339 { 1340 struct drm_i915_private *i915 = arg; 1341 struct i915_gem_context *ctx; 1342 struct drm_file *file; 1343 int err; 1344 1345 file = mock_file(i915); 1346 if (IS_ERR(file)) 1347 return PTR_ERR(file); 1348 1349 mutex_lock(&i915->drm.struct_mutex); 1350 ctx = live_context(i915, file); 1351 mutex_unlock(&i915->drm.struct_mutex); 1352 if (IS_ERR(ctx)) { 1353 err = PTR_ERR(ctx); 1354 goto out; 1355 } 1356 1357 err = 0; 1358 if (ctx->ppgtt) /* aliasing == global gtt locking, covered above */ 1359 err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm, 1360 evict_vma, EXEC_OBJECT_WRITE); 1361 1362 out: 1363 mock_file_free(i915, file); 1364 return err; 1365 } 1366 1367 static int igt_reset_evict_fence(void *arg) 1368 { 1369 struct drm_i915_private *i915 = arg; 1370 1371 return __igt_reset_evict_vma(i915, &i915->ggtt.vm, 1372 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1373 } 1374 1375 static int wait_for_others(struct drm_i915_private *i915, 1376 struct intel_engine_cs *exclude) 1377 { 1378 struct intel_engine_cs *engine; 1379 enum intel_engine_id id; 1380 1381 for_each_engine(engine, i915, id) { 1382 if (engine == exclude) 1383 continue; 1384 1385 if (!wait_for_idle(engine)) 1386 return -EIO; 1387 } 1388 1389 return 0; 1390 } 1391 1392 static int igt_reset_queue(void *arg) 1393 { 1394 struct drm_i915_private *i915 = arg; 1395 struct intel_engine_cs *engine; 1396 enum intel_engine_id id; 1397 struct hang h; 1398 int err; 1399 1400 /* Check that we replay pending requests following a hang */ 1401 1402 igt_global_reset_lock(i915); 1403 1404 mutex_lock(&i915->drm.struct_mutex); 1405 err = hang_init(&h, i915); 1406 if (err) 1407 goto unlock; 1408 1409 for_each_engine(engine, i915, id) { 1410 struct i915_request *prev; 1411 IGT_TIMEOUT(end_time); 1412 unsigned int count; 1413 1414 if (!intel_engine_can_store_dword(engine)) 1415 continue; 1416 1417 prev = hang_create_request(&h, engine); 1418 if (IS_ERR(prev)) { 1419 err = PTR_ERR(prev); 1420 goto fini; 1421 } 1422 1423 i915_request_get(prev); 1424 i915_request_add(prev); 1425 1426 count = 0; 1427 do { 1428 struct i915_request *rq; 1429 unsigned int reset_count; 1430 1431 rq = hang_create_request(&h, engine); 1432 if (IS_ERR(rq)) { 1433 err = PTR_ERR(rq); 1434 goto fini; 1435 } 1436 1437 i915_request_get(rq); 1438 i915_request_add(rq); 1439 1440 /* 1441 * XXX We don't handle resetting the kernel context 1442 * very well. If we trigger a device reset twice in 1443 * quick succession while the kernel context is 1444 * executing, we may end up skipping the breadcrumb. 1445 * This is really only a problem for the selftest as 1446 * normally there is a large interlude between resets 1447 * (hangcheck), or we focus on resetting just one 1448 * engine and so avoid repeatedly resetting innocents. 1449 */ 1450 err = wait_for_others(i915, engine); 1451 if (err) { 1452 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1453 __func__, engine->name); 1454 i915_request_put(rq); 1455 i915_request_put(prev); 1456 1457 GEM_TRACE_DUMP(); 1458 i915_gem_set_wedged(i915); 1459 goto fini; 1460 } 1461 1462 if (!wait_until_running(&h, prev)) { 1463 struct drm_printer p = drm_info_printer(i915->drm.dev); 1464 1465 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1466 __func__, engine->name, 1467 prev->fence.seqno, hws_seqno(&h, prev)); 1468 intel_engine_dump(engine, &p, 1469 "%s\n", engine->name); 1470 1471 i915_request_put(rq); 1472 i915_request_put(prev); 1473 1474 i915_gem_set_wedged(i915); 1475 1476 err = -EIO; 1477 goto fini; 1478 } 1479 1480 reset_count = fake_hangcheck(i915, BIT(id)); 1481 1482 if (prev->fence.error != -EIO) { 1483 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1484 prev->fence.error); 1485 i915_request_put(rq); 1486 i915_request_put(prev); 1487 err = -EINVAL; 1488 goto fini; 1489 } 1490 1491 if (rq->fence.error) { 1492 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1493 rq->fence.error); 1494 i915_request_put(rq); 1495 i915_request_put(prev); 1496 err = -EINVAL; 1497 goto fini; 1498 } 1499 1500 if (i915_reset_count(&i915->gpu_error) == reset_count) { 1501 pr_err("No GPU reset recorded!\n"); 1502 i915_request_put(rq); 1503 i915_request_put(prev); 1504 err = -EINVAL; 1505 goto fini; 1506 } 1507 1508 i915_request_put(prev); 1509 prev = rq; 1510 count++; 1511 } while (time_before(jiffies, end_time)); 1512 pr_info("%s: Completed %d resets\n", engine->name, count); 1513 1514 *h.batch = MI_BATCH_BUFFER_END; 1515 i915_gem_chipset_flush(i915); 1516 1517 i915_request_put(prev); 1518 1519 err = igt_flush_test(i915, I915_WAIT_LOCKED); 1520 if (err) 1521 break; 1522 } 1523 1524 fini: 1525 hang_fini(&h); 1526 unlock: 1527 mutex_unlock(&i915->drm.struct_mutex); 1528 igt_global_reset_unlock(i915); 1529 1530 if (i915_reset_failed(i915)) 1531 return -EIO; 1532 1533 return err; 1534 } 1535 1536 static int igt_handle_error(void *arg) 1537 { 1538 struct drm_i915_private *i915 = arg; 1539 struct intel_engine_cs *engine = i915->engine[RCS0]; 1540 struct hang h; 1541 struct i915_request *rq; 1542 struct i915_gpu_state *error; 1543 int err; 1544 1545 /* Check that we can issue a global GPU and engine reset */ 1546 1547 if (!intel_has_reset_engine(i915)) 1548 return 0; 1549 1550 if (!engine || !intel_engine_can_store_dword(engine)) 1551 return 0; 1552 1553 mutex_lock(&i915->drm.struct_mutex); 1554 1555 err = hang_init(&h, i915); 1556 if (err) 1557 goto err_unlock; 1558 1559 rq = hang_create_request(&h, engine); 1560 if (IS_ERR(rq)) { 1561 err = PTR_ERR(rq); 1562 goto err_fini; 1563 } 1564 1565 i915_request_get(rq); 1566 i915_request_add(rq); 1567 1568 if (!wait_until_running(&h, rq)) { 1569 struct drm_printer p = drm_info_printer(i915->drm.dev); 1570 1571 pr_err("%s: Failed to start request %llx, at %x\n", 1572 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1573 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1574 1575 i915_gem_set_wedged(i915); 1576 1577 err = -EIO; 1578 goto err_request; 1579 } 1580 1581 mutex_unlock(&i915->drm.struct_mutex); 1582 1583 /* Temporarily disable error capture */ 1584 error = xchg(&i915->gpu_error.first_error, (void *)-1); 1585 1586 i915_handle_error(i915, engine->mask, 0, NULL); 1587 1588 xchg(&i915->gpu_error.first_error, error); 1589 1590 mutex_lock(&i915->drm.struct_mutex); 1591 1592 if (rq->fence.error != -EIO) { 1593 pr_err("Guilty request not identified!\n"); 1594 err = -EINVAL; 1595 goto err_request; 1596 } 1597 1598 err_request: 1599 i915_request_put(rq); 1600 err_fini: 1601 hang_fini(&h); 1602 err_unlock: 1603 mutex_unlock(&i915->drm.struct_mutex); 1604 return err; 1605 } 1606 1607 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1608 const struct igt_atomic_section *p, 1609 const char *mode) 1610 { 1611 struct tasklet_struct * const t = &engine->execlists.tasklet; 1612 int err; 1613 1614 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1615 engine->name, mode, p->name); 1616 1617 tasklet_disable_nosync(t); 1618 p->critical_section_begin(); 1619 1620 err = i915_reset_engine(engine, NULL); 1621 1622 p->critical_section_end(); 1623 tasklet_enable(t); 1624 1625 if (err) 1626 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1627 engine->name, mode, p->name); 1628 1629 return err; 1630 } 1631 1632 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1633 const struct igt_atomic_section *p) 1634 { 1635 struct drm_i915_private *i915 = engine->i915; 1636 struct i915_request *rq; 1637 struct hang h; 1638 int err; 1639 1640 err = __igt_atomic_reset_engine(engine, p, "idle"); 1641 if (err) 1642 return err; 1643 1644 err = hang_init(&h, i915); 1645 if (err) 1646 return err; 1647 1648 rq = hang_create_request(&h, engine); 1649 if (IS_ERR(rq)) { 1650 err = PTR_ERR(rq); 1651 goto out; 1652 } 1653 1654 i915_request_get(rq); 1655 i915_request_add(rq); 1656 1657 if (wait_until_running(&h, rq)) { 1658 err = __igt_atomic_reset_engine(engine, p, "active"); 1659 } else { 1660 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1661 __func__, engine->name, 1662 rq->fence.seqno, hws_seqno(&h, rq)); 1663 i915_gem_set_wedged(i915); 1664 err = -EIO; 1665 } 1666 1667 if (err == 0) { 1668 struct igt_wedge_me w; 1669 1670 igt_wedge_on_timeout(&w, i915, HZ / 20 /* 50ms timeout*/) 1671 i915_request_wait(rq, 1672 I915_WAIT_LOCKED, 1673 MAX_SCHEDULE_TIMEOUT); 1674 if (i915_reset_failed(i915)) 1675 err = -EIO; 1676 } 1677 1678 i915_request_put(rq); 1679 out: 1680 hang_fini(&h); 1681 return err; 1682 } 1683 1684 static int igt_reset_engines_atomic(void *arg) 1685 { 1686 struct drm_i915_private *i915 = arg; 1687 const typeof(*igt_atomic_phases) *p; 1688 int err = 0; 1689 1690 /* Check that the engines resets are usable from atomic context */ 1691 1692 if (!intel_has_reset_engine(i915)) 1693 return 0; 1694 1695 if (USES_GUC_SUBMISSION(i915)) 1696 return 0; 1697 1698 igt_global_reset_lock(i915); 1699 mutex_lock(&i915->drm.struct_mutex); 1700 1701 /* Flush any requests before we get started and check basics */ 1702 if (!igt_force_reset(i915)) 1703 goto unlock; 1704 1705 for (p = igt_atomic_phases; p->name; p++) { 1706 struct intel_engine_cs *engine; 1707 enum intel_engine_id id; 1708 1709 for_each_engine(engine, i915, id) { 1710 err = igt_atomic_reset_engine(engine, p); 1711 if (err) 1712 goto out; 1713 } 1714 } 1715 1716 out: 1717 /* As we poke around the guts, do a full reset before continuing. */ 1718 igt_force_reset(i915); 1719 1720 unlock: 1721 mutex_unlock(&i915->drm.struct_mutex); 1722 igt_global_reset_unlock(i915); 1723 1724 return err; 1725 } 1726 1727 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1728 { 1729 static const struct i915_subtest tests[] = { 1730 SUBTEST(igt_hang_sanitycheck), 1731 SUBTEST(igt_reset_nop), 1732 SUBTEST(igt_reset_nop_engine), 1733 SUBTEST(igt_reset_idle_engine), 1734 SUBTEST(igt_reset_active_engine), 1735 SUBTEST(igt_reset_engines), 1736 SUBTEST(igt_reset_engines_atomic), 1737 SUBTEST(igt_reset_queue), 1738 SUBTEST(igt_reset_wait), 1739 SUBTEST(igt_reset_evict_ggtt), 1740 SUBTEST(igt_reset_evict_ppgtt), 1741 SUBTEST(igt_reset_evict_fence), 1742 SUBTEST(igt_handle_error), 1743 }; 1744 intel_wakeref_t wakeref; 1745 bool saved_hangcheck; 1746 int err; 1747 1748 if (!intel_has_gpu_reset(i915)) 1749 return 0; 1750 1751 if (i915_terminally_wedged(i915)) 1752 return -EIO; /* we're long past hope of a successful reset */ 1753 1754 wakeref = intel_runtime_pm_get(i915); 1755 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck); 1756 drain_delayed_work(&i915->gpu_error.hangcheck_work); /* flush param */ 1757 1758 err = i915_subtests(tests, i915); 1759 1760 mutex_lock(&i915->drm.struct_mutex); 1761 igt_flush_test(i915, I915_WAIT_LOCKED); 1762 mutex_unlock(&i915->drm.struct_mutex); 1763 1764 i915_modparams.enable_hangcheck = saved_hangcheck; 1765 intel_runtime_pm_put(i915, wakeref); 1766 1767 return err; 1768 } 1769