1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/kthread.h> 26 27 #include "gem/i915_gem_context.h" 28 #include "intel_engine_pm.h" 29 30 #include "i915_selftest.h" 31 #include "selftests/i915_random.h" 32 #include "selftests/igt_flush_test.h" 33 #include "selftests/igt_reset.h" 34 #include "selftests/igt_wedge_me.h" 35 #include "selftests/igt_atomic.h" 36 37 #include "selftests/mock_drm.h" 38 39 #include "gem/selftests/mock_context.h" 40 #include "gem/selftests/igt_gem_utils.h" 41 42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 43 44 struct hang { 45 struct drm_i915_private *i915; 46 struct drm_i915_gem_object *hws; 47 struct drm_i915_gem_object *obj; 48 struct i915_gem_context *ctx; 49 u32 *seqno; 50 u32 *batch; 51 }; 52 53 static int hang_init(struct hang *h, struct drm_i915_private *i915) 54 { 55 void *vaddr; 56 int err; 57 58 memset(h, 0, sizeof(*h)); 59 h->i915 = i915; 60 61 h->ctx = kernel_context(i915); 62 if (IS_ERR(h->ctx)) 63 return PTR_ERR(h->ctx); 64 65 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 66 67 h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE); 68 if (IS_ERR(h->hws)) { 69 err = PTR_ERR(h->hws); 70 goto err_ctx; 71 } 72 73 h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 74 if (IS_ERR(h->obj)) { 75 err = PTR_ERR(h->obj); 76 goto err_hws; 77 } 78 79 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 80 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 81 if (IS_ERR(vaddr)) { 82 err = PTR_ERR(vaddr); 83 goto err_obj; 84 } 85 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 86 87 vaddr = i915_gem_object_pin_map(h->obj, 88 i915_coherent_map_type(i915)); 89 if (IS_ERR(vaddr)) { 90 err = PTR_ERR(vaddr); 91 goto err_unpin_hws; 92 } 93 h->batch = vaddr; 94 95 return 0; 96 97 err_unpin_hws: 98 i915_gem_object_unpin_map(h->hws); 99 err_obj: 100 i915_gem_object_put(h->obj); 101 err_hws: 102 i915_gem_object_put(h->hws); 103 err_ctx: 104 kernel_context_close(h->ctx); 105 return err; 106 } 107 108 static u64 hws_address(const struct i915_vma *hws, 109 const struct i915_request *rq) 110 { 111 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 112 } 113 114 static int move_to_active(struct i915_vma *vma, 115 struct i915_request *rq, 116 unsigned int flags) 117 { 118 int err; 119 120 i915_vma_lock(vma); 121 err = i915_vma_move_to_active(vma, rq, flags); 122 i915_vma_unlock(vma); 123 124 return err; 125 } 126 127 static struct i915_request * 128 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 129 { 130 struct drm_i915_private *i915 = h->i915; 131 struct i915_address_space *vm = h->ctx->vm ?: &i915->ggtt.vm; 132 struct i915_request *rq = NULL; 133 struct i915_vma *hws, *vma; 134 unsigned int flags; 135 u32 *batch; 136 int err; 137 138 if (i915_gem_object_is_active(h->obj)) { 139 struct drm_i915_gem_object *obj; 140 void *vaddr; 141 142 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE); 143 if (IS_ERR(obj)) 144 return ERR_CAST(obj); 145 146 vaddr = i915_gem_object_pin_map(obj, 147 i915_coherent_map_type(h->i915)); 148 if (IS_ERR(vaddr)) { 149 i915_gem_object_put(obj); 150 return ERR_CAST(vaddr); 151 } 152 153 i915_gem_object_unpin_map(h->obj); 154 i915_gem_object_put(h->obj); 155 156 h->obj = obj; 157 h->batch = vaddr; 158 } 159 160 vma = i915_vma_instance(h->obj, vm, NULL); 161 if (IS_ERR(vma)) 162 return ERR_CAST(vma); 163 164 hws = i915_vma_instance(h->hws, vm, NULL); 165 if (IS_ERR(hws)) 166 return ERR_CAST(hws); 167 168 err = i915_vma_pin(vma, 0, 0, PIN_USER); 169 if (err) 170 return ERR_PTR(err); 171 172 err = i915_vma_pin(hws, 0, 0, PIN_USER); 173 if (err) 174 goto unpin_vma; 175 176 rq = igt_request_alloc(h->ctx, engine); 177 if (IS_ERR(rq)) { 178 err = PTR_ERR(rq); 179 goto unpin_hws; 180 } 181 182 err = move_to_active(vma, rq, 0); 183 if (err) 184 goto cancel_rq; 185 186 err = move_to_active(hws, rq, 0); 187 if (err) 188 goto cancel_rq; 189 190 batch = h->batch; 191 if (INTEL_GEN(i915) >= 8) { 192 *batch++ = MI_STORE_DWORD_IMM_GEN4; 193 *batch++ = lower_32_bits(hws_address(hws, rq)); 194 *batch++ = upper_32_bits(hws_address(hws, rq)); 195 *batch++ = rq->fence.seqno; 196 *batch++ = MI_ARB_CHECK; 197 198 memset(batch, 0, 1024); 199 batch += 1024 / sizeof(*batch); 200 201 *batch++ = MI_ARB_CHECK; 202 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 203 *batch++ = lower_32_bits(vma->node.start); 204 *batch++ = upper_32_bits(vma->node.start); 205 } else if (INTEL_GEN(i915) >= 6) { 206 *batch++ = MI_STORE_DWORD_IMM_GEN4; 207 *batch++ = 0; 208 *batch++ = lower_32_bits(hws_address(hws, rq)); 209 *batch++ = rq->fence.seqno; 210 *batch++ = MI_ARB_CHECK; 211 212 memset(batch, 0, 1024); 213 batch += 1024 / sizeof(*batch); 214 215 *batch++ = MI_ARB_CHECK; 216 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 217 *batch++ = lower_32_bits(vma->node.start); 218 } else if (INTEL_GEN(i915) >= 4) { 219 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 220 *batch++ = 0; 221 *batch++ = lower_32_bits(hws_address(hws, rq)); 222 *batch++ = rq->fence.seqno; 223 *batch++ = MI_ARB_CHECK; 224 225 memset(batch, 0, 1024); 226 batch += 1024 / sizeof(*batch); 227 228 *batch++ = MI_ARB_CHECK; 229 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 230 *batch++ = lower_32_bits(vma->node.start); 231 } else { 232 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 233 *batch++ = lower_32_bits(hws_address(hws, rq)); 234 *batch++ = rq->fence.seqno; 235 *batch++ = MI_ARB_CHECK; 236 237 memset(batch, 0, 1024); 238 batch += 1024 / sizeof(*batch); 239 240 *batch++ = MI_ARB_CHECK; 241 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 242 *batch++ = lower_32_bits(vma->node.start); 243 } 244 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 245 i915_gem_chipset_flush(h->i915); 246 247 if (rq->engine->emit_init_breadcrumb) { 248 err = rq->engine->emit_init_breadcrumb(rq); 249 if (err) 250 goto cancel_rq; 251 } 252 253 flags = 0; 254 if (INTEL_GEN(vm->i915) <= 5) 255 flags |= I915_DISPATCH_SECURE; 256 257 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 258 259 cancel_rq: 260 if (err) { 261 i915_request_skip(rq, err); 262 i915_request_add(rq); 263 } 264 unpin_hws: 265 i915_vma_unpin(hws); 266 unpin_vma: 267 i915_vma_unpin(vma); 268 return err ? ERR_PTR(err) : rq; 269 } 270 271 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 272 { 273 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 274 } 275 276 static void hang_fini(struct hang *h) 277 { 278 *h->batch = MI_BATCH_BUFFER_END; 279 i915_gem_chipset_flush(h->i915); 280 281 i915_gem_object_unpin_map(h->obj); 282 i915_gem_object_put(h->obj); 283 284 i915_gem_object_unpin_map(h->hws); 285 i915_gem_object_put(h->hws); 286 287 kernel_context_close(h->ctx); 288 289 igt_flush_test(h->i915, I915_WAIT_LOCKED); 290 } 291 292 static bool wait_until_running(struct hang *h, struct i915_request *rq) 293 { 294 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 295 rq->fence.seqno), 296 10) && 297 wait_for(i915_seqno_passed(hws_seqno(h, rq), 298 rq->fence.seqno), 299 1000)); 300 } 301 302 static int igt_hang_sanitycheck(void *arg) 303 { 304 struct drm_i915_private *i915 = arg; 305 struct i915_request *rq; 306 struct intel_engine_cs *engine; 307 enum intel_engine_id id; 308 struct hang h; 309 int err; 310 311 /* Basic check that we can execute our hanging batch */ 312 313 mutex_lock(&i915->drm.struct_mutex); 314 err = hang_init(&h, i915); 315 if (err) 316 goto unlock; 317 318 for_each_engine(engine, i915, id) { 319 struct igt_wedge_me w; 320 long timeout; 321 322 if (!intel_engine_can_store_dword(engine)) 323 continue; 324 325 rq = hang_create_request(&h, engine); 326 if (IS_ERR(rq)) { 327 err = PTR_ERR(rq); 328 pr_err("Failed to create request for %s, err=%d\n", 329 engine->name, err); 330 goto fini; 331 } 332 333 i915_request_get(rq); 334 335 *h.batch = MI_BATCH_BUFFER_END; 336 i915_gem_chipset_flush(i915); 337 338 i915_request_add(rq); 339 340 timeout = 0; 341 igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/) 342 timeout = i915_request_wait(rq, 0, 343 MAX_SCHEDULE_TIMEOUT); 344 if (i915_reset_failed(i915)) 345 timeout = -EIO; 346 347 i915_request_put(rq); 348 349 if (timeout < 0) { 350 err = timeout; 351 pr_err("Wait for request failed on %s, err=%d\n", 352 engine->name, err); 353 goto fini; 354 } 355 } 356 357 fini: 358 hang_fini(&h); 359 unlock: 360 mutex_unlock(&i915->drm.struct_mutex); 361 return err; 362 } 363 364 static bool wait_for_idle(struct intel_engine_cs *engine) 365 { 366 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 367 } 368 369 static int igt_reset_nop(void *arg) 370 { 371 struct drm_i915_private *i915 = arg; 372 struct intel_engine_cs *engine; 373 struct i915_gem_context *ctx; 374 unsigned int reset_count, count; 375 enum intel_engine_id id; 376 intel_wakeref_t wakeref; 377 struct drm_file *file; 378 IGT_TIMEOUT(end_time); 379 int err = 0; 380 381 /* Check that we can reset during non-user portions of requests */ 382 383 file = mock_file(i915); 384 if (IS_ERR(file)) 385 return PTR_ERR(file); 386 387 mutex_lock(&i915->drm.struct_mutex); 388 ctx = live_context(i915, file); 389 mutex_unlock(&i915->drm.struct_mutex); 390 if (IS_ERR(ctx)) { 391 err = PTR_ERR(ctx); 392 goto out; 393 } 394 395 i915_gem_context_clear_bannable(ctx); 396 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 397 reset_count = i915_reset_count(&i915->gpu_error); 398 count = 0; 399 do { 400 mutex_lock(&i915->drm.struct_mutex); 401 for_each_engine(engine, i915, id) { 402 int i; 403 404 for (i = 0; i < 16; i++) { 405 struct i915_request *rq; 406 407 rq = igt_request_alloc(ctx, engine); 408 if (IS_ERR(rq)) { 409 err = PTR_ERR(rq); 410 break; 411 } 412 413 i915_request_add(rq); 414 } 415 } 416 mutex_unlock(&i915->drm.struct_mutex); 417 418 igt_global_reset_lock(i915); 419 i915_reset(i915, ALL_ENGINES, NULL); 420 igt_global_reset_unlock(i915); 421 if (i915_reset_failed(i915)) { 422 err = -EIO; 423 break; 424 } 425 426 if (i915_reset_count(&i915->gpu_error) != 427 reset_count + ++count) { 428 pr_err("Full GPU reset not recorded!\n"); 429 err = -EINVAL; 430 break; 431 } 432 433 err = igt_flush_test(i915, 0); 434 if (err) 435 break; 436 } while (time_before(jiffies, end_time)); 437 pr_info("%s: %d resets\n", __func__, count); 438 439 mutex_lock(&i915->drm.struct_mutex); 440 err = igt_flush_test(i915, I915_WAIT_LOCKED); 441 mutex_unlock(&i915->drm.struct_mutex); 442 443 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 444 445 out: 446 mock_file_free(i915, file); 447 if (i915_reset_failed(i915)) 448 err = -EIO; 449 return err; 450 } 451 452 static int igt_reset_nop_engine(void *arg) 453 { 454 struct drm_i915_private *i915 = arg; 455 struct intel_engine_cs *engine; 456 struct i915_gem_context *ctx; 457 enum intel_engine_id id; 458 intel_wakeref_t wakeref; 459 struct drm_file *file; 460 int err = 0; 461 462 /* Check that we can engine-reset during non-user portions */ 463 464 if (!intel_has_reset_engine(i915)) 465 return 0; 466 467 file = mock_file(i915); 468 if (IS_ERR(file)) 469 return PTR_ERR(file); 470 471 mutex_lock(&i915->drm.struct_mutex); 472 ctx = live_context(i915, file); 473 mutex_unlock(&i915->drm.struct_mutex); 474 if (IS_ERR(ctx)) { 475 err = PTR_ERR(ctx); 476 goto out; 477 } 478 479 i915_gem_context_clear_bannable(ctx); 480 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 481 for_each_engine(engine, i915, id) { 482 unsigned int reset_count, reset_engine_count; 483 unsigned int count; 484 IGT_TIMEOUT(end_time); 485 486 reset_count = i915_reset_count(&i915->gpu_error); 487 reset_engine_count = i915_reset_engine_count(&i915->gpu_error, 488 engine); 489 count = 0; 490 491 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 492 do { 493 int i; 494 495 if (!wait_for_idle(engine)) { 496 pr_err("%s failed to idle before reset\n", 497 engine->name); 498 err = -EIO; 499 break; 500 } 501 502 mutex_lock(&i915->drm.struct_mutex); 503 for (i = 0; i < 16; i++) { 504 struct i915_request *rq; 505 506 rq = igt_request_alloc(ctx, engine); 507 if (IS_ERR(rq)) { 508 err = PTR_ERR(rq); 509 break; 510 } 511 512 i915_request_add(rq); 513 } 514 mutex_unlock(&i915->drm.struct_mutex); 515 516 err = i915_reset_engine(engine, NULL); 517 if (err) { 518 pr_err("i915_reset_engine failed\n"); 519 break; 520 } 521 522 if (i915_reset_count(&i915->gpu_error) != reset_count) { 523 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 524 err = -EINVAL; 525 break; 526 } 527 528 if (i915_reset_engine_count(&i915->gpu_error, engine) != 529 reset_engine_count + ++count) { 530 pr_err("%s engine reset not recorded!\n", 531 engine->name); 532 err = -EINVAL; 533 break; 534 } 535 } while (time_before(jiffies, end_time)); 536 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 537 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 538 539 if (err) 540 break; 541 542 err = igt_flush_test(i915, 0); 543 if (err) 544 break; 545 } 546 547 mutex_lock(&i915->drm.struct_mutex); 548 err = igt_flush_test(i915, I915_WAIT_LOCKED); 549 mutex_unlock(&i915->drm.struct_mutex); 550 551 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 552 out: 553 mock_file_free(i915, file); 554 if (i915_reset_failed(i915)) 555 err = -EIO; 556 return err; 557 } 558 559 static int __igt_reset_engine(struct drm_i915_private *i915, bool active) 560 { 561 struct intel_engine_cs *engine; 562 enum intel_engine_id id; 563 struct hang h; 564 int err = 0; 565 566 /* Check that we can issue an engine reset on an idle engine (no-op) */ 567 568 if (!intel_has_reset_engine(i915)) 569 return 0; 570 571 if (active) { 572 mutex_lock(&i915->drm.struct_mutex); 573 err = hang_init(&h, i915); 574 mutex_unlock(&i915->drm.struct_mutex); 575 if (err) 576 return err; 577 } 578 579 for_each_engine(engine, i915, id) { 580 unsigned int reset_count, reset_engine_count; 581 IGT_TIMEOUT(end_time); 582 583 if (active && !intel_engine_can_store_dword(engine)) 584 continue; 585 586 if (!wait_for_idle(engine)) { 587 pr_err("%s failed to idle before reset\n", 588 engine->name); 589 err = -EIO; 590 break; 591 } 592 593 reset_count = i915_reset_count(&i915->gpu_error); 594 reset_engine_count = i915_reset_engine_count(&i915->gpu_error, 595 engine); 596 597 intel_engine_pm_get(engine); 598 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 599 do { 600 if (active) { 601 struct i915_request *rq; 602 603 mutex_lock(&i915->drm.struct_mutex); 604 rq = hang_create_request(&h, engine); 605 if (IS_ERR(rq)) { 606 err = PTR_ERR(rq); 607 mutex_unlock(&i915->drm.struct_mutex); 608 break; 609 } 610 611 i915_request_get(rq); 612 i915_request_add(rq); 613 mutex_unlock(&i915->drm.struct_mutex); 614 615 if (!wait_until_running(&h, rq)) { 616 struct drm_printer p = drm_info_printer(i915->drm.dev); 617 618 pr_err("%s: Failed to start request %llx, at %x\n", 619 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 620 intel_engine_dump(engine, &p, 621 "%s\n", engine->name); 622 623 i915_request_put(rq); 624 err = -EIO; 625 break; 626 } 627 628 i915_request_put(rq); 629 } 630 631 err = i915_reset_engine(engine, NULL); 632 if (err) { 633 pr_err("i915_reset_engine failed\n"); 634 break; 635 } 636 637 if (i915_reset_count(&i915->gpu_error) != reset_count) { 638 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 639 err = -EINVAL; 640 break; 641 } 642 643 if (i915_reset_engine_count(&i915->gpu_error, engine) != 644 ++reset_engine_count) { 645 pr_err("%s engine reset not recorded!\n", 646 engine->name); 647 err = -EINVAL; 648 break; 649 } 650 } while (time_before(jiffies, end_time)); 651 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 652 intel_engine_pm_put(engine); 653 654 if (err) 655 break; 656 657 err = igt_flush_test(i915, 0); 658 if (err) 659 break; 660 } 661 662 if (i915_reset_failed(i915)) 663 err = -EIO; 664 665 if (active) { 666 mutex_lock(&i915->drm.struct_mutex); 667 hang_fini(&h); 668 mutex_unlock(&i915->drm.struct_mutex); 669 } 670 671 return err; 672 } 673 674 static int igt_reset_idle_engine(void *arg) 675 { 676 return __igt_reset_engine(arg, false); 677 } 678 679 static int igt_reset_active_engine(void *arg) 680 { 681 return __igt_reset_engine(arg, true); 682 } 683 684 struct active_engine { 685 struct task_struct *task; 686 struct intel_engine_cs *engine; 687 unsigned long resets; 688 unsigned int flags; 689 }; 690 691 #define TEST_ACTIVE BIT(0) 692 #define TEST_OTHERS BIT(1) 693 #define TEST_SELF BIT(2) 694 #define TEST_PRIORITY BIT(3) 695 696 static int active_request_put(struct i915_request *rq) 697 { 698 int err = 0; 699 700 if (!rq) 701 return 0; 702 703 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 704 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 705 rq->engine->name, 706 rq->fence.context, 707 rq->fence.seqno); 708 GEM_TRACE_DUMP(); 709 710 i915_gem_set_wedged(rq->i915); 711 err = -EIO; 712 } 713 714 i915_request_put(rq); 715 716 return err; 717 } 718 719 static int active_engine(void *data) 720 { 721 I915_RND_STATE(prng); 722 struct active_engine *arg = data; 723 struct intel_engine_cs *engine = arg->engine; 724 struct i915_request *rq[8] = {}; 725 struct i915_gem_context *ctx[ARRAY_SIZE(rq)]; 726 struct drm_file *file; 727 unsigned long count = 0; 728 int err = 0; 729 730 file = mock_file(engine->i915); 731 if (IS_ERR(file)) 732 return PTR_ERR(file); 733 734 for (count = 0; count < ARRAY_SIZE(ctx); count++) { 735 mutex_lock(&engine->i915->drm.struct_mutex); 736 ctx[count] = live_context(engine->i915, file); 737 mutex_unlock(&engine->i915->drm.struct_mutex); 738 if (IS_ERR(ctx[count])) { 739 err = PTR_ERR(ctx[count]); 740 while (--count) 741 i915_gem_context_put(ctx[count]); 742 goto err_file; 743 } 744 } 745 746 while (!kthread_should_stop()) { 747 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 748 struct i915_request *old = rq[idx]; 749 struct i915_request *new; 750 751 mutex_lock(&engine->i915->drm.struct_mutex); 752 new = igt_request_alloc(ctx[idx], engine); 753 if (IS_ERR(new)) { 754 mutex_unlock(&engine->i915->drm.struct_mutex); 755 err = PTR_ERR(new); 756 break; 757 } 758 759 if (arg->flags & TEST_PRIORITY) 760 ctx[idx]->sched.priority = 761 i915_prandom_u32_max_state(512, &prng); 762 763 rq[idx] = i915_request_get(new); 764 i915_request_add(new); 765 mutex_unlock(&engine->i915->drm.struct_mutex); 766 767 err = active_request_put(old); 768 if (err) 769 break; 770 771 cond_resched(); 772 } 773 774 for (count = 0; count < ARRAY_SIZE(rq); count++) { 775 int err__ = active_request_put(rq[count]); 776 777 /* Keep the first error */ 778 if (!err) 779 err = err__; 780 } 781 782 err_file: 783 mock_file_free(engine->i915, file); 784 return err; 785 } 786 787 static int __igt_reset_engines(struct drm_i915_private *i915, 788 const char *test_name, 789 unsigned int flags) 790 { 791 struct intel_engine_cs *engine, *other; 792 enum intel_engine_id id, tmp; 793 struct hang h; 794 int err = 0; 795 796 /* Check that issuing a reset on one engine does not interfere 797 * with any other engine. 798 */ 799 800 if (!intel_has_reset_engine(i915)) 801 return 0; 802 803 if (flags & TEST_ACTIVE) { 804 mutex_lock(&i915->drm.struct_mutex); 805 err = hang_init(&h, i915); 806 mutex_unlock(&i915->drm.struct_mutex); 807 if (err) 808 return err; 809 810 if (flags & TEST_PRIORITY) 811 h.ctx->sched.priority = 1024; 812 } 813 814 for_each_engine(engine, i915, id) { 815 struct active_engine threads[I915_NUM_ENGINES] = {}; 816 unsigned long global = i915_reset_count(&i915->gpu_error); 817 unsigned long count = 0, reported; 818 IGT_TIMEOUT(end_time); 819 820 if (flags & TEST_ACTIVE && 821 !intel_engine_can_store_dword(engine)) 822 continue; 823 824 if (!wait_for_idle(engine)) { 825 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 826 engine->name, test_name); 827 err = -EIO; 828 break; 829 } 830 831 memset(threads, 0, sizeof(threads)); 832 for_each_engine(other, i915, tmp) { 833 struct task_struct *tsk; 834 835 threads[tmp].resets = 836 i915_reset_engine_count(&i915->gpu_error, 837 other); 838 839 if (!(flags & TEST_OTHERS)) 840 continue; 841 842 if (other == engine && !(flags & TEST_SELF)) 843 continue; 844 845 threads[tmp].engine = other; 846 threads[tmp].flags = flags; 847 848 tsk = kthread_run(active_engine, &threads[tmp], 849 "igt/%s", other->name); 850 if (IS_ERR(tsk)) { 851 err = PTR_ERR(tsk); 852 goto unwind; 853 } 854 855 threads[tmp].task = tsk; 856 get_task_struct(tsk); 857 } 858 859 intel_engine_pm_get(engine); 860 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 861 do { 862 struct i915_request *rq = NULL; 863 864 if (flags & TEST_ACTIVE) { 865 mutex_lock(&i915->drm.struct_mutex); 866 rq = hang_create_request(&h, engine); 867 if (IS_ERR(rq)) { 868 err = PTR_ERR(rq); 869 mutex_unlock(&i915->drm.struct_mutex); 870 break; 871 } 872 873 i915_request_get(rq); 874 i915_request_add(rq); 875 mutex_unlock(&i915->drm.struct_mutex); 876 877 if (!wait_until_running(&h, rq)) { 878 struct drm_printer p = drm_info_printer(i915->drm.dev); 879 880 pr_err("%s: Failed to start request %llx, at %x\n", 881 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 882 intel_engine_dump(engine, &p, 883 "%s\n", engine->name); 884 885 i915_request_put(rq); 886 err = -EIO; 887 break; 888 } 889 } 890 891 err = i915_reset_engine(engine, NULL); 892 if (err) { 893 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 894 engine->name, test_name, err); 895 break; 896 } 897 898 count++; 899 900 if (rq) { 901 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 902 struct drm_printer p = 903 drm_info_printer(i915->drm.dev); 904 905 pr_err("i915_reset_engine(%s:%s):" 906 " failed to complete request after reset\n", 907 engine->name, test_name); 908 intel_engine_dump(engine, &p, 909 "%s\n", engine->name); 910 i915_request_put(rq); 911 912 GEM_TRACE_DUMP(); 913 i915_gem_set_wedged(i915); 914 err = -EIO; 915 break; 916 } 917 918 i915_request_put(rq); 919 } 920 921 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 922 struct drm_printer p = 923 drm_info_printer(i915->drm.dev); 924 925 pr_err("i915_reset_engine(%s:%s):" 926 " failed to idle after reset\n", 927 engine->name, test_name); 928 intel_engine_dump(engine, &p, 929 "%s\n", engine->name); 930 931 err = -EIO; 932 break; 933 } 934 } while (time_before(jiffies, end_time)); 935 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); 936 intel_engine_pm_put(engine); 937 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 938 engine->name, test_name, count); 939 940 reported = i915_reset_engine_count(&i915->gpu_error, engine); 941 reported -= threads[engine->id].resets; 942 if (reported != count) { 943 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 944 engine->name, test_name, count, reported); 945 if (!err) 946 err = -EINVAL; 947 } 948 949 unwind: 950 for_each_engine(other, i915, tmp) { 951 int ret; 952 953 if (!threads[tmp].task) 954 continue; 955 956 ret = kthread_stop(threads[tmp].task); 957 if (ret) { 958 pr_err("kthread for other engine %s failed, err=%d\n", 959 other->name, ret); 960 if (!err) 961 err = ret; 962 } 963 put_task_struct(threads[tmp].task); 964 965 if (other != engine && 966 threads[tmp].resets != 967 i915_reset_engine_count(&i915->gpu_error, other)) { 968 pr_err("Innocent engine %s was reset (count=%ld)\n", 969 other->name, 970 i915_reset_engine_count(&i915->gpu_error, 971 other) - 972 threads[tmp].resets); 973 if (!err) 974 err = -EINVAL; 975 } 976 } 977 978 if (global != i915_reset_count(&i915->gpu_error)) { 979 pr_err("Global reset (count=%ld)!\n", 980 i915_reset_count(&i915->gpu_error) - global); 981 if (!err) 982 err = -EINVAL; 983 } 984 985 if (err) 986 break; 987 988 mutex_lock(&i915->drm.struct_mutex); 989 err = igt_flush_test(i915, I915_WAIT_LOCKED); 990 mutex_unlock(&i915->drm.struct_mutex); 991 if (err) 992 break; 993 } 994 995 if (i915_reset_failed(i915)) 996 err = -EIO; 997 998 if (flags & TEST_ACTIVE) { 999 mutex_lock(&i915->drm.struct_mutex); 1000 hang_fini(&h); 1001 mutex_unlock(&i915->drm.struct_mutex); 1002 } 1003 1004 return err; 1005 } 1006 1007 static int igt_reset_engines(void *arg) 1008 { 1009 static const struct { 1010 const char *name; 1011 unsigned int flags; 1012 } phases[] = { 1013 { "idle", 0 }, 1014 { "active", TEST_ACTIVE }, 1015 { "others-idle", TEST_OTHERS }, 1016 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1017 { 1018 "others-priority", 1019 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1020 }, 1021 { 1022 "self-priority", 1023 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1024 }, 1025 { } 1026 }; 1027 struct drm_i915_private *i915 = arg; 1028 typeof(*phases) *p; 1029 int err; 1030 1031 for (p = phases; p->name; p++) { 1032 if (p->flags & TEST_PRIORITY) { 1033 if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1034 continue; 1035 } 1036 1037 err = __igt_reset_engines(arg, p->name, p->flags); 1038 if (err) 1039 return err; 1040 } 1041 1042 return 0; 1043 } 1044 1045 static u32 fake_hangcheck(struct drm_i915_private *i915, 1046 intel_engine_mask_t mask) 1047 { 1048 u32 count = i915_reset_count(&i915->gpu_error); 1049 1050 i915_reset(i915, mask, NULL); 1051 1052 return count; 1053 } 1054 1055 static int igt_reset_wait(void *arg) 1056 { 1057 struct drm_i915_private *i915 = arg; 1058 struct i915_request *rq; 1059 unsigned int reset_count; 1060 struct hang h; 1061 long timeout; 1062 int err; 1063 1064 if (!intel_engine_can_store_dword(i915->engine[RCS0])) 1065 return 0; 1066 1067 /* Check that we detect a stuck waiter and issue a reset */ 1068 1069 igt_global_reset_lock(i915); 1070 1071 mutex_lock(&i915->drm.struct_mutex); 1072 err = hang_init(&h, i915); 1073 if (err) 1074 goto unlock; 1075 1076 rq = hang_create_request(&h, i915->engine[RCS0]); 1077 if (IS_ERR(rq)) { 1078 err = PTR_ERR(rq); 1079 goto fini; 1080 } 1081 1082 i915_request_get(rq); 1083 i915_request_add(rq); 1084 1085 if (!wait_until_running(&h, rq)) { 1086 struct drm_printer p = drm_info_printer(i915->drm.dev); 1087 1088 pr_err("%s: Failed to start request %llx, at %x\n", 1089 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1090 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1091 1092 i915_gem_set_wedged(i915); 1093 1094 err = -EIO; 1095 goto out_rq; 1096 } 1097 1098 reset_count = fake_hangcheck(i915, ALL_ENGINES); 1099 1100 timeout = i915_request_wait(rq, 0, 10); 1101 if (timeout < 0) { 1102 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1103 timeout); 1104 err = timeout; 1105 goto out_rq; 1106 } 1107 1108 if (i915_reset_count(&i915->gpu_error) == reset_count) { 1109 pr_err("No GPU reset recorded!\n"); 1110 err = -EINVAL; 1111 goto out_rq; 1112 } 1113 1114 out_rq: 1115 i915_request_put(rq); 1116 fini: 1117 hang_fini(&h); 1118 unlock: 1119 mutex_unlock(&i915->drm.struct_mutex); 1120 igt_global_reset_unlock(i915); 1121 1122 if (i915_reset_failed(i915)) 1123 return -EIO; 1124 1125 return err; 1126 } 1127 1128 struct evict_vma { 1129 struct completion completion; 1130 struct i915_vma *vma; 1131 }; 1132 1133 static int evict_vma(void *data) 1134 { 1135 struct evict_vma *arg = data; 1136 struct i915_address_space *vm = arg->vma->vm; 1137 struct drm_i915_private *i915 = vm->i915; 1138 struct drm_mm_node evict = arg->vma->node; 1139 int err; 1140 1141 complete(&arg->completion); 1142 1143 mutex_lock(&i915->drm.struct_mutex); 1144 err = i915_gem_evict_for_node(vm, &evict, 0); 1145 mutex_unlock(&i915->drm.struct_mutex); 1146 1147 return err; 1148 } 1149 1150 static int evict_fence(void *data) 1151 { 1152 struct evict_vma *arg = data; 1153 struct drm_i915_private *i915 = arg->vma->vm->i915; 1154 int err; 1155 1156 complete(&arg->completion); 1157 1158 mutex_lock(&i915->drm.struct_mutex); 1159 1160 /* Mark the fence register as dirty to force the mmio update. */ 1161 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1162 if (err) { 1163 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1164 goto out_unlock; 1165 } 1166 1167 err = i915_vma_pin_fence(arg->vma); 1168 if (err) { 1169 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1170 goto out_unlock; 1171 } 1172 1173 i915_vma_unpin_fence(arg->vma); 1174 1175 out_unlock: 1176 mutex_unlock(&i915->drm.struct_mutex); 1177 1178 return err; 1179 } 1180 1181 static int __igt_reset_evict_vma(struct drm_i915_private *i915, 1182 struct i915_address_space *vm, 1183 int (*fn)(void *), 1184 unsigned int flags) 1185 { 1186 struct drm_i915_gem_object *obj; 1187 struct task_struct *tsk = NULL; 1188 struct i915_request *rq; 1189 struct evict_vma arg; 1190 struct hang h; 1191 int err; 1192 1193 if (!intel_engine_can_store_dword(i915->engine[RCS0])) 1194 return 0; 1195 1196 /* Check that we can recover an unbind stuck on a hanging request */ 1197 1198 mutex_lock(&i915->drm.struct_mutex); 1199 err = hang_init(&h, i915); 1200 if (err) 1201 goto unlock; 1202 1203 obj = i915_gem_object_create_internal(i915, SZ_1M); 1204 if (IS_ERR(obj)) { 1205 err = PTR_ERR(obj); 1206 goto fini; 1207 } 1208 1209 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1210 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1211 if (err) { 1212 pr_err("Invalid X-tiling settings; err:%d\n", err); 1213 goto out_obj; 1214 } 1215 } 1216 1217 arg.vma = i915_vma_instance(obj, vm, NULL); 1218 if (IS_ERR(arg.vma)) { 1219 err = PTR_ERR(arg.vma); 1220 goto out_obj; 1221 } 1222 1223 rq = hang_create_request(&h, i915->engine[RCS0]); 1224 if (IS_ERR(rq)) { 1225 err = PTR_ERR(rq); 1226 goto out_obj; 1227 } 1228 1229 err = i915_vma_pin(arg.vma, 0, 0, 1230 i915_vma_is_ggtt(arg.vma) ? 1231 PIN_GLOBAL | PIN_MAPPABLE : 1232 PIN_USER); 1233 if (err) { 1234 i915_request_add(rq); 1235 goto out_obj; 1236 } 1237 1238 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1239 err = i915_vma_pin_fence(arg.vma); 1240 if (err) { 1241 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1242 i915_vma_unpin(arg.vma); 1243 i915_request_add(rq); 1244 goto out_obj; 1245 } 1246 } 1247 1248 i915_vma_lock(arg.vma); 1249 err = i915_vma_move_to_active(arg.vma, rq, flags); 1250 i915_vma_unlock(arg.vma); 1251 1252 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1253 i915_vma_unpin_fence(arg.vma); 1254 i915_vma_unpin(arg.vma); 1255 1256 i915_request_get(rq); 1257 i915_request_add(rq); 1258 if (err) 1259 goto out_rq; 1260 1261 mutex_unlock(&i915->drm.struct_mutex); 1262 1263 if (!wait_until_running(&h, rq)) { 1264 struct drm_printer p = drm_info_printer(i915->drm.dev); 1265 1266 pr_err("%s: Failed to start request %llx, at %x\n", 1267 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1268 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1269 1270 i915_gem_set_wedged(i915); 1271 goto out_reset; 1272 } 1273 1274 init_completion(&arg.completion); 1275 1276 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1277 if (IS_ERR(tsk)) { 1278 err = PTR_ERR(tsk); 1279 tsk = NULL; 1280 goto out_reset; 1281 } 1282 get_task_struct(tsk); 1283 1284 wait_for_completion(&arg.completion); 1285 1286 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1287 struct drm_printer p = drm_info_printer(i915->drm.dev); 1288 1289 pr_err("igt/evict_vma kthread did not wait\n"); 1290 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1291 1292 i915_gem_set_wedged(i915); 1293 goto out_reset; 1294 } 1295 1296 out_reset: 1297 igt_global_reset_lock(i915); 1298 fake_hangcheck(rq->i915, rq->engine->mask); 1299 igt_global_reset_unlock(i915); 1300 1301 if (tsk) { 1302 struct igt_wedge_me w; 1303 1304 /* The reset, even indirectly, should take less than 10ms. */ 1305 igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/) 1306 err = kthread_stop(tsk); 1307 1308 put_task_struct(tsk); 1309 } 1310 1311 mutex_lock(&i915->drm.struct_mutex); 1312 out_rq: 1313 i915_request_put(rq); 1314 out_obj: 1315 i915_gem_object_put(obj); 1316 fini: 1317 hang_fini(&h); 1318 unlock: 1319 mutex_unlock(&i915->drm.struct_mutex); 1320 1321 if (i915_reset_failed(i915)) 1322 return -EIO; 1323 1324 return err; 1325 } 1326 1327 static int igt_reset_evict_ggtt(void *arg) 1328 { 1329 struct drm_i915_private *i915 = arg; 1330 1331 return __igt_reset_evict_vma(i915, &i915->ggtt.vm, 1332 evict_vma, EXEC_OBJECT_WRITE); 1333 } 1334 1335 static int igt_reset_evict_ppgtt(void *arg) 1336 { 1337 struct drm_i915_private *i915 = arg; 1338 struct i915_gem_context *ctx; 1339 struct drm_file *file; 1340 int err; 1341 1342 file = mock_file(i915); 1343 if (IS_ERR(file)) 1344 return PTR_ERR(file); 1345 1346 mutex_lock(&i915->drm.struct_mutex); 1347 ctx = live_context(i915, file); 1348 mutex_unlock(&i915->drm.struct_mutex); 1349 if (IS_ERR(ctx)) { 1350 err = PTR_ERR(ctx); 1351 goto out; 1352 } 1353 1354 err = 0; 1355 if (ctx->vm) /* aliasing == global gtt locking, covered above */ 1356 err = __igt_reset_evict_vma(i915, ctx->vm, 1357 evict_vma, EXEC_OBJECT_WRITE); 1358 1359 out: 1360 mock_file_free(i915, file); 1361 return err; 1362 } 1363 1364 static int igt_reset_evict_fence(void *arg) 1365 { 1366 struct drm_i915_private *i915 = arg; 1367 1368 return __igt_reset_evict_vma(i915, &i915->ggtt.vm, 1369 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1370 } 1371 1372 static int wait_for_others(struct drm_i915_private *i915, 1373 struct intel_engine_cs *exclude) 1374 { 1375 struct intel_engine_cs *engine; 1376 enum intel_engine_id id; 1377 1378 for_each_engine(engine, i915, id) { 1379 if (engine == exclude) 1380 continue; 1381 1382 if (!wait_for_idle(engine)) 1383 return -EIO; 1384 } 1385 1386 return 0; 1387 } 1388 1389 static int igt_reset_queue(void *arg) 1390 { 1391 struct drm_i915_private *i915 = arg; 1392 struct intel_engine_cs *engine; 1393 enum intel_engine_id id; 1394 struct hang h; 1395 int err; 1396 1397 /* Check that we replay pending requests following a hang */ 1398 1399 igt_global_reset_lock(i915); 1400 1401 mutex_lock(&i915->drm.struct_mutex); 1402 err = hang_init(&h, i915); 1403 if (err) 1404 goto unlock; 1405 1406 for_each_engine(engine, i915, id) { 1407 struct i915_request *prev; 1408 IGT_TIMEOUT(end_time); 1409 unsigned int count; 1410 1411 if (!intel_engine_can_store_dword(engine)) 1412 continue; 1413 1414 prev = hang_create_request(&h, engine); 1415 if (IS_ERR(prev)) { 1416 err = PTR_ERR(prev); 1417 goto fini; 1418 } 1419 1420 i915_request_get(prev); 1421 i915_request_add(prev); 1422 1423 count = 0; 1424 do { 1425 struct i915_request *rq; 1426 unsigned int reset_count; 1427 1428 rq = hang_create_request(&h, engine); 1429 if (IS_ERR(rq)) { 1430 err = PTR_ERR(rq); 1431 goto fini; 1432 } 1433 1434 i915_request_get(rq); 1435 i915_request_add(rq); 1436 1437 /* 1438 * XXX We don't handle resetting the kernel context 1439 * very well. If we trigger a device reset twice in 1440 * quick succession while the kernel context is 1441 * executing, we may end up skipping the breadcrumb. 1442 * This is really only a problem for the selftest as 1443 * normally there is a large interlude between resets 1444 * (hangcheck), or we focus on resetting just one 1445 * engine and so avoid repeatedly resetting innocents. 1446 */ 1447 err = wait_for_others(i915, engine); 1448 if (err) { 1449 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1450 __func__, engine->name); 1451 i915_request_put(rq); 1452 i915_request_put(prev); 1453 1454 GEM_TRACE_DUMP(); 1455 i915_gem_set_wedged(i915); 1456 goto fini; 1457 } 1458 1459 if (!wait_until_running(&h, prev)) { 1460 struct drm_printer p = drm_info_printer(i915->drm.dev); 1461 1462 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1463 __func__, engine->name, 1464 prev->fence.seqno, hws_seqno(&h, prev)); 1465 intel_engine_dump(engine, &p, 1466 "%s\n", engine->name); 1467 1468 i915_request_put(rq); 1469 i915_request_put(prev); 1470 1471 i915_gem_set_wedged(i915); 1472 1473 err = -EIO; 1474 goto fini; 1475 } 1476 1477 reset_count = fake_hangcheck(i915, BIT(id)); 1478 1479 if (prev->fence.error != -EIO) { 1480 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1481 prev->fence.error); 1482 i915_request_put(rq); 1483 i915_request_put(prev); 1484 err = -EINVAL; 1485 goto fini; 1486 } 1487 1488 if (rq->fence.error) { 1489 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1490 rq->fence.error); 1491 i915_request_put(rq); 1492 i915_request_put(prev); 1493 err = -EINVAL; 1494 goto fini; 1495 } 1496 1497 if (i915_reset_count(&i915->gpu_error) == reset_count) { 1498 pr_err("No GPU reset recorded!\n"); 1499 i915_request_put(rq); 1500 i915_request_put(prev); 1501 err = -EINVAL; 1502 goto fini; 1503 } 1504 1505 i915_request_put(prev); 1506 prev = rq; 1507 count++; 1508 } while (time_before(jiffies, end_time)); 1509 pr_info("%s: Completed %d resets\n", engine->name, count); 1510 1511 *h.batch = MI_BATCH_BUFFER_END; 1512 i915_gem_chipset_flush(i915); 1513 1514 i915_request_put(prev); 1515 1516 err = igt_flush_test(i915, I915_WAIT_LOCKED); 1517 if (err) 1518 break; 1519 } 1520 1521 fini: 1522 hang_fini(&h); 1523 unlock: 1524 mutex_unlock(&i915->drm.struct_mutex); 1525 igt_global_reset_unlock(i915); 1526 1527 if (i915_reset_failed(i915)) 1528 return -EIO; 1529 1530 return err; 1531 } 1532 1533 static int igt_handle_error(void *arg) 1534 { 1535 struct drm_i915_private *i915 = arg; 1536 struct intel_engine_cs *engine = i915->engine[RCS0]; 1537 struct hang h; 1538 struct i915_request *rq; 1539 struct i915_gpu_state *error; 1540 int err; 1541 1542 /* Check that we can issue a global GPU and engine reset */ 1543 1544 if (!intel_has_reset_engine(i915)) 1545 return 0; 1546 1547 if (!engine || !intel_engine_can_store_dword(engine)) 1548 return 0; 1549 1550 mutex_lock(&i915->drm.struct_mutex); 1551 1552 err = hang_init(&h, i915); 1553 if (err) 1554 goto err_unlock; 1555 1556 rq = hang_create_request(&h, engine); 1557 if (IS_ERR(rq)) { 1558 err = PTR_ERR(rq); 1559 goto err_fini; 1560 } 1561 1562 i915_request_get(rq); 1563 i915_request_add(rq); 1564 1565 if (!wait_until_running(&h, rq)) { 1566 struct drm_printer p = drm_info_printer(i915->drm.dev); 1567 1568 pr_err("%s: Failed to start request %llx, at %x\n", 1569 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1570 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1571 1572 i915_gem_set_wedged(i915); 1573 1574 err = -EIO; 1575 goto err_request; 1576 } 1577 1578 mutex_unlock(&i915->drm.struct_mutex); 1579 1580 /* Temporarily disable error capture */ 1581 error = xchg(&i915->gpu_error.first_error, (void *)-1); 1582 1583 i915_handle_error(i915, engine->mask, 0, NULL); 1584 1585 xchg(&i915->gpu_error.first_error, error); 1586 1587 mutex_lock(&i915->drm.struct_mutex); 1588 1589 if (rq->fence.error != -EIO) { 1590 pr_err("Guilty request not identified!\n"); 1591 err = -EINVAL; 1592 goto err_request; 1593 } 1594 1595 err_request: 1596 i915_request_put(rq); 1597 err_fini: 1598 hang_fini(&h); 1599 err_unlock: 1600 mutex_unlock(&i915->drm.struct_mutex); 1601 return err; 1602 } 1603 1604 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1605 const struct igt_atomic_section *p, 1606 const char *mode) 1607 { 1608 struct tasklet_struct * const t = &engine->execlists.tasklet; 1609 int err; 1610 1611 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1612 engine->name, mode, p->name); 1613 1614 tasklet_disable_nosync(t); 1615 p->critical_section_begin(); 1616 1617 err = i915_reset_engine(engine, NULL); 1618 1619 p->critical_section_end(); 1620 tasklet_enable(t); 1621 1622 if (err) 1623 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1624 engine->name, mode, p->name); 1625 1626 return err; 1627 } 1628 1629 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1630 const struct igt_atomic_section *p) 1631 { 1632 struct drm_i915_private *i915 = engine->i915; 1633 struct i915_request *rq; 1634 struct hang h; 1635 int err; 1636 1637 err = __igt_atomic_reset_engine(engine, p, "idle"); 1638 if (err) 1639 return err; 1640 1641 err = hang_init(&h, i915); 1642 if (err) 1643 return err; 1644 1645 rq = hang_create_request(&h, engine); 1646 if (IS_ERR(rq)) { 1647 err = PTR_ERR(rq); 1648 goto out; 1649 } 1650 1651 i915_request_get(rq); 1652 i915_request_add(rq); 1653 1654 if (wait_until_running(&h, rq)) { 1655 err = __igt_atomic_reset_engine(engine, p, "active"); 1656 } else { 1657 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1658 __func__, engine->name, 1659 rq->fence.seqno, hws_seqno(&h, rq)); 1660 i915_gem_set_wedged(i915); 1661 err = -EIO; 1662 } 1663 1664 if (err == 0) { 1665 struct igt_wedge_me w; 1666 1667 igt_wedge_on_timeout(&w, i915, HZ / 20 /* 50ms timeout*/) 1668 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1669 if (i915_reset_failed(i915)) 1670 err = -EIO; 1671 } 1672 1673 i915_request_put(rq); 1674 out: 1675 hang_fini(&h); 1676 return err; 1677 } 1678 1679 static int igt_reset_engines_atomic(void *arg) 1680 { 1681 struct drm_i915_private *i915 = arg; 1682 const typeof(*igt_atomic_phases) *p; 1683 int err = 0; 1684 1685 /* Check that the engines resets are usable from atomic context */ 1686 1687 if (!intel_has_reset_engine(i915)) 1688 return 0; 1689 1690 if (USES_GUC_SUBMISSION(i915)) 1691 return 0; 1692 1693 igt_global_reset_lock(i915); 1694 mutex_lock(&i915->drm.struct_mutex); 1695 1696 /* Flush any requests before we get started and check basics */ 1697 if (!igt_force_reset(i915)) 1698 goto unlock; 1699 1700 for (p = igt_atomic_phases; p->name; p++) { 1701 struct intel_engine_cs *engine; 1702 enum intel_engine_id id; 1703 1704 for_each_engine(engine, i915, id) { 1705 err = igt_atomic_reset_engine(engine, p); 1706 if (err) 1707 goto out; 1708 } 1709 } 1710 1711 out: 1712 /* As we poke around the guts, do a full reset before continuing. */ 1713 igt_force_reset(i915); 1714 1715 unlock: 1716 mutex_unlock(&i915->drm.struct_mutex); 1717 igt_global_reset_unlock(i915); 1718 1719 return err; 1720 } 1721 1722 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1723 { 1724 static const struct i915_subtest tests[] = { 1725 SUBTEST(igt_hang_sanitycheck), 1726 SUBTEST(igt_reset_nop), 1727 SUBTEST(igt_reset_nop_engine), 1728 SUBTEST(igt_reset_idle_engine), 1729 SUBTEST(igt_reset_active_engine), 1730 SUBTEST(igt_reset_engines), 1731 SUBTEST(igt_reset_engines_atomic), 1732 SUBTEST(igt_reset_queue), 1733 SUBTEST(igt_reset_wait), 1734 SUBTEST(igt_reset_evict_ggtt), 1735 SUBTEST(igt_reset_evict_ppgtt), 1736 SUBTEST(igt_reset_evict_fence), 1737 SUBTEST(igt_handle_error), 1738 }; 1739 intel_wakeref_t wakeref; 1740 bool saved_hangcheck; 1741 int err; 1742 1743 if (!intel_has_gpu_reset(i915)) 1744 return 0; 1745 1746 if (i915_terminally_wedged(i915)) 1747 return -EIO; /* we're long past hope of a successful reset */ 1748 1749 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1750 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck); 1751 drain_delayed_work(&i915->gpu_error.hangcheck_work); /* flush param */ 1752 1753 err = i915_subtests(tests, i915); 1754 1755 mutex_lock(&i915->drm.struct_mutex); 1756 igt_flush_test(i915, I915_WAIT_LOCKED); 1757 mutex_unlock(&i915->drm.struct_mutex); 1758 1759 i915_modparams.enable_hangcheck = saved_hangcheck; 1760 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1761 1762 return err; 1763 } 1764