1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2008-2018 Intel Corporation 4 */ 5 6 #include <linux/sched/mm.h> 7 #include <linux/stop_machine.h> 8 9 #include "display/intel_display.h" 10 #include "display/intel_overlay.h" 11 12 #include "gem/i915_gem_context.h" 13 14 #include "gt/intel_gt_regs.h" 15 16 #include "i915_drv.h" 17 #include "i915_file_private.h" 18 #include "i915_gpu_error.h" 19 #include "i915_irq.h" 20 #include "intel_breadcrumbs.h" 21 #include "intel_engine_pm.h" 22 #include "intel_engine_regs.h" 23 #include "intel_gt.h" 24 #include "intel_gt_pm.h" 25 #include "intel_gt_requests.h" 26 #include "intel_mchbar_regs.h" 27 #include "intel_pci_config.h" 28 #include "intel_reset.h" 29 30 #include "uc/intel_guc.h" 31 32 #define RESET_MAX_RETRIES 3 33 34 /* XXX How to handle concurrent GGTT updates using tiling registers? */ 35 #define RESET_UNDER_STOP_MACHINE 0 36 37 static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set) 38 { 39 intel_uncore_rmw_fw(uncore, reg, 0, set); 40 } 41 42 static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) 43 { 44 intel_uncore_rmw_fw(uncore, reg, clr, 0); 45 } 46 47 static void client_mark_guilty(struct i915_gem_context *ctx, bool banned) 48 { 49 struct drm_i915_file_private *file_priv = ctx->file_priv; 50 unsigned long prev_hang; 51 unsigned int score; 52 53 if (IS_ERR_OR_NULL(file_priv)) 54 return; 55 56 score = 0; 57 if (banned) 58 score = I915_CLIENT_SCORE_CONTEXT_BAN; 59 60 prev_hang = xchg(&file_priv->hang_timestamp, jiffies); 61 if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) 62 score += I915_CLIENT_SCORE_HANG_FAST; 63 64 if (score) { 65 atomic_add(score, &file_priv->ban_score); 66 67 drm_dbg(&ctx->i915->drm, 68 "client %s: gained %u ban score, now %u\n", 69 ctx->name, score, 70 atomic_read(&file_priv->ban_score)); 71 } 72 } 73 74 static bool mark_guilty(struct i915_request *rq) 75 { 76 struct i915_gem_context *ctx; 77 unsigned long prev_hang; 78 bool banned; 79 int i; 80 81 if (intel_context_is_closed(rq->context)) 82 return true; 83 84 rcu_read_lock(); 85 ctx = rcu_dereference(rq->context->gem_context); 86 if (ctx && !kref_get_unless_zero(&ctx->ref)) 87 ctx = NULL; 88 rcu_read_unlock(); 89 if (!ctx) 90 return intel_context_is_banned(rq->context); 91 92 atomic_inc(&ctx->guilty_count); 93 94 /* Cool contexts are too cool to be banned! (Used for reset testing.) */ 95 if (!i915_gem_context_is_bannable(ctx)) { 96 banned = false; 97 goto out; 98 } 99 100 drm_notice(&ctx->i915->drm, 101 "%s context reset due to GPU hang\n", 102 ctx->name); 103 104 /* Record the timestamp for the last N hangs */ 105 prev_hang = ctx->hang_timestamp[0]; 106 for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) 107 ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; 108 ctx->hang_timestamp[i] = jiffies; 109 110 /* If we have hung N+1 times in rapid succession, we ban the context! */ 111 banned = !i915_gem_context_is_recoverable(ctx); 112 if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) 113 banned = true; 114 if (banned) 115 drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n", 116 ctx->name, atomic_read(&ctx->guilty_count)); 117 118 client_mark_guilty(ctx, banned); 119 120 out: 121 i915_gem_context_put(ctx); 122 return banned; 123 } 124 125 static void mark_innocent(struct i915_request *rq) 126 { 127 struct i915_gem_context *ctx; 128 129 rcu_read_lock(); 130 ctx = rcu_dereference(rq->context->gem_context); 131 if (ctx) 132 atomic_inc(&ctx->active_count); 133 rcu_read_unlock(); 134 } 135 136 void __i915_request_reset(struct i915_request *rq, bool guilty) 137 { 138 bool banned = false; 139 140 RQ_TRACE(rq, "guilty? %s\n", yesno(guilty)); 141 GEM_BUG_ON(__i915_request_is_complete(rq)); 142 143 rcu_read_lock(); /* protect the GEM context */ 144 if (guilty) { 145 i915_request_set_error_once(rq, -EIO); 146 __i915_request_skip(rq); 147 banned = mark_guilty(rq); 148 } else { 149 i915_request_set_error_once(rq, -EAGAIN); 150 mark_innocent(rq); 151 } 152 rcu_read_unlock(); 153 154 if (banned) 155 intel_context_ban(rq->context, rq); 156 } 157 158 static bool i915_in_reset(struct pci_dev *pdev) 159 { 160 u8 gdrst; 161 162 pci_read_config_byte(pdev, I915_GDRST, &gdrst); 163 return gdrst & GRDOM_RESET_STATUS; 164 } 165 166 static int i915_do_reset(struct intel_gt *gt, 167 intel_engine_mask_t engine_mask, 168 unsigned int retry) 169 { 170 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); 171 int err; 172 173 /* Assert reset for at least 20 usec, and wait for acknowledgement. */ 174 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 175 udelay(50); 176 err = wait_for_atomic(i915_in_reset(pdev), 50); 177 178 /* Clear the reset request. */ 179 pci_write_config_byte(pdev, I915_GDRST, 0); 180 udelay(50); 181 if (!err) 182 err = wait_for_atomic(!i915_in_reset(pdev), 50); 183 184 return err; 185 } 186 187 static bool g4x_reset_complete(struct pci_dev *pdev) 188 { 189 u8 gdrst; 190 191 pci_read_config_byte(pdev, I915_GDRST, &gdrst); 192 return (gdrst & GRDOM_RESET_ENABLE) == 0; 193 } 194 195 static int g33_do_reset(struct intel_gt *gt, 196 intel_engine_mask_t engine_mask, 197 unsigned int retry) 198 { 199 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); 200 201 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 202 return wait_for_atomic(g4x_reset_complete(pdev), 50); 203 } 204 205 static int g4x_do_reset(struct intel_gt *gt, 206 intel_engine_mask_t engine_mask, 207 unsigned int retry) 208 { 209 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); 210 struct intel_uncore *uncore = gt->uncore; 211 int ret; 212 213 /* WaVcpClkGateDisableForMediaReset:ctg,elk */ 214 rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 215 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 216 217 pci_write_config_byte(pdev, I915_GDRST, 218 GRDOM_MEDIA | GRDOM_RESET_ENABLE); 219 ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 220 if (ret) { 221 GT_TRACE(gt, "Wait for media reset failed\n"); 222 goto out; 223 } 224 225 pci_write_config_byte(pdev, I915_GDRST, 226 GRDOM_RENDER | GRDOM_RESET_ENABLE); 227 ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 228 if (ret) { 229 GT_TRACE(gt, "Wait for render reset failed\n"); 230 goto out; 231 } 232 233 out: 234 pci_write_config_byte(pdev, I915_GDRST, 0); 235 236 rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 237 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 238 239 return ret; 240 } 241 242 static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask, 243 unsigned int retry) 244 { 245 struct intel_uncore *uncore = gt->uncore; 246 int ret; 247 248 intel_uncore_write_fw(uncore, ILK_GDSR, 249 ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); 250 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 251 ILK_GRDOM_RESET_ENABLE, 0, 252 5000, 0, 253 NULL); 254 if (ret) { 255 GT_TRACE(gt, "Wait for render reset failed\n"); 256 goto out; 257 } 258 259 intel_uncore_write_fw(uncore, ILK_GDSR, 260 ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); 261 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 262 ILK_GRDOM_RESET_ENABLE, 0, 263 5000, 0, 264 NULL); 265 if (ret) { 266 GT_TRACE(gt, "Wait for media reset failed\n"); 267 goto out; 268 } 269 270 out: 271 intel_uncore_write_fw(uncore, ILK_GDSR, 0); 272 intel_uncore_posting_read_fw(uncore, ILK_GDSR); 273 return ret; 274 } 275 276 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ 277 static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask) 278 { 279 struct intel_uncore *uncore = gt->uncore; 280 int err; 281 282 /* 283 * GEN6_GDRST is not in the gt power well, no need to check 284 * for fifo space for the write or forcewake the chip for 285 * the read 286 */ 287 intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); 288 289 /* Wait for the device to ack the reset requests */ 290 err = __intel_wait_for_register_fw(uncore, 291 GEN6_GDRST, hw_domain_mask, 0, 292 500, 0, 293 NULL); 294 if (err) 295 GT_TRACE(gt, 296 "Wait for 0x%08x engines reset failed\n", 297 hw_domain_mask); 298 299 return err; 300 } 301 302 static int gen6_reset_engines(struct intel_gt *gt, 303 intel_engine_mask_t engine_mask, 304 unsigned int retry) 305 { 306 struct intel_engine_cs *engine; 307 u32 hw_mask; 308 309 if (engine_mask == ALL_ENGINES) { 310 hw_mask = GEN6_GRDOM_FULL; 311 } else { 312 intel_engine_mask_t tmp; 313 314 hw_mask = 0; 315 for_each_engine_masked(engine, gt, engine_mask, tmp) { 316 hw_mask |= engine->reset_domain; 317 } 318 } 319 320 return gen6_hw_domain_reset(gt, hw_mask); 321 } 322 323 static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine) 324 { 325 int vecs_id; 326 327 GEM_BUG_ON(engine->class != VIDEO_DECODE_CLASS); 328 329 vecs_id = _VECS((engine->instance) / 2); 330 331 return engine->gt->engine[vecs_id]; 332 } 333 334 struct sfc_lock_data { 335 i915_reg_t lock_reg; 336 i915_reg_t ack_reg; 337 i915_reg_t usage_reg; 338 u32 lock_bit; 339 u32 ack_bit; 340 u32 usage_bit; 341 u32 reset_bit; 342 }; 343 344 static void get_sfc_forced_lock_data(struct intel_engine_cs *engine, 345 struct sfc_lock_data *sfc_lock) 346 { 347 switch (engine->class) { 348 default: 349 MISSING_CASE(engine->class); 350 fallthrough; 351 case VIDEO_DECODE_CLASS: 352 sfc_lock->lock_reg = GEN11_VCS_SFC_FORCED_LOCK(engine->mmio_base); 353 sfc_lock->lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 354 355 sfc_lock->ack_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base); 356 sfc_lock->ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; 357 358 sfc_lock->usage_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base); 359 sfc_lock->usage_bit = GEN11_VCS_SFC_USAGE_BIT; 360 sfc_lock->reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); 361 362 break; 363 case VIDEO_ENHANCEMENT_CLASS: 364 sfc_lock->lock_reg = GEN11_VECS_SFC_FORCED_LOCK(engine->mmio_base); 365 sfc_lock->lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 366 367 sfc_lock->ack_reg = GEN11_VECS_SFC_LOCK_ACK(engine->mmio_base); 368 sfc_lock->ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; 369 370 sfc_lock->usage_reg = GEN11_VECS_SFC_USAGE(engine->mmio_base); 371 sfc_lock->usage_bit = GEN11_VECS_SFC_USAGE_BIT; 372 sfc_lock->reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); 373 374 break; 375 } 376 } 377 378 static int gen11_lock_sfc(struct intel_engine_cs *engine, 379 u32 *reset_mask, 380 u32 *unlock_mask) 381 { 382 struct intel_uncore *uncore = engine->uncore; 383 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; 384 struct sfc_lock_data sfc_lock; 385 bool lock_obtained, lock_to_other = false; 386 int ret; 387 388 switch (engine->class) { 389 case VIDEO_DECODE_CLASS: 390 if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 391 return 0; 392 393 fallthrough; 394 case VIDEO_ENHANCEMENT_CLASS: 395 get_sfc_forced_lock_data(engine, &sfc_lock); 396 397 break; 398 default: 399 return 0; 400 } 401 402 if (!(intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & sfc_lock.usage_bit)) { 403 struct intel_engine_cs *paired_vecs; 404 405 if (engine->class != VIDEO_DECODE_CLASS || 406 GRAPHICS_VER(engine->i915) != 12) 407 return 0; 408 409 /* 410 * Wa_14010733141 411 * 412 * If the VCS-MFX isn't using the SFC, we also need to check 413 * whether VCS-HCP is using it. If so, we need to issue a *VE* 414 * forced lock on the VE engine that shares the same SFC. 415 */ 416 if (!(intel_uncore_read_fw(uncore, 417 GEN12_HCP_SFC_LOCK_STATUS(engine->mmio_base)) & 418 GEN12_HCP_SFC_USAGE_BIT)) 419 return 0; 420 421 paired_vecs = find_sfc_paired_vecs_engine(engine); 422 get_sfc_forced_lock_data(paired_vecs, &sfc_lock); 423 lock_to_other = true; 424 *unlock_mask |= paired_vecs->mask; 425 } else { 426 *unlock_mask |= engine->mask; 427 } 428 429 /* 430 * If the engine is using an SFC, tell the engine that a software reset 431 * is going to happen. The engine will then try to force lock the SFC. 432 * If SFC ends up being locked to the engine we want to reset, we have 433 * to reset it as well (we will unlock it once the reset sequence is 434 * completed). 435 */ 436 rmw_set_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit); 437 438 ret = __intel_wait_for_register_fw(uncore, 439 sfc_lock.ack_reg, 440 sfc_lock.ack_bit, 441 sfc_lock.ack_bit, 442 1000, 0, NULL); 443 444 /* 445 * Was the SFC released while we were trying to lock it? 446 * 447 * We should reset both the engine and the SFC if: 448 * - We were locking the SFC to this engine and the lock succeeded 449 * OR 450 * - We were locking the SFC to a different engine (Wa_14010733141) 451 * but the SFC was released before the lock was obtained. 452 * 453 * Otherwise we need only reset the engine by itself and we can 454 * leave the SFC alone. 455 */ 456 lock_obtained = (intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & 457 sfc_lock.usage_bit) != 0; 458 if (lock_obtained == lock_to_other) 459 return 0; 460 461 if (ret) { 462 ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n"); 463 return ret; 464 } 465 466 *reset_mask |= sfc_lock.reset_bit; 467 return 0; 468 } 469 470 static void gen11_unlock_sfc(struct intel_engine_cs *engine) 471 { 472 struct intel_uncore *uncore = engine->uncore; 473 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; 474 struct sfc_lock_data sfc_lock = {}; 475 476 if (engine->class != VIDEO_DECODE_CLASS && 477 engine->class != VIDEO_ENHANCEMENT_CLASS) 478 return; 479 480 if (engine->class == VIDEO_DECODE_CLASS && 481 (BIT(engine->instance) & vdbox_sfc_access) == 0) 482 return; 483 484 get_sfc_forced_lock_data(engine, &sfc_lock); 485 486 rmw_clear_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit); 487 } 488 489 static int gen11_reset_engines(struct intel_gt *gt, 490 intel_engine_mask_t engine_mask, 491 unsigned int retry) 492 { 493 struct intel_engine_cs *engine; 494 intel_engine_mask_t tmp; 495 u32 reset_mask, unlock_mask = 0; 496 int ret; 497 498 if (engine_mask == ALL_ENGINES) { 499 reset_mask = GEN11_GRDOM_FULL; 500 } else { 501 reset_mask = 0; 502 for_each_engine_masked(engine, gt, engine_mask, tmp) { 503 reset_mask |= engine->reset_domain; 504 ret = gen11_lock_sfc(engine, &reset_mask, &unlock_mask); 505 if (ret) 506 goto sfc_unlock; 507 } 508 } 509 510 ret = gen6_hw_domain_reset(gt, reset_mask); 511 512 sfc_unlock: 513 /* 514 * We unlock the SFC based on the lock status and not the result of 515 * gen11_lock_sfc to make sure that we clean properly if something 516 * wrong happened during the lock (e.g. lock acquired after timeout 517 * expiration). 518 * 519 * Due to Wa_14010733141, we may have locked an SFC to an engine that 520 * wasn't being reset. So instead of calling gen11_unlock_sfc() 521 * on engine_mask, we instead call it on the mask of engines that our 522 * gen11_lock_sfc() calls told us actually had locks attempted. 523 */ 524 for_each_engine_masked(engine, gt, unlock_mask, tmp) 525 gen11_unlock_sfc(engine); 526 527 return ret; 528 } 529 530 static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) 531 { 532 struct intel_uncore *uncore = engine->uncore; 533 const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base); 534 u32 request, mask, ack; 535 int ret; 536 537 if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1))) 538 return -ETIMEDOUT; 539 540 ack = intel_uncore_read_fw(uncore, reg); 541 if (ack & RESET_CTL_CAT_ERROR) { 542 /* 543 * For catastrophic errors, ready-for-reset sequence 544 * needs to be bypassed: HAS#396813 545 */ 546 request = RESET_CTL_CAT_ERROR; 547 mask = RESET_CTL_CAT_ERROR; 548 549 /* Catastrophic errors need to be cleared by HW */ 550 ack = 0; 551 } else if (!(ack & RESET_CTL_READY_TO_RESET)) { 552 request = RESET_CTL_REQUEST_RESET; 553 mask = RESET_CTL_READY_TO_RESET; 554 ack = RESET_CTL_READY_TO_RESET; 555 } else { 556 return 0; 557 } 558 559 intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request)); 560 ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 561 700, 0, NULL); 562 if (ret) 563 drm_err(&engine->i915->drm, 564 "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", 565 engine->name, request, 566 intel_uncore_read_fw(uncore, reg)); 567 568 return ret; 569 } 570 571 static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) 572 { 573 intel_uncore_write_fw(engine->uncore, 574 RING_RESET_CTL(engine->mmio_base), 575 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); 576 } 577 578 static int gen8_reset_engines(struct intel_gt *gt, 579 intel_engine_mask_t engine_mask, 580 unsigned int retry) 581 { 582 struct intel_engine_cs *engine; 583 const bool reset_non_ready = retry >= 1; 584 intel_engine_mask_t tmp; 585 int ret; 586 587 for_each_engine_masked(engine, gt, engine_mask, tmp) { 588 ret = gen8_engine_reset_prepare(engine); 589 if (ret && !reset_non_ready) 590 goto skip_reset; 591 592 /* 593 * If this is not the first failed attempt to prepare, 594 * we decide to proceed anyway. 595 * 596 * By doing so we risk context corruption and with 597 * some gens (kbl), possible system hang if reset 598 * happens during active bb execution. 599 * 600 * We rather take context corruption instead of 601 * failed reset with a wedged driver/gpu. And 602 * active bb execution case should be covered by 603 * stop_engines() we have before the reset. 604 */ 605 } 606 607 /* 608 * Wa_22011100796:dg2, whenever Full soft reset is required, 609 * reset all individual engines firstly, and then do a full soft reset. 610 * 611 * This is best effort, so ignore any error from the initial reset. 612 */ 613 if (IS_DG2(gt->i915) && engine_mask == ALL_ENGINES) 614 gen11_reset_engines(gt, gt->info.engine_mask, 0); 615 616 if (GRAPHICS_VER(gt->i915) >= 11) 617 ret = gen11_reset_engines(gt, engine_mask, retry); 618 else 619 ret = gen6_reset_engines(gt, engine_mask, retry); 620 621 skip_reset: 622 for_each_engine_masked(engine, gt, engine_mask, tmp) 623 gen8_engine_reset_cancel(engine); 624 625 return ret; 626 } 627 628 static int mock_reset(struct intel_gt *gt, 629 intel_engine_mask_t mask, 630 unsigned int retry) 631 { 632 return 0; 633 } 634 635 typedef int (*reset_func)(struct intel_gt *, 636 intel_engine_mask_t engine_mask, 637 unsigned int retry); 638 639 static reset_func intel_get_gpu_reset(const struct intel_gt *gt) 640 { 641 struct drm_i915_private *i915 = gt->i915; 642 643 if (is_mock_gt(gt)) 644 return mock_reset; 645 else if (GRAPHICS_VER(i915) >= 8) 646 return gen8_reset_engines; 647 else if (GRAPHICS_VER(i915) >= 6) 648 return gen6_reset_engines; 649 else if (GRAPHICS_VER(i915) >= 5) 650 return ilk_do_reset; 651 else if (IS_G4X(i915)) 652 return g4x_do_reset; 653 else if (IS_G33(i915) || IS_PINEVIEW(i915)) 654 return g33_do_reset; 655 else if (GRAPHICS_VER(i915) >= 3) 656 return i915_do_reset; 657 else 658 return NULL; 659 } 660 661 int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) 662 { 663 const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; 664 reset_func reset; 665 int ret = -ETIMEDOUT; 666 int retry; 667 668 reset = intel_get_gpu_reset(gt); 669 if (!reset) 670 return -ENODEV; 671 672 /* 673 * If the power well sleeps during the reset, the reset 674 * request may be dropped and never completes (causing -EIO). 675 */ 676 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 677 for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { 678 GT_TRACE(gt, "engine_mask=%x\n", engine_mask); 679 preempt_disable(); 680 ret = reset(gt, engine_mask, retry); 681 preempt_enable(); 682 } 683 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 684 685 return ret; 686 } 687 688 bool intel_has_gpu_reset(const struct intel_gt *gt) 689 { 690 if (!gt->i915->params.reset) 691 return NULL; 692 693 return intel_get_gpu_reset(gt); 694 } 695 696 bool intel_has_reset_engine(const struct intel_gt *gt) 697 { 698 if (gt->i915->params.reset < 2) 699 return false; 700 701 return INTEL_INFO(gt->i915)->has_reset_engine; 702 } 703 704 int intel_reset_guc(struct intel_gt *gt) 705 { 706 u32 guc_domain = 707 GRAPHICS_VER(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; 708 int ret; 709 710 GEM_BUG_ON(!HAS_GT_UC(gt->i915)); 711 712 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 713 ret = gen6_hw_domain_reset(gt, guc_domain); 714 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 715 716 return ret; 717 } 718 719 /* 720 * Ensure irq handler finishes, and not run again. 721 * Also return the active request so that we only search for it once. 722 */ 723 static void reset_prepare_engine(struct intel_engine_cs *engine) 724 { 725 /* 726 * During the reset sequence, we must prevent the engine from 727 * entering RC6. As the context state is undefined until we restart 728 * the engine, if it does enter RC6 during the reset, the state 729 * written to the powercontext is undefined and so we may lose 730 * GPU state upon resume, i.e. fail to restart after a reset. 731 */ 732 intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); 733 if (engine->reset.prepare) 734 engine->reset.prepare(engine); 735 } 736 737 static void revoke_mmaps(struct intel_gt *gt) 738 { 739 int i; 740 741 for (i = 0; i < gt->ggtt->num_fences; i++) { 742 struct drm_vma_offset_node *node; 743 struct i915_vma *vma; 744 u64 vma_offset; 745 746 vma = READ_ONCE(gt->ggtt->fence_regs[i].vma); 747 if (!vma) 748 continue; 749 750 if (!i915_vma_has_userfault(vma)) 751 continue; 752 753 GEM_BUG_ON(vma->fence != >->ggtt->fence_regs[i]); 754 755 if (!vma->mmo) 756 continue; 757 758 node = &vma->mmo->vma_node; 759 vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT; 760 761 unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping, 762 drm_vma_node_offset_addr(node) + vma_offset, 763 vma->size, 764 1); 765 } 766 } 767 768 static intel_engine_mask_t reset_prepare(struct intel_gt *gt) 769 { 770 struct intel_engine_cs *engine; 771 intel_engine_mask_t awake = 0; 772 enum intel_engine_id id; 773 774 for_each_engine(engine, gt, id) { 775 if (intel_engine_pm_get_if_awake(engine)) 776 awake |= engine->mask; 777 reset_prepare_engine(engine); 778 } 779 780 intel_uc_reset_prepare(>->uc); 781 782 return awake; 783 } 784 785 static void gt_revoke(struct intel_gt *gt) 786 { 787 revoke_mmaps(gt); 788 } 789 790 static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) 791 { 792 struct intel_engine_cs *engine; 793 enum intel_engine_id id; 794 int err; 795 796 /* 797 * Everything depends on having the GTT running, so we need to start 798 * there. 799 */ 800 err = i915_ggtt_enable_hw(gt->i915); 801 if (err) 802 return err; 803 804 local_bh_disable(); 805 for_each_engine(engine, gt, id) 806 __intel_engine_reset(engine, stalled_mask & engine->mask); 807 local_bh_enable(); 808 809 intel_uc_reset(>->uc, true); 810 811 intel_ggtt_restore_fences(gt->ggtt); 812 813 return err; 814 } 815 816 static void reset_finish_engine(struct intel_engine_cs *engine) 817 { 818 if (engine->reset.finish) 819 engine->reset.finish(engine); 820 intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); 821 822 intel_engine_signal_breadcrumbs(engine); 823 } 824 825 static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake) 826 { 827 struct intel_engine_cs *engine; 828 enum intel_engine_id id; 829 830 for_each_engine(engine, gt, id) { 831 reset_finish_engine(engine); 832 if (awake & engine->mask) 833 intel_engine_pm_put(engine); 834 } 835 836 intel_uc_reset_finish(>->uc); 837 } 838 839 static void nop_submit_request(struct i915_request *request) 840 { 841 RQ_TRACE(request, "-EIO\n"); 842 843 request = i915_request_mark_eio(request); 844 if (request) { 845 i915_request_submit(request); 846 intel_engine_signal_breadcrumbs(request->engine); 847 848 i915_request_put(request); 849 } 850 } 851 852 static void __intel_gt_set_wedged(struct intel_gt *gt) 853 { 854 struct intel_engine_cs *engine; 855 intel_engine_mask_t awake; 856 enum intel_engine_id id; 857 858 if (test_bit(I915_WEDGED, >->reset.flags)) 859 return; 860 861 GT_TRACE(gt, "start\n"); 862 863 /* 864 * First, stop submission to hw, but do not yet complete requests by 865 * rolling the global seqno forward (since this would complete requests 866 * for which we haven't set the fence error to EIO yet). 867 */ 868 awake = reset_prepare(gt); 869 870 /* Even if the GPU reset fails, it should still stop the engines */ 871 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 872 __intel_gt_reset(gt, ALL_ENGINES); 873 874 for_each_engine(engine, gt, id) 875 engine->submit_request = nop_submit_request; 876 877 /* 878 * Make sure no request can slip through without getting completed by 879 * either this call here to intel_engine_write_global_seqno, or the one 880 * in nop_submit_request. 881 */ 882 synchronize_rcu_expedited(); 883 set_bit(I915_WEDGED, >->reset.flags); 884 885 /* Mark all executing requests as skipped */ 886 local_bh_disable(); 887 for_each_engine(engine, gt, id) 888 if (engine->reset.cancel) 889 engine->reset.cancel(engine); 890 intel_uc_cancel_requests(>->uc); 891 local_bh_enable(); 892 893 reset_finish(gt, awake); 894 895 GT_TRACE(gt, "end\n"); 896 } 897 898 void intel_gt_set_wedged(struct intel_gt *gt) 899 { 900 intel_wakeref_t wakeref; 901 902 if (test_bit(I915_WEDGED, >->reset.flags)) 903 return; 904 905 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 906 mutex_lock(>->reset.mutex); 907 908 if (GEM_SHOW_DEBUG()) { 909 struct drm_printer p = drm_debug_printer(__func__); 910 struct intel_engine_cs *engine; 911 enum intel_engine_id id; 912 913 drm_printf(&p, "called from %pS\n", (void *)_RET_IP_); 914 for_each_engine(engine, gt, id) { 915 if (intel_engine_is_idle(engine)) 916 continue; 917 918 intel_engine_dump(engine, &p, "%s\n", engine->name); 919 } 920 } 921 922 __intel_gt_set_wedged(gt); 923 924 mutex_unlock(>->reset.mutex); 925 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 926 } 927 928 static bool __intel_gt_unset_wedged(struct intel_gt *gt) 929 { 930 struct intel_gt_timelines *timelines = >->timelines; 931 struct intel_timeline *tl; 932 bool ok; 933 934 if (!test_bit(I915_WEDGED, >->reset.flags)) 935 return true; 936 937 /* Never fully initialised, recovery impossible */ 938 if (intel_gt_has_unrecoverable_error(gt)) 939 return false; 940 941 GT_TRACE(gt, "start\n"); 942 943 /* 944 * Before unwedging, make sure that all pending operations 945 * are flushed and errored out - we may have requests waiting upon 946 * third party fences. We marked all inflight requests as EIO, and 947 * every execbuf since returned EIO, for consistency we want all 948 * the currently pending requests to also be marked as EIO, which 949 * is done inside our nop_submit_request - and so we must wait. 950 * 951 * No more can be submitted until we reset the wedged bit. 952 */ 953 spin_lock(&timelines->lock); 954 list_for_each_entry(tl, &timelines->active_list, link) { 955 struct dma_fence *fence; 956 957 fence = i915_active_fence_get(&tl->last_request); 958 if (!fence) 959 continue; 960 961 spin_unlock(&timelines->lock); 962 963 /* 964 * All internal dependencies (i915_requests) will have 965 * been flushed by the set-wedge, but we may be stuck waiting 966 * for external fences. These should all be capped to 10s 967 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded 968 * in the worst case. 969 */ 970 dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT); 971 dma_fence_put(fence); 972 973 /* Restart iteration after droping lock */ 974 spin_lock(&timelines->lock); 975 tl = list_entry(&timelines->active_list, typeof(*tl), link); 976 } 977 spin_unlock(&timelines->lock); 978 979 /* We must reset pending GPU events before restoring our submission */ 980 ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */ 981 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 982 ok = __intel_gt_reset(gt, ALL_ENGINES) == 0; 983 if (!ok) { 984 /* 985 * Warn CI about the unrecoverable wedged condition. 986 * Time for a reboot. 987 */ 988 add_taint_for_CI(gt->i915, TAINT_WARN); 989 return false; 990 } 991 992 /* 993 * Undo nop_submit_request. We prevent all new i915 requests from 994 * being queued (by disallowing execbuf whilst wedged) so having 995 * waited for all active requests above, we know the system is idle 996 * and do not have to worry about a thread being inside 997 * engine->submit_request() as we swap over. So unlike installing 998 * the nop_submit_request on reset, we can do this from normal 999 * context and do not require stop_machine(). 1000 */ 1001 intel_engines_reset_default_submission(gt); 1002 1003 GT_TRACE(gt, "end\n"); 1004 1005 smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ 1006 clear_bit(I915_WEDGED, >->reset.flags); 1007 1008 return true; 1009 } 1010 1011 bool intel_gt_unset_wedged(struct intel_gt *gt) 1012 { 1013 bool result; 1014 1015 mutex_lock(>->reset.mutex); 1016 result = __intel_gt_unset_wedged(gt); 1017 mutex_unlock(>->reset.mutex); 1018 1019 return result; 1020 } 1021 1022 static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) 1023 { 1024 int err, i; 1025 1026 err = __intel_gt_reset(gt, ALL_ENGINES); 1027 for (i = 0; err && i < RESET_MAX_RETRIES; i++) { 1028 msleep(10 * (i + 1)); 1029 err = __intel_gt_reset(gt, ALL_ENGINES); 1030 } 1031 if (err) 1032 return err; 1033 1034 return gt_reset(gt, stalled_mask); 1035 } 1036 1037 static int resume(struct intel_gt *gt) 1038 { 1039 struct intel_engine_cs *engine; 1040 enum intel_engine_id id; 1041 int ret; 1042 1043 for_each_engine(engine, gt, id) { 1044 ret = intel_engine_resume(engine); 1045 if (ret) 1046 return ret; 1047 } 1048 1049 return 0; 1050 } 1051 1052 /** 1053 * intel_gt_reset - reset chip after a hang 1054 * @gt: #intel_gt to reset 1055 * @stalled_mask: mask of the stalled engines with the guilty requests 1056 * @reason: user error message for why we are resetting 1057 * 1058 * Reset the chip. Useful if a hang is detected. Marks the device as wedged 1059 * on failure. 1060 * 1061 * Procedure is fairly simple: 1062 * - reset the chip using the reset reg 1063 * - re-init context state 1064 * - re-init hardware status page 1065 * - re-init ring buffer 1066 * - re-init interrupt state 1067 * - re-init display 1068 */ 1069 void intel_gt_reset(struct intel_gt *gt, 1070 intel_engine_mask_t stalled_mask, 1071 const char *reason) 1072 { 1073 intel_engine_mask_t awake; 1074 int ret; 1075 1076 GT_TRACE(gt, "flags=%lx\n", gt->reset.flags); 1077 1078 might_sleep(); 1079 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, >->reset.flags)); 1080 1081 /* 1082 * FIXME: Revoking cpu mmap ptes cannot be done from a dma_fence 1083 * critical section like gpu reset. 1084 */ 1085 gt_revoke(gt); 1086 1087 mutex_lock(>->reset.mutex); 1088 1089 /* Clear any previous failed attempts at recovery. Time to try again. */ 1090 if (!__intel_gt_unset_wedged(gt)) 1091 goto unlock; 1092 1093 if (reason) 1094 drm_notice(>->i915->drm, 1095 "Resetting chip for %s\n", reason); 1096 atomic_inc(>->i915->gpu_error.reset_count); 1097 1098 awake = reset_prepare(gt); 1099 1100 if (!intel_has_gpu_reset(gt)) { 1101 if (gt->i915->params.reset) 1102 drm_err(>->i915->drm, "GPU reset not supported\n"); 1103 else 1104 drm_dbg(>->i915->drm, "GPU reset disabled\n"); 1105 goto error; 1106 } 1107 1108 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 1109 intel_runtime_pm_disable_interrupts(gt->i915); 1110 1111 if (do_reset(gt, stalled_mask)) { 1112 drm_err(>->i915->drm, "Failed to reset chip\n"); 1113 goto taint; 1114 } 1115 1116 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 1117 intel_runtime_pm_enable_interrupts(gt->i915); 1118 1119 intel_overlay_reset(gt->i915); 1120 1121 /* 1122 * Next we need to restore the context, but we don't use those 1123 * yet either... 1124 * 1125 * Ring buffer needs to be re-initialized in the KMS case, or if X 1126 * was running at the time of the reset (i.e. we weren't VT 1127 * switched away). 1128 */ 1129 ret = intel_gt_init_hw(gt); 1130 if (ret) { 1131 drm_err(>->i915->drm, 1132 "Failed to initialise HW following reset (%d)\n", 1133 ret); 1134 goto taint; 1135 } 1136 1137 ret = resume(gt); 1138 if (ret) 1139 goto taint; 1140 1141 finish: 1142 reset_finish(gt, awake); 1143 unlock: 1144 mutex_unlock(>->reset.mutex); 1145 return; 1146 1147 taint: 1148 /* 1149 * History tells us that if we cannot reset the GPU now, we 1150 * never will. This then impacts everything that is run 1151 * subsequently. On failing the reset, we mark the driver 1152 * as wedged, preventing further execution on the GPU. 1153 * We also want to go one step further and add a taint to the 1154 * kernel so that any subsequent faults can be traced back to 1155 * this failure. This is important for CI, where if the 1156 * GPU/driver fails we would like to reboot and restart testing 1157 * rather than continue on into oblivion. For everyone else, 1158 * the system should still plod along, but they have been warned! 1159 */ 1160 add_taint_for_CI(gt->i915, TAINT_WARN); 1161 error: 1162 __intel_gt_set_wedged(gt); 1163 goto finish; 1164 } 1165 1166 static int intel_gt_reset_engine(struct intel_engine_cs *engine) 1167 { 1168 return __intel_gt_reset(engine->gt, engine->mask); 1169 } 1170 1171 int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg) 1172 { 1173 struct intel_gt *gt = engine->gt; 1174 int ret; 1175 1176 ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags); 1177 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, >->reset.flags)); 1178 1179 if (intel_engine_uses_guc(engine)) 1180 return -ENODEV; 1181 1182 if (!intel_engine_pm_get_if_awake(engine)) 1183 return 0; 1184 1185 reset_prepare_engine(engine); 1186 1187 if (msg) 1188 drm_notice(&engine->i915->drm, 1189 "Resetting %s for %s\n", engine->name, msg); 1190 atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]); 1191 1192 ret = intel_gt_reset_engine(engine); 1193 if (ret) { 1194 /* If we fail here, we expect to fallback to a global reset */ 1195 ENGINE_TRACE(engine, "Failed to reset %s, err: %d\n", engine->name, ret); 1196 goto out; 1197 } 1198 1199 /* 1200 * The request that caused the hang is stuck on elsp, we know the 1201 * active request and can drop it, adjust head to skip the offending 1202 * request to resume executing remaining requests in the queue. 1203 */ 1204 __intel_engine_reset(engine, true); 1205 1206 /* 1207 * The engine and its registers (and workarounds in case of render) 1208 * have been reset to their default values. Follow the init_ring 1209 * process to program RING_MODE, HWSP and re-enable submission. 1210 */ 1211 ret = intel_engine_resume(engine); 1212 1213 out: 1214 intel_engine_cancel_stop_cs(engine); 1215 reset_finish_engine(engine); 1216 intel_engine_pm_put_async(engine); 1217 return ret; 1218 } 1219 1220 /** 1221 * intel_engine_reset - reset GPU engine to recover from a hang 1222 * @engine: engine to reset 1223 * @msg: reason for GPU reset; or NULL for no drm_notice() 1224 * 1225 * Reset a specific GPU engine. Useful if a hang is detected. 1226 * Returns zero on successful reset or otherwise an error code. 1227 * 1228 * Procedure is: 1229 * - identifies the request that caused the hang and it is dropped 1230 * - reset engine (which will force the engine to idle) 1231 * - re-init/configure engine 1232 */ 1233 int intel_engine_reset(struct intel_engine_cs *engine, const char *msg) 1234 { 1235 int err; 1236 1237 local_bh_disable(); 1238 err = __intel_engine_reset_bh(engine, msg); 1239 local_bh_enable(); 1240 1241 return err; 1242 } 1243 1244 static void intel_gt_reset_global(struct intel_gt *gt, 1245 u32 engine_mask, 1246 const char *reason) 1247 { 1248 struct kobject *kobj = >->i915->drm.primary->kdev->kobj; 1249 char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; 1250 char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; 1251 char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; 1252 struct intel_wedge_me w; 1253 1254 kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); 1255 1256 GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask); 1257 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); 1258 1259 /* Use a watchdog to ensure that our reset completes */ 1260 intel_wedge_on_timeout(&w, gt, 5 * HZ) { 1261 intel_display_prepare_reset(gt->i915); 1262 1263 /* Flush everyone using a resource about to be clobbered */ 1264 synchronize_srcu_expedited(>->reset.backoff_srcu); 1265 1266 intel_gt_reset(gt, engine_mask, reason); 1267 1268 intel_display_finish_reset(gt->i915); 1269 } 1270 1271 if (!test_bit(I915_WEDGED, >->reset.flags)) 1272 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); 1273 } 1274 1275 /** 1276 * intel_gt_handle_error - handle a gpu error 1277 * @gt: the intel_gt 1278 * @engine_mask: mask representing engines that are hung 1279 * @flags: control flags 1280 * @fmt: Error message format string 1281 * 1282 * Do some basic checking of register state at error time and 1283 * dump it to the syslog. Also call i915_capture_error_state() to make 1284 * sure we get a record and make it available in debugfs. Fire a uevent 1285 * so userspace knows something bad happened (should trigger collection 1286 * of a ring dump etc.). 1287 */ 1288 void intel_gt_handle_error(struct intel_gt *gt, 1289 intel_engine_mask_t engine_mask, 1290 unsigned long flags, 1291 const char *fmt, ...) 1292 { 1293 struct intel_engine_cs *engine; 1294 intel_wakeref_t wakeref; 1295 intel_engine_mask_t tmp; 1296 char error_msg[80]; 1297 char *msg = NULL; 1298 1299 if (fmt) { 1300 va_list args; 1301 1302 va_start(args, fmt); 1303 vscnprintf(error_msg, sizeof(error_msg), fmt, args); 1304 va_end(args); 1305 1306 msg = error_msg; 1307 } 1308 1309 /* 1310 * In most cases it's guaranteed that we get here with an RPM 1311 * reference held, for example because there is a pending GPU 1312 * request that won't finish until the reset is done. This 1313 * isn't the case at least when we get here by doing a 1314 * simulated reset via debugfs, so get an RPM reference. 1315 */ 1316 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1317 1318 engine_mask &= gt->info.engine_mask; 1319 1320 if (flags & I915_ERROR_CAPTURE) { 1321 i915_capture_error_state(gt, engine_mask); 1322 intel_gt_clear_error_registers(gt, engine_mask); 1323 } 1324 1325 /* 1326 * Try engine reset when available. We fall back to full reset if 1327 * single reset fails. 1328 */ 1329 if (!intel_uc_uses_guc_submission(>->uc) && 1330 intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) { 1331 local_bh_disable(); 1332 for_each_engine_masked(engine, gt, engine_mask, tmp) { 1333 BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); 1334 if (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1335 >->reset.flags)) 1336 continue; 1337 1338 if (__intel_engine_reset_bh(engine, msg) == 0) 1339 engine_mask &= ~engine->mask; 1340 1341 clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, 1342 >->reset.flags); 1343 } 1344 local_bh_enable(); 1345 } 1346 1347 if (!engine_mask) 1348 goto out; 1349 1350 /* Full reset needs the mutex, stop any other user trying to do so. */ 1351 if (test_and_set_bit(I915_RESET_BACKOFF, >->reset.flags)) { 1352 wait_event(gt->reset.queue, 1353 !test_bit(I915_RESET_BACKOFF, >->reset.flags)); 1354 goto out; /* piggy-back on the other reset */ 1355 } 1356 1357 /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */ 1358 synchronize_rcu_expedited(); 1359 1360 /* 1361 * Prevent any other reset-engine attempt. We don't do this for GuC 1362 * submission the GuC owns the per-engine reset, not the i915. 1363 */ 1364 if (!intel_uc_uses_guc_submission(>->uc)) { 1365 for_each_engine(engine, gt, tmp) { 1366 while (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1367 >->reset.flags)) 1368 wait_on_bit(>->reset.flags, 1369 I915_RESET_ENGINE + engine->id, 1370 TASK_UNINTERRUPTIBLE); 1371 } 1372 } 1373 1374 intel_gt_reset_global(gt, engine_mask, msg); 1375 1376 if (!intel_uc_uses_guc_submission(>->uc)) { 1377 for_each_engine(engine, gt, tmp) 1378 clear_bit_unlock(I915_RESET_ENGINE + engine->id, 1379 >->reset.flags); 1380 } 1381 clear_bit_unlock(I915_RESET_BACKOFF, >->reset.flags); 1382 smp_mb__after_atomic(); 1383 wake_up_all(>->reset.queue); 1384 1385 out: 1386 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1387 } 1388 1389 int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) 1390 { 1391 might_lock(>->reset.backoff_srcu); 1392 might_sleep(); 1393 1394 rcu_read_lock(); 1395 while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) { 1396 rcu_read_unlock(); 1397 1398 if (wait_event_interruptible(gt->reset.queue, 1399 !test_bit(I915_RESET_BACKOFF, 1400 >->reset.flags))) 1401 return -EINTR; 1402 1403 rcu_read_lock(); 1404 } 1405 *srcu = srcu_read_lock(>->reset.backoff_srcu); 1406 rcu_read_unlock(); 1407 1408 return 0; 1409 } 1410 1411 void intel_gt_reset_unlock(struct intel_gt *gt, int tag) 1412 __releases(>->reset.backoff_srcu) 1413 { 1414 srcu_read_unlock(>->reset.backoff_srcu, tag); 1415 } 1416 1417 int intel_gt_terminally_wedged(struct intel_gt *gt) 1418 { 1419 might_sleep(); 1420 1421 if (!intel_gt_is_wedged(gt)) 1422 return 0; 1423 1424 if (intel_gt_has_unrecoverable_error(gt)) 1425 return -EIO; 1426 1427 /* Reset still in progress? Maybe we will recover? */ 1428 if (wait_event_interruptible(gt->reset.queue, 1429 !test_bit(I915_RESET_BACKOFF, 1430 >->reset.flags))) 1431 return -EINTR; 1432 1433 return intel_gt_is_wedged(gt) ? -EIO : 0; 1434 } 1435 1436 void intel_gt_set_wedged_on_init(struct intel_gt *gt) 1437 { 1438 BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES > 1439 I915_WEDGED_ON_INIT); 1440 intel_gt_set_wedged(gt); 1441 i915_disable_error_state(gt->i915, -ENODEV); 1442 set_bit(I915_WEDGED_ON_INIT, >->reset.flags); 1443 1444 /* Wedged on init is non-recoverable */ 1445 add_taint_for_CI(gt->i915, TAINT_WARN); 1446 } 1447 1448 void intel_gt_set_wedged_on_fini(struct intel_gt *gt) 1449 { 1450 intel_gt_set_wedged(gt); 1451 i915_disable_error_state(gt->i915, -ENODEV); 1452 set_bit(I915_WEDGED_ON_FINI, >->reset.flags); 1453 intel_gt_retire_requests(gt); /* cleanup any wedged requests */ 1454 } 1455 1456 void intel_gt_init_reset(struct intel_gt *gt) 1457 { 1458 init_waitqueue_head(>->reset.queue); 1459 mutex_init(>->reset.mutex); 1460 init_srcu_struct(>->reset.backoff_srcu); 1461 1462 /* 1463 * While undesirable to wait inside the shrinker, complain anyway. 1464 * 1465 * If we have to wait during shrinking, we guarantee forward progress 1466 * by forcing the reset. Therefore during the reset we must not 1467 * re-enter the shrinker. By declaring that we take the reset mutex 1468 * within the shrinker, we forbid ourselves from performing any 1469 * fs-reclaim or taking related locks during reset. 1470 */ 1471 i915_gem_shrinker_taints_mutex(gt->i915, >->reset.mutex); 1472 1473 /* no GPU until we are ready! */ 1474 __set_bit(I915_WEDGED, >->reset.flags); 1475 } 1476 1477 void intel_gt_fini_reset(struct intel_gt *gt) 1478 { 1479 cleanup_srcu_struct(>->reset.backoff_srcu); 1480 } 1481 1482 static void intel_wedge_me(struct work_struct *work) 1483 { 1484 struct intel_wedge_me *w = container_of(work, typeof(*w), work.work); 1485 1486 drm_err(&w->gt->i915->drm, 1487 "%s timed out, cancelling all in-flight rendering.\n", 1488 w->name); 1489 intel_gt_set_wedged(w->gt); 1490 } 1491 1492 void __intel_init_wedge(struct intel_wedge_me *w, 1493 struct intel_gt *gt, 1494 long timeout, 1495 const char *name) 1496 { 1497 w->gt = gt; 1498 w->name = name; 1499 1500 INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me); 1501 schedule_delayed_work(&w->work, timeout); 1502 } 1503 1504 void __intel_fini_wedge(struct intel_wedge_me *w) 1505 { 1506 cancel_delayed_work_sync(&w->work); 1507 destroy_delayed_work_on_stack(&w->work); 1508 w->gt = NULL; 1509 } 1510 1511 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1512 #include "selftest_reset.c" 1513 #include "selftest_hangcheck.c" 1514 #endif 1515