1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2008-2018 Intel Corporation 4 */ 5 6 #include <linux/sched/mm.h> 7 #include <linux/stop_machine.h> 8 #include <linux/string_helpers.h> 9 10 #include "display/intel_display.h" 11 #include "display/intel_overlay.h" 12 13 #include "gem/i915_gem_context.h" 14 15 #include "gt/intel_gt_regs.h" 16 17 #include "gt/uc/intel_gsc_fw.h" 18 19 #include "i915_drv.h" 20 #include "i915_file_private.h" 21 #include "i915_gpu_error.h" 22 #include "i915_irq.h" 23 #include "intel_breadcrumbs.h" 24 #include "intel_engine_pm.h" 25 #include "intel_engine_regs.h" 26 #include "intel_gt.h" 27 #include "intel_gt_pm.h" 28 #include "intel_gt_requests.h" 29 #include "intel_mchbar_regs.h" 30 #include "intel_pci_config.h" 31 #include "intel_reset.h" 32 33 #include "uc/intel_guc.h" 34 35 #define RESET_MAX_RETRIES 3 36 37 /* XXX How to handle concurrent GGTT updates using tiling registers? */ 38 #define RESET_UNDER_STOP_MACHINE 0 39 40 static void client_mark_guilty(struct i915_gem_context *ctx, bool banned) 41 { 42 struct drm_i915_file_private *file_priv = ctx->file_priv; 43 unsigned long prev_hang; 44 unsigned int score; 45 46 if (IS_ERR_OR_NULL(file_priv)) 47 return; 48 49 score = 0; 50 if (banned) 51 score = I915_CLIENT_SCORE_CONTEXT_BAN; 52 53 prev_hang = xchg(&file_priv->hang_timestamp, jiffies); 54 if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) 55 score += I915_CLIENT_SCORE_HANG_FAST; 56 57 if (score) { 58 atomic_add(score, &file_priv->ban_score); 59 60 drm_dbg(&ctx->i915->drm, 61 "client %s: gained %u ban score, now %u\n", 62 ctx->name, score, 63 atomic_read(&file_priv->ban_score)); 64 } 65 } 66 67 static bool mark_guilty(struct i915_request *rq) 68 { 69 struct i915_gem_context *ctx; 70 unsigned long prev_hang; 71 bool banned; 72 int i; 73 74 if (intel_context_is_closed(rq->context)) 75 return true; 76 77 rcu_read_lock(); 78 ctx = rcu_dereference(rq->context->gem_context); 79 if (ctx && !kref_get_unless_zero(&ctx->ref)) 80 ctx = NULL; 81 rcu_read_unlock(); 82 if (!ctx) 83 return intel_context_is_banned(rq->context); 84 85 atomic_inc(&ctx->guilty_count); 86 87 /* Cool contexts are too cool to be banned! (Used for reset testing.) */ 88 if (!i915_gem_context_is_bannable(ctx)) { 89 banned = false; 90 goto out; 91 } 92 93 drm_notice(&ctx->i915->drm, 94 "%s context reset due to GPU hang\n", 95 ctx->name); 96 97 /* Record the timestamp for the last N hangs */ 98 prev_hang = ctx->hang_timestamp[0]; 99 for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) 100 ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; 101 ctx->hang_timestamp[i] = jiffies; 102 103 /* If we have hung N+1 times in rapid succession, we ban the context! */ 104 banned = !i915_gem_context_is_recoverable(ctx); 105 if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) 106 banned = true; 107 if (banned) 108 drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n", 109 ctx->name, atomic_read(&ctx->guilty_count)); 110 111 client_mark_guilty(ctx, banned); 112 113 out: 114 i915_gem_context_put(ctx); 115 return banned; 116 } 117 118 static void mark_innocent(struct i915_request *rq) 119 { 120 struct i915_gem_context *ctx; 121 122 rcu_read_lock(); 123 ctx = rcu_dereference(rq->context->gem_context); 124 if (ctx) 125 atomic_inc(&ctx->active_count); 126 rcu_read_unlock(); 127 } 128 129 void __i915_request_reset(struct i915_request *rq, bool guilty) 130 { 131 bool banned = false; 132 133 RQ_TRACE(rq, "guilty? %s\n", str_yes_no(guilty)); 134 GEM_BUG_ON(__i915_request_is_complete(rq)); 135 136 rcu_read_lock(); /* protect the GEM context */ 137 if (guilty) { 138 i915_request_set_error_once(rq, -EIO); 139 __i915_request_skip(rq); 140 banned = mark_guilty(rq); 141 } else { 142 i915_request_set_error_once(rq, -EAGAIN); 143 mark_innocent(rq); 144 } 145 rcu_read_unlock(); 146 147 if (banned) 148 intel_context_ban(rq->context, rq); 149 } 150 151 static bool i915_in_reset(struct pci_dev *pdev) 152 { 153 u8 gdrst; 154 155 pci_read_config_byte(pdev, I915_GDRST, &gdrst); 156 return gdrst & GRDOM_RESET_STATUS; 157 } 158 159 static int i915_do_reset(struct intel_gt *gt, 160 intel_engine_mask_t engine_mask, 161 unsigned int retry) 162 { 163 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); 164 int err; 165 166 /* Assert reset for at least 20 usec, and wait for acknowledgement. */ 167 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 168 udelay(50); 169 err = wait_for_atomic(i915_in_reset(pdev), 50); 170 171 /* Clear the reset request. */ 172 pci_write_config_byte(pdev, I915_GDRST, 0); 173 udelay(50); 174 if (!err) 175 err = wait_for_atomic(!i915_in_reset(pdev), 50); 176 177 return err; 178 } 179 180 static bool g4x_reset_complete(struct pci_dev *pdev) 181 { 182 u8 gdrst; 183 184 pci_read_config_byte(pdev, I915_GDRST, &gdrst); 185 return (gdrst & GRDOM_RESET_ENABLE) == 0; 186 } 187 188 static int g33_do_reset(struct intel_gt *gt, 189 intel_engine_mask_t engine_mask, 190 unsigned int retry) 191 { 192 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); 193 194 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 195 return wait_for_atomic(g4x_reset_complete(pdev), 50); 196 } 197 198 static int g4x_do_reset(struct intel_gt *gt, 199 intel_engine_mask_t engine_mask, 200 unsigned int retry) 201 { 202 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); 203 struct intel_uncore *uncore = gt->uncore; 204 int ret; 205 206 /* WaVcpClkGateDisableForMediaReset:ctg,elk */ 207 intel_uncore_rmw_fw(uncore, VDECCLK_GATE_D, 0, VCP_UNIT_CLOCK_GATE_DISABLE); 208 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 209 210 pci_write_config_byte(pdev, I915_GDRST, 211 GRDOM_MEDIA | GRDOM_RESET_ENABLE); 212 ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 213 if (ret) { 214 GT_TRACE(gt, "Wait for media reset failed\n"); 215 goto out; 216 } 217 218 pci_write_config_byte(pdev, I915_GDRST, 219 GRDOM_RENDER | GRDOM_RESET_ENABLE); 220 ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 221 if (ret) { 222 GT_TRACE(gt, "Wait for render reset failed\n"); 223 goto out; 224 } 225 226 out: 227 pci_write_config_byte(pdev, I915_GDRST, 0); 228 229 intel_uncore_rmw_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE, 0); 230 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 231 232 return ret; 233 } 234 235 static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask, 236 unsigned int retry) 237 { 238 struct intel_uncore *uncore = gt->uncore; 239 int ret; 240 241 intel_uncore_write_fw(uncore, ILK_GDSR, 242 ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); 243 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 244 ILK_GRDOM_RESET_ENABLE, 0, 245 5000, 0, 246 NULL); 247 if (ret) { 248 GT_TRACE(gt, "Wait for render reset failed\n"); 249 goto out; 250 } 251 252 intel_uncore_write_fw(uncore, ILK_GDSR, 253 ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); 254 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 255 ILK_GRDOM_RESET_ENABLE, 0, 256 5000, 0, 257 NULL); 258 if (ret) { 259 GT_TRACE(gt, "Wait for media reset failed\n"); 260 goto out; 261 } 262 263 out: 264 intel_uncore_write_fw(uncore, ILK_GDSR, 0); 265 intel_uncore_posting_read_fw(uncore, ILK_GDSR); 266 return ret; 267 } 268 269 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ 270 static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask) 271 { 272 struct intel_uncore *uncore = gt->uncore; 273 int loops; 274 int err; 275 276 /* 277 * On some platforms, e.g. Jasperlake, we see that the engine register 278 * state is not cleared until shortly after GDRST reports completion, 279 * causing a failure as we try to immediately resume while the internal 280 * state is still in flux. If we immediately repeat the reset, the 281 * second reset appears to serialise with the first, and since it is a 282 * no-op, the registers should retain their reset value. However, there 283 * is still a concern that upon leaving the second reset, the internal 284 * engine state is still in flux and not ready for resuming. 285 * 286 * Starting on MTL, there are some prep steps that we need to do when 287 * resetting some engines that need to be applied every time we write to 288 * GEN6_GDRST. As those are time consuming (tens of ms), we don't want 289 * to perform that twice, so, since the Jasperlake issue hasn't been 290 * observed on MTL, we avoid repeating the reset on newer platforms. 291 */ 292 loops = GRAPHICS_VER_FULL(gt->i915) < IP_VER(12, 70) ? 2 : 1; 293 294 /* 295 * GEN6_GDRST is not in the gt power well, no need to check 296 * for fifo space for the write or forcewake the chip for 297 * the read 298 */ 299 do { 300 intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); 301 302 /* Wait for the device to ack the reset requests. */ 303 err = __intel_wait_for_register_fw(uncore, GEN6_GDRST, 304 hw_domain_mask, 0, 305 2000, 0, 306 NULL); 307 } while (err == 0 && --loops); 308 if (err) 309 GT_TRACE(gt, 310 "Wait for 0x%08x engines reset failed\n", 311 hw_domain_mask); 312 313 /* 314 * As we have observed that the engine state is still volatile 315 * after GDRST is acked, impose a small delay to let everything settle. 316 */ 317 udelay(50); 318 319 return err; 320 } 321 322 static int __gen6_reset_engines(struct intel_gt *gt, 323 intel_engine_mask_t engine_mask, 324 unsigned int retry) 325 { 326 struct intel_engine_cs *engine; 327 u32 hw_mask; 328 329 if (engine_mask == ALL_ENGINES) { 330 hw_mask = GEN6_GRDOM_FULL; 331 } else { 332 intel_engine_mask_t tmp; 333 334 hw_mask = 0; 335 for_each_engine_masked(engine, gt, engine_mask, tmp) { 336 hw_mask |= engine->reset_domain; 337 } 338 } 339 340 return gen6_hw_domain_reset(gt, hw_mask); 341 } 342 343 static int gen6_reset_engines(struct intel_gt *gt, 344 intel_engine_mask_t engine_mask, 345 unsigned int retry) 346 { 347 unsigned long flags; 348 int ret; 349 350 spin_lock_irqsave(>->uncore->lock, flags); 351 ret = __gen6_reset_engines(gt, engine_mask, retry); 352 spin_unlock_irqrestore(>->uncore->lock, flags); 353 354 return ret; 355 } 356 357 static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine) 358 { 359 int vecs_id; 360 361 GEM_BUG_ON(engine->class != VIDEO_DECODE_CLASS); 362 363 vecs_id = _VECS((engine->instance) / 2); 364 365 return engine->gt->engine[vecs_id]; 366 } 367 368 struct sfc_lock_data { 369 i915_reg_t lock_reg; 370 i915_reg_t ack_reg; 371 i915_reg_t usage_reg; 372 u32 lock_bit; 373 u32 ack_bit; 374 u32 usage_bit; 375 u32 reset_bit; 376 }; 377 378 static void get_sfc_forced_lock_data(struct intel_engine_cs *engine, 379 struct sfc_lock_data *sfc_lock) 380 { 381 switch (engine->class) { 382 default: 383 MISSING_CASE(engine->class); 384 fallthrough; 385 case VIDEO_DECODE_CLASS: 386 sfc_lock->lock_reg = GEN11_VCS_SFC_FORCED_LOCK(engine->mmio_base); 387 sfc_lock->lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 388 389 sfc_lock->ack_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base); 390 sfc_lock->ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; 391 392 sfc_lock->usage_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base); 393 sfc_lock->usage_bit = GEN11_VCS_SFC_USAGE_BIT; 394 sfc_lock->reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); 395 396 break; 397 case VIDEO_ENHANCEMENT_CLASS: 398 sfc_lock->lock_reg = GEN11_VECS_SFC_FORCED_LOCK(engine->mmio_base); 399 sfc_lock->lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 400 401 sfc_lock->ack_reg = GEN11_VECS_SFC_LOCK_ACK(engine->mmio_base); 402 sfc_lock->ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; 403 404 sfc_lock->usage_reg = GEN11_VECS_SFC_USAGE(engine->mmio_base); 405 sfc_lock->usage_bit = GEN11_VECS_SFC_USAGE_BIT; 406 sfc_lock->reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); 407 408 break; 409 } 410 } 411 412 static int gen11_lock_sfc(struct intel_engine_cs *engine, 413 u32 *reset_mask, 414 u32 *unlock_mask) 415 { 416 struct intel_uncore *uncore = engine->uncore; 417 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; 418 struct sfc_lock_data sfc_lock; 419 bool lock_obtained, lock_to_other = false; 420 int ret; 421 422 switch (engine->class) { 423 case VIDEO_DECODE_CLASS: 424 if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 425 return 0; 426 427 fallthrough; 428 case VIDEO_ENHANCEMENT_CLASS: 429 get_sfc_forced_lock_data(engine, &sfc_lock); 430 431 break; 432 default: 433 return 0; 434 } 435 436 if (!(intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & sfc_lock.usage_bit)) { 437 struct intel_engine_cs *paired_vecs; 438 439 if (engine->class != VIDEO_DECODE_CLASS || 440 GRAPHICS_VER(engine->i915) != 12) 441 return 0; 442 443 /* 444 * Wa_14010733141 445 * 446 * If the VCS-MFX isn't using the SFC, we also need to check 447 * whether VCS-HCP is using it. If so, we need to issue a *VE* 448 * forced lock on the VE engine that shares the same SFC. 449 */ 450 if (!(intel_uncore_read_fw(uncore, 451 GEN12_HCP_SFC_LOCK_STATUS(engine->mmio_base)) & 452 GEN12_HCP_SFC_USAGE_BIT)) 453 return 0; 454 455 paired_vecs = find_sfc_paired_vecs_engine(engine); 456 get_sfc_forced_lock_data(paired_vecs, &sfc_lock); 457 lock_to_other = true; 458 *unlock_mask |= paired_vecs->mask; 459 } else { 460 *unlock_mask |= engine->mask; 461 } 462 463 /* 464 * If the engine is using an SFC, tell the engine that a software reset 465 * is going to happen. The engine will then try to force lock the SFC. 466 * If SFC ends up being locked to the engine we want to reset, we have 467 * to reset it as well (we will unlock it once the reset sequence is 468 * completed). 469 */ 470 intel_uncore_rmw_fw(uncore, sfc_lock.lock_reg, 0, sfc_lock.lock_bit); 471 472 ret = __intel_wait_for_register_fw(uncore, 473 sfc_lock.ack_reg, 474 sfc_lock.ack_bit, 475 sfc_lock.ack_bit, 476 1000, 0, NULL); 477 478 /* 479 * Was the SFC released while we were trying to lock it? 480 * 481 * We should reset both the engine and the SFC if: 482 * - We were locking the SFC to this engine and the lock succeeded 483 * OR 484 * - We were locking the SFC to a different engine (Wa_14010733141) 485 * but the SFC was released before the lock was obtained. 486 * 487 * Otherwise we need only reset the engine by itself and we can 488 * leave the SFC alone. 489 */ 490 lock_obtained = (intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & 491 sfc_lock.usage_bit) != 0; 492 if (lock_obtained == lock_to_other) 493 return 0; 494 495 if (ret) { 496 ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n"); 497 return ret; 498 } 499 500 *reset_mask |= sfc_lock.reset_bit; 501 return 0; 502 } 503 504 static void gen11_unlock_sfc(struct intel_engine_cs *engine) 505 { 506 struct intel_uncore *uncore = engine->uncore; 507 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; 508 struct sfc_lock_data sfc_lock = {}; 509 510 if (engine->class != VIDEO_DECODE_CLASS && 511 engine->class != VIDEO_ENHANCEMENT_CLASS) 512 return; 513 514 if (engine->class == VIDEO_DECODE_CLASS && 515 (BIT(engine->instance) & vdbox_sfc_access) == 0) 516 return; 517 518 get_sfc_forced_lock_data(engine, &sfc_lock); 519 520 intel_uncore_rmw_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit, 0); 521 } 522 523 static int __gen11_reset_engines(struct intel_gt *gt, 524 intel_engine_mask_t engine_mask, 525 unsigned int retry) 526 { 527 struct intel_engine_cs *engine; 528 intel_engine_mask_t tmp; 529 u32 reset_mask, unlock_mask = 0; 530 int ret; 531 532 if (engine_mask == ALL_ENGINES) { 533 reset_mask = GEN11_GRDOM_FULL; 534 } else { 535 reset_mask = 0; 536 for_each_engine_masked(engine, gt, engine_mask, tmp) { 537 reset_mask |= engine->reset_domain; 538 ret = gen11_lock_sfc(engine, &reset_mask, &unlock_mask); 539 if (ret) 540 goto sfc_unlock; 541 } 542 } 543 544 ret = gen6_hw_domain_reset(gt, reset_mask); 545 546 sfc_unlock: 547 /* 548 * We unlock the SFC based on the lock status and not the result of 549 * gen11_lock_sfc to make sure that we clean properly if something 550 * wrong happened during the lock (e.g. lock acquired after timeout 551 * expiration). 552 * 553 * Due to Wa_14010733141, we may have locked an SFC to an engine that 554 * wasn't being reset. So instead of calling gen11_unlock_sfc() 555 * on engine_mask, we instead call it on the mask of engines that our 556 * gen11_lock_sfc() calls told us actually had locks attempted. 557 */ 558 for_each_engine_masked(engine, gt, unlock_mask, tmp) 559 gen11_unlock_sfc(engine); 560 561 return ret; 562 } 563 564 static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) 565 { 566 struct intel_uncore *uncore = engine->uncore; 567 const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base); 568 u32 request, mask, ack; 569 int ret; 570 571 if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1))) 572 return -ETIMEDOUT; 573 574 ack = intel_uncore_read_fw(uncore, reg); 575 if (ack & RESET_CTL_CAT_ERROR) { 576 /* 577 * For catastrophic errors, ready-for-reset sequence 578 * needs to be bypassed: HAS#396813 579 */ 580 request = RESET_CTL_CAT_ERROR; 581 mask = RESET_CTL_CAT_ERROR; 582 583 /* Catastrophic errors need to be cleared by HW */ 584 ack = 0; 585 } else if (!(ack & RESET_CTL_READY_TO_RESET)) { 586 request = RESET_CTL_REQUEST_RESET; 587 mask = RESET_CTL_READY_TO_RESET; 588 ack = RESET_CTL_READY_TO_RESET; 589 } else { 590 return 0; 591 } 592 593 intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request)); 594 ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 595 700, 0, NULL); 596 if (ret) 597 drm_err(&engine->i915->drm, 598 "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", 599 engine->name, request, 600 intel_uncore_read_fw(uncore, reg)); 601 602 return ret; 603 } 604 605 static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) 606 { 607 intel_uncore_write_fw(engine->uncore, 608 RING_RESET_CTL(engine->mmio_base), 609 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); 610 } 611 612 static int gen8_reset_engines(struct intel_gt *gt, 613 intel_engine_mask_t engine_mask, 614 unsigned int retry) 615 { 616 struct intel_engine_cs *engine; 617 const bool reset_non_ready = retry >= 1; 618 intel_engine_mask_t tmp; 619 unsigned long flags; 620 int ret; 621 622 spin_lock_irqsave(>->uncore->lock, flags); 623 624 for_each_engine_masked(engine, gt, engine_mask, tmp) { 625 ret = gen8_engine_reset_prepare(engine); 626 if (ret && !reset_non_ready) 627 goto skip_reset; 628 629 /* 630 * If this is not the first failed attempt to prepare, 631 * we decide to proceed anyway. 632 * 633 * By doing so we risk context corruption and with 634 * some gens (kbl), possible system hang if reset 635 * happens during active bb execution. 636 * 637 * We rather take context corruption instead of 638 * failed reset with a wedged driver/gpu. And 639 * active bb execution case should be covered by 640 * stop_engines() we have before the reset. 641 */ 642 } 643 644 /* 645 * Wa_22011100796:dg2, whenever Full soft reset is required, 646 * reset all individual engines firstly, and then do a full soft reset. 647 * 648 * This is best effort, so ignore any error from the initial reset. 649 */ 650 if (IS_DG2(gt->i915) && engine_mask == ALL_ENGINES) 651 __gen11_reset_engines(gt, gt->info.engine_mask, 0); 652 653 if (GRAPHICS_VER(gt->i915) >= 11) 654 ret = __gen11_reset_engines(gt, engine_mask, retry); 655 else 656 ret = __gen6_reset_engines(gt, engine_mask, retry); 657 658 skip_reset: 659 for_each_engine_masked(engine, gt, engine_mask, tmp) 660 gen8_engine_reset_cancel(engine); 661 662 spin_unlock_irqrestore(>->uncore->lock, flags); 663 664 return ret; 665 } 666 667 static int mock_reset(struct intel_gt *gt, 668 intel_engine_mask_t mask, 669 unsigned int retry) 670 { 671 return 0; 672 } 673 674 typedef int (*reset_func)(struct intel_gt *, 675 intel_engine_mask_t engine_mask, 676 unsigned int retry); 677 678 static reset_func intel_get_gpu_reset(const struct intel_gt *gt) 679 { 680 struct drm_i915_private *i915 = gt->i915; 681 682 if (is_mock_gt(gt)) 683 return mock_reset; 684 else if (GRAPHICS_VER(i915) >= 8) 685 return gen8_reset_engines; 686 else if (GRAPHICS_VER(i915) >= 6) 687 return gen6_reset_engines; 688 else if (GRAPHICS_VER(i915) >= 5) 689 return ilk_do_reset; 690 else if (IS_G4X(i915)) 691 return g4x_do_reset; 692 else if (IS_G33(i915) || IS_PINEVIEW(i915)) 693 return g33_do_reset; 694 else if (GRAPHICS_VER(i915) >= 3) 695 return i915_do_reset; 696 else 697 return NULL; 698 } 699 700 static int __reset_guc(struct intel_gt *gt) 701 { 702 u32 guc_domain = 703 GRAPHICS_VER(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; 704 705 return gen6_hw_domain_reset(gt, guc_domain); 706 } 707 708 static bool needs_wa_14015076503(struct intel_gt *gt, intel_engine_mask_t engine_mask) 709 { 710 if (!IS_METEORLAKE(gt->i915) || !HAS_ENGINE(gt, GSC0)) 711 return false; 712 713 if (!__HAS_ENGINE(engine_mask, GSC0)) 714 return false; 715 716 return intel_gsc_uc_fw_init_done(>->uc.gsc); 717 } 718 719 static intel_engine_mask_t 720 wa_14015076503_start(struct intel_gt *gt, intel_engine_mask_t engine_mask, bool first) 721 { 722 if (!needs_wa_14015076503(gt, engine_mask)) 723 return engine_mask; 724 725 /* 726 * wa_14015076503: if the GSC FW is loaded, we need to alert it that 727 * we're going to do a GSC engine reset and then wait for 200ms for the 728 * FW to get ready for it. However, if this is the first ALL_ENGINES 729 * reset attempt and the GSC is not busy, we can try to instead reset 730 * the GuC and all the other engines individually to avoid the 200ms 731 * wait. 732 * Skipping the GSC engine is safe because, differently from other 733 * engines, the GSCCS only role is to forward the commands to the GSC 734 * FW, so it doesn't have any HW outside of the CS itself and therefore 735 * it has no state that we don't explicitly re-init on resume or on 736 * context switch LRC or power context). The HW for the GSC uC is 737 * managed by the GSC FW so we don't need to care about that. 738 */ 739 if (engine_mask == ALL_ENGINES && first && intel_engine_is_idle(gt->engine[GSC0])) { 740 __reset_guc(gt); 741 engine_mask = gt->info.engine_mask & ~BIT(GSC0); 742 } else { 743 intel_uncore_rmw(gt->uncore, 744 HECI_H_GS1(MTL_GSC_HECI2_BASE), 745 0, HECI_H_GS1_ER_PREP); 746 747 /* make sure the reset bit is clear when writing the CSR reg */ 748 intel_uncore_rmw(gt->uncore, 749 HECI_H_CSR(MTL_GSC_HECI2_BASE), 750 HECI_H_CSR_RST, HECI_H_CSR_IG); 751 msleep(200); 752 } 753 754 return engine_mask; 755 } 756 757 static void 758 wa_14015076503_end(struct intel_gt *gt, intel_engine_mask_t engine_mask) 759 { 760 if (!needs_wa_14015076503(gt, engine_mask)) 761 return; 762 763 intel_uncore_rmw(gt->uncore, 764 HECI_H_GS1(MTL_GSC_HECI2_BASE), 765 HECI_H_GS1_ER_PREP, 0); 766 } 767 768 int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) 769 { 770 const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; 771 reset_func reset; 772 int ret = -ETIMEDOUT; 773 int retry; 774 775 reset = intel_get_gpu_reset(gt); 776 if (!reset) 777 return -ENODEV; 778 779 /* 780 * If the power well sleeps during the reset, the reset 781 * request may be dropped and never completes (causing -EIO). 782 */ 783 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 784 for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { 785 intel_engine_mask_t reset_mask; 786 787 reset_mask = wa_14015076503_start(gt, engine_mask, !retry); 788 789 GT_TRACE(gt, "engine_mask=%x\n", reset_mask); 790 preempt_disable(); 791 ret = reset(gt, reset_mask, retry); 792 preempt_enable(); 793 794 wa_14015076503_end(gt, reset_mask); 795 } 796 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 797 798 return ret; 799 } 800 801 bool intel_has_gpu_reset(const struct intel_gt *gt) 802 { 803 if (!gt->i915->params.reset) 804 return NULL; 805 806 return intel_get_gpu_reset(gt); 807 } 808 809 bool intel_has_reset_engine(const struct intel_gt *gt) 810 { 811 if (gt->i915->params.reset < 2) 812 return false; 813 814 return INTEL_INFO(gt->i915)->has_reset_engine; 815 } 816 817 int intel_reset_guc(struct intel_gt *gt) 818 { 819 int ret; 820 821 GEM_BUG_ON(!HAS_GT_UC(gt->i915)); 822 823 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 824 ret = __reset_guc(gt); 825 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 826 827 return ret; 828 } 829 830 /* 831 * Ensure irq handler finishes, and not run again. 832 * Also return the active request so that we only search for it once. 833 */ 834 static void reset_prepare_engine(struct intel_engine_cs *engine) 835 { 836 /* 837 * During the reset sequence, we must prevent the engine from 838 * entering RC6. As the context state is undefined until we restart 839 * the engine, if it does enter RC6 during the reset, the state 840 * written to the powercontext is undefined and so we may lose 841 * GPU state upon resume, i.e. fail to restart after a reset. 842 */ 843 intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); 844 if (engine->reset.prepare) 845 engine->reset.prepare(engine); 846 } 847 848 static void revoke_mmaps(struct intel_gt *gt) 849 { 850 int i; 851 852 for (i = 0; i < gt->ggtt->num_fences; i++) { 853 struct drm_vma_offset_node *node; 854 struct i915_vma *vma; 855 u64 vma_offset; 856 857 vma = READ_ONCE(gt->ggtt->fence_regs[i].vma); 858 if (!vma) 859 continue; 860 861 if (!i915_vma_has_userfault(vma)) 862 continue; 863 864 GEM_BUG_ON(vma->fence != >->ggtt->fence_regs[i]); 865 866 if (!vma->mmo) 867 continue; 868 869 node = &vma->mmo->vma_node; 870 vma_offset = vma->gtt_view.partial.offset << PAGE_SHIFT; 871 872 unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping, 873 drm_vma_node_offset_addr(node) + vma_offset, 874 vma->size, 875 1); 876 } 877 } 878 879 static intel_engine_mask_t reset_prepare(struct intel_gt *gt) 880 { 881 struct intel_engine_cs *engine; 882 intel_engine_mask_t awake = 0; 883 enum intel_engine_id id; 884 885 /* For GuC mode, ensure submission is disabled before stopping ring */ 886 intel_uc_reset_prepare(>->uc); 887 888 for_each_engine(engine, gt, id) { 889 if (intel_engine_pm_get_if_awake(engine)) 890 awake |= engine->mask; 891 reset_prepare_engine(engine); 892 } 893 894 return awake; 895 } 896 897 static void gt_revoke(struct intel_gt *gt) 898 { 899 revoke_mmaps(gt); 900 } 901 902 static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) 903 { 904 struct intel_engine_cs *engine; 905 enum intel_engine_id id; 906 int err; 907 908 /* 909 * Everything depends on having the GTT running, so we need to start 910 * there. 911 */ 912 err = i915_ggtt_enable_hw(gt->i915); 913 if (err) 914 return err; 915 916 local_bh_disable(); 917 for_each_engine(engine, gt, id) 918 __intel_engine_reset(engine, stalled_mask & engine->mask); 919 local_bh_enable(); 920 921 intel_uc_reset(>->uc, ALL_ENGINES); 922 923 intel_ggtt_restore_fences(gt->ggtt); 924 925 return err; 926 } 927 928 static void reset_finish_engine(struct intel_engine_cs *engine) 929 { 930 if (engine->reset.finish) 931 engine->reset.finish(engine); 932 intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); 933 934 intel_engine_signal_breadcrumbs(engine); 935 } 936 937 static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake) 938 { 939 struct intel_engine_cs *engine; 940 enum intel_engine_id id; 941 942 for_each_engine(engine, gt, id) { 943 reset_finish_engine(engine); 944 if (awake & engine->mask) 945 intel_engine_pm_put(engine); 946 } 947 948 intel_uc_reset_finish(>->uc); 949 } 950 951 static void nop_submit_request(struct i915_request *request) 952 { 953 RQ_TRACE(request, "-EIO\n"); 954 955 request = i915_request_mark_eio(request); 956 if (request) { 957 i915_request_submit(request); 958 intel_engine_signal_breadcrumbs(request->engine); 959 960 i915_request_put(request); 961 } 962 } 963 964 static void __intel_gt_set_wedged(struct intel_gt *gt) 965 { 966 struct intel_engine_cs *engine; 967 intel_engine_mask_t awake; 968 enum intel_engine_id id; 969 970 if (test_bit(I915_WEDGED, >->reset.flags)) 971 return; 972 973 GT_TRACE(gt, "start\n"); 974 975 /* 976 * First, stop submission to hw, but do not yet complete requests by 977 * rolling the global seqno forward (since this would complete requests 978 * for which we haven't set the fence error to EIO yet). 979 */ 980 awake = reset_prepare(gt); 981 982 /* Even if the GPU reset fails, it should still stop the engines */ 983 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 984 __intel_gt_reset(gt, ALL_ENGINES); 985 986 for_each_engine(engine, gt, id) 987 engine->submit_request = nop_submit_request; 988 989 /* 990 * Make sure no request can slip through without getting completed by 991 * either this call here to intel_engine_write_global_seqno, or the one 992 * in nop_submit_request. 993 */ 994 synchronize_rcu_expedited(); 995 set_bit(I915_WEDGED, >->reset.flags); 996 997 /* Mark all executing requests as skipped */ 998 local_bh_disable(); 999 for_each_engine(engine, gt, id) 1000 if (engine->reset.cancel) 1001 engine->reset.cancel(engine); 1002 intel_uc_cancel_requests(>->uc); 1003 local_bh_enable(); 1004 1005 reset_finish(gt, awake); 1006 1007 GT_TRACE(gt, "end\n"); 1008 } 1009 1010 void intel_gt_set_wedged(struct intel_gt *gt) 1011 { 1012 intel_wakeref_t wakeref; 1013 1014 if (test_bit(I915_WEDGED, >->reset.flags)) 1015 return; 1016 1017 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1018 mutex_lock(>->reset.mutex); 1019 1020 if (GEM_SHOW_DEBUG()) { 1021 struct drm_printer p = drm_debug_printer(__func__); 1022 struct intel_engine_cs *engine; 1023 enum intel_engine_id id; 1024 1025 drm_printf(&p, "called from %pS\n", (void *)_RET_IP_); 1026 for_each_engine(engine, gt, id) { 1027 if (intel_engine_is_idle(engine)) 1028 continue; 1029 1030 intel_engine_dump(engine, &p, "%s\n", engine->name); 1031 } 1032 } 1033 1034 __intel_gt_set_wedged(gt); 1035 1036 mutex_unlock(>->reset.mutex); 1037 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1038 } 1039 1040 static bool __intel_gt_unset_wedged(struct intel_gt *gt) 1041 { 1042 struct intel_gt_timelines *timelines = >->timelines; 1043 struct intel_timeline *tl; 1044 bool ok; 1045 1046 if (!test_bit(I915_WEDGED, >->reset.flags)) 1047 return true; 1048 1049 /* Never fully initialised, recovery impossible */ 1050 if (intel_gt_has_unrecoverable_error(gt)) 1051 return false; 1052 1053 GT_TRACE(gt, "start\n"); 1054 1055 /* 1056 * Before unwedging, make sure that all pending operations 1057 * are flushed and errored out - we may have requests waiting upon 1058 * third party fences. We marked all inflight requests as EIO, and 1059 * every execbuf since returned EIO, for consistency we want all 1060 * the currently pending requests to also be marked as EIO, which 1061 * is done inside our nop_submit_request - and so we must wait. 1062 * 1063 * No more can be submitted until we reset the wedged bit. 1064 */ 1065 spin_lock(&timelines->lock); 1066 list_for_each_entry(tl, &timelines->active_list, link) { 1067 struct dma_fence *fence; 1068 1069 fence = i915_active_fence_get(&tl->last_request); 1070 if (!fence) 1071 continue; 1072 1073 spin_unlock(&timelines->lock); 1074 1075 /* 1076 * All internal dependencies (i915_requests) will have 1077 * been flushed by the set-wedge, but we may be stuck waiting 1078 * for external fences. These should all be capped to 10s 1079 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded 1080 * in the worst case. 1081 */ 1082 dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT); 1083 dma_fence_put(fence); 1084 1085 /* Restart iteration after droping lock */ 1086 spin_lock(&timelines->lock); 1087 tl = list_entry(&timelines->active_list, typeof(*tl), link); 1088 } 1089 spin_unlock(&timelines->lock); 1090 1091 /* We must reset pending GPU events before restoring our submission */ 1092 ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */ 1093 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 1094 ok = __intel_gt_reset(gt, ALL_ENGINES) == 0; 1095 if (!ok) { 1096 /* 1097 * Warn CI about the unrecoverable wedged condition. 1098 * Time for a reboot. 1099 */ 1100 add_taint_for_CI(gt->i915, TAINT_WARN); 1101 return false; 1102 } 1103 1104 /* 1105 * Undo nop_submit_request. We prevent all new i915 requests from 1106 * being queued (by disallowing execbuf whilst wedged) so having 1107 * waited for all active requests above, we know the system is idle 1108 * and do not have to worry about a thread being inside 1109 * engine->submit_request() as we swap over. So unlike installing 1110 * the nop_submit_request on reset, we can do this from normal 1111 * context and do not require stop_machine(). 1112 */ 1113 intel_engines_reset_default_submission(gt); 1114 1115 GT_TRACE(gt, "end\n"); 1116 1117 smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ 1118 clear_bit(I915_WEDGED, >->reset.flags); 1119 1120 return true; 1121 } 1122 1123 bool intel_gt_unset_wedged(struct intel_gt *gt) 1124 { 1125 bool result; 1126 1127 mutex_lock(>->reset.mutex); 1128 result = __intel_gt_unset_wedged(gt); 1129 mutex_unlock(>->reset.mutex); 1130 1131 return result; 1132 } 1133 1134 static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) 1135 { 1136 int err, i; 1137 1138 err = __intel_gt_reset(gt, ALL_ENGINES); 1139 for (i = 0; err && i < RESET_MAX_RETRIES; i++) { 1140 msleep(10 * (i + 1)); 1141 err = __intel_gt_reset(gt, ALL_ENGINES); 1142 } 1143 if (err) 1144 return err; 1145 1146 return gt_reset(gt, stalled_mask); 1147 } 1148 1149 static int resume(struct intel_gt *gt) 1150 { 1151 struct intel_engine_cs *engine; 1152 enum intel_engine_id id; 1153 int ret; 1154 1155 for_each_engine(engine, gt, id) { 1156 ret = intel_engine_resume(engine); 1157 if (ret) 1158 return ret; 1159 } 1160 1161 return 0; 1162 } 1163 1164 /** 1165 * intel_gt_reset - reset chip after a hang 1166 * @gt: #intel_gt to reset 1167 * @stalled_mask: mask of the stalled engines with the guilty requests 1168 * @reason: user error message for why we are resetting 1169 * 1170 * Reset the chip. Useful if a hang is detected. Marks the device as wedged 1171 * on failure. 1172 * 1173 * Procedure is fairly simple: 1174 * - reset the chip using the reset reg 1175 * - re-init context state 1176 * - re-init hardware status page 1177 * - re-init ring buffer 1178 * - re-init interrupt state 1179 * - re-init display 1180 */ 1181 void intel_gt_reset(struct intel_gt *gt, 1182 intel_engine_mask_t stalled_mask, 1183 const char *reason) 1184 { 1185 intel_engine_mask_t awake; 1186 int ret; 1187 1188 GT_TRACE(gt, "flags=%lx\n", gt->reset.flags); 1189 1190 might_sleep(); 1191 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, >->reset.flags)); 1192 1193 /* 1194 * FIXME: Revoking cpu mmap ptes cannot be done from a dma_fence 1195 * critical section like gpu reset. 1196 */ 1197 gt_revoke(gt); 1198 1199 mutex_lock(>->reset.mutex); 1200 1201 /* Clear any previous failed attempts at recovery. Time to try again. */ 1202 if (!__intel_gt_unset_wedged(gt)) 1203 goto unlock; 1204 1205 if (reason) 1206 drm_notice(>->i915->drm, 1207 "Resetting chip for %s\n", reason); 1208 atomic_inc(>->i915->gpu_error.reset_count); 1209 1210 awake = reset_prepare(gt); 1211 1212 if (!intel_has_gpu_reset(gt)) { 1213 if (gt->i915->params.reset) 1214 drm_err(>->i915->drm, "GPU reset not supported\n"); 1215 else 1216 drm_dbg(>->i915->drm, "GPU reset disabled\n"); 1217 goto error; 1218 } 1219 1220 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 1221 intel_runtime_pm_disable_interrupts(gt->i915); 1222 1223 if (do_reset(gt, stalled_mask)) { 1224 drm_err(>->i915->drm, "Failed to reset chip\n"); 1225 goto taint; 1226 } 1227 1228 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 1229 intel_runtime_pm_enable_interrupts(gt->i915); 1230 1231 intel_overlay_reset(gt->i915); 1232 1233 /* 1234 * Next we need to restore the context, but we don't use those 1235 * yet either... 1236 * 1237 * Ring buffer needs to be re-initialized in the KMS case, or if X 1238 * was running at the time of the reset (i.e. we weren't VT 1239 * switched away). 1240 */ 1241 ret = intel_gt_init_hw(gt); 1242 if (ret) { 1243 drm_err(>->i915->drm, 1244 "Failed to initialise HW following reset (%d)\n", 1245 ret); 1246 goto taint; 1247 } 1248 1249 ret = resume(gt); 1250 if (ret) 1251 goto taint; 1252 1253 finish: 1254 reset_finish(gt, awake); 1255 unlock: 1256 mutex_unlock(>->reset.mutex); 1257 return; 1258 1259 taint: 1260 /* 1261 * History tells us that if we cannot reset the GPU now, we 1262 * never will. This then impacts everything that is run 1263 * subsequently. On failing the reset, we mark the driver 1264 * as wedged, preventing further execution on the GPU. 1265 * We also want to go one step further and add a taint to the 1266 * kernel so that any subsequent faults can be traced back to 1267 * this failure. This is important for CI, where if the 1268 * GPU/driver fails we would like to reboot and restart testing 1269 * rather than continue on into oblivion. For everyone else, 1270 * the system should still plod along, but they have been warned! 1271 */ 1272 add_taint_for_CI(gt->i915, TAINT_WARN); 1273 error: 1274 __intel_gt_set_wedged(gt); 1275 goto finish; 1276 } 1277 1278 static int intel_gt_reset_engine(struct intel_engine_cs *engine) 1279 { 1280 return __intel_gt_reset(engine->gt, engine->mask); 1281 } 1282 1283 int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg) 1284 { 1285 struct intel_gt *gt = engine->gt; 1286 int ret; 1287 1288 ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags); 1289 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, >->reset.flags)); 1290 1291 if (intel_engine_uses_guc(engine)) 1292 return -ENODEV; 1293 1294 if (!intel_engine_pm_get_if_awake(engine)) 1295 return 0; 1296 1297 reset_prepare_engine(engine); 1298 1299 if (msg) 1300 drm_notice(&engine->i915->drm, 1301 "Resetting %s for %s\n", engine->name, msg); 1302 atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]); 1303 1304 ret = intel_gt_reset_engine(engine); 1305 if (ret) { 1306 /* If we fail here, we expect to fallback to a global reset */ 1307 ENGINE_TRACE(engine, "Failed to reset %s, err: %d\n", engine->name, ret); 1308 goto out; 1309 } 1310 1311 /* 1312 * The request that caused the hang is stuck on elsp, we know the 1313 * active request and can drop it, adjust head to skip the offending 1314 * request to resume executing remaining requests in the queue. 1315 */ 1316 __intel_engine_reset(engine, true); 1317 1318 /* 1319 * The engine and its registers (and workarounds in case of render) 1320 * have been reset to their default values. Follow the init_ring 1321 * process to program RING_MODE, HWSP and re-enable submission. 1322 */ 1323 ret = intel_engine_resume(engine); 1324 1325 out: 1326 intel_engine_cancel_stop_cs(engine); 1327 reset_finish_engine(engine); 1328 intel_engine_pm_put_async(engine); 1329 return ret; 1330 } 1331 1332 /** 1333 * intel_engine_reset - reset GPU engine to recover from a hang 1334 * @engine: engine to reset 1335 * @msg: reason for GPU reset; or NULL for no drm_notice() 1336 * 1337 * Reset a specific GPU engine. Useful if a hang is detected. 1338 * Returns zero on successful reset or otherwise an error code. 1339 * 1340 * Procedure is: 1341 * - identifies the request that caused the hang and it is dropped 1342 * - reset engine (which will force the engine to idle) 1343 * - re-init/configure engine 1344 */ 1345 int intel_engine_reset(struct intel_engine_cs *engine, const char *msg) 1346 { 1347 int err; 1348 1349 local_bh_disable(); 1350 err = __intel_engine_reset_bh(engine, msg); 1351 local_bh_enable(); 1352 1353 return err; 1354 } 1355 1356 static void intel_gt_reset_global(struct intel_gt *gt, 1357 u32 engine_mask, 1358 const char *reason) 1359 { 1360 struct kobject *kobj = >->i915->drm.primary->kdev->kobj; 1361 char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; 1362 char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; 1363 char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; 1364 struct intel_wedge_me w; 1365 1366 kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); 1367 1368 GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask); 1369 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); 1370 1371 /* Use a watchdog to ensure that our reset completes */ 1372 intel_wedge_on_timeout(&w, gt, 60 * HZ) { 1373 intel_display_prepare_reset(gt->i915); 1374 1375 intel_gt_reset(gt, engine_mask, reason); 1376 1377 intel_display_finish_reset(gt->i915); 1378 } 1379 1380 if (!test_bit(I915_WEDGED, >->reset.flags)) 1381 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); 1382 } 1383 1384 /** 1385 * intel_gt_handle_error - handle a gpu error 1386 * @gt: the intel_gt 1387 * @engine_mask: mask representing engines that are hung 1388 * @flags: control flags 1389 * @fmt: Error message format string 1390 * 1391 * Do some basic checking of register state at error time and 1392 * dump it to the syslog. Also call i915_capture_error_state() to make 1393 * sure we get a record and make it available in debugfs. Fire a uevent 1394 * so userspace knows something bad happened (should trigger collection 1395 * of a ring dump etc.). 1396 */ 1397 void intel_gt_handle_error(struct intel_gt *gt, 1398 intel_engine_mask_t engine_mask, 1399 unsigned long flags, 1400 const char *fmt, ...) 1401 { 1402 struct intel_engine_cs *engine; 1403 intel_wakeref_t wakeref; 1404 intel_engine_mask_t tmp; 1405 char error_msg[80]; 1406 char *msg = NULL; 1407 1408 if (fmt) { 1409 va_list args; 1410 1411 va_start(args, fmt); 1412 vscnprintf(error_msg, sizeof(error_msg), fmt, args); 1413 va_end(args); 1414 1415 msg = error_msg; 1416 } 1417 1418 /* 1419 * In most cases it's guaranteed that we get here with an RPM 1420 * reference held, for example because there is a pending GPU 1421 * request that won't finish until the reset is done. This 1422 * isn't the case at least when we get here by doing a 1423 * simulated reset via debugfs, so get an RPM reference. 1424 */ 1425 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1426 1427 engine_mask &= gt->info.engine_mask; 1428 1429 if (flags & I915_ERROR_CAPTURE) { 1430 i915_capture_error_state(gt, engine_mask, CORE_DUMP_FLAG_NONE); 1431 intel_gt_clear_error_registers(gt, engine_mask); 1432 } 1433 1434 /* 1435 * Try engine reset when available. We fall back to full reset if 1436 * single reset fails. 1437 */ 1438 if (!intel_uc_uses_guc_submission(>->uc) && 1439 intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) { 1440 local_bh_disable(); 1441 for_each_engine_masked(engine, gt, engine_mask, tmp) { 1442 BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); 1443 if (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1444 >->reset.flags)) 1445 continue; 1446 1447 if (__intel_engine_reset_bh(engine, msg) == 0) 1448 engine_mask &= ~engine->mask; 1449 1450 clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, 1451 >->reset.flags); 1452 } 1453 local_bh_enable(); 1454 } 1455 1456 if (!engine_mask) 1457 goto out; 1458 1459 /* Full reset needs the mutex, stop any other user trying to do so. */ 1460 if (test_and_set_bit(I915_RESET_BACKOFF, >->reset.flags)) { 1461 wait_event(gt->reset.queue, 1462 !test_bit(I915_RESET_BACKOFF, >->reset.flags)); 1463 goto out; /* piggy-back on the other reset */ 1464 } 1465 1466 /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */ 1467 synchronize_rcu_expedited(); 1468 1469 /* 1470 * Prevent any other reset-engine attempt. We don't do this for GuC 1471 * submission the GuC owns the per-engine reset, not the i915. 1472 */ 1473 if (!intel_uc_uses_guc_submission(>->uc)) { 1474 for_each_engine(engine, gt, tmp) { 1475 while (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1476 >->reset.flags)) 1477 wait_on_bit(>->reset.flags, 1478 I915_RESET_ENGINE + engine->id, 1479 TASK_UNINTERRUPTIBLE); 1480 } 1481 } 1482 1483 /* Flush everyone using a resource about to be clobbered */ 1484 synchronize_srcu_expedited(>->reset.backoff_srcu); 1485 1486 intel_gt_reset_global(gt, engine_mask, msg); 1487 1488 if (!intel_uc_uses_guc_submission(>->uc)) { 1489 for_each_engine(engine, gt, tmp) 1490 clear_bit_unlock(I915_RESET_ENGINE + engine->id, 1491 >->reset.flags); 1492 } 1493 clear_bit_unlock(I915_RESET_BACKOFF, >->reset.flags); 1494 smp_mb__after_atomic(); 1495 wake_up_all(>->reset.queue); 1496 1497 out: 1498 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1499 } 1500 1501 static int _intel_gt_reset_lock(struct intel_gt *gt, int *srcu, bool retry) 1502 { 1503 might_lock(>->reset.backoff_srcu); 1504 if (retry) 1505 might_sleep(); 1506 1507 rcu_read_lock(); 1508 while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) { 1509 rcu_read_unlock(); 1510 1511 if (!retry) 1512 return -EBUSY; 1513 1514 if (wait_event_interruptible(gt->reset.queue, 1515 !test_bit(I915_RESET_BACKOFF, 1516 >->reset.flags))) 1517 return -EINTR; 1518 1519 rcu_read_lock(); 1520 } 1521 *srcu = srcu_read_lock(>->reset.backoff_srcu); 1522 rcu_read_unlock(); 1523 1524 return 0; 1525 } 1526 1527 int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) 1528 { 1529 return _intel_gt_reset_lock(gt, srcu, false); 1530 } 1531 1532 int intel_gt_reset_lock_interruptible(struct intel_gt *gt, int *srcu) 1533 { 1534 return _intel_gt_reset_lock(gt, srcu, true); 1535 } 1536 1537 void intel_gt_reset_unlock(struct intel_gt *gt, int tag) 1538 __releases(>->reset.backoff_srcu) 1539 { 1540 srcu_read_unlock(>->reset.backoff_srcu, tag); 1541 } 1542 1543 int intel_gt_terminally_wedged(struct intel_gt *gt) 1544 { 1545 might_sleep(); 1546 1547 if (!intel_gt_is_wedged(gt)) 1548 return 0; 1549 1550 if (intel_gt_has_unrecoverable_error(gt)) 1551 return -EIO; 1552 1553 /* Reset still in progress? Maybe we will recover? */ 1554 if (wait_event_interruptible(gt->reset.queue, 1555 !test_bit(I915_RESET_BACKOFF, 1556 >->reset.flags))) 1557 return -EINTR; 1558 1559 return intel_gt_is_wedged(gt) ? -EIO : 0; 1560 } 1561 1562 void intel_gt_set_wedged_on_init(struct intel_gt *gt) 1563 { 1564 BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES > 1565 I915_WEDGED_ON_INIT); 1566 intel_gt_set_wedged(gt); 1567 i915_disable_error_state(gt->i915, -ENODEV); 1568 set_bit(I915_WEDGED_ON_INIT, >->reset.flags); 1569 1570 /* Wedged on init is non-recoverable */ 1571 add_taint_for_CI(gt->i915, TAINT_WARN); 1572 } 1573 1574 void intel_gt_set_wedged_on_fini(struct intel_gt *gt) 1575 { 1576 intel_gt_set_wedged(gt); 1577 i915_disable_error_state(gt->i915, -ENODEV); 1578 set_bit(I915_WEDGED_ON_FINI, >->reset.flags); 1579 intel_gt_retire_requests(gt); /* cleanup any wedged requests */ 1580 } 1581 1582 void intel_gt_init_reset(struct intel_gt *gt) 1583 { 1584 init_waitqueue_head(>->reset.queue); 1585 mutex_init(>->reset.mutex); 1586 init_srcu_struct(>->reset.backoff_srcu); 1587 1588 /* 1589 * While undesirable to wait inside the shrinker, complain anyway. 1590 * 1591 * If we have to wait during shrinking, we guarantee forward progress 1592 * by forcing the reset. Therefore during the reset we must not 1593 * re-enter the shrinker. By declaring that we take the reset mutex 1594 * within the shrinker, we forbid ourselves from performing any 1595 * fs-reclaim or taking related locks during reset. 1596 */ 1597 i915_gem_shrinker_taints_mutex(gt->i915, >->reset.mutex); 1598 1599 /* no GPU until we are ready! */ 1600 __set_bit(I915_WEDGED, >->reset.flags); 1601 } 1602 1603 void intel_gt_fini_reset(struct intel_gt *gt) 1604 { 1605 cleanup_srcu_struct(>->reset.backoff_srcu); 1606 } 1607 1608 static void intel_wedge_me(struct work_struct *work) 1609 { 1610 struct intel_wedge_me *w = container_of(work, typeof(*w), work.work); 1611 1612 drm_err(&w->gt->i915->drm, 1613 "%s timed out, cancelling all in-flight rendering.\n", 1614 w->name); 1615 intel_gt_set_wedged(w->gt); 1616 } 1617 1618 void __intel_init_wedge(struct intel_wedge_me *w, 1619 struct intel_gt *gt, 1620 long timeout, 1621 const char *name) 1622 { 1623 w->gt = gt; 1624 w->name = name; 1625 1626 INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me); 1627 schedule_delayed_work(&w->work, timeout); 1628 } 1629 1630 void __intel_fini_wedge(struct intel_wedge_me *w) 1631 { 1632 cancel_delayed_work_sync(&w->work); 1633 destroy_delayed_work_on_stack(&w->work); 1634 w->gt = NULL; 1635 } 1636 1637 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1638 #include "selftest_reset.c" 1639 #include "selftest_hangcheck.c" 1640 #endif 1641