1112ed2d3SChris Wilson /* 2112ed2d3SChris Wilson * SPDX-License-Identifier: MIT 3112ed2d3SChris Wilson * 4112ed2d3SChris Wilson * Copyright © 2008-2018 Intel Corporation 5112ed2d3SChris Wilson */ 6112ed2d3SChris Wilson 7112ed2d3SChris Wilson #include <linux/sched/mm.h> 8112ed2d3SChris Wilson #include <linux/stop_machine.h> 9112ed2d3SChris Wilson 10112ed2d3SChris Wilson #include "i915_drv.h" 11112ed2d3SChris Wilson #include "i915_gpu_error.h" 12440e2b3dSJani Nikula #include "i915_irq.h" 1379ffac85SChris Wilson #include "intel_engine_pm.h" 1479ffac85SChris Wilson #include "intel_gt_pm.h" 15112ed2d3SChris Wilson #include "intel_reset.h" 16112ed2d3SChris Wilson 17112ed2d3SChris Wilson #include "intel_guc.h" 1805ca9306SJani Nikula #include "intel_overlay.h" 19112ed2d3SChris Wilson 20112ed2d3SChris Wilson #define RESET_MAX_RETRIES 3 21112ed2d3SChris Wilson 22112ed2d3SChris Wilson /* XXX How to handle concurrent GGTT updates using tiling registers? */ 23112ed2d3SChris Wilson #define RESET_UNDER_STOP_MACHINE 0 24112ed2d3SChris Wilson 25112ed2d3SChris Wilson static void rmw_set(struct intel_uncore *uncore, i915_reg_t reg, u32 set) 26112ed2d3SChris Wilson { 27112ed2d3SChris Wilson intel_uncore_rmw(uncore, reg, 0, set); 28112ed2d3SChris Wilson } 29112ed2d3SChris Wilson 30112ed2d3SChris Wilson static void rmw_clear(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) 31112ed2d3SChris Wilson { 32112ed2d3SChris Wilson intel_uncore_rmw(uncore, reg, clr, 0); 33112ed2d3SChris Wilson } 34112ed2d3SChris Wilson 35112ed2d3SChris Wilson static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set) 36112ed2d3SChris Wilson { 37112ed2d3SChris Wilson intel_uncore_rmw_fw(uncore, reg, 0, set); 38112ed2d3SChris Wilson } 39112ed2d3SChris Wilson 40112ed2d3SChris Wilson static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) 41112ed2d3SChris Wilson { 42112ed2d3SChris Wilson intel_uncore_rmw_fw(uncore, reg, clr, 0); 43112ed2d3SChris Wilson } 44112ed2d3SChris Wilson 45112ed2d3SChris Wilson static void engine_skip_context(struct i915_request *rq) 46112ed2d3SChris Wilson { 47112ed2d3SChris Wilson struct intel_engine_cs *engine = rq->engine; 48112ed2d3SChris Wilson struct i915_gem_context *hung_ctx = rq->gem_context; 49112ed2d3SChris Wilson 50112ed2d3SChris Wilson lockdep_assert_held(&engine->timeline.lock); 51112ed2d3SChris Wilson 52112ed2d3SChris Wilson if (!i915_request_is_active(rq)) 53112ed2d3SChris Wilson return; 54112ed2d3SChris Wilson 55112ed2d3SChris Wilson list_for_each_entry_continue(rq, &engine->timeline.requests, link) 56112ed2d3SChris Wilson if (rq->gem_context == hung_ctx) 57112ed2d3SChris Wilson i915_request_skip(rq, -EIO); 58112ed2d3SChris Wilson } 59112ed2d3SChris Wilson 60112ed2d3SChris Wilson static void client_mark_guilty(struct drm_i915_file_private *file_priv, 61112ed2d3SChris Wilson const struct i915_gem_context *ctx) 62112ed2d3SChris Wilson { 63112ed2d3SChris Wilson unsigned int score; 64112ed2d3SChris Wilson unsigned long prev_hang; 65112ed2d3SChris Wilson 66112ed2d3SChris Wilson if (i915_gem_context_is_banned(ctx)) 67112ed2d3SChris Wilson score = I915_CLIENT_SCORE_CONTEXT_BAN; 68112ed2d3SChris Wilson else 69112ed2d3SChris Wilson score = 0; 70112ed2d3SChris Wilson 71112ed2d3SChris Wilson prev_hang = xchg(&file_priv->hang_timestamp, jiffies); 72112ed2d3SChris Wilson if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) 73112ed2d3SChris Wilson score += I915_CLIENT_SCORE_HANG_FAST; 74112ed2d3SChris Wilson 75112ed2d3SChris Wilson if (score) { 76112ed2d3SChris Wilson atomic_add(score, &file_priv->ban_score); 77112ed2d3SChris Wilson 78112ed2d3SChris Wilson DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n", 79112ed2d3SChris Wilson ctx->name, score, 80112ed2d3SChris Wilson atomic_read(&file_priv->ban_score)); 81112ed2d3SChris Wilson } 82112ed2d3SChris Wilson } 83112ed2d3SChris Wilson 84112ed2d3SChris Wilson static bool context_mark_guilty(struct i915_gem_context *ctx) 85112ed2d3SChris Wilson { 86112ed2d3SChris Wilson unsigned long prev_hang; 87112ed2d3SChris Wilson bool banned; 88112ed2d3SChris Wilson int i; 89112ed2d3SChris Wilson 90112ed2d3SChris Wilson atomic_inc(&ctx->guilty_count); 91112ed2d3SChris Wilson 92112ed2d3SChris Wilson /* Cool contexts are too cool to be banned! (Used for reset testing.) */ 93112ed2d3SChris Wilson if (!i915_gem_context_is_bannable(ctx)) 94112ed2d3SChris Wilson return false; 95112ed2d3SChris Wilson 96112ed2d3SChris Wilson /* Record the timestamp for the last N hangs */ 97112ed2d3SChris Wilson prev_hang = ctx->hang_timestamp[0]; 98112ed2d3SChris Wilson for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) 99112ed2d3SChris Wilson ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; 100112ed2d3SChris Wilson ctx->hang_timestamp[i] = jiffies; 101112ed2d3SChris Wilson 102112ed2d3SChris Wilson /* If we have hung N+1 times in rapid succession, we ban the context! */ 103112ed2d3SChris Wilson banned = !i915_gem_context_is_recoverable(ctx); 104112ed2d3SChris Wilson if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) 105112ed2d3SChris Wilson banned = true; 106112ed2d3SChris Wilson if (banned) { 107112ed2d3SChris Wilson DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n", 108112ed2d3SChris Wilson ctx->name, atomic_read(&ctx->guilty_count)); 109112ed2d3SChris Wilson i915_gem_context_set_banned(ctx); 110112ed2d3SChris Wilson } 111112ed2d3SChris Wilson 112112ed2d3SChris Wilson if (!IS_ERR_OR_NULL(ctx->file_priv)) 113112ed2d3SChris Wilson client_mark_guilty(ctx->file_priv, ctx); 114112ed2d3SChris Wilson 115112ed2d3SChris Wilson return banned; 116112ed2d3SChris Wilson } 117112ed2d3SChris Wilson 118112ed2d3SChris Wilson static void context_mark_innocent(struct i915_gem_context *ctx) 119112ed2d3SChris Wilson { 120112ed2d3SChris Wilson atomic_inc(&ctx->active_count); 121112ed2d3SChris Wilson } 122112ed2d3SChris Wilson 123112ed2d3SChris Wilson void i915_reset_request(struct i915_request *rq, bool guilty) 124112ed2d3SChris Wilson { 125112ed2d3SChris Wilson GEM_TRACE("%s rq=%llx:%lld, guilty? %s\n", 126112ed2d3SChris Wilson rq->engine->name, 127112ed2d3SChris Wilson rq->fence.context, 128112ed2d3SChris Wilson rq->fence.seqno, 129112ed2d3SChris Wilson yesno(guilty)); 130112ed2d3SChris Wilson 131112ed2d3SChris Wilson lockdep_assert_held(&rq->engine->timeline.lock); 132112ed2d3SChris Wilson GEM_BUG_ON(i915_request_completed(rq)); 133112ed2d3SChris Wilson 134112ed2d3SChris Wilson if (guilty) { 135112ed2d3SChris Wilson i915_request_skip(rq, -EIO); 136112ed2d3SChris Wilson if (context_mark_guilty(rq->gem_context)) 137112ed2d3SChris Wilson engine_skip_context(rq); 138112ed2d3SChris Wilson } else { 139112ed2d3SChris Wilson dma_fence_set_error(&rq->fence, -EAGAIN); 140112ed2d3SChris Wilson context_mark_innocent(rq->gem_context); 141112ed2d3SChris Wilson } 142112ed2d3SChris Wilson } 143112ed2d3SChris Wilson 144112ed2d3SChris Wilson static void gen3_stop_engine(struct intel_engine_cs *engine) 145112ed2d3SChris Wilson { 146112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 147112ed2d3SChris Wilson const u32 base = engine->mmio_base; 148112ed2d3SChris Wilson 149112ed2d3SChris Wilson GEM_TRACE("%s\n", engine->name); 150112ed2d3SChris Wilson 151112ed2d3SChris Wilson if (intel_engine_stop_cs(engine)) 152112ed2d3SChris Wilson GEM_TRACE("%s: timed out on STOP_RING\n", engine->name); 153112ed2d3SChris Wilson 154112ed2d3SChris Wilson intel_uncore_write_fw(uncore, 155112ed2d3SChris Wilson RING_HEAD(base), 156112ed2d3SChris Wilson intel_uncore_read_fw(uncore, RING_TAIL(base))); 157112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */ 158112ed2d3SChris Wilson 159112ed2d3SChris Wilson intel_uncore_write_fw(uncore, RING_HEAD(base), 0); 160112ed2d3SChris Wilson intel_uncore_write_fw(uncore, RING_TAIL(base), 0); 161112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, RING_TAIL(base)); 162112ed2d3SChris Wilson 163112ed2d3SChris Wilson /* The ring must be empty before it is disabled */ 164112ed2d3SChris Wilson intel_uncore_write_fw(uncore, RING_CTL(base), 0); 165112ed2d3SChris Wilson 166112ed2d3SChris Wilson /* Check acts as a post */ 167112ed2d3SChris Wilson if (intel_uncore_read_fw(uncore, RING_HEAD(base))) 168112ed2d3SChris Wilson GEM_TRACE("%s: ring head [%x] not parked\n", 169112ed2d3SChris Wilson engine->name, 170112ed2d3SChris Wilson intel_uncore_read_fw(uncore, RING_HEAD(base))); 171112ed2d3SChris Wilson } 172112ed2d3SChris Wilson 173112ed2d3SChris Wilson static void i915_stop_engines(struct drm_i915_private *i915, 174112ed2d3SChris Wilson intel_engine_mask_t engine_mask) 175112ed2d3SChris Wilson { 176112ed2d3SChris Wilson struct intel_engine_cs *engine; 177112ed2d3SChris Wilson intel_engine_mask_t tmp; 178112ed2d3SChris Wilson 179112ed2d3SChris Wilson if (INTEL_GEN(i915) < 3) 180112ed2d3SChris Wilson return; 181112ed2d3SChris Wilson 182112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) 183112ed2d3SChris Wilson gen3_stop_engine(engine); 184112ed2d3SChris Wilson } 185112ed2d3SChris Wilson 186112ed2d3SChris Wilson static bool i915_in_reset(struct pci_dev *pdev) 187112ed2d3SChris Wilson { 188112ed2d3SChris Wilson u8 gdrst; 189112ed2d3SChris Wilson 190112ed2d3SChris Wilson pci_read_config_byte(pdev, I915_GDRST, &gdrst); 191112ed2d3SChris Wilson return gdrst & GRDOM_RESET_STATUS; 192112ed2d3SChris Wilson } 193112ed2d3SChris Wilson 194112ed2d3SChris Wilson static int i915_do_reset(struct drm_i915_private *i915, 195112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 196112ed2d3SChris Wilson unsigned int retry) 197112ed2d3SChris Wilson { 198112ed2d3SChris Wilson struct pci_dev *pdev = i915->drm.pdev; 199112ed2d3SChris Wilson int err; 200112ed2d3SChris Wilson 201112ed2d3SChris Wilson /* Assert reset for at least 20 usec, and wait for acknowledgement. */ 202112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 203112ed2d3SChris Wilson udelay(50); 204112ed2d3SChris Wilson err = wait_for_atomic(i915_in_reset(pdev), 50); 205112ed2d3SChris Wilson 206112ed2d3SChris Wilson /* Clear the reset request. */ 207112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 0); 208112ed2d3SChris Wilson udelay(50); 209112ed2d3SChris Wilson if (!err) 210112ed2d3SChris Wilson err = wait_for_atomic(!i915_in_reset(pdev), 50); 211112ed2d3SChris Wilson 212112ed2d3SChris Wilson return err; 213112ed2d3SChris Wilson } 214112ed2d3SChris Wilson 215112ed2d3SChris Wilson static bool g4x_reset_complete(struct pci_dev *pdev) 216112ed2d3SChris Wilson { 217112ed2d3SChris Wilson u8 gdrst; 218112ed2d3SChris Wilson 219112ed2d3SChris Wilson pci_read_config_byte(pdev, I915_GDRST, &gdrst); 220112ed2d3SChris Wilson return (gdrst & GRDOM_RESET_ENABLE) == 0; 221112ed2d3SChris Wilson } 222112ed2d3SChris Wilson 223112ed2d3SChris Wilson static int g33_do_reset(struct drm_i915_private *i915, 224112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 225112ed2d3SChris Wilson unsigned int retry) 226112ed2d3SChris Wilson { 227112ed2d3SChris Wilson struct pci_dev *pdev = i915->drm.pdev; 228112ed2d3SChris Wilson 229112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 230112ed2d3SChris Wilson return wait_for_atomic(g4x_reset_complete(pdev), 50); 231112ed2d3SChris Wilson } 232112ed2d3SChris Wilson 233112ed2d3SChris Wilson static int g4x_do_reset(struct drm_i915_private *i915, 234112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 235112ed2d3SChris Wilson unsigned int retry) 236112ed2d3SChris Wilson { 237112ed2d3SChris Wilson struct pci_dev *pdev = i915->drm.pdev; 238112ed2d3SChris Wilson struct intel_uncore *uncore = &i915->uncore; 239112ed2d3SChris Wilson int ret; 240112ed2d3SChris Wilson 241112ed2d3SChris Wilson /* WaVcpClkGateDisableForMediaReset:ctg,elk */ 242112ed2d3SChris Wilson rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 243112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 244112ed2d3SChris Wilson 245112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 246112ed2d3SChris Wilson GRDOM_MEDIA | GRDOM_RESET_ENABLE); 247112ed2d3SChris Wilson ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 248112ed2d3SChris Wilson if (ret) { 249112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for media reset failed\n"); 250112ed2d3SChris Wilson goto out; 251112ed2d3SChris Wilson } 252112ed2d3SChris Wilson 253112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 254112ed2d3SChris Wilson GRDOM_RENDER | GRDOM_RESET_ENABLE); 255112ed2d3SChris Wilson ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 256112ed2d3SChris Wilson if (ret) { 257112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for render reset failed\n"); 258112ed2d3SChris Wilson goto out; 259112ed2d3SChris Wilson } 260112ed2d3SChris Wilson 261112ed2d3SChris Wilson out: 262112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 0); 263112ed2d3SChris Wilson 264112ed2d3SChris Wilson rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 265112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 266112ed2d3SChris Wilson 267112ed2d3SChris Wilson return ret; 268112ed2d3SChris Wilson } 269112ed2d3SChris Wilson 270112ed2d3SChris Wilson static int ironlake_do_reset(struct drm_i915_private *i915, 271112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 272112ed2d3SChris Wilson unsigned int retry) 273112ed2d3SChris Wilson { 274112ed2d3SChris Wilson struct intel_uncore *uncore = &i915->uncore; 275112ed2d3SChris Wilson int ret; 276112ed2d3SChris Wilson 277112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 278112ed2d3SChris Wilson ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); 279112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 280112ed2d3SChris Wilson ILK_GRDOM_RESET_ENABLE, 0, 281112ed2d3SChris Wilson 5000, 0, 282112ed2d3SChris Wilson NULL); 283112ed2d3SChris Wilson if (ret) { 284112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for render reset failed\n"); 285112ed2d3SChris Wilson goto out; 286112ed2d3SChris Wilson } 287112ed2d3SChris Wilson 288112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 289112ed2d3SChris Wilson ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); 290112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 291112ed2d3SChris Wilson ILK_GRDOM_RESET_ENABLE, 0, 292112ed2d3SChris Wilson 5000, 0, 293112ed2d3SChris Wilson NULL); 294112ed2d3SChris Wilson if (ret) { 295112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for media reset failed\n"); 296112ed2d3SChris Wilson goto out; 297112ed2d3SChris Wilson } 298112ed2d3SChris Wilson 299112ed2d3SChris Wilson out: 300112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 0); 301112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, ILK_GDSR); 302112ed2d3SChris Wilson return ret; 303112ed2d3SChris Wilson } 304112ed2d3SChris Wilson 305112ed2d3SChris Wilson /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ 306112ed2d3SChris Wilson static int gen6_hw_domain_reset(struct drm_i915_private *i915, 307112ed2d3SChris Wilson u32 hw_domain_mask) 308112ed2d3SChris Wilson { 309112ed2d3SChris Wilson struct intel_uncore *uncore = &i915->uncore; 310112ed2d3SChris Wilson int err; 311112ed2d3SChris Wilson 312112ed2d3SChris Wilson /* 313112ed2d3SChris Wilson * GEN6_GDRST is not in the gt power well, no need to check 314112ed2d3SChris Wilson * for fifo space for the write or forcewake the chip for 315112ed2d3SChris Wilson * the read 316112ed2d3SChris Wilson */ 317112ed2d3SChris Wilson intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); 318112ed2d3SChris Wilson 319112ed2d3SChris Wilson /* Wait for the device to ack the reset requests */ 320112ed2d3SChris Wilson err = __intel_wait_for_register_fw(uncore, 321112ed2d3SChris Wilson GEN6_GDRST, hw_domain_mask, 0, 322112ed2d3SChris Wilson 500, 0, 323112ed2d3SChris Wilson NULL); 324112ed2d3SChris Wilson if (err) 325112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n", 326112ed2d3SChris Wilson hw_domain_mask); 327112ed2d3SChris Wilson 328112ed2d3SChris Wilson return err; 329112ed2d3SChris Wilson } 330112ed2d3SChris Wilson 331112ed2d3SChris Wilson static int gen6_reset_engines(struct drm_i915_private *i915, 332112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 333112ed2d3SChris Wilson unsigned int retry) 334112ed2d3SChris Wilson { 335112ed2d3SChris Wilson struct intel_engine_cs *engine; 336112ed2d3SChris Wilson const u32 hw_engine_mask[] = { 337112ed2d3SChris Wilson [RCS0] = GEN6_GRDOM_RENDER, 338112ed2d3SChris Wilson [BCS0] = GEN6_GRDOM_BLT, 339112ed2d3SChris Wilson [VCS0] = GEN6_GRDOM_MEDIA, 340112ed2d3SChris Wilson [VCS1] = GEN8_GRDOM_MEDIA2, 341112ed2d3SChris Wilson [VECS0] = GEN6_GRDOM_VECS, 342112ed2d3SChris Wilson }; 343112ed2d3SChris Wilson u32 hw_mask; 344112ed2d3SChris Wilson 345112ed2d3SChris Wilson if (engine_mask == ALL_ENGINES) { 346112ed2d3SChris Wilson hw_mask = GEN6_GRDOM_FULL; 347112ed2d3SChris Wilson } else { 348112ed2d3SChris Wilson intel_engine_mask_t tmp; 349112ed2d3SChris Wilson 350112ed2d3SChris Wilson hw_mask = 0; 351112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 352112ed2d3SChris Wilson GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); 353112ed2d3SChris Wilson hw_mask |= hw_engine_mask[engine->id]; 354112ed2d3SChris Wilson } 355112ed2d3SChris Wilson } 356112ed2d3SChris Wilson 357112ed2d3SChris Wilson return gen6_hw_domain_reset(i915, hw_mask); 358112ed2d3SChris Wilson } 359112ed2d3SChris Wilson 360112ed2d3SChris Wilson static u32 gen11_lock_sfc(struct intel_engine_cs *engine) 361112ed2d3SChris Wilson { 362112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 363112ed2d3SChris Wilson u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access; 364112ed2d3SChris Wilson i915_reg_t sfc_forced_lock, sfc_forced_lock_ack; 365112ed2d3SChris Wilson u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit; 366112ed2d3SChris Wilson i915_reg_t sfc_usage; 367112ed2d3SChris Wilson u32 sfc_usage_bit; 368112ed2d3SChris Wilson u32 sfc_reset_bit; 369112ed2d3SChris Wilson 370112ed2d3SChris Wilson switch (engine->class) { 371112ed2d3SChris Wilson case VIDEO_DECODE_CLASS: 372112ed2d3SChris Wilson if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 373112ed2d3SChris Wilson return 0; 374112ed2d3SChris Wilson 375112ed2d3SChris Wilson sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); 376112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 377112ed2d3SChris Wilson 378112ed2d3SChris Wilson sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine); 379112ed2d3SChris Wilson sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; 380112ed2d3SChris Wilson 381112ed2d3SChris Wilson sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine); 382112ed2d3SChris Wilson sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT; 383112ed2d3SChris Wilson sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); 384112ed2d3SChris Wilson break; 385112ed2d3SChris Wilson 386112ed2d3SChris Wilson case VIDEO_ENHANCEMENT_CLASS: 387112ed2d3SChris Wilson sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); 388112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 389112ed2d3SChris Wilson 390112ed2d3SChris Wilson sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine); 391112ed2d3SChris Wilson sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; 392112ed2d3SChris Wilson 393112ed2d3SChris Wilson sfc_usage = GEN11_VECS_SFC_USAGE(engine); 394112ed2d3SChris Wilson sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT; 395112ed2d3SChris Wilson sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); 396112ed2d3SChris Wilson break; 397112ed2d3SChris Wilson 398112ed2d3SChris Wilson default: 399112ed2d3SChris Wilson return 0; 400112ed2d3SChris Wilson } 401112ed2d3SChris Wilson 402112ed2d3SChris Wilson /* 403112ed2d3SChris Wilson * Tell the engine that a software reset is going to happen. The engine 404112ed2d3SChris Wilson * will then try to force lock the SFC (if currently locked, it will 405112ed2d3SChris Wilson * remain so until we tell the engine it is safe to unlock; if currently 406112ed2d3SChris Wilson * unlocked, it will ignore this and all new lock requests). If SFC 407112ed2d3SChris Wilson * ends up being locked to the engine we want to reset, we have to reset 408112ed2d3SChris Wilson * it as well (we will unlock it once the reset sequence is completed). 409112ed2d3SChris Wilson */ 410112ed2d3SChris Wilson rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); 411112ed2d3SChris Wilson 412112ed2d3SChris Wilson if (__intel_wait_for_register_fw(uncore, 413112ed2d3SChris Wilson sfc_forced_lock_ack, 414112ed2d3SChris Wilson sfc_forced_lock_ack_bit, 415112ed2d3SChris Wilson sfc_forced_lock_ack_bit, 416112ed2d3SChris Wilson 1000, 0, NULL)) { 417112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n"); 418112ed2d3SChris Wilson return 0; 419112ed2d3SChris Wilson } 420112ed2d3SChris Wilson 421112ed2d3SChris Wilson if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit) 422112ed2d3SChris Wilson return sfc_reset_bit; 423112ed2d3SChris Wilson 424112ed2d3SChris Wilson return 0; 425112ed2d3SChris Wilson } 426112ed2d3SChris Wilson 427112ed2d3SChris Wilson static void gen11_unlock_sfc(struct intel_engine_cs *engine) 428112ed2d3SChris Wilson { 429112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 430112ed2d3SChris Wilson u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access; 431112ed2d3SChris Wilson i915_reg_t sfc_forced_lock; 432112ed2d3SChris Wilson u32 sfc_forced_lock_bit; 433112ed2d3SChris Wilson 434112ed2d3SChris Wilson switch (engine->class) { 435112ed2d3SChris Wilson case VIDEO_DECODE_CLASS: 436112ed2d3SChris Wilson if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 437112ed2d3SChris Wilson return; 438112ed2d3SChris Wilson 439112ed2d3SChris Wilson sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); 440112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 441112ed2d3SChris Wilson break; 442112ed2d3SChris Wilson 443112ed2d3SChris Wilson case VIDEO_ENHANCEMENT_CLASS: 444112ed2d3SChris Wilson sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); 445112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 446112ed2d3SChris Wilson break; 447112ed2d3SChris Wilson 448112ed2d3SChris Wilson default: 449112ed2d3SChris Wilson return; 450112ed2d3SChris Wilson } 451112ed2d3SChris Wilson 452112ed2d3SChris Wilson rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); 453112ed2d3SChris Wilson } 454112ed2d3SChris Wilson 455112ed2d3SChris Wilson static int gen11_reset_engines(struct drm_i915_private *i915, 456112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 457112ed2d3SChris Wilson unsigned int retry) 458112ed2d3SChris Wilson { 459112ed2d3SChris Wilson const u32 hw_engine_mask[] = { 460112ed2d3SChris Wilson [RCS0] = GEN11_GRDOM_RENDER, 461112ed2d3SChris Wilson [BCS0] = GEN11_GRDOM_BLT, 462112ed2d3SChris Wilson [VCS0] = GEN11_GRDOM_MEDIA, 463112ed2d3SChris Wilson [VCS1] = GEN11_GRDOM_MEDIA2, 464112ed2d3SChris Wilson [VCS2] = GEN11_GRDOM_MEDIA3, 465112ed2d3SChris Wilson [VCS3] = GEN11_GRDOM_MEDIA4, 466112ed2d3SChris Wilson [VECS0] = GEN11_GRDOM_VECS, 467112ed2d3SChris Wilson [VECS1] = GEN11_GRDOM_VECS2, 468112ed2d3SChris Wilson }; 469112ed2d3SChris Wilson struct intel_engine_cs *engine; 470112ed2d3SChris Wilson intel_engine_mask_t tmp; 471112ed2d3SChris Wilson u32 hw_mask; 472112ed2d3SChris Wilson int ret; 473112ed2d3SChris Wilson 474112ed2d3SChris Wilson if (engine_mask == ALL_ENGINES) { 475112ed2d3SChris Wilson hw_mask = GEN11_GRDOM_FULL; 476112ed2d3SChris Wilson } else { 477112ed2d3SChris Wilson hw_mask = 0; 478112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 479112ed2d3SChris Wilson GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); 480112ed2d3SChris Wilson hw_mask |= hw_engine_mask[engine->id]; 481112ed2d3SChris Wilson hw_mask |= gen11_lock_sfc(engine); 482112ed2d3SChris Wilson } 483112ed2d3SChris Wilson } 484112ed2d3SChris Wilson 485112ed2d3SChris Wilson ret = gen6_hw_domain_reset(i915, hw_mask); 486112ed2d3SChris Wilson 487112ed2d3SChris Wilson if (engine_mask != ALL_ENGINES) 488112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) 489112ed2d3SChris Wilson gen11_unlock_sfc(engine); 490112ed2d3SChris Wilson 491112ed2d3SChris Wilson return ret; 492112ed2d3SChris Wilson } 493112ed2d3SChris Wilson 494112ed2d3SChris Wilson static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) 495112ed2d3SChris Wilson { 496112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 497112ed2d3SChris Wilson const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base); 498112ed2d3SChris Wilson u32 request, mask, ack; 499112ed2d3SChris Wilson int ret; 500112ed2d3SChris Wilson 501112ed2d3SChris Wilson ack = intel_uncore_read_fw(uncore, reg); 502112ed2d3SChris Wilson if (ack & RESET_CTL_CAT_ERROR) { 503112ed2d3SChris Wilson /* 504112ed2d3SChris Wilson * For catastrophic errors, ready-for-reset sequence 505112ed2d3SChris Wilson * needs to be bypassed: HAS#396813 506112ed2d3SChris Wilson */ 507112ed2d3SChris Wilson request = RESET_CTL_CAT_ERROR; 508112ed2d3SChris Wilson mask = RESET_CTL_CAT_ERROR; 509112ed2d3SChris Wilson 510112ed2d3SChris Wilson /* Catastrophic errors need to be cleared by HW */ 511112ed2d3SChris Wilson ack = 0; 512112ed2d3SChris Wilson } else if (!(ack & RESET_CTL_READY_TO_RESET)) { 513112ed2d3SChris Wilson request = RESET_CTL_REQUEST_RESET; 514112ed2d3SChris Wilson mask = RESET_CTL_READY_TO_RESET; 515112ed2d3SChris Wilson ack = RESET_CTL_READY_TO_RESET; 516112ed2d3SChris Wilson } else { 517112ed2d3SChris Wilson return 0; 518112ed2d3SChris Wilson } 519112ed2d3SChris Wilson 520112ed2d3SChris Wilson intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request)); 521112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 522112ed2d3SChris Wilson 700, 0, NULL); 523112ed2d3SChris Wilson if (ret) 524112ed2d3SChris Wilson DRM_ERROR("%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", 525112ed2d3SChris Wilson engine->name, request, 526112ed2d3SChris Wilson intel_uncore_read_fw(uncore, reg)); 527112ed2d3SChris Wilson 528112ed2d3SChris Wilson return ret; 529112ed2d3SChris Wilson } 530112ed2d3SChris Wilson 531112ed2d3SChris Wilson static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) 532112ed2d3SChris Wilson { 533112ed2d3SChris Wilson intel_uncore_write_fw(engine->uncore, 534112ed2d3SChris Wilson RING_RESET_CTL(engine->mmio_base), 535112ed2d3SChris Wilson _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); 536112ed2d3SChris Wilson } 537112ed2d3SChris Wilson 538112ed2d3SChris Wilson static int gen8_reset_engines(struct drm_i915_private *i915, 539112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 540112ed2d3SChris Wilson unsigned int retry) 541112ed2d3SChris Wilson { 542112ed2d3SChris Wilson struct intel_engine_cs *engine; 543112ed2d3SChris Wilson const bool reset_non_ready = retry >= 1; 544112ed2d3SChris Wilson intel_engine_mask_t tmp; 545112ed2d3SChris Wilson int ret; 546112ed2d3SChris Wilson 547112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 548112ed2d3SChris Wilson ret = gen8_engine_reset_prepare(engine); 549112ed2d3SChris Wilson if (ret && !reset_non_ready) 550112ed2d3SChris Wilson goto skip_reset; 551112ed2d3SChris Wilson 552112ed2d3SChris Wilson /* 553112ed2d3SChris Wilson * If this is not the first failed attempt to prepare, 554112ed2d3SChris Wilson * we decide to proceed anyway. 555112ed2d3SChris Wilson * 556112ed2d3SChris Wilson * By doing so we risk context corruption and with 557112ed2d3SChris Wilson * some gens (kbl), possible system hang if reset 558112ed2d3SChris Wilson * happens during active bb execution. 559112ed2d3SChris Wilson * 560112ed2d3SChris Wilson * We rather take context corruption instead of 561112ed2d3SChris Wilson * failed reset with a wedged driver/gpu. And 562112ed2d3SChris Wilson * active bb execution case should be covered by 563112ed2d3SChris Wilson * i915_stop_engines we have before the reset. 564112ed2d3SChris Wilson */ 565112ed2d3SChris Wilson } 566112ed2d3SChris Wilson 567112ed2d3SChris Wilson if (INTEL_GEN(i915) >= 11) 568112ed2d3SChris Wilson ret = gen11_reset_engines(i915, engine_mask, retry); 569112ed2d3SChris Wilson else 570112ed2d3SChris Wilson ret = gen6_reset_engines(i915, engine_mask, retry); 571112ed2d3SChris Wilson 572112ed2d3SChris Wilson skip_reset: 573112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) 574112ed2d3SChris Wilson gen8_engine_reset_cancel(engine); 575112ed2d3SChris Wilson 576112ed2d3SChris Wilson return ret; 577112ed2d3SChris Wilson } 578112ed2d3SChris Wilson 579112ed2d3SChris Wilson typedef int (*reset_func)(struct drm_i915_private *, 580112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 581112ed2d3SChris Wilson unsigned int retry); 582112ed2d3SChris Wilson 583112ed2d3SChris Wilson static reset_func intel_get_gpu_reset(struct drm_i915_private *i915) 584112ed2d3SChris Wilson { 585112ed2d3SChris Wilson if (INTEL_GEN(i915) >= 8) 586112ed2d3SChris Wilson return gen8_reset_engines; 587112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 6) 588112ed2d3SChris Wilson return gen6_reset_engines; 589112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 5) 590112ed2d3SChris Wilson return ironlake_do_reset; 591112ed2d3SChris Wilson else if (IS_G4X(i915)) 592112ed2d3SChris Wilson return g4x_do_reset; 593112ed2d3SChris Wilson else if (IS_G33(i915) || IS_PINEVIEW(i915)) 594112ed2d3SChris Wilson return g33_do_reset; 595112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 3) 596112ed2d3SChris Wilson return i915_do_reset; 597112ed2d3SChris Wilson else 598112ed2d3SChris Wilson return NULL; 599112ed2d3SChris Wilson } 600112ed2d3SChris Wilson 601112ed2d3SChris Wilson int intel_gpu_reset(struct drm_i915_private *i915, 602112ed2d3SChris Wilson intel_engine_mask_t engine_mask) 603112ed2d3SChris Wilson { 604112ed2d3SChris Wilson const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; 605112ed2d3SChris Wilson reset_func reset; 606112ed2d3SChris Wilson int ret = -ETIMEDOUT; 607112ed2d3SChris Wilson int retry; 608112ed2d3SChris Wilson 609112ed2d3SChris Wilson reset = intel_get_gpu_reset(i915); 610112ed2d3SChris Wilson if (!reset) 611112ed2d3SChris Wilson return -ENODEV; 612112ed2d3SChris Wilson 613112ed2d3SChris Wilson /* 614112ed2d3SChris Wilson * If the power well sleeps during the reset, the reset 615112ed2d3SChris Wilson * request may be dropped and never completes (causing -EIO). 616112ed2d3SChris Wilson */ 617112ed2d3SChris Wilson intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL); 618112ed2d3SChris Wilson for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { 619112ed2d3SChris Wilson /* 620112ed2d3SChris Wilson * We stop engines, otherwise we might get failed reset and a 621112ed2d3SChris Wilson * dead gpu (on elk). Also as modern gpu as kbl can suffer 622112ed2d3SChris Wilson * from system hang if batchbuffer is progressing when 623112ed2d3SChris Wilson * the reset is issued, regardless of READY_TO_RESET ack. 624112ed2d3SChris Wilson * Thus assume it is best to stop engines on all gens 625112ed2d3SChris Wilson * where we have a gpu reset. 626112ed2d3SChris Wilson * 627112ed2d3SChris Wilson * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 628112ed2d3SChris Wilson * 629112ed2d3SChris Wilson * WaMediaResetMainRingCleanup:ctg,elk (presumably) 630112ed2d3SChris Wilson * 631112ed2d3SChris Wilson * FIXME: Wa for more modern gens needs to be validated 632112ed2d3SChris Wilson */ 633112ed2d3SChris Wilson if (retry) 634112ed2d3SChris Wilson i915_stop_engines(i915, engine_mask); 635112ed2d3SChris Wilson 636112ed2d3SChris Wilson GEM_TRACE("engine_mask=%x\n", engine_mask); 637112ed2d3SChris Wilson preempt_disable(); 638112ed2d3SChris Wilson ret = reset(i915, engine_mask, retry); 639112ed2d3SChris Wilson preempt_enable(); 640112ed2d3SChris Wilson } 641112ed2d3SChris Wilson intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL); 642112ed2d3SChris Wilson 643112ed2d3SChris Wilson return ret; 644112ed2d3SChris Wilson } 645112ed2d3SChris Wilson 646112ed2d3SChris Wilson bool intel_has_gpu_reset(struct drm_i915_private *i915) 647112ed2d3SChris Wilson { 648112ed2d3SChris Wilson if (!i915_modparams.reset) 649112ed2d3SChris Wilson return NULL; 650112ed2d3SChris Wilson 651112ed2d3SChris Wilson return intel_get_gpu_reset(i915); 652112ed2d3SChris Wilson } 653112ed2d3SChris Wilson 654112ed2d3SChris Wilson bool intel_has_reset_engine(struct drm_i915_private *i915) 655112ed2d3SChris Wilson { 656112ed2d3SChris Wilson return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2; 657112ed2d3SChris Wilson } 658112ed2d3SChris Wilson 659112ed2d3SChris Wilson int intel_reset_guc(struct drm_i915_private *i915) 660112ed2d3SChris Wilson { 661112ed2d3SChris Wilson u32 guc_domain = 662112ed2d3SChris Wilson INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; 663112ed2d3SChris Wilson int ret; 664112ed2d3SChris Wilson 665112ed2d3SChris Wilson GEM_BUG_ON(!HAS_GUC(i915)); 666112ed2d3SChris Wilson 667112ed2d3SChris Wilson intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL); 668112ed2d3SChris Wilson ret = gen6_hw_domain_reset(i915, guc_domain); 669112ed2d3SChris Wilson intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL); 670112ed2d3SChris Wilson 671112ed2d3SChris Wilson return ret; 672112ed2d3SChris Wilson } 673112ed2d3SChris Wilson 674112ed2d3SChris Wilson /* 675112ed2d3SChris Wilson * Ensure irq handler finishes, and not run again. 676112ed2d3SChris Wilson * Also return the active request so that we only search for it once. 677112ed2d3SChris Wilson */ 678112ed2d3SChris Wilson static void reset_prepare_engine(struct intel_engine_cs *engine) 679112ed2d3SChris Wilson { 680112ed2d3SChris Wilson /* 681112ed2d3SChris Wilson * During the reset sequence, we must prevent the engine from 682112ed2d3SChris Wilson * entering RC6. As the context state is undefined until we restart 683112ed2d3SChris Wilson * the engine, if it does enter RC6 during the reset, the state 684112ed2d3SChris Wilson * written to the powercontext is undefined and so we may lose 685112ed2d3SChris Wilson * GPU state upon resume, i.e. fail to restart after a reset. 686112ed2d3SChris Wilson */ 68779ffac85SChris Wilson intel_engine_pm_get(engine); 688112ed2d3SChris Wilson intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); 689112ed2d3SChris Wilson engine->reset.prepare(engine); 690112ed2d3SChris Wilson } 691112ed2d3SChris Wilson 692112ed2d3SChris Wilson static void revoke_mmaps(struct drm_i915_private *i915) 693112ed2d3SChris Wilson { 694112ed2d3SChris Wilson int i; 695112ed2d3SChris Wilson 696112ed2d3SChris Wilson for (i = 0; i < i915->num_fence_regs; i++) { 697112ed2d3SChris Wilson struct drm_vma_offset_node *node; 698112ed2d3SChris Wilson struct i915_vma *vma; 699112ed2d3SChris Wilson u64 vma_offset; 700112ed2d3SChris Wilson 701112ed2d3SChris Wilson vma = READ_ONCE(i915->fence_regs[i].vma); 702112ed2d3SChris Wilson if (!vma) 703112ed2d3SChris Wilson continue; 704112ed2d3SChris Wilson 705112ed2d3SChris Wilson if (!i915_vma_has_userfault(vma)) 706112ed2d3SChris Wilson continue; 707112ed2d3SChris Wilson 708112ed2d3SChris Wilson GEM_BUG_ON(vma->fence != &i915->fence_regs[i]); 709112ed2d3SChris Wilson node = &vma->obj->base.vma_node; 710112ed2d3SChris Wilson vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT; 711112ed2d3SChris Wilson unmap_mapping_range(i915->drm.anon_inode->i_mapping, 712112ed2d3SChris Wilson drm_vma_node_offset_addr(node) + vma_offset, 713112ed2d3SChris Wilson vma->size, 714112ed2d3SChris Wilson 1); 715112ed2d3SChris Wilson } 716112ed2d3SChris Wilson } 717112ed2d3SChris Wilson 718112ed2d3SChris Wilson static void reset_prepare(struct drm_i915_private *i915) 719112ed2d3SChris Wilson { 720112ed2d3SChris Wilson struct intel_engine_cs *engine; 721112ed2d3SChris Wilson enum intel_engine_id id; 722112ed2d3SChris Wilson 72379ffac85SChris Wilson intel_gt_pm_get(i915); 724112ed2d3SChris Wilson for_each_engine(engine, i915, id) 725112ed2d3SChris Wilson reset_prepare_engine(engine); 726112ed2d3SChris Wilson 727112ed2d3SChris Wilson intel_uc_reset_prepare(i915); 728112ed2d3SChris Wilson } 729112ed2d3SChris Wilson 730112ed2d3SChris Wilson static void gt_revoke(struct drm_i915_private *i915) 731112ed2d3SChris Wilson { 732112ed2d3SChris Wilson revoke_mmaps(i915); 733112ed2d3SChris Wilson } 734112ed2d3SChris Wilson 735112ed2d3SChris Wilson static int gt_reset(struct drm_i915_private *i915, 736112ed2d3SChris Wilson intel_engine_mask_t stalled_mask) 737112ed2d3SChris Wilson { 738112ed2d3SChris Wilson struct intel_engine_cs *engine; 739112ed2d3SChris Wilson enum intel_engine_id id; 740112ed2d3SChris Wilson int err; 741112ed2d3SChris Wilson 742112ed2d3SChris Wilson /* 743112ed2d3SChris Wilson * Everything depends on having the GTT running, so we need to start 744112ed2d3SChris Wilson * there. 745112ed2d3SChris Wilson */ 746112ed2d3SChris Wilson err = i915_ggtt_enable_hw(i915); 747112ed2d3SChris Wilson if (err) 748112ed2d3SChris Wilson return err; 749112ed2d3SChris Wilson 750112ed2d3SChris Wilson for_each_engine(engine, i915, id) 751112ed2d3SChris Wilson intel_engine_reset(engine, stalled_mask & engine->mask); 752112ed2d3SChris Wilson 753112ed2d3SChris Wilson i915_gem_restore_fences(i915); 754112ed2d3SChris Wilson 755112ed2d3SChris Wilson return err; 756112ed2d3SChris Wilson } 757112ed2d3SChris Wilson 758112ed2d3SChris Wilson static void reset_finish_engine(struct intel_engine_cs *engine) 759112ed2d3SChris Wilson { 760112ed2d3SChris Wilson engine->reset.finish(engine); 76179ffac85SChris Wilson intel_engine_pm_put(engine); 762112ed2d3SChris Wilson intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); 763112ed2d3SChris Wilson } 764112ed2d3SChris Wilson 765112ed2d3SChris Wilson static void reset_finish(struct drm_i915_private *i915) 766112ed2d3SChris Wilson { 767112ed2d3SChris Wilson struct intel_engine_cs *engine; 768112ed2d3SChris Wilson enum intel_engine_id id; 769112ed2d3SChris Wilson 770112ed2d3SChris Wilson for_each_engine(engine, i915, id) { 771112ed2d3SChris Wilson reset_finish_engine(engine); 772112ed2d3SChris Wilson intel_engine_signal_breadcrumbs(engine); 773112ed2d3SChris Wilson } 77479ffac85SChris Wilson intel_gt_pm_put(i915); 775112ed2d3SChris Wilson } 776112ed2d3SChris Wilson 777112ed2d3SChris Wilson static void nop_submit_request(struct i915_request *request) 778112ed2d3SChris Wilson { 779112ed2d3SChris Wilson struct intel_engine_cs *engine = request->engine; 780112ed2d3SChris Wilson unsigned long flags; 781112ed2d3SChris Wilson 782112ed2d3SChris Wilson GEM_TRACE("%s fence %llx:%lld -> -EIO\n", 783112ed2d3SChris Wilson engine->name, request->fence.context, request->fence.seqno); 784112ed2d3SChris Wilson dma_fence_set_error(&request->fence, -EIO); 785112ed2d3SChris Wilson 786112ed2d3SChris Wilson spin_lock_irqsave(&engine->timeline.lock, flags); 787112ed2d3SChris Wilson __i915_request_submit(request); 788112ed2d3SChris Wilson i915_request_mark_complete(request); 789112ed2d3SChris Wilson spin_unlock_irqrestore(&engine->timeline.lock, flags); 790112ed2d3SChris Wilson 791112ed2d3SChris Wilson intel_engine_queue_breadcrumbs(engine); 792112ed2d3SChris Wilson } 793112ed2d3SChris Wilson 794112ed2d3SChris Wilson static void __i915_gem_set_wedged(struct drm_i915_private *i915) 795112ed2d3SChris Wilson { 796112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 797112ed2d3SChris Wilson struct intel_engine_cs *engine; 798112ed2d3SChris Wilson enum intel_engine_id id; 799112ed2d3SChris Wilson 800112ed2d3SChris Wilson if (test_bit(I915_WEDGED, &error->flags)) 801112ed2d3SChris Wilson return; 802112ed2d3SChris Wilson 803112ed2d3SChris Wilson if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) { 804112ed2d3SChris Wilson struct drm_printer p = drm_debug_printer(__func__); 805112ed2d3SChris Wilson 806112ed2d3SChris Wilson for_each_engine(engine, i915, id) 807112ed2d3SChris Wilson intel_engine_dump(engine, &p, "%s\n", engine->name); 808112ed2d3SChris Wilson } 809112ed2d3SChris Wilson 810112ed2d3SChris Wilson GEM_TRACE("start\n"); 811112ed2d3SChris Wilson 812112ed2d3SChris Wilson /* 813112ed2d3SChris Wilson * First, stop submission to hw, but do not yet complete requests by 814112ed2d3SChris Wilson * rolling the global seqno forward (since this would complete requests 815112ed2d3SChris Wilson * for which we haven't set the fence error to EIO yet). 816112ed2d3SChris Wilson */ 817112ed2d3SChris Wilson reset_prepare(i915); 818112ed2d3SChris Wilson 819112ed2d3SChris Wilson /* Even if the GPU reset fails, it should still stop the engines */ 820112ed2d3SChris Wilson if (!INTEL_INFO(i915)->gpu_reset_clobbers_display) 821112ed2d3SChris Wilson intel_gpu_reset(i915, ALL_ENGINES); 822112ed2d3SChris Wilson 823112ed2d3SChris Wilson for_each_engine(engine, i915, id) { 824112ed2d3SChris Wilson engine->submit_request = nop_submit_request; 825112ed2d3SChris Wilson engine->schedule = NULL; 826112ed2d3SChris Wilson } 827112ed2d3SChris Wilson i915->caps.scheduler = 0; 828112ed2d3SChris Wilson 829112ed2d3SChris Wilson /* 830112ed2d3SChris Wilson * Make sure no request can slip through without getting completed by 831112ed2d3SChris Wilson * either this call here to intel_engine_write_global_seqno, or the one 832112ed2d3SChris Wilson * in nop_submit_request. 833112ed2d3SChris Wilson */ 834112ed2d3SChris Wilson synchronize_rcu_expedited(); 83579ffac85SChris Wilson set_bit(I915_WEDGED, &error->flags); 836112ed2d3SChris Wilson 837112ed2d3SChris Wilson /* Mark all executing requests as skipped */ 838112ed2d3SChris Wilson for_each_engine(engine, i915, id) 839112ed2d3SChris Wilson engine->cancel_requests(engine); 840112ed2d3SChris Wilson 841112ed2d3SChris Wilson reset_finish(i915); 842112ed2d3SChris Wilson 843112ed2d3SChris Wilson GEM_TRACE("end\n"); 844112ed2d3SChris Wilson } 845112ed2d3SChris Wilson 846112ed2d3SChris Wilson void i915_gem_set_wedged(struct drm_i915_private *i915) 847112ed2d3SChris Wilson { 848112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 849112ed2d3SChris Wilson intel_wakeref_t wakeref; 850112ed2d3SChris Wilson 851112ed2d3SChris Wilson mutex_lock(&error->wedge_mutex); 852112ed2d3SChris Wilson with_intel_runtime_pm(i915, wakeref) 853112ed2d3SChris Wilson __i915_gem_set_wedged(i915); 854112ed2d3SChris Wilson mutex_unlock(&error->wedge_mutex); 855112ed2d3SChris Wilson } 856112ed2d3SChris Wilson 857112ed2d3SChris Wilson static bool __i915_gem_unset_wedged(struct drm_i915_private *i915) 858112ed2d3SChris Wilson { 859112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 860112ed2d3SChris Wilson struct i915_timeline *tl; 861112ed2d3SChris Wilson 862112ed2d3SChris Wilson if (!test_bit(I915_WEDGED, &error->flags)) 863112ed2d3SChris Wilson return true; 864112ed2d3SChris Wilson 865112ed2d3SChris Wilson if (!i915->gt.scratch) /* Never full initialised, recovery impossible */ 866112ed2d3SChris Wilson return false; 867112ed2d3SChris Wilson 868112ed2d3SChris Wilson GEM_TRACE("start\n"); 869112ed2d3SChris Wilson 870112ed2d3SChris Wilson /* 871112ed2d3SChris Wilson * Before unwedging, make sure that all pending operations 872112ed2d3SChris Wilson * are flushed and errored out - we may have requests waiting upon 873112ed2d3SChris Wilson * third party fences. We marked all inflight requests as EIO, and 874112ed2d3SChris Wilson * every execbuf since returned EIO, for consistency we want all 875112ed2d3SChris Wilson * the currently pending requests to also be marked as EIO, which 876112ed2d3SChris Wilson * is done inside our nop_submit_request - and so we must wait. 877112ed2d3SChris Wilson * 878112ed2d3SChris Wilson * No more can be submitted until we reset the wedged bit. 879112ed2d3SChris Wilson */ 880112ed2d3SChris Wilson mutex_lock(&i915->gt.timelines.mutex); 881112ed2d3SChris Wilson list_for_each_entry(tl, &i915->gt.timelines.active_list, link) { 882112ed2d3SChris Wilson struct i915_request *rq; 883112ed2d3SChris Wilson 884112ed2d3SChris Wilson rq = i915_active_request_get_unlocked(&tl->last_request); 885112ed2d3SChris Wilson if (!rq) 886112ed2d3SChris Wilson continue; 887112ed2d3SChris Wilson 888112ed2d3SChris Wilson /* 889112ed2d3SChris Wilson * All internal dependencies (i915_requests) will have 890112ed2d3SChris Wilson * been flushed by the set-wedge, but we may be stuck waiting 891112ed2d3SChris Wilson * for external fences. These should all be capped to 10s 892112ed2d3SChris Wilson * (I915_FENCE_TIMEOUT) so this wait should not be unbounded 893112ed2d3SChris Wilson * in the worst case. 894112ed2d3SChris Wilson */ 895112ed2d3SChris Wilson dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT); 896112ed2d3SChris Wilson i915_request_put(rq); 897112ed2d3SChris Wilson } 898112ed2d3SChris Wilson mutex_unlock(&i915->gt.timelines.mutex); 899112ed2d3SChris Wilson 90079ffac85SChris Wilson intel_gt_sanitize(i915, false); 901112ed2d3SChris Wilson 902112ed2d3SChris Wilson /* 903112ed2d3SChris Wilson * Undo nop_submit_request. We prevent all new i915 requests from 904112ed2d3SChris Wilson * being queued (by disallowing execbuf whilst wedged) so having 905112ed2d3SChris Wilson * waited for all active requests above, we know the system is idle 906112ed2d3SChris Wilson * and do not have to worry about a thread being inside 907112ed2d3SChris Wilson * engine->submit_request() as we swap over. So unlike installing 908112ed2d3SChris Wilson * the nop_submit_request on reset, we can do this from normal 909112ed2d3SChris Wilson * context and do not require stop_machine(). 910112ed2d3SChris Wilson */ 911112ed2d3SChris Wilson intel_engines_reset_default_submission(i915); 912112ed2d3SChris Wilson 913112ed2d3SChris Wilson GEM_TRACE("end\n"); 914112ed2d3SChris Wilson 915112ed2d3SChris Wilson smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ 916112ed2d3SChris Wilson clear_bit(I915_WEDGED, &i915->gpu_error.flags); 917112ed2d3SChris Wilson 918112ed2d3SChris Wilson return true; 919112ed2d3SChris Wilson } 920112ed2d3SChris Wilson 921112ed2d3SChris Wilson bool i915_gem_unset_wedged(struct drm_i915_private *i915) 922112ed2d3SChris Wilson { 923112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 924112ed2d3SChris Wilson bool result; 925112ed2d3SChris Wilson 926112ed2d3SChris Wilson mutex_lock(&error->wedge_mutex); 927112ed2d3SChris Wilson result = __i915_gem_unset_wedged(i915); 928112ed2d3SChris Wilson mutex_unlock(&error->wedge_mutex); 929112ed2d3SChris Wilson 930112ed2d3SChris Wilson return result; 931112ed2d3SChris Wilson } 932112ed2d3SChris Wilson 933112ed2d3SChris Wilson static int do_reset(struct drm_i915_private *i915, 934112ed2d3SChris Wilson intel_engine_mask_t stalled_mask) 935112ed2d3SChris Wilson { 936112ed2d3SChris Wilson int err, i; 937112ed2d3SChris Wilson 938112ed2d3SChris Wilson gt_revoke(i915); 939112ed2d3SChris Wilson 940112ed2d3SChris Wilson err = intel_gpu_reset(i915, ALL_ENGINES); 941112ed2d3SChris Wilson for (i = 0; err && i < RESET_MAX_RETRIES; i++) { 942112ed2d3SChris Wilson msleep(10 * (i + 1)); 943112ed2d3SChris Wilson err = intel_gpu_reset(i915, ALL_ENGINES); 944112ed2d3SChris Wilson } 945112ed2d3SChris Wilson if (err) 946112ed2d3SChris Wilson return err; 947112ed2d3SChris Wilson 948112ed2d3SChris Wilson return gt_reset(i915, stalled_mask); 949112ed2d3SChris Wilson } 950112ed2d3SChris Wilson 951112ed2d3SChris Wilson /** 952112ed2d3SChris Wilson * i915_reset - reset chip after a hang 953112ed2d3SChris Wilson * @i915: #drm_i915_private to reset 954112ed2d3SChris Wilson * @stalled_mask: mask of the stalled engines with the guilty requests 955112ed2d3SChris Wilson * @reason: user error message for why we are resetting 956112ed2d3SChris Wilson * 957112ed2d3SChris Wilson * Reset the chip. Useful if a hang is detected. Marks the device as wedged 958112ed2d3SChris Wilson * on failure. 959112ed2d3SChris Wilson * 960112ed2d3SChris Wilson * Procedure is fairly simple: 961112ed2d3SChris Wilson * - reset the chip using the reset reg 962112ed2d3SChris Wilson * - re-init context state 963112ed2d3SChris Wilson * - re-init hardware status page 964112ed2d3SChris Wilson * - re-init ring buffer 965112ed2d3SChris Wilson * - re-init interrupt state 966112ed2d3SChris Wilson * - re-init display 967112ed2d3SChris Wilson */ 968112ed2d3SChris Wilson void i915_reset(struct drm_i915_private *i915, 969112ed2d3SChris Wilson intel_engine_mask_t stalled_mask, 970112ed2d3SChris Wilson const char *reason) 971112ed2d3SChris Wilson { 972112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 973112ed2d3SChris Wilson int ret; 974112ed2d3SChris Wilson 975112ed2d3SChris Wilson GEM_TRACE("flags=%lx\n", error->flags); 976112ed2d3SChris Wilson 977112ed2d3SChris Wilson might_sleep(); 978112ed2d3SChris Wilson GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); 979112ed2d3SChris Wilson 980112ed2d3SChris Wilson /* Clear any previous failed attempts at recovery. Time to try again. */ 981112ed2d3SChris Wilson if (!__i915_gem_unset_wedged(i915)) 982112ed2d3SChris Wilson return; 983112ed2d3SChris Wilson 984112ed2d3SChris Wilson if (reason) 985112ed2d3SChris Wilson dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason); 986112ed2d3SChris Wilson error->reset_count++; 987112ed2d3SChris Wilson 988112ed2d3SChris Wilson reset_prepare(i915); 989112ed2d3SChris Wilson 990112ed2d3SChris Wilson if (!intel_has_gpu_reset(i915)) { 991112ed2d3SChris Wilson if (i915_modparams.reset) 992112ed2d3SChris Wilson dev_err(i915->drm.dev, "GPU reset not supported\n"); 993112ed2d3SChris Wilson else 994112ed2d3SChris Wilson DRM_DEBUG_DRIVER("GPU reset disabled\n"); 995112ed2d3SChris Wilson goto error; 996112ed2d3SChris Wilson } 997112ed2d3SChris Wilson 998112ed2d3SChris Wilson if (INTEL_INFO(i915)->gpu_reset_clobbers_display) 999112ed2d3SChris Wilson intel_runtime_pm_disable_interrupts(i915); 1000112ed2d3SChris Wilson 1001112ed2d3SChris Wilson if (do_reset(i915, stalled_mask)) { 1002112ed2d3SChris Wilson dev_err(i915->drm.dev, "Failed to reset chip\n"); 1003112ed2d3SChris Wilson goto taint; 1004112ed2d3SChris Wilson } 1005112ed2d3SChris Wilson 1006112ed2d3SChris Wilson if (INTEL_INFO(i915)->gpu_reset_clobbers_display) 1007112ed2d3SChris Wilson intel_runtime_pm_enable_interrupts(i915); 1008112ed2d3SChris Wilson 1009112ed2d3SChris Wilson intel_overlay_reset(i915); 1010112ed2d3SChris Wilson 1011112ed2d3SChris Wilson /* 1012112ed2d3SChris Wilson * Next we need to restore the context, but we don't use those 1013112ed2d3SChris Wilson * yet either... 1014112ed2d3SChris Wilson * 1015112ed2d3SChris Wilson * Ring buffer needs to be re-initialized in the KMS case, or if X 1016112ed2d3SChris Wilson * was running at the time of the reset (i.e. we weren't VT 1017112ed2d3SChris Wilson * switched away). 1018112ed2d3SChris Wilson */ 1019112ed2d3SChris Wilson ret = i915_gem_init_hw(i915); 1020112ed2d3SChris Wilson if (ret) { 1021112ed2d3SChris Wilson DRM_ERROR("Failed to initialise HW following reset (%d)\n", 1022112ed2d3SChris Wilson ret); 1023112ed2d3SChris Wilson goto error; 1024112ed2d3SChris Wilson } 1025112ed2d3SChris Wilson 1026112ed2d3SChris Wilson i915_queue_hangcheck(i915); 1027112ed2d3SChris Wilson 1028112ed2d3SChris Wilson finish: 1029112ed2d3SChris Wilson reset_finish(i915); 1030112ed2d3SChris Wilson return; 1031112ed2d3SChris Wilson 1032112ed2d3SChris Wilson taint: 1033112ed2d3SChris Wilson /* 1034112ed2d3SChris Wilson * History tells us that if we cannot reset the GPU now, we 1035112ed2d3SChris Wilson * never will. This then impacts everything that is run 1036112ed2d3SChris Wilson * subsequently. On failing the reset, we mark the driver 1037112ed2d3SChris Wilson * as wedged, preventing further execution on the GPU. 1038112ed2d3SChris Wilson * We also want to go one step further and add a taint to the 1039112ed2d3SChris Wilson * kernel so that any subsequent faults can be traced back to 1040112ed2d3SChris Wilson * this failure. This is important for CI, where if the 1041112ed2d3SChris Wilson * GPU/driver fails we would like to reboot and restart testing 1042112ed2d3SChris Wilson * rather than continue on into oblivion. For everyone else, 1043112ed2d3SChris Wilson * the system should still plod along, but they have been warned! 1044112ed2d3SChris Wilson */ 104518ecc6c5SChris Wilson add_taint_for_CI(TAINT_WARN); 1046112ed2d3SChris Wilson error: 1047112ed2d3SChris Wilson __i915_gem_set_wedged(i915); 1048112ed2d3SChris Wilson goto finish; 1049112ed2d3SChris Wilson } 1050112ed2d3SChris Wilson 1051112ed2d3SChris Wilson static inline int intel_gt_reset_engine(struct drm_i915_private *i915, 1052112ed2d3SChris Wilson struct intel_engine_cs *engine) 1053112ed2d3SChris Wilson { 1054112ed2d3SChris Wilson return intel_gpu_reset(i915, engine->mask); 1055112ed2d3SChris Wilson } 1056112ed2d3SChris Wilson 1057112ed2d3SChris Wilson /** 1058112ed2d3SChris Wilson * i915_reset_engine - reset GPU engine to recover from a hang 1059112ed2d3SChris Wilson * @engine: engine to reset 1060112ed2d3SChris Wilson * @msg: reason for GPU reset; or NULL for no dev_notice() 1061112ed2d3SChris Wilson * 1062112ed2d3SChris Wilson * Reset a specific GPU engine. Useful if a hang is detected. 1063112ed2d3SChris Wilson * Returns zero on successful reset or otherwise an error code. 1064112ed2d3SChris Wilson * 1065112ed2d3SChris Wilson * Procedure is: 1066112ed2d3SChris Wilson * - identifies the request that caused the hang and it is dropped 1067112ed2d3SChris Wilson * - reset engine (which will force the engine to idle) 1068112ed2d3SChris Wilson * - re-init/configure engine 1069112ed2d3SChris Wilson */ 1070112ed2d3SChris Wilson int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) 1071112ed2d3SChris Wilson { 1072112ed2d3SChris Wilson struct i915_gpu_error *error = &engine->i915->gpu_error; 1073112ed2d3SChris Wilson int ret; 1074112ed2d3SChris Wilson 1075112ed2d3SChris Wilson GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); 1076112ed2d3SChris Wilson GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); 1077112ed2d3SChris Wilson 107879ffac85SChris Wilson if (!intel_wakeref_active(&engine->wakeref)) 107979ffac85SChris Wilson return 0; 108079ffac85SChris Wilson 1081112ed2d3SChris Wilson reset_prepare_engine(engine); 1082112ed2d3SChris Wilson 1083112ed2d3SChris Wilson if (msg) 1084112ed2d3SChris Wilson dev_notice(engine->i915->drm.dev, 1085112ed2d3SChris Wilson "Resetting %s for %s\n", engine->name, msg); 1086112ed2d3SChris Wilson error->reset_engine_count[engine->id]++; 1087112ed2d3SChris Wilson 1088112ed2d3SChris Wilson if (!engine->i915->guc.execbuf_client) 1089112ed2d3SChris Wilson ret = intel_gt_reset_engine(engine->i915, engine); 1090112ed2d3SChris Wilson else 1091112ed2d3SChris Wilson ret = intel_guc_reset_engine(&engine->i915->guc, engine); 1092112ed2d3SChris Wilson if (ret) { 1093112ed2d3SChris Wilson /* If we fail here, we expect to fallback to a global reset */ 1094112ed2d3SChris Wilson DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n", 1095112ed2d3SChris Wilson engine->i915->guc.execbuf_client ? "GuC " : "", 1096112ed2d3SChris Wilson engine->name, ret); 1097112ed2d3SChris Wilson goto out; 1098112ed2d3SChris Wilson } 1099112ed2d3SChris Wilson 1100112ed2d3SChris Wilson /* 1101112ed2d3SChris Wilson * The request that caused the hang is stuck on elsp, we know the 1102112ed2d3SChris Wilson * active request and can drop it, adjust head to skip the offending 1103112ed2d3SChris Wilson * request to resume executing remaining requests in the queue. 1104112ed2d3SChris Wilson */ 1105112ed2d3SChris Wilson intel_engine_reset(engine, true); 1106112ed2d3SChris Wilson 1107112ed2d3SChris Wilson /* 1108112ed2d3SChris Wilson * The engine and its registers (and workarounds in case of render) 1109112ed2d3SChris Wilson * have been reset to their default values. Follow the init_ring 1110112ed2d3SChris Wilson * process to program RING_MODE, HWSP and re-enable submission. 1111112ed2d3SChris Wilson */ 111279ffac85SChris Wilson ret = engine->resume(engine); 1113112ed2d3SChris Wilson if (ret) 1114112ed2d3SChris Wilson goto out; 1115112ed2d3SChris Wilson 1116112ed2d3SChris Wilson out: 1117112ed2d3SChris Wilson intel_engine_cancel_stop_cs(engine); 1118112ed2d3SChris Wilson reset_finish_engine(engine); 1119112ed2d3SChris Wilson return ret; 1120112ed2d3SChris Wilson } 1121112ed2d3SChris Wilson 1122112ed2d3SChris Wilson static void i915_reset_device(struct drm_i915_private *i915, 1123112ed2d3SChris Wilson u32 engine_mask, 1124112ed2d3SChris Wilson const char *reason) 1125112ed2d3SChris Wilson { 1126112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1127112ed2d3SChris Wilson struct kobject *kobj = &i915->drm.primary->kdev->kobj; 1128112ed2d3SChris Wilson char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; 1129112ed2d3SChris Wilson char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; 1130112ed2d3SChris Wilson char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; 1131112ed2d3SChris Wilson struct i915_wedge_me w; 1132112ed2d3SChris Wilson 1133112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); 1134112ed2d3SChris Wilson 1135112ed2d3SChris Wilson DRM_DEBUG_DRIVER("resetting chip\n"); 1136112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); 1137112ed2d3SChris Wilson 1138112ed2d3SChris Wilson /* Use a watchdog to ensure that our reset completes */ 1139112ed2d3SChris Wilson i915_wedge_on_timeout(&w, i915, 5 * HZ) { 1140112ed2d3SChris Wilson intel_prepare_reset(i915); 1141112ed2d3SChris Wilson 1142112ed2d3SChris Wilson /* Flush everyone using a resource about to be clobbered */ 1143112ed2d3SChris Wilson synchronize_srcu_expedited(&error->reset_backoff_srcu); 1144112ed2d3SChris Wilson 1145112ed2d3SChris Wilson mutex_lock(&error->wedge_mutex); 1146112ed2d3SChris Wilson i915_reset(i915, engine_mask, reason); 1147112ed2d3SChris Wilson mutex_unlock(&error->wedge_mutex); 1148112ed2d3SChris Wilson 1149112ed2d3SChris Wilson intel_finish_reset(i915); 1150112ed2d3SChris Wilson } 1151112ed2d3SChris Wilson 1152112ed2d3SChris Wilson if (!test_bit(I915_WEDGED, &error->flags)) 1153112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); 1154112ed2d3SChris Wilson } 1155112ed2d3SChris Wilson 1156112ed2d3SChris Wilson static void clear_register(struct intel_uncore *uncore, i915_reg_t reg) 1157112ed2d3SChris Wilson { 1158112ed2d3SChris Wilson intel_uncore_rmw(uncore, reg, 0, 0); 1159112ed2d3SChris Wilson } 1160112ed2d3SChris Wilson 1161112ed2d3SChris Wilson void i915_clear_error_registers(struct drm_i915_private *i915) 1162112ed2d3SChris Wilson { 1163112ed2d3SChris Wilson struct intel_uncore *uncore = &i915->uncore; 1164112ed2d3SChris Wilson u32 eir; 1165112ed2d3SChris Wilson 1166112ed2d3SChris Wilson if (!IS_GEN(i915, 2)) 1167112ed2d3SChris Wilson clear_register(uncore, PGTBL_ER); 1168112ed2d3SChris Wilson 1169112ed2d3SChris Wilson if (INTEL_GEN(i915) < 4) 1170112ed2d3SChris Wilson clear_register(uncore, IPEIR(RENDER_RING_BASE)); 1171112ed2d3SChris Wilson else 1172112ed2d3SChris Wilson clear_register(uncore, IPEIR_I965); 1173112ed2d3SChris Wilson 1174112ed2d3SChris Wilson clear_register(uncore, EIR); 1175112ed2d3SChris Wilson eir = intel_uncore_read(uncore, EIR); 1176112ed2d3SChris Wilson if (eir) { 1177112ed2d3SChris Wilson /* 1178112ed2d3SChris Wilson * some errors might have become stuck, 1179112ed2d3SChris Wilson * mask them. 1180112ed2d3SChris Wilson */ 1181112ed2d3SChris Wilson DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir); 1182112ed2d3SChris Wilson rmw_set(uncore, EMR, eir); 1183112ed2d3SChris Wilson intel_uncore_write(uncore, GEN2_IIR, 1184112ed2d3SChris Wilson I915_MASTER_ERROR_INTERRUPT); 1185112ed2d3SChris Wilson } 1186112ed2d3SChris Wilson 1187112ed2d3SChris Wilson if (INTEL_GEN(i915) >= 8) { 1188112ed2d3SChris Wilson rmw_clear(uncore, GEN8_RING_FAULT_REG, RING_FAULT_VALID); 1189112ed2d3SChris Wilson intel_uncore_posting_read(uncore, GEN8_RING_FAULT_REG); 1190112ed2d3SChris Wilson } else if (INTEL_GEN(i915) >= 6) { 1191112ed2d3SChris Wilson struct intel_engine_cs *engine; 1192112ed2d3SChris Wilson enum intel_engine_id id; 1193112ed2d3SChris Wilson 1194112ed2d3SChris Wilson for_each_engine(engine, i915, id) { 1195112ed2d3SChris Wilson rmw_clear(uncore, 1196112ed2d3SChris Wilson RING_FAULT_REG(engine), RING_FAULT_VALID); 1197112ed2d3SChris Wilson intel_uncore_posting_read(uncore, 1198112ed2d3SChris Wilson RING_FAULT_REG(engine)); 1199112ed2d3SChris Wilson } 1200112ed2d3SChris Wilson } 1201112ed2d3SChris Wilson } 1202112ed2d3SChris Wilson 1203112ed2d3SChris Wilson /** 1204112ed2d3SChris Wilson * i915_handle_error - handle a gpu error 1205112ed2d3SChris Wilson * @i915: i915 device private 1206112ed2d3SChris Wilson * @engine_mask: mask representing engines that are hung 1207112ed2d3SChris Wilson * @flags: control flags 1208112ed2d3SChris Wilson * @fmt: Error message format string 1209112ed2d3SChris Wilson * 1210112ed2d3SChris Wilson * Do some basic checking of register state at error time and 1211112ed2d3SChris Wilson * dump it to the syslog. Also call i915_capture_error_state() to make 1212112ed2d3SChris Wilson * sure we get a record and make it available in debugfs. Fire a uevent 1213112ed2d3SChris Wilson * so userspace knows something bad happened (should trigger collection 1214112ed2d3SChris Wilson * of a ring dump etc.). 1215112ed2d3SChris Wilson */ 1216112ed2d3SChris Wilson void i915_handle_error(struct drm_i915_private *i915, 1217112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 1218112ed2d3SChris Wilson unsigned long flags, 1219112ed2d3SChris Wilson const char *fmt, ...) 1220112ed2d3SChris Wilson { 1221112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1222112ed2d3SChris Wilson struct intel_engine_cs *engine; 1223112ed2d3SChris Wilson intel_wakeref_t wakeref; 1224112ed2d3SChris Wilson intel_engine_mask_t tmp; 1225112ed2d3SChris Wilson char error_msg[80]; 1226112ed2d3SChris Wilson char *msg = NULL; 1227112ed2d3SChris Wilson 1228112ed2d3SChris Wilson if (fmt) { 1229112ed2d3SChris Wilson va_list args; 1230112ed2d3SChris Wilson 1231112ed2d3SChris Wilson va_start(args, fmt); 1232112ed2d3SChris Wilson vscnprintf(error_msg, sizeof(error_msg), fmt, args); 1233112ed2d3SChris Wilson va_end(args); 1234112ed2d3SChris Wilson 1235112ed2d3SChris Wilson msg = error_msg; 1236112ed2d3SChris Wilson } 1237112ed2d3SChris Wilson 1238112ed2d3SChris Wilson /* 1239112ed2d3SChris Wilson * In most cases it's guaranteed that we get here with an RPM 1240112ed2d3SChris Wilson * reference held, for example because there is a pending GPU 1241112ed2d3SChris Wilson * request that won't finish until the reset is done. This 1242112ed2d3SChris Wilson * isn't the case at least when we get here by doing a 1243112ed2d3SChris Wilson * simulated reset via debugfs, so get an RPM reference. 1244112ed2d3SChris Wilson */ 1245112ed2d3SChris Wilson wakeref = intel_runtime_pm_get(i915); 1246112ed2d3SChris Wilson 1247112ed2d3SChris Wilson engine_mask &= INTEL_INFO(i915)->engine_mask; 1248112ed2d3SChris Wilson 1249112ed2d3SChris Wilson if (flags & I915_ERROR_CAPTURE) { 1250112ed2d3SChris Wilson i915_capture_error_state(i915, engine_mask, msg); 1251112ed2d3SChris Wilson i915_clear_error_registers(i915); 1252112ed2d3SChris Wilson } 1253112ed2d3SChris Wilson 1254112ed2d3SChris Wilson /* 1255112ed2d3SChris Wilson * Try engine reset when available. We fall back to full reset if 1256112ed2d3SChris Wilson * single reset fails. 1257112ed2d3SChris Wilson */ 1258112ed2d3SChris Wilson if (intel_has_reset_engine(i915) && !__i915_wedged(error)) { 1259112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 1260112ed2d3SChris Wilson BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); 1261112ed2d3SChris Wilson if (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1262112ed2d3SChris Wilson &error->flags)) 1263112ed2d3SChris Wilson continue; 1264112ed2d3SChris Wilson 1265112ed2d3SChris Wilson if (i915_reset_engine(engine, msg) == 0) 1266112ed2d3SChris Wilson engine_mask &= ~engine->mask; 1267112ed2d3SChris Wilson 1268112ed2d3SChris Wilson clear_bit(I915_RESET_ENGINE + engine->id, 1269112ed2d3SChris Wilson &error->flags); 1270112ed2d3SChris Wilson wake_up_bit(&error->flags, 1271112ed2d3SChris Wilson I915_RESET_ENGINE + engine->id); 1272112ed2d3SChris Wilson } 1273112ed2d3SChris Wilson } 1274112ed2d3SChris Wilson 1275112ed2d3SChris Wilson if (!engine_mask) 1276112ed2d3SChris Wilson goto out; 1277112ed2d3SChris Wilson 1278112ed2d3SChris Wilson /* Full reset needs the mutex, stop any other user trying to do so. */ 1279112ed2d3SChris Wilson if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) { 1280112ed2d3SChris Wilson wait_event(error->reset_queue, 1281112ed2d3SChris Wilson !test_bit(I915_RESET_BACKOFF, &error->flags)); 1282112ed2d3SChris Wilson goto out; /* piggy-back on the other reset */ 1283112ed2d3SChris Wilson } 1284112ed2d3SChris Wilson 1285112ed2d3SChris Wilson /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */ 1286112ed2d3SChris Wilson synchronize_rcu_expedited(); 1287112ed2d3SChris Wilson 1288112ed2d3SChris Wilson /* Prevent any other reset-engine attempt. */ 1289112ed2d3SChris Wilson for_each_engine(engine, i915, tmp) { 1290112ed2d3SChris Wilson while (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1291112ed2d3SChris Wilson &error->flags)) 1292112ed2d3SChris Wilson wait_on_bit(&error->flags, 1293112ed2d3SChris Wilson I915_RESET_ENGINE + engine->id, 1294112ed2d3SChris Wilson TASK_UNINTERRUPTIBLE); 1295112ed2d3SChris Wilson } 1296112ed2d3SChris Wilson 1297112ed2d3SChris Wilson i915_reset_device(i915, engine_mask, msg); 1298112ed2d3SChris Wilson 1299112ed2d3SChris Wilson for_each_engine(engine, i915, tmp) { 1300112ed2d3SChris Wilson clear_bit(I915_RESET_ENGINE + engine->id, 1301112ed2d3SChris Wilson &error->flags); 1302112ed2d3SChris Wilson } 1303112ed2d3SChris Wilson 1304112ed2d3SChris Wilson clear_bit(I915_RESET_BACKOFF, &error->flags); 1305112ed2d3SChris Wilson wake_up_all(&error->reset_queue); 1306112ed2d3SChris Wilson 1307112ed2d3SChris Wilson out: 1308112ed2d3SChris Wilson intel_runtime_pm_put(i915, wakeref); 1309112ed2d3SChris Wilson } 1310112ed2d3SChris Wilson 1311112ed2d3SChris Wilson int i915_reset_trylock(struct drm_i915_private *i915) 1312112ed2d3SChris Wilson { 1313112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1314112ed2d3SChris Wilson int srcu; 1315112ed2d3SChris Wilson 1316112ed2d3SChris Wilson might_lock(&error->reset_backoff_srcu); 1317112ed2d3SChris Wilson might_sleep(); 1318112ed2d3SChris Wilson 1319112ed2d3SChris Wilson rcu_read_lock(); 1320112ed2d3SChris Wilson while (test_bit(I915_RESET_BACKOFF, &error->flags)) { 1321112ed2d3SChris Wilson rcu_read_unlock(); 1322112ed2d3SChris Wilson 1323112ed2d3SChris Wilson if (wait_event_interruptible(error->reset_queue, 1324112ed2d3SChris Wilson !test_bit(I915_RESET_BACKOFF, 1325112ed2d3SChris Wilson &error->flags))) 1326112ed2d3SChris Wilson return -EINTR; 1327112ed2d3SChris Wilson 1328112ed2d3SChris Wilson rcu_read_lock(); 1329112ed2d3SChris Wilson } 1330112ed2d3SChris Wilson srcu = srcu_read_lock(&error->reset_backoff_srcu); 1331112ed2d3SChris Wilson rcu_read_unlock(); 1332112ed2d3SChris Wilson 1333112ed2d3SChris Wilson return srcu; 1334112ed2d3SChris Wilson } 1335112ed2d3SChris Wilson 1336112ed2d3SChris Wilson void i915_reset_unlock(struct drm_i915_private *i915, int tag) 1337112ed2d3SChris Wilson __releases(&i915->gpu_error.reset_backoff_srcu) 1338112ed2d3SChris Wilson { 1339112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1340112ed2d3SChris Wilson 1341112ed2d3SChris Wilson srcu_read_unlock(&error->reset_backoff_srcu, tag); 1342112ed2d3SChris Wilson } 1343112ed2d3SChris Wilson 1344112ed2d3SChris Wilson int i915_terminally_wedged(struct drm_i915_private *i915) 1345112ed2d3SChris Wilson { 1346112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1347112ed2d3SChris Wilson 1348112ed2d3SChris Wilson might_sleep(); 1349112ed2d3SChris Wilson 1350112ed2d3SChris Wilson if (!__i915_wedged(error)) 1351112ed2d3SChris Wilson return 0; 1352112ed2d3SChris Wilson 1353112ed2d3SChris Wilson /* Reset still in progress? Maybe we will recover? */ 1354112ed2d3SChris Wilson if (!test_bit(I915_RESET_BACKOFF, &error->flags)) 1355112ed2d3SChris Wilson return -EIO; 1356112ed2d3SChris Wilson 1357112ed2d3SChris Wilson /* XXX intel_reset_finish() still takes struct_mutex!!! */ 1358112ed2d3SChris Wilson if (mutex_is_locked(&i915->drm.struct_mutex)) 1359112ed2d3SChris Wilson return -EAGAIN; 1360112ed2d3SChris Wilson 1361112ed2d3SChris Wilson if (wait_event_interruptible(error->reset_queue, 1362112ed2d3SChris Wilson !test_bit(I915_RESET_BACKOFF, 1363112ed2d3SChris Wilson &error->flags))) 1364112ed2d3SChris Wilson return -EINTR; 1365112ed2d3SChris Wilson 1366112ed2d3SChris Wilson return __i915_wedged(error) ? -EIO : 0; 1367112ed2d3SChris Wilson } 1368112ed2d3SChris Wilson 1369112ed2d3SChris Wilson static void i915_wedge_me(struct work_struct *work) 1370112ed2d3SChris Wilson { 1371112ed2d3SChris Wilson struct i915_wedge_me *w = container_of(work, typeof(*w), work.work); 1372112ed2d3SChris Wilson 1373112ed2d3SChris Wilson dev_err(w->i915->drm.dev, 1374112ed2d3SChris Wilson "%s timed out, cancelling all in-flight rendering.\n", 1375112ed2d3SChris Wilson w->name); 1376112ed2d3SChris Wilson i915_gem_set_wedged(w->i915); 1377112ed2d3SChris Wilson } 1378112ed2d3SChris Wilson 1379112ed2d3SChris Wilson void __i915_init_wedge(struct i915_wedge_me *w, 1380112ed2d3SChris Wilson struct drm_i915_private *i915, 1381112ed2d3SChris Wilson long timeout, 1382112ed2d3SChris Wilson const char *name) 1383112ed2d3SChris Wilson { 1384112ed2d3SChris Wilson w->i915 = i915; 1385112ed2d3SChris Wilson w->name = name; 1386112ed2d3SChris Wilson 1387112ed2d3SChris Wilson INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me); 1388112ed2d3SChris Wilson schedule_delayed_work(&w->work, timeout); 1389112ed2d3SChris Wilson } 1390112ed2d3SChris Wilson 1391112ed2d3SChris Wilson void __i915_fini_wedge(struct i915_wedge_me *w) 1392112ed2d3SChris Wilson { 1393112ed2d3SChris Wilson cancel_delayed_work_sync(&w->work); 1394112ed2d3SChris Wilson destroy_delayed_work_on_stack(&w->work); 1395112ed2d3SChris Wilson w->i915 = NULL; 1396112ed2d3SChris Wilson } 1397932309fbSMichal Wajdeczko 1398932309fbSMichal Wajdeczko #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1399932309fbSMichal Wajdeczko #include "selftest_reset.c" 1400932309fbSMichal Wajdeczko #endif 1401