1112ed2d3SChris Wilson /* 2112ed2d3SChris Wilson * SPDX-License-Identifier: MIT 3112ed2d3SChris Wilson * 4112ed2d3SChris Wilson * Copyright © 2008-2018 Intel Corporation 5112ed2d3SChris Wilson */ 6112ed2d3SChris Wilson 7112ed2d3SChris Wilson #include <linux/sched/mm.h> 8112ed2d3SChris Wilson #include <linux/stop_machine.h> 9112ed2d3SChris Wilson 1010be98a7SChris Wilson #include "gem/i915_gem_context.h" 1110be98a7SChris Wilson 12112ed2d3SChris Wilson #include "i915_drv.h" 13112ed2d3SChris Wilson #include "i915_gpu_error.h" 14440e2b3dSJani Nikula #include "i915_irq.h" 1579ffac85SChris Wilson #include "intel_engine_pm.h" 1679ffac85SChris Wilson #include "intel_gt_pm.h" 17112ed2d3SChris Wilson #include "intel_reset.h" 18112ed2d3SChris Wilson 19112ed2d3SChris Wilson #include "intel_guc.h" 2005ca9306SJani Nikula #include "intel_overlay.h" 21112ed2d3SChris Wilson 22112ed2d3SChris Wilson #define RESET_MAX_RETRIES 3 23112ed2d3SChris Wilson 24112ed2d3SChris Wilson /* XXX How to handle concurrent GGTT updates using tiling registers? */ 25112ed2d3SChris Wilson #define RESET_UNDER_STOP_MACHINE 0 26112ed2d3SChris Wilson 27112ed2d3SChris Wilson static void rmw_set(struct intel_uncore *uncore, i915_reg_t reg, u32 set) 28112ed2d3SChris Wilson { 29112ed2d3SChris Wilson intel_uncore_rmw(uncore, reg, 0, set); 30112ed2d3SChris Wilson } 31112ed2d3SChris Wilson 32112ed2d3SChris Wilson static void rmw_clear(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) 33112ed2d3SChris Wilson { 34112ed2d3SChris Wilson intel_uncore_rmw(uncore, reg, clr, 0); 35112ed2d3SChris Wilson } 36112ed2d3SChris Wilson 37112ed2d3SChris Wilson static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set) 38112ed2d3SChris Wilson { 39112ed2d3SChris Wilson intel_uncore_rmw_fw(uncore, reg, 0, set); 40112ed2d3SChris Wilson } 41112ed2d3SChris Wilson 42112ed2d3SChris Wilson static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) 43112ed2d3SChris Wilson { 44112ed2d3SChris Wilson intel_uncore_rmw_fw(uncore, reg, clr, 0); 45112ed2d3SChris Wilson } 46112ed2d3SChris Wilson 47112ed2d3SChris Wilson static void engine_skip_context(struct i915_request *rq) 48112ed2d3SChris Wilson { 49112ed2d3SChris Wilson struct intel_engine_cs *engine = rq->engine; 50112ed2d3SChris Wilson struct i915_gem_context *hung_ctx = rq->gem_context; 51112ed2d3SChris Wilson 52112ed2d3SChris Wilson lockdep_assert_held(&engine->timeline.lock); 53112ed2d3SChris Wilson 54112ed2d3SChris Wilson if (!i915_request_is_active(rq)) 55112ed2d3SChris Wilson return; 56112ed2d3SChris Wilson 57112ed2d3SChris Wilson list_for_each_entry_continue(rq, &engine->timeline.requests, link) 58112ed2d3SChris Wilson if (rq->gem_context == hung_ctx) 59112ed2d3SChris Wilson i915_request_skip(rq, -EIO); 60112ed2d3SChris Wilson } 61112ed2d3SChris Wilson 62112ed2d3SChris Wilson static void client_mark_guilty(struct drm_i915_file_private *file_priv, 63112ed2d3SChris Wilson const struct i915_gem_context *ctx) 64112ed2d3SChris Wilson { 65112ed2d3SChris Wilson unsigned int score; 66112ed2d3SChris Wilson unsigned long prev_hang; 67112ed2d3SChris Wilson 68112ed2d3SChris Wilson if (i915_gem_context_is_banned(ctx)) 69112ed2d3SChris Wilson score = I915_CLIENT_SCORE_CONTEXT_BAN; 70112ed2d3SChris Wilson else 71112ed2d3SChris Wilson score = 0; 72112ed2d3SChris Wilson 73112ed2d3SChris Wilson prev_hang = xchg(&file_priv->hang_timestamp, jiffies); 74112ed2d3SChris Wilson if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) 75112ed2d3SChris Wilson score += I915_CLIENT_SCORE_HANG_FAST; 76112ed2d3SChris Wilson 77112ed2d3SChris Wilson if (score) { 78112ed2d3SChris Wilson atomic_add(score, &file_priv->ban_score); 79112ed2d3SChris Wilson 80112ed2d3SChris Wilson DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n", 81112ed2d3SChris Wilson ctx->name, score, 82112ed2d3SChris Wilson atomic_read(&file_priv->ban_score)); 83112ed2d3SChris Wilson } 84112ed2d3SChris Wilson } 85112ed2d3SChris Wilson 86112ed2d3SChris Wilson static bool context_mark_guilty(struct i915_gem_context *ctx) 87112ed2d3SChris Wilson { 88112ed2d3SChris Wilson unsigned long prev_hang; 89112ed2d3SChris Wilson bool banned; 90112ed2d3SChris Wilson int i; 91112ed2d3SChris Wilson 92112ed2d3SChris Wilson atomic_inc(&ctx->guilty_count); 93112ed2d3SChris Wilson 94112ed2d3SChris Wilson /* Cool contexts are too cool to be banned! (Used for reset testing.) */ 95112ed2d3SChris Wilson if (!i915_gem_context_is_bannable(ctx)) 96112ed2d3SChris Wilson return false; 97112ed2d3SChris Wilson 98112ed2d3SChris Wilson /* Record the timestamp for the last N hangs */ 99112ed2d3SChris Wilson prev_hang = ctx->hang_timestamp[0]; 100112ed2d3SChris Wilson for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) 101112ed2d3SChris Wilson ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; 102112ed2d3SChris Wilson ctx->hang_timestamp[i] = jiffies; 103112ed2d3SChris Wilson 104112ed2d3SChris Wilson /* If we have hung N+1 times in rapid succession, we ban the context! */ 105112ed2d3SChris Wilson banned = !i915_gem_context_is_recoverable(ctx); 106112ed2d3SChris Wilson if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) 107112ed2d3SChris Wilson banned = true; 108112ed2d3SChris Wilson if (banned) { 109112ed2d3SChris Wilson DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n", 110112ed2d3SChris Wilson ctx->name, atomic_read(&ctx->guilty_count)); 111112ed2d3SChris Wilson i915_gem_context_set_banned(ctx); 112112ed2d3SChris Wilson } 113112ed2d3SChris Wilson 114112ed2d3SChris Wilson if (!IS_ERR_OR_NULL(ctx->file_priv)) 115112ed2d3SChris Wilson client_mark_guilty(ctx->file_priv, ctx); 116112ed2d3SChris Wilson 117112ed2d3SChris Wilson return banned; 118112ed2d3SChris Wilson } 119112ed2d3SChris Wilson 120112ed2d3SChris Wilson static void context_mark_innocent(struct i915_gem_context *ctx) 121112ed2d3SChris Wilson { 122112ed2d3SChris Wilson atomic_inc(&ctx->active_count); 123112ed2d3SChris Wilson } 124112ed2d3SChris Wilson 125112ed2d3SChris Wilson void i915_reset_request(struct i915_request *rq, bool guilty) 126112ed2d3SChris Wilson { 127112ed2d3SChris Wilson GEM_TRACE("%s rq=%llx:%lld, guilty? %s\n", 128112ed2d3SChris Wilson rq->engine->name, 129112ed2d3SChris Wilson rq->fence.context, 130112ed2d3SChris Wilson rq->fence.seqno, 131112ed2d3SChris Wilson yesno(guilty)); 132112ed2d3SChris Wilson 133112ed2d3SChris Wilson lockdep_assert_held(&rq->engine->timeline.lock); 134112ed2d3SChris Wilson GEM_BUG_ON(i915_request_completed(rq)); 135112ed2d3SChris Wilson 136112ed2d3SChris Wilson if (guilty) { 137112ed2d3SChris Wilson i915_request_skip(rq, -EIO); 138112ed2d3SChris Wilson if (context_mark_guilty(rq->gem_context)) 139112ed2d3SChris Wilson engine_skip_context(rq); 140112ed2d3SChris Wilson } else { 141112ed2d3SChris Wilson dma_fence_set_error(&rq->fence, -EAGAIN); 142112ed2d3SChris Wilson context_mark_innocent(rq->gem_context); 143112ed2d3SChris Wilson } 144112ed2d3SChris Wilson } 145112ed2d3SChris Wilson 146112ed2d3SChris Wilson static void gen3_stop_engine(struct intel_engine_cs *engine) 147112ed2d3SChris Wilson { 148112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 149112ed2d3SChris Wilson const u32 base = engine->mmio_base; 150112ed2d3SChris Wilson 151112ed2d3SChris Wilson GEM_TRACE("%s\n", engine->name); 152112ed2d3SChris Wilson 153112ed2d3SChris Wilson if (intel_engine_stop_cs(engine)) 154112ed2d3SChris Wilson GEM_TRACE("%s: timed out on STOP_RING\n", engine->name); 155112ed2d3SChris Wilson 156112ed2d3SChris Wilson intel_uncore_write_fw(uncore, 157112ed2d3SChris Wilson RING_HEAD(base), 158112ed2d3SChris Wilson intel_uncore_read_fw(uncore, RING_TAIL(base))); 159112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */ 160112ed2d3SChris Wilson 161112ed2d3SChris Wilson intel_uncore_write_fw(uncore, RING_HEAD(base), 0); 162112ed2d3SChris Wilson intel_uncore_write_fw(uncore, RING_TAIL(base), 0); 163112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, RING_TAIL(base)); 164112ed2d3SChris Wilson 165112ed2d3SChris Wilson /* The ring must be empty before it is disabled */ 166112ed2d3SChris Wilson intel_uncore_write_fw(uncore, RING_CTL(base), 0); 167112ed2d3SChris Wilson 168112ed2d3SChris Wilson /* Check acts as a post */ 169112ed2d3SChris Wilson if (intel_uncore_read_fw(uncore, RING_HEAD(base))) 170112ed2d3SChris Wilson GEM_TRACE("%s: ring head [%x] not parked\n", 171112ed2d3SChris Wilson engine->name, 172112ed2d3SChris Wilson intel_uncore_read_fw(uncore, RING_HEAD(base))); 173112ed2d3SChris Wilson } 174112ed2d3SChris Wilson 175112ed2d3SChris Wilson static void i915_stop_engines(struct drm_i915_private *i915, 176112ed2d3SChris Wilson intel_engine_mask_t engine_mask) 177112ed2d3SChris Wilson { 178112ed2d3SChris Wilson struct intel_engine_cs *engine; 179112ed2d3SChris Wilson intel_engine_mask_t tmp; 180112ed2d3SChris Wilson 181112ed2d3SChris Wilson if (INTEL_GEN(i915) < 3) 182112ed2d3SChris Wilson return; 183112ed2d3SChris Wilson 184112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) 185112ed2d3SChris Wilson gen3_stop_engine(engine); 186112ed2d3SChris Wilson } 187112ed2d3SChris Wilson 188112ed2d3SChris Wilson static bool i915_in_reset(struct pci_dev *pdev) 189112ed2d3SChris Wilson { 190112ed2d3SChris Wilson u8 gdrst; 191112ed2d3SChris Wilson 192112ed2d3SChris Wilson pci_read_config_byte(pdev, I915_GDRST, &gdrst); 193112ed2d3SChris Wilson return gdrst & GRDOM_RESET_STATUS; 194112ed2d3SChris Wilson } 195112ed2d3SChris Wilson 196112ed2d3SChris Wilson static int i915_do_reset(struct drm_i915_private *i915, 197112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 198112ed2d3SChris Wilson unsigned int retry) 199112ed2d3SChris Wilson { 200112ed2d3SChris Wilson struct pci_dev *pdev = i915->drm.pdev; 201112ed2d3SChris Wilson int err; 202112ed2d3SChris Wilson 203112ed2d3SChris Wilson /* Assert reset for at least 20 usec, and wait for acknowledgement. */ 204112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 205112ed2d3SChris Wilson udelay(50); 206112ed2d3SChris Wilson err = wait_for_atomic(i915_in_reset(pdev), 50); 207112ed2d3SChris Wilson 208112ed2d3SChris Wilson /* Clear the reset request. */ 209112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 0); 210112ed2d3SChris Wilson udelay(50); 211112ed2d3SChris Wilson if (!err) 212112ed2d3SChris Wilson err = wait_for_atomic(!i915_in_reset(pdev), 50); 213112ed2d3SChris Wilson 214112ed2d3SChris Wilson return err; 215112ed2d3SChris Wilson } 216112ed2d3SChris Wilson 217112ed2d3SChris Wilson static bool g4x_reset_complete(struct pci_dev *pdev) 218112ed2d3SChris Wilson { 219112ed2d3SChris Wilson u8 gdrst; 220112ed2d3SChris Wilson 221112ed2d3SChris Wilson pci_read_config_byte(pdev, I915_GDRST, &gdrst); 222112ed2d3SChris Wilson return (gdrst & GRDOM_RESET_ENABLE) == 0; 223112ed2d3SChris Wilson } 224112ed2d3SChris Wilson 225112ed2d3SChris Wilson static int g33_do_reset(struct drm_i915_private *i915, 226112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 227112ed2d3SChris Wilson unsigned int retry) 228112ed2d3SChris Wilson { 229112ed2d3SChris Wilson struct pci_dev *pdev = i915->drm.pdev; 230112ed2d3SChris Wilson 231112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 232112ed2d3SChris Wilson return wait_for_atomic(g4x_reset_complete(pdev), 50); 233112ed2d3SChris Wilson } 234112ed2d3SChris Wilson 235112ed2d3SChris Wilson static int g4x_do_reset(struct drm_i915_private *i915, 236112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 237112ed2d3SChris Wilson unsigned int retry) 238112ed2d3SChris Wilson { 239112ed2d3SChris Wilson struct pci_dev *pdev = i915->drm.pdev; 240112ed2d3SChris Wilson struct intel_uncore *uncore = &i915->uncore; 241112ed2d3SChris Wilson int ret; 242112ed2d3SChris Wilson 243112ed2d3SChris Wilson /* WaVcpClkGateDisableForMediaReset:ctg,elk */ 244112ed2d3SChris Wilson rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 245112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 246112ed2d3SChris Wilson 247112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 248112ed2d3SChris Wilson GRDOM_MEDIA | GRDOM_RESET_ENABLE); 249112ed2d3SChris Wilson ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 250112ed2d3SChris Wilson if (ret) { 251112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for media reset failed\n"); 252112ed2d3SChris Wilson goto out; 253112ed2d3SChris Wilson } 254112ed2d3SChris Wilson 255112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 256112ed2d3SChris Wilson GRDOM_RENDER | GRDOM_RESET_ENABLE); 257112ed2d3SChris Wilson ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 258112ed2d3SChris Wilson if (ret) { 259112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for render reset failed\n"); 260112ed2d3SChris Wilson goto out; 261112ed2d3SChris Wilson } 262112ed2d3SChris Wilson 263112ed2d3SChris Wilson out: 264112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 0); 265112ed2d3SChris Wilson 266112ed2d3SChris Wilson rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 267112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 268112ed2d3SChris Wilson 269112ed2d3SChris Wilson return ret; 270112ed2d3SChris Wilson } 271112ed2d3SChris Wilson 272112ed2d3SChris Wilson static int ironlake_do_reset(struct drm_i915_private *i915, 273112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 274112ed2d3SChris Wilson unsigned int retry) 275112ed2d3SChris Wilson { 276112ed2d3SChris Wilson struct intel_uncore *uncore = &i915->uncore; 277112ed2d3SChris Wilson int ret; 278112ed2d3SChris Wilson 279112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 280112ed2d3SChris Wilson ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); 281112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 282112ed2d3SChris Wilson ILK_GRDOM_RESET_ENABLE, 0, 283112ed2d3SChris Wilson 5000, 0, 284112ed2d3SChris Wilson NULL); 285112ed2d3SChris Wilson if (ret) { 286112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for render reset failed\n"); 287112ed2d3SChris Wilson goto out; 288112ed2d3SChris Wilson } 289112ed2d3SChris Wilson 290112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 291112ed2d3SChris Wilson ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); 292112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 293112ed2d3SChris Wilson ILK_GRDOM_RESET_ENABLE, 0, 294112ed2d3SChris Wilson 5000, 0, 295112ed2d3SChris Wilson NULL); 296112ed2d3SChris Wilson if (ret) { 297112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for media reset failed\n"); 298112ed2d3SChris Wilson goto out; 299112ed2d3SChris Wilson } 300112ed2d3SChris Wilson 301112ed2d3SChris Wilson out: 302112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 0); 303112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, ILK_GDSR); 304112ed2d3SChris Wilson return ret; 305112ed2d3SChris Wilson } 306112ed2d3SChris Wilson 307112ed2d3SChris Wilson /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ 308112ed2d3SChris Wilson static int gen6_hw_domain_reset(struct drm_i915_private *i915, 309112ed2d3SChris Wilson u32 hw_domain_mask) 310112ed2d3SChris Wilson { 311112ed2d3SChris Wilson struct intel_uncore *uncore = &i915->uncore; 312112ed2d3SChris Wilson int err; 313112ed2d3SChris Wilson 314112ed2d3SChris Wilson /* 315112ed2d3SChris Wilson * GEN6_GDRST is not in the gt power well, no need to check 316112ed2d3SChris Wilson * for fifo space for the write or forcewake the chip for 317112ed2d3SChris Wilson * the read 318112ed2d3SChris Wilson */ 319112ed2d3SChris Wilson intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); 320112ed2d3SChris Wilson 321112ed2d3SChris Wilson /* Wait for the device to ack the reset requests */ 322112ed2d3SChris Wilson err = __intel_wait_for_register_fw(uncore, 323112ed2d3SChris Wilson GEN6_GDRST, hw_domain_mask, 0, 324112ed2d3SChris Wilson 500, 0, 325112ed2d3SChris Wilson NULL); 326112ed2d3SChris Wilson if (err) 327112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n", 328112ed2d3SChris Wilson hw_domain_mask); 329112ed2d3SChris Wilson 330112ed2d3SChris Wilson return err; 331112ed2d3SChris Wilson } 332112ed2d3SChris Wilson 333112ed2d3SChris Wilson static int gen6_reset_engines(struct drm_i915_private *i915, 334112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 335112ed2d3SChris Wilson unsigned int retry) 336112ed2d3SChris Wilson { 337112ed2d3SChris Wilson struct intel_engine_cs *engine; 338112ed2d3SChris Wilson const u32 hw_engine_mask[] = { 339112ed2d3SChris Wilson [RCS0] = GEN6_GRDOM_RENDER, 340112ed2d3SChris Wilson [BCS0] = GEN6_GRDOM_BLT, 341112ed2d3SChris Wilson [VCS0] = GEN6_GRDOM_MEDIA, 342112ed2d3SChris Wilson [VCS1] = GEN8_GRDOM_MEDIA2, 343112ed2d3SChris Wilson [VECS0] = GEN6_GRDOM_VECS, 344112ed2d3SChris Wilson }; 345112ed2d3SChris Wilson u32 hw_mask; 346112ed2d3SChris Wilson 347112ed2d3SChris Wilson if (engine_mask == ALL_ENGINES) { 348112ed2d3SChris Wilson hw_mask = GEN6_GRDOM_FULL; 349112ed2d3SChris Wilson } else { 350112ed2d3SChris Wilson intel_engine_mask_t tmp; 351112ed2d3SChris Wilson 352112ed2d3SChris Wilson hw_mask = 0; 353112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 354112ed2d3SChris Wilson GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); 355112ed2d3SChris Wilson hw_mask |= hw_engine_mask[engine->id]; 356112ed2d3SChris Wilson } 357112ed2d3SChris Wilson } 358112ed2d3SChris Wilson 359112ed2d3SChris Wilson return gen6_hw_domain_reset(i915, hw_mask); 360112ed2d3SChris Wilson } 361112ed2d3SChris Wilson 362112ed2d3SChris Wilson static u32 gen11_lock_sfc(struct intel_engine_cs *engine) 363112ed2d3SChris Wilson { 364112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 365112ed2d3SChris Wilson u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access; 366112ed2d3SChris Wilson i915_reg_t sfc_forced_lock, sfc_forced_lock_ack; 367112ed2d3SChris Wilson u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit; 368112ed2d3SChris Wilson i915_reg_t sfc_usage; 369112ed2d3SChris Wilson u32 sfc_usage_bit; 370112ed2d3SChris Wilson u32 sfc_reset_bit; 371112ed2d3SChris Wilson 372112ed2d3SChris Wilson switch (engine->class) { 373112ed2d3SChris Wilson case VIDEO_DECODE_CLASS: 374112ed2d3SChris Wilson if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 375112ed2d3SChris Wilson return 0; 376112ed2d3SChris Wilson 377112ed2d3SChris Wilson sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); 378112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 379112ed2d3SChris Wilson 380112ed2d3SChris Wilson sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine); 381112ed2d3SChris Wilson sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; 382112ed2d3SChris Wilson 383112ed2d3SChris Wilson sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine); 384112ed2d3SChris Wilson sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT; 385112ed2d3SChris Wilson sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); 386112ed2d3SChris Wilson break; 387112ed2d3SChris Wilson 388112ed2d3SChris Wilson case VIDEO_ENHANCEMENT_CLASS: 389112ed2d3SChris Wilson sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); 390112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 391112ed2d3SChris Wilson 392112ed2d3SChris Wilson sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine); 393112ed2d3SChris Wilson sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; 394112ed2d3SChris Wilson 395112ed2d3SChris Wilson sfc_usage = GEN11_VECS_SFC_USAGE(engine); 396112ed2d3SChris Wilson sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT; 397112ed2d3SChris Wilson sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); 398112ed2d3SChris Wilson break; 399112ed2d3SChris Wilson 400112ed2d3SChris Wilson default: 401112ed2d3SChris Wilson return 0; 402112ed2d3SChris Wilson } 403112ed2d3SChris Wilson 404112ed2d3SChris Wilson /* 405112ed2d3SChris Wilson * Tell the engine that a software reset is going to happen. The engine 406112ed2d3SChris Wilson * will then try to force lock the SFC (if currently locked, it will 407112ed2d3SChris Wilson * remain so until we tell the engine it is safe to unlock; if currently 408112ed2d3SChris Wilson * unlocked, it will ignore this and all new lock requests). If SFC 409112ed2d3SChris Wilson * ends up being locked to the engine we want to reset, we have to reset 410112ed2d3SChris Wilson * it as well (we will unlock it once the reset sequence is completed). 411112ed2d3SChris Wilson */ 412112ed2d3SChris Wilson rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); 413112ed2d3SChris Wilson 414112ed2d3SChris Wilson if (__intel_wait_for_register_fw(uncore, 415112ed2d3SChris Wilson sfc_forced_lock_ack, 416112ed2d3SChris Wilson sfc_forced_lock_ack_bit, 417112ed2d3SChris Wilson sfc_forced_lock_ack_bit, 418112ed2d3SChris Wilson 1000, 0, NULL)) { 419112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n"); 420112ed2d3SChris Wilson return 0; 421112ed2d3SChris Wilson } 422112ed2d3SChris Wilson 423112ed2d3SChris Wilson if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit) 424112ed2d3SChris Wilson return sfc_reset_bit; 425112ed2d3SChris Wilson 426112ed2d3SChris Wilson return 0; 427112ed2d3SChris Wilson } 428112ed2d3SChris Wilson 429112ed2d3SChris Wilson static void gen11_unlock_sfc(struct intel_engine_cs *engine) 430112ed2d3SChris Wilson { 431112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 432112ed2d3SChris Wilson u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access; 433112ed2d3SChris Wilson i915_reg_t sfc_forced_lock; 434112ed2d3SChris Wilson u32 sfc_forced_lock_bit; 435112ed2d3SChris Wilson 436112ed2d3SChris Wilson switch (engine->class) { 437112ed2d3SChris Wilson case VIDEO_DECODE_CLASS: 438112ed2d3SChris Wilson if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 439112ed2d3SChris Wilson return; 440112ed2d3SChris Wilson 441112ed2d3SChris Wilson sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); 442112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 443112ed2d3SChris Wilson break; 444112ed2d3SChris Wilson 445112ed2d3SChris Wilson case VIDEO_ENHANCEMENT_CLASS: 446112ed2d3SChris Wilson sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); 447112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 448112ed2d3SChris Wilson break; 449112ed2d3SChris Wilson 450112ed2d3SChris Wilson default: 451112ed2d3SChris Wilson return; 452112ed2d3SChris Wilson } 453112ed2d3SChris Wilson 454112ed2d3SChris Wilson rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); 455112ed2d3SChris Wilson } 456112ed2d3SChris Wilson 457112ed2d3SChris Wilson static int gen11_reset_engines(struct drm_i915_private *i915, 458112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 459112ed2d3SChris Wilson unsigned int retry) 460112ed2d3SChris Wilson { 461112ed2d3SChris Wilson const u32 hw_engine_mask[] = { 462112ed2d3SChris Wilson [RCS0] = GEN11_GRDOM_RENDER, 463112ed2d3SChris Wilson [BCS0] = GEN11_GRDOM_BLT, 464112ed2d3SChris Wilson [VCS0] = GEN11_GRDOM_MEDIA, 465112ed2d3SChris Wilson [VCS1] = GEN11_GRDOM_MEDIA2, 466112ed2d3SChris Wilson [VCS2] = GEN11_GRDOM_MEDIA3, 467112ed2d3SChris Wilson [VCS3] = GEN11_GRDOM_MEDIA4, 468112ed2d3SChris Wilson [VECS0] = GEN11_GRDOM_VECS, 469112ed2d3SChris Wilson [VECS1] = GEN11_GRDOM_VECS2, 470112ed2d3SChris Wilson }; 471112ed2d3SChris Wilson struct intel_engine_cs *engine; 472112ed2d3SChris Wilson intel_engine_mask_t tmp; 473112ed2d3SChris Wilson u32 hw_mask; 474112ed2d3SChris Wilson int ret; 475112ed2d3SChris Wilson 476112ed2d3SChris Wilson if (engine_mask == ALL_ENGINES) { 477112ed2d3SChris Wilson hw_mask = GEN11_GRDOM_FULL; 478112ed2d3SChris Wilson } else { 479112ed2d3SChris Wilson hw_mask = 0; 480112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 481112ed2d3SChris Wilson GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); 482112ed2d3SChris Wilson hw_mask |= hw_engine_mask[engine->id]; 483112ed2d3SChris Wilson hw_mask |= gen11_lock_sfc(engine); 484112ed2d3SChris Wilson } 485112ed2d3SChris Wilson } 486112ed2d3SChris Wilson 487112ed2d3SChris Wilson ret = gen6_hw_domain_reset(i915, hw_mask); 488112ed2d3SChris Wilson 489112ed2d3SChris Wilson if (engine_mask != ALL_ENGINES) 490112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) 491112ed2d3SChris Wilson gen11_unlock_sfc(engine); 492112ed2d3SChris Wilson 493112ed2d3SChris Wilson return ret; 494112ed2d3SChris Wilson } 495112ed2d3SChris Wilson 496112ed2d3SChris Wilson static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) 497112ed2d3SChris Wilson { 498112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 499112ed2d3SChris Wilson const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base); 500112ed2d3SChris Wilson u32 request, mask, ack; 501112ed2d3SChris Wilson int ret; 502112ed2d3SChris Wilson 503112ed2d3SChris Wilson ack = intel_uncore_read_fw(uncore, reg); 504112ed2d3SChris Wilson if (ack & RESET_CTL_CAT_ERROR) { 505112ed2d3SChris Wilson /* 506112ed2d3SChris Wilson * For catastrophic errors, ready-for-reset sequence 507112ed2d3SChris Wilson * needs to be bypassed: HAS#396813 508112ed2d3SChris Wilson */ 509112ed2d3SChris Wilson request = RESET_CTL_CAT_ERROR; 510112ed2d3SChris Wilson mask = RESET_CTL_CAT_ERROR; 511112ed2d3SChris Wilson 512112ed2d3SChris Wilson /* Catastrophic errors need to be cleared by HW */ 513112ed2d3SChris Wilson ack = 0; 514112ed2d3SChris Wilson } else if (!(ack & RESET_CTL_READY_TO_RESET)) { 515112ed2d3SChris Wilson request = RESET_CTL_REQUEST_RESET; 516112ed2d3SChris Wilson mask = RESET_CTL_READY_TO_RESET; 517112ed2d3SChris Wilson ack = RESET_CTL_READY_TO_RESET; 518112ed2d3SChris Wilson } else { 519112ed2d3SChris Wilson return 0; 520112ed2d3SChris Wilson } 521112ed2d3SChris Wilson 522112ed2d3SChris Wilson intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request)); 523112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 524112ed2d3SChris Wilson 700, 0, NULL); 525112ed2d3SChris Wilson if (ret) 526112ed2d3SChris Wilson DRM_ERROR("%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", 527112ed2d3SChris Wilson engine->name, request, 528112ed2d3SChris Wilson intel_uncore_read_fw(uncore, reg)); 529112ed2d3SChris Wilson 530112ed2d3SChris Wilson return ret; 531112ed2d3SChris Wilson } 532112ed2d3SChris Wilson 533112ed2d3SChris Wilson static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) 534112ed2d3SChris Wilson { 535112ed2d3SChris Wilson intel_uncore_write_fw(engine->uncore, 536112ed2d3SChris Wilson RING_RESET_CTL(engine->mmio_base), 537112ed2d3SChris Wilson _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); 538112ed2d3SChris Wilson } 539112ed2d3SChris Wilson 540112ed2d3SChris Wilson static int gen8_reset_engines(struct drm_i915_private *i915, 541112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 542112ed2d3SChris Wilson unsigned int retry) 543112ed2d3SChris Wilson { 544112ed2d3SChris Wilson struct intel_engine_cs *engine; 545112ed2d3SChris Wilson const bool reset_non_ready = retry >= 1; 546112ed2d3SChris Wilson intel_engine_mask_t tmp; 547112ed2d3SChris Wilson int ret; 548112ed2d3SChris Wilson 549112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 550112ed2d3SChris Wilson ret = gen8_engine_reset_prepare(engine); 551112ed2d3SChris Wilson if (ret && !reset_non_ready) 552112ed2d3SChris Wilson goto skip_reset; 553112ed2d3SChris Wilson 554112ed2d3SChris Wilson /* 555112ed2d3SChris Wilson * If this is not the first failed attempt to prepare, 556112ed2d3SChris Wilson * we decide to proceed anyway. 557112ed2d3SChris Wilson * 558112ed2d3SChris Wilson * By doing so we risk context corruption and with 559112ed2d3SChris Wilson * some gens (kbl), possible system hang if reset 560112ed2d3SChris Wilson * happens during active bb execution. 561112ed2d3SChris Wilson * 562112ed2d3SChris Wilson * We rather take context corruption instead of 563112ed2d3SChris Wilson * failed reset with a wedged driver/gpu. And 564112ed2d3SChris Wilson * active bb execution case should be covered by 565112ed2d3SChris Wilson * i915_stop_engines we have before the reset. 566112ed2d3SChris Wilson */ 567112ed2d3SChris Wilson } 568112ed2d3SChris Wilson 569112ed2d3SChris Wilson if (INTEL_GEN(i915) >= 11) 570112ed2d3SChris Wilson ret = gen11_reset_engines(i915, engine_mask, retry); 571112ed2d3SChris Wilson else 572112ed2d3SChris Wilson ret = gen6_reset_engines(i915, engine_mask, retry); 573112ed2d3SChris Wilson 574112ed2d3SChris Wilson skip_reset: 575112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) 576112ed2d3SChris Wilson gen8_engine_reset_cancel(engine); 577112ed2d3SChris Wilson 578112ed2d3SChris Wilson return ret; 579112ed2d3SChris Wilson } 580112ed2d3SChris Wilson 581112ed2d3SChris Wilson typedef int (*reset_func)(struct drm_i915_private *, 582112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 583112ed2d3SChris Wilson unsigned int retry); 584112ed2d3SChris Wilson 585112ed2d3SChris Wilson static reset_func intel_get_gpu_reset(struct drm_i915_private *i915) 586112ed2d3SChris Wilson { 587112ed2d3SChris Wilson if (INTEL_GEN(i915) >= 8) 588112ed2d3SChris Wilson return gen8_reset_engines; 589112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 6) 590112ed2d3SChris Wilson return gen6_reset_engines; 591112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 5) 592112ed2d3SChris Wilson return ironlake_do_reset; 593112ed2d3SChris Wilson else if (IS_G4X(i915)) 594112ed2d3SChris Wilson return g4x_do_reset; 595112ed2d3SChris Wilson else if (IS_G33(i915) || IS_PINEVIEW(i915)) 596112ed2d3SChris Wilson return g33_do_reset; 597112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 3) 598112ed2d3SChris Wilson return i915_do_reset; 599112ed2d3SChris Wilson else 600112ed2d3SChris Wilson return NULL; 601112ed2d3SChris Wilson } 602112ed2d3SChris Wilson 603112ed2d3SChris Wilson int intel_gpu_reset(struct drm_i915_private *i915, 604112ed2d3SChris Wilson intel_engine_mask_t engine_mask) 605112ed2d3SChris Wilson { 606112ed2d3SChris Wilson const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; 607112ed2d3SChris Wilson reset_func reset; 608112ed2d3SChris Wilson int ret = -ETIMEDOUT; 609112ed2d3SChris Wilson int retry; 610112ed2d3SChris Wilson 611112ed2d3SChris Wilson reset = intel_get_gpu_reset(i915); 612112ed2d3SChris Wilson if (!reset) 613112ed2d3SChris Wilson return -ENODEV; 614112ed2d3SChris Wilson 615112ed2d3SChris Wilson /* 616112ed2d3SChris Wilson * If the power well sleeps during the reset, the reset 617112ed2d3SChris Wilson * request may be dropped and never completes (causing -EIO). 618112ed2d3SChris Wilson */ 619112ed2d3SChris Wilson intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL); 620112ed2d3SChris Wilson for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { 621112ed2d3SChris Wilson /* 622112ed2d3SChris Wilson * We stop engines, otherwise we might get failed reset and a 623112ed2d3SChris Wilson * dead gpu (on elk). Also as modern gpu as kbl can suffer 624112ed2d3SChris Wilson * from system hang if batchbuffer is progressing when 625112ed2d3SChris Wilson * the reset is issued, regardless of READY_TO_RESET ack. 626112ed2d3SChris Wilson * Thus assume it is best to stop engines on all gens 627112ed2d3SChris Wilson * where we have a gpu reset. 628112ed2d3SChris Wilson * 629112ed2d3SChris Wilson * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 630112ed2d3SChris Wilson * 631112ed2d3SChris Wilson * WaMediaResetMainRingCleanup:ctg,elk (presumably) 632112ed2d3SChris Wilson * 633112ed2d3SChris Wilson * FIXME: Wa for more modern gens needs to be validated 634112ed2d3SChris Wilson */ 635112ed2d3SChris Wilson if (retry) 636112ed2d3SChris Wilson i915_stop_engines(i915, engine_mask); 637112ed2d3SChris Wilson 638112ed2d3SChris Wilson GEM_TRACE("engine_mask=%x\n", engine_mask); 639112ed2d3SChris Wilson preempt_disable(); 640112ed2d3SChris Wilson ret = reset(i915, engine_mask, retry); 641112ed2d3SChris Wilson preempt_enable(); 642112ed2d3SChris Wilson } 643112ed2d3SChris Wilson intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL); 644112ed2d3SChris Wilson 645112ed2d3SChris Wilson return ret; 646112ed2d3SChris Wilson } 647112ed2d3SChris Wilson 648112ed2d3SChris Wilson bool intel_has_gpu_reset(struct drm_i915_private *i915) 649112ed2d3SChris Wilson { 650112ed2d3SChris Wilson if (!i915_modparams.reset) 651112ed2d3SChris Wilson return NULL; 652112ed2d3SChris Wilson 653112ed2d3SChris Wilson return intel_get_gpu_reset(i915); 654112ed2d3SChris Wilson } 655112ed2d3SChris Wilson 656112ed2d3SChris Wilson bool intel_has_reset_engine(struct drm_i915_private *i915) 657112ed2d3SChris Wilson { 658112ed2d3SChris Wilson return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2; 659112ed2d3SChris Wilson } 660112ed2d3SChris Wilson 661112ed2d3SChris Wilson int intel_reset_guc(struct drm_i915_private *i915) 662112ed2d3SChris Wilson { 663112ed2d3SChris Wilson u32 guc_domain = 664112ed2d3SChris Wilson INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; 665112ed2d3SChris Wilson int ret; 666112ed2d3SChris Wilson 667112ed2d3SChris Wilson GEM_BUG_ON(!HAS_GUC(i915)); 668112ed2d3SChris Wilson 669112ed2d3SChris Wilson intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL); 670112ed2d3SChris Wilson ret = gen6_hw_domain_reset(i915, guc_domain); 671112ed2d3SChris Wilson intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL); 672112ed2d3SChris Wilson 673112ed2d3SChris Wilson return ret; 674112ed2d3SChris Wilson } 675112ed2d3SChris Wilson 676112ed2d3SChris Wilson /* 677112ed2d3SChris Wilson * Ensure irq handler finishes, and not run again. 678112ed2d3SChris Wilson * Also return the active request so that we only search for it once. 679112ed2d3SChris Wilson */ 680112ed2d3SChris Wilson static void reset_prepare_engine(struct intel_engine_cs *engine) 681112ed2d3SChris Wilson { 682112ed2d3SChris Wilson /* 683112ed2d3SChris Wilson * During the reset sequence, we must prevent the engine from 684112ed2d3SChris Wilson * entering RC6. As the context state is undefined until we restart 685112ed2d3SChris Wilson * the engine, if it does enter RC6 during the reset, the state 686112ed2d3SChris Wilson * written to the powercontext is undefined and so we may lose 687112ed2d3SChris Wilson * GPU state upon resume, i.e. fail to restart after a reset. 688112ed2d3SChris Wilson */ 68979ffac85SChris Wilson intel_engine_pm_get(engine); 690112ed2d3SChris Wilson intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); 691112ed2d3SChris Wilson engine->reset.prepare(engine); 692112ed2d3SChris Wilson } 693112ed2d3SChris Wilson 694112ed2d3SChris Wilson static void revoke_mmaps(struct drm_i915_private *i915) 695112ed2d3SChris Wilson { 696112ed2d3SChris Wilson int i; 697112ed2d3SChris Wilson 698112ed2d3SChris Wilson for (i = 0; i < i915->num_fence_regs; i++) { 699112ed2d3SChris Wilson struct drm_vma_offset_node *node; 700112ed2d3SChris Wilson struct i915_vma *vma; 701112ed2d3SChris Wilson u64 vma_offset; 702112ed2d3SChris Wilson 703112ed2d3SChris Wilson vma = READ_ONCE(i915->fence_regs[i].vma); 704112ed2d3SChris Wilson if (!vma) 705112ed2d3SChris Wilson continue; 706112ed2d3SChris Wilson 707112ed2d3SChris Wilson if (!i915_vma_has_userfault(vma)) 708112ed2d3SChris Wilson continue; 709112ed2d3SChris Wilson 710112ed2d3SChris Wilson GEM_BUG_ON(vma->fence != &i915->fence_regs[i]); 711112ed2d3SChris Wilson node = &vma->obj->base.vma_node; 712112ed2d3SChris Wilson vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT; 713112ed2d3SChris Wilson unmap_mapping_range(i915->drm.anon_inode->i_mapping, 714112ed2d3SChris Wilson drm_vma_node_offset_addr(node) + vma_offset, 715112ed2d3SChris Wilson vma->size, 716112ed2d3SChris Wilson 1); 717112ed2d3SChris Wilson } 718112ed2d3SChris Wilson } 719112ed2d3SChris Wilson 720112ed2d3SChris Wilson static void reset_prepare(struct drm_i915_private *i915) 721112ed2d3SChris Wilson { 722112ed2d3SChris Wilson struct intel_engine_cs *engine; 723112ed2d3SChris Wilson enum intel_engine_id id; 724112ed2d3SChris Wilson 72579ffac85SChris Wilson intel_gt_pm_get(i915); 726112ed2d3SChris Wilson for_each_engine(engine, i915, id) 727112ed2d3SChris Wilson reset_prepare_engine(engine); 728112ed2d3SChris Wilson 729112ed2d3SChris Wilson intel_uc_reset_prepare(i915); 730112ed2d3SChris Wilson } 731112ed2d3SChris Wilson 732112ed2d3SChris Wilson static void gt_revoke(struct drm_i915_private *i915) 733112ed2d3SChris Wilson { 734112ed2d3SChris Wilson revoke_mmaps(i915); 735112ed2d3SChris Wilson } 736112ed2d3SChris Wilson 737112ed2d3SChris Wilson static int gt_reset(struct drm_i915_private *i915, 738112ed2d3SChris Wilson intel_engine_mask_t stalled_mask) 739112ed2d3SChris Wilson { 740112ed2d3SChris Wilson struct intel_engine_cs *engine; 741112ed2d3SChris Wilson enum intel_engine_id id; 742112ed2d3SChris Wilson int err; 743112ed2d3SChris Wilson 744112ed2d3SChris Wilson /* 745112ed2d3SChris Wilson * Everything depends on having the GTT running, so we need to start 746112ed2d3SChris Wilson * there. 747112ed2d3SChris Wilson */ 748112ed2d3SChris Wilson err = i915_ggtt_enable_hw(i915); 749112ed2d3SChris Wilson if (err) 750112ed2d3SChris Wilson return err; 751112ed2d3SChris Wilson 752112ed2d3SChris Wilson for_each_engine(engine, i915, id) 753112ed2d3SChris Wilson intel_engine_reset(engine, stalled_mask & engine->mask); 754112ed2d3SChris Wilson 755112ed2d3SChris Wilson i915_gem_restore_fences(i915); 756112ed2d3SChris Wilson 757112ed2d3SChris Wilson return err; 758112ed2d3SChris Wilson } 759112ed2d3SChris Wilson 760112ed2d3SChris Wilson static void reset_finish_engine(struct intel_engine_cs *engine) 761112ed2d3SChris Wilson { 762112ed2d3SChris Wilson engine->reset.finish(engine); 76379ffac85SChris Wilson intel_engine_pm_put(engine); 764112ed2d3SChris Wilson intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); 765112ed2d3SChris Wilson } 766112ed2d3SChris Wilson 767112ed2d3SChris Wilson static void reset_finish(struct drm_i915_private *i915) 768112ed2d3SChris Wilson { 769112ed2d3SChris Wilson struct intel_engine_cs *engine; 770112ed2d3SChris Wilson enum intel_engine_id id; 771112ed2d3SChris Wilson 772112ed2d3SChris Wilson for_each_engine(engine, i915, id) { 773112ed2d3SChris Wilson reset_finish_engine(engine); 774112ed2d3SChris Wilson intel_engine_signal_breadcrumbs(engine); 775112ed2d3SChris Wilson } 77679ffac85SChris Wilson intel_gt_pm_put(i915); 777112ed2d3SChris Wilson } 778112ed2d3SChris Wilson 779112ed2d3SChris Wilson static void nop_submit_request(struct i915_request *request) 780112ed2d3SChris Wilson { 781112ed2d3SChris Wilson struct intel_engine_cs *engine = request->engine; 782112ed2d3SChris Wilson unsigned long flags; 783112ed2d3SChris Wilson 784112ed2d3SChris Wilson GEM_TRACE("%s fence %llx:%lld -> -EIO\n", 785112ed2d3SChris Wilson engine->name, request->fence.context, request->fence.seqno); 786112ed2d3SChris Wilson dma_fence_set_error(&request->fence, -EIO); 787112ed2d3SChris Wilson 788112ed2d3SChris Wilson spin_lock_irqsave(&engine->timeline.lock, flags); 789112ed2d3SChris Wilson __i915_request_submit(request); 790112ed2d3SChris Wilson i915_request_mark_complete(request); 791112ed2d3SChris Wilson spin_unlock_irqrestore(&engine->timeline.lock, flags); 792112ed2d3SChris Wilson 793112ed2d3SChris Wilson intel_engine_queue_breadcrumbs(engine); 794112ed2d3SChris Wilson } 795112ed2d3SChris Wilson 796112ed2d3SChris Wilson static void __i915_gem_set_wedged(struct drm_i915_private *i915) 797112ed2d3SChris Wilson { 798112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 799112ed2d3SChris Wilson struct intel_engine_cs *engine; 800112ed2d3SChris Wilson enum intel_engine_id id; 801112ed2d3SChris Wilson 802112ed2d3SChris Wilson if (test_bit(I915_WEDGED, &error->flags)) 803112ed2d3SChris Wilson return; 804112ed2d3SChris Wilson 805112ed2d3SChris Wilson if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) { 806112ed2d3SChris Wilson struct drm_printer p = drm_debug_printer(__func__); 807112ed2d3SChris Wilson 808112ed2d3SChris Wilson for_each_engine(engine, i915, id) 809112ed2d3SChris Wilson intel_engine_dump(engine, &p, "%s\n", engine->name); 810112ed2d3SChris Wilson } 811112ed2d3SChris Wilson 812112ed2d3SChris Wilson GEM_TRACE("start\n"); 813112ed2d3SChris Wilson 814112ed2d3SChris Wilson /* 815112ed2d3SChris Wilson * First, stop submission to hw, but do not yet complete requests by 816112ed2d3SChris Wilson * rolling the global seqno forward (since this would complete requests 817112ed2d3SChris Wilson * for which we haven't set the fence error to EIO yet). 818112ed2d3SChris Wilson */ 819112ed2d3SChris Wilson reset_prepare(i915); 820112ed2d3SChris Wilson 821112ed2d3SChris Wilson /* Even if the GPU reset fails, it should still stop the engines */ 822112ed2d3SChris Wilson if (!INTEL_INFO(i915)->gpu_reset_clobbers_display) 823112ed2d3SChris Wilson intel_gpu_reset(i915, ALL_ENGINES); 824112ed2d3SChris Wilson 825112ed2d3SChris Wilson for_each_engine(engine, i915, id) { 826112ed2d3SChris Wilson engine->submit_request = nop_submit_request; 827112ed2d3SChris Wilson engine->schedule = NULL; 828112ed2d3SChris Wilson } 829112ed2d3SChris Wilson i915->caps.scheduler = 0; 830112ed2d3SChris Wilson 831112ed2d3SChris Wilson /* 832112ed2d3SChris Wilson * Make sure no request can slip through without getting completed by 833112ed2d3SChris Wilson * either this call here to intel_engine_write_global_seqno, or the one 834112ed2d3SChris Wilson * in nop_submit_request. 835112ed2d3SChris Wilson */ 836112ed2d3SChris Wilson synchronize_rcu_expedited(); 83779ffac85SChris Wilson set_bit(I915_WEDGED, &error->flags); 838112ed2d3SChris Wilson 839112ed2d3SChris Wilson /* Mark all executing requests as skipped */ 840112ed2d3SChris Wilson for_each_engine(engine, i915, id) 841112ed2d3SChris Wilson engine->cancel_requests(engine); 842112ed2d3SChris Wilson 843112ed2d3SChris Wilson reset_finish(i915); 844112ed2d3SChris Wilson 845112ed2d3SChris Wilson GEM_TRACE("end\n"); 846112ed2d3SChris Wilson } 847112ed2d3SChris Wilson 848112ed2d3SChris Wilson void i915_gem_set_wedged(struct drm_i915_private *i915) 849112ed2d3SChris Wilson { 850112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 851112ed2d3SChris Wilson intel_wakeref_t wakeref; 852112ed2d3SChris Wilson 853112ed2d3SChris Wilson mutex_lock(&error->wedge_mutex); 854112ed2d3SChris Wilson with_intel_runtime_pm(i915, wakeref) 855112ed2d3SChris Wilson __i915_gem_set_wedged(i915); 856112ed2d3SChris Wilson mutex_unlock(&error->wedge_mutex); 857112ed2d3SChris Wilson } 858112ed2d3SChris Wilson 859112ed2d3SChris Wilson static bool __i915_gem_unset_wedged(struct drm_i915_private *i915) 860112ed2d3SChris Wilson { 861112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 862112ed2d3SChris Wilson struct i915_timeline *tl; 863112ed2d3SChris Wilson 864112ed2d3SChris Wilson if (!test_bit(I915_WEDGED, &error->flags)) 865112ed2d3SChris Wilson return true; 866112ed2d3SChris Wilson 867112ed2d3SChris Wilson if (!i915->gt.scratch) /* Never full initialised, recovery impossible */ 868112ed2d3SChris Wilson return false; 869112ed2d3SChris Wilson 870112ed2d3SChris Wilson GEM_TRACE("start\n"); 871112ed2d3SChris Wilson 872112ed2d3SChris Wilson /* 873112ed2d3SChris Wilson * Before unwedging, make sure that all pending operations 874112ed2d3SChris Wilson * are flushed and errored out - we may have requests waiting upon 875112ed2d3SChris Wilson * third party fences. We marked all inflight requests as EIO, and 876112ed2d3SChris Wilson * every execbuf since returned EIO, for consistency we want all 877112ed2d3SChris Wilson * the currently pending requests to also be marked as EIO, which 878112ed2d3SChris Wilson * is done inside our nop_submit_request - and so we must wait. 879112ed2d3SChris Wilson * 880112ed2d3SChris Wilson * No more can be submitted until we reset the wedged bit. 881112ed2d3SChris Wilson */ 882112ed2d3SChris Wilson mutex_lock(&i915->gt.timelines.mutex); 883112ed2d3SChris Wilson list_for_each_entry(tl, &i915->gt.timelines.active_list, link) { 884112ed2d3SChris Wilson struct i915_request *rq; 885112ed2d3SChris Wilson 886112ed2d3SChris Wilson rq = i915_active_request_get_unlocked(&tl->last_request); 887112ed2d3SChris Wilson if (!rq) 888112ed2d3SChris Wilson continue; 889112ed2d3SChris Wilson 890112ed2d3SChris Wilson /* 891112ed2d3SChris Wilson * All internal dependencies (i915_requests) will have 892112ed2d3SChris Wilson * been flushed by the set-wedge, but we may be stuck waiting 893112ed2d3SChris Wilson * for external fences. These should all be capped to 10s 894112ed2d3SChris Wilson * (I915_FENCE_TIMEOUT) so this wait should not be unbounded 895112ed2d3SChris Wilson * in the worst case. 896112ed2d3SChris Wilson */ 897112ed2d3SChris Wilson dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT); 898112ed2d3SChris Wilson i915_request_put(rq); 899112ed2d3SChris Wilson } 900112ed2d3SChris Wilson mutex_unlock(&i915->gt.timelines.mutex); 901112ed2d3SChris Wilson 90279ffac85SChris Wilson intel_gt_sanitize(i915, false); 903112ed2d3SChris Wilson 904112ed2d3SChris Wilson /* 905112ed2d3SChris Wilson * Undo nop_submit_request. We prevent all new i915 requests from 906112ed2d3SChris Wilson * being queued (by disallowing execbuf whilst wedged) so having 907112ed2d3SChris Wilson * waited for all active requests above, we know the system is idle 908112ed2d3SChris Wilson * and do not have to worry about a thread being inside 909112ed2d3SChris Wilson * engine->submit_request() as we swap over. So unlike installing 910112ed2d3SChris Wilson * the nop_submit_request on reset, we can do this from normal 911112ed2d3SChris Wilson * context and do not require stop_machine(). 912112ed2d3SChris Wilson */ 913112ed2d3SChris Wilson intel_engines_reset_default_submission(i915); 914112ed2d3SChris Wilson 915112ed2d3SChris Wilson GEM_TRACE("end\n"); 916112ed2d3SChris Wilson 917112ed2d3SChris Wilson smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ 918112ed2d3SChris Wilson clear_bit(I915_WEDGED, &i915->gpu_error.flags); 919112ed2d3SChris Wilson 920112ed2d3SChris Wilson return true; 921112ed2d3SChris Wilson } 922112ed2d3SChris Wilson 923112ed2d3SChris Wilson bool i915_gem_unset_wedged(struct drm_i915_private *i915) 924112ed2d3SChris Wilson { 925112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 926112ed2d3SChris Wilson bool result; 927112ed2d3SChris Wilson 928112ed2d3SChris Wilson mutex_lock(&error->wedge_mutex); 929112ed2d3SChris Wilson result = __i915_gem_unset_wedged(i915); 930112ed2d3SChris Wilson mutex_unlock(&error->wedge_mutex); 931112ed2d3SChris Wilson 932112ed2d3SChris Wilson return result; 933112ed2d3SChris Wilson } 934112ed2d3SChris Wilson 935112ed2d3SChris Wilson static int do_reset(struct drm_i915_private *i915, 936112ed2d3SChris Wilson intel_engine_mask_t stalled_mask) 937112ed2d3SChris Wilson { 938112ed2d3SChris Wilson int err, i; 939112ed2d3SChris Wilson 940112ed2d3SChris Wilson gt_revoke(i915); 941112ed2d3SChris Wilson 942112ed2d3SChris Wilson err = intel_gpu_reset(i915, ALL_ENGINES); 943112ed2d3SChris Wilson for (i = 0; err && i < RESET_MAX_RETRIES; i++) { 944112ed2d3SChris Wilson msleep(10 * (i + 1)); 945112ed2d3SChris Wilson err = intel_gpu_reset(i915, ALL_ENGINES); 946112ed2d3SChris Wilson } 947112ed2d3SChris Wilson if (err) 948112ed2d3SChris Wilson return err; 949112ed2d3SChris Wilson 950112ed2d3SChris Wilson return gt_reset(i915, stalled_mask); 951112ed2d3SChris Wilson } 952112ed2d3SChris Wilson 953112ed2d3SChris Wilson /** 954112ed2d3SChris Wilson * i915_reset - reset chip after a hang 955112ed2d3SChris Wilson * @i915: #drm_i915_private to reset 956112ed2d3SChris Wilson * @stalled_mask: mask of the stalled engines with the guilty requests 957112ed2d3SChris Wilson * @reason: user error message for why we are resetting 958112ed2d3SChris Wilson * 959112ed2d3SChris Wilson * Reset the chip. Useful if a hang is detected. Marks the device as wedged 960112ed2d3SChris Wilson * on failure. 961112ed2d3SChris Wilson * 962112ed2d3SChris Wilson * Procedure is fairly simple: 963112ed2d3SChris Wilson * - reset the chip using the reset reg 964112ed2d3SChris Wilson * - re-init context state 965112ed2d3SChris Wilson * - re-init hardware status page 966112ed2d3SChris Wilson * - re-init ring buffer 967112ed2d3SChris Wilson * - re-init interrupt state 968112ed2d3SChris Wilson * - re-init display 969112ed2d3SChris Wilson */ 970112ed2d3SChris Wilson void i915_reset(struct drm_i915_private *i915, 971112ed2d3SChris Wilson intel_engine_mask_t stalled_mask, 972112ed2d3SChris Wilson const char *reason) 973112ed2d3SChris Wilson { 974112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 975112ed2d3SChris Wilson int ret; 976112ed2d3SChris Wilson 977112ed2d3SChris Wilson GEM_TRACE("flags=%lx\n", error->flags); 978112ed2d3SChris Wilson 979112ed2d3SChris Wilson might_sleep(); 980112ed2d3SChris Wilson GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); 981112ed2d3SChris Wilson 982112ed2d3SChris Wilson /* Clear any previous failed attempts at recovery. Time to try again. */ 983112ed2d3SChris Wilson if (!__i915_gem_unset_wedged(i915)) 984112ed2d3SChris Wilson return; 985112ed2d3SChris Wilson 986112ed2d3SChris Wilson if (reason) 987112ed2d3SChris Wilson dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason); 988112ed2d3SChris Wilson error->reset_count++; 989112ed2d3SChris Wilson 990112ed2d3SChris Wilson reset_prepare(i915); 991112ed2d3SChris Wilson 992112ed2d3SChris Wilson if (!intel_has_gpu_reset(i915)) { 993112ed2d3SChris Wilson if (i915_modparams.reset) 994112ed2d3SChris Wilson dev_err(i915->drm.dev, "GPU reset not supported\n"); 995112ed2d3SChris Wilson else 996112ed2d3SChris Wilson DRM_DEBUG_DRIVER("GPU reset disabled\n"); 997112ed2d3SChris Wilson goto error; 998112ed2d3SChris Wilson } 999112ed2d3SChris Wilson 1000112ed2d3SChris Wilson if (INTEL_INFO(i915)->gpu_reset_clobbers_display) 1001112ed2d3SChris Wilson intel_runtime_pm_disable_interrupts(i915); 1002112ed2d3SChris Wilson 1003112ed2d3SChris Wilson if (do_reset(i915, stalled_mask)) { 1004112ed2d3SChris Wilson dev_err(i915->drm.dev, "Failed to reset chip\n"); 1005112ed2d3SChris Wilson goto taint; 1006112ed2d3SChris Wilson } 1007112ed2d3SChris Wilson 1008112ed2d3SChris Wilson if (INTEL_INFO(i915)->gpu_reset_clobbers_display) 1009112ed2d3SChris Wilson intel_runtime_pm_enable_interrupts(i915); 1010112ed2d3SChris Wilson 1011112ed2d3SChris Wilson intel_overlay_reset(i915); 1012112ed2d3SChris Wilson 1013112ed2d3SChris Wilson /* 1014112ed2d3SChris Wilson * Next we need to restore the context, but we don't use those 1015112ed2d3SChris Wilson * yet either... 1016112ed2d3SChris Wilson * 1017112ed2d3SChris Wilson * Ring buffer needs to be re-initialized in the KMS case, or if X 1018112ed2d3SChris Wilson * was running at the time of the reset (i.e. we weren't VT 1019112ed2d3SChris Wilson * switched away). 1020112ed2d3SChris Wilson */ 1021112ed2d3SChris Wilson ret = i915_gem_init_hw(i915); 1022112ed2d3SChris Wilson if (ret) { 1023112ed2d3SChris Wilson DRM_ERROR("Failed to initialise HW following reset (%d)\n", 1024112ed2d3SChris Wilson ret); 1025112ed2d3SChris Wilson goto error; 1026112ed2d3SChris Wilson } 1027112ed2d3SChris Wilson 1028112ed2d3SChris Wilson i915_queue_hangcheck(i915); 1029112ed2d3SChris Wilson 1030112ed2d3SChris Wilson finish: 1031112ed2d3SChris Wilson reset_finish(i915); 1032112ed2d3SChris Wilson return; 1033112ed2d3SChris Wilson 1034112ed2d3SChris Wilson taint: 1035112ed2d3SChris Wilson /* 1036112ed2d3SChris Wilson * History tells us that if we cannot reset the GPU now, we 1037112ed2d3SChris Wilson * never will. This then impacts everything that is run 1038112ed2d3SChris Wilson * subsequently. On failing the reset, we mark the driver 1039112ed2d3SChris Wilson * as wedged, preventing further execution on the GPU. 1040112ed2d3SChris Wilson * We also want to go one step further and add a taint to the 1041112ed2d3SChris Wilson * kernel so that any subsequent faults can be traced back to 1042112ed2d3SChris Wilson * this failure. This is important for CI, where if the 1043112ed2d3SChris Wilson * GPU/driver fails we would like to reboot and restart testing 1044112ed2d3SChris Wilson * rather than continue on into oblivion. For everyone else, 1045112ed2d3SChris Wilson * the system should still plod along, but they have been warned! 1046112ed2d3SChris Wilson */ 104718ecc6c5SChris Wilson add_taint_for_CI(TAINT_WARN); 1048112ed2d3SChris Wilson error: 1049112ed2d3SChris Wilson __i915_gem_set_wedged(i915); 1050112ed2d3SChris Wilson goto finish; 1051112ed2d3SChris Wilson } 1052112ed2d3SChris Wilson 1053112ed2d3SChris Wilson static inline int intel_gt_reset_engine(struct drm_i915_private *i915, 1054112ed2d3SChris Wilson struct intel_engine_cs *engine) 1055112ed2d3SChris Wilson { 1056112ed2d3SChris Wilson return intel_gpu_reset(i915, engine->mask); 1057112ed2d3SChris Wilson } 1058112ed2d3SChris Wilson 1059112ed2d3SChris Wilson /** 1060112ed2d3SChris Wilson * i915_reset_engine - reset GPU engine to recover from a hang 1061112ed2d3SChris Wilson * @engine: engine to reset 1062112ed2d3SChris Wilson * @msg: reason for GPU reset; or NULL for no dev_notice() 1063112ed2d3SChris Wilson * 1064112ed2d3SChris Wilson * Reset a specific GPU engine. Useful if a hang is detected. 1065112ed2d3SChris Wilson * Returns zero on successful reset or otherwise an error code. 1066112ed2d3SChris Wilson * 1067112ed2d3SChris Wilson * Procedure is: 1068112ed2d3SChris Wilson * - identifies the request that caused the hang and it is dropped 1069112ed2d3SChris Wilson * - reset engine (which will force the engine to idle) 1070112ed2d3SChris Wilson * - re-init/configure engine 1071112ed2d3SChris Wilson */ 1072112ed2d3SChris Wilson int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) 1073112ed2d3SChris Wilson { 1074112ed2d3SChris Wilson struct i915_gpu_error *error = &engine->i915->gpu_error; 1075112ed2d3SChris Wilson int ret; 1076112ed2d3SChris Wilson 1077112ed2d3SChris Wilson GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); 1078112ed2d3SChris Wilson GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); 1079112ed2d3SChris Wilson 108079ffac85SChris Wilson if (!intel_wakeref_active(&engine->wakeref)) 108179ffac85SChris Wilson return 0; 108279ffac85SChris Wilson 1083112ed2d3SChris Wilson reset_prepare_engine(engine); 1084112ed2d3SChris Wilson 1085112ed2d3SChris Wilson if (msg) 1086112ed2d3SChris Wilson dev_notice(engine->i915->drm.dev, 1087112ed2d3SChris Wilson "Resetting %s for %s\n", engine->name, msg); 1088112ed2d3SChris Wilson error->reset_engine_count[engine->id]++; 1089112ed2d3SChris Wilson 1090112ed2d3SChris Wilson if (!engine->i915->guc.execbuf_client) 1091112ed2d3SChris Wilson ret = intel_gt_reset_engine(engine->i915, engine); 1092112ed2d3SChris Wilson else 1093112ed2d3SChris Wilson ret = intel_guc_reset_engine(&engine->i915->guc, engine); 1094112ed2d3SChris Wilson if (ret) { 1095112ed2d3SChris Wilson /* If we fail here, we expect to fallback to a global reset */ 1096112ed2d3SChris Wilson DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n", 1097112ed2d3SChris Wilson engine->i915->guc.execbuf_client ? "GuC " : "", 1098112ed2d3SChris Wilson engine->name, ret); 1099112ed2d3SChris Wilson goto out; 1100112ed2d3SChris Wilson } 1101112ed2d3SChris Wilson 1102112ed2d3SChris Wilson /* 1103112ed2d3SChris Wilson * The request that caused the hang is stuck on elsp, we know the 1104112ed2d3SChris Wilson * active request and can drop it, adjust head to skip the offending 1105112ed2d3SChris Wilson * request to resume executing remaining requests in the queue. 1106112ed2d3SChris Wilson */ 1107112ed2d3SChris Wilson intel_engine_reset(engine, true); 1108112ed2d3SChris Wilson 1109112ed2d3SChris Wilson /* 1110112ed2d3SChris Wilson * The engine and its registers (and workarounds in case of render) 1111112ed2d3SChris Wilson * have been reset to their default values. Follow the init_ring 1112112ed2d3SChris Wilson * process to program RING_MODE, HWSP and re-enable submission. 1113112ed2d3SChris Wilson */ 111479ffac85SChris Wilson ret = engine->resume(engine); 1115112ed2d3SChris Wilson if (ret) 1116112ed2d3SChris Wilson goto out; 1117112ed2d3SChris Wilson 1118112ed2d3SChris Wilson out: 1119112ed2d3SChris Wilson intel_engine_cancel_stop_cs(engine); 1120112ed2d3SChris Wilson reset_finish_engine(engine); 1121112ed2d3SChris Wilson return ret; 1122112ed2d3SChris Wilson } 1123112ed2d3SChris Wilson 1124112ed2d3SChris Wilson static void i915_reset_device(struct drm_i915_private *i915, 1125112ed2d3SChris Wilson u32 engine_mask, 1126112ed2d3SChris Wilson const char *reason) 1127112ed2d3SChris Wilson { 1128112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1129112ed2d3SChris Wilson struct kobject *kobj = &i915->drm.primary->kdev->kobj; 1130112ed2d3SChris Wilson char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; 1131112ed2d3SChris Wilson char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; 1132112ed2d3SChris Wilson char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; 1133112ed2d3SChris Wilson struct i915_wedge_me w; 1134112ed2d3SChris Wilson 1135112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); 1136112ed2d3SChris Wilson 1137112ed2d3SChris Wilson DRM_DEBUG_DRIVER("resetting chip\n"); 1138112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); 1139112ed2d3SChris Wilson 1140112ed2d3SChris Wilson /* Use a watchdog to ensure that our reset completes */ 1141112ed2d3SChris Wilson i915_wedge_on_timeout(&w, i915, 5 * HZ) { 1142112ed2d3SChris Wilson intel_prepare_reset(i915); 1143112ed2d3SChris Wilson 1144112ed2d3SChris Wilson /* Flush everyone using a resource about to be clobbered */ 1145112ed2d3SChris Wilson synchronize_srcu_expedited(&error->reset_backoff_srcu); 1146112ed2d3SChris Wilson 1147112ed2d3SChris Wilson mutex_lock(&error->wedge_mutex); 1148112ed2d3SChris Wilson i915_reset(i915, engine_mask, reason); 1149112ed2d3SChris Wilson mutex_unlock(&error->wedge_mutex); 1150112ed2d3SChris Wilson 1151112ed2d3SChris Wilson intel_finish_reset(i915); 1152112ed2d3SChris Wilson } 1153112ed2d3SChris Wilson 1154112ed2d3SChris Wilson if (!test_bit(I915_WEDGED, &error->flags)) 1155112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); 1156112ed2d3SChris Wilson } 1157112ed2d3SChris Wilson 1158112ed2d3SChris Wilson static void clear_register(struct intel_uncore *uncore, i915_reg_t reg) 1159112ed2d3SChris Wilson { 1160112ed2d3SChris Wilson intel_uncore_rmw(uncore, reg, 0, 0); 1161112ed2d3SChris Wilson } 1162112ed2d3SChris Wilson 1163112ed2d3SChris Wilson void i915_clear_error_registers(struct drm_i915_private *i915) 1164112ed2d3SChris Wilson { 1165112ed2d3SChris Wilson struct intel_uncore *uncore = &i915->uncore; 1166112ed2d3SChris Wilson u32 eir; 1167112ed2d3SChris Wilson 1168112ed2d3SChris Wilson if (!IS_GEN(i915, 2)) 1169112ed2d3SChris Wilson clear_register(uncore, PGTBL_ER); 1170112ed2d3SChris Wilson 1171112ed2d3SChris Wilson if (INTEL_GEN(i915) < 4) 1172112ed2d3SChris Wilson clear_register(uncore, IPEIR(RENDER_RING_BASE)); 1173112ed2d3SChris Wilson else 1174112ed2d3SChris Wilson clear_register(uncore, IPEIR_I965); 1175112ed2d3SChris Wilson 1176112ed2d3SChris Wilson clear_register(uncore, EIR); 1177112ed2d3SChris Wilson eir = intel_uncore_read(uncore, EIR); 1178112ed2d3SChris Wilson if (eir) { 1179112ed2d3SChris Wilson /* 1180112ed2d3SChris Wilson * some errors might have become stuck, 1181112ed2d3SChris Wilson * mask them. 1182112ed2d3SChris Wilson */ 1183112ed2d3SChris Wilson DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir); 1184112ed2d3SChris Wilson rmw_set(uncore, EMR, eir); 1185112ed2d3SChris Wilson intel_uncore_write(uncore, GEN2_IIR, 1186112ed2d3SChris Wilson I915_MASTER_ERROR_INTERRUPT); 1187112ed2d3SChris Wilson } 1188112ed2d3SChris Wilson 1189112ed2d3SChris Wilson if (INTEL_GEN(i915) >= 8) { 1190112ed2d3SChris Wilson rmw_clear(uncore, GEN8_RING_FAULT_REG, RING_FAULT_VALID); 1191112ed2d3SChris Wilson intel_uncore_posting_read(uncore, GEN8_RING_FAULT_REG); 1192112ed2d3SChris Wilson } else if (INTEL_GEN(i915) >= 6) { 1193112ed2d3SChris Wilson struct intel_engine_cs *engine; 1194112ed2d3SChris Wilson enum intel_engine_id id; 1195112ed2d3SChris Wilson 1196112ed2d3SChris Wilson for_each_engine(engine, i915, id) { 1197112ed2d3SChris Wilson rmw_clear(uncore, 1198112ed2d3SChris Wilson RING_FAULT_REG(engine), RING_FAULT_VALID); 1199112ed2d3SChris Wilson intel_uncore_posting_read(uncore, 1200112ed2d3SChris Wilson RING_FAULT_REG(engine)); 1201112ed2d3SChris Wilson } 1202112ed2d3SChris Wilson } 1203112ed2d3SChris Wilson } 1204112ed2d3SChris Wilson 1205112ed2d3SChris Wilson /** 1206112ed2d3SChris Wilson * i915_handle_error - handle a gpu error 1207112ed2d3SChris Wilson * @i915: i915 device private 1208112ed2d3SChris Wilson * @engine_mask: mask representing engines that are hung 1209112ed2d3SChris Wilson * @flags: control flags 1210112ed2d3SChris Wilson * @fmt: Error message format string 1211112ed2d3SChris Wilson * 1212112ed2d3SChris Wilson * Do some basic checking of register state at error time and 1213112ed2d3SChris Wilson * dump it to the syslog. Also call i915_capture_error_state() to make 1214112ed2d3SChris Wilson * sure we get a record and make it available in debugfs. Fire a uevent 1215112ed2d3SChris Wilson * so userspace knows something bad happened (should trigger collection 1216112ed2d3SChris Wilson * of a ring dump etc.). 1217112ed2d3SChris Wilson */ 1218112ed2d3SChris Wilson void i915_handle_error(struct drm_i915_private *i915, 1219112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 1220112ed2d3SChris Wilson unsigned long flags, 1221112ed2d3SChris Wilson const char *fmt, ...) 1222112ed2d3SChris Wilson { 1223112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1224112ed2d3SChris Wilson struct intel_engine_cs *engine; 1225112ed2d3SChris Wilson intel_wakeref_t wakeref; 1226112ed2d3SChris Wilson intel_engine_mask_t tmp; 1227112ed2d3SChris Wilson char error_msg[80]; 1228112ed2d3SChris Wilson char *msg = NULL; 1229112ed2d3SChris Wilson 1230112ed2d3SChris Wilson if (fmt) { 1231112ed2d3SChris Wilson va_list args; 1232112ed2d3SChris Wilson 1233112ed2d3SChris Wilson va_start(args, fmt); 1234112ed2d3SChris Wilson vscnprintf(error_msg, sizeof(error_msg), fmt, args); 1235112ed2d3SChris Wilson va_end(args); 1236112ed2d3SChris Wilson 1237112ed2d3SChris Wilson msg = error_msg; 1238112ed2d3SChris Wilson } 1239112ed2d3SChris Wilson 1240112ed2d3SChris Wilson /* 1241112ed2d3SChris Wilson * In most cases it's guaranteed that we get here with an RPM 1242112ed2d3SChris Wilson * reference held, for example because there is a pending GPU 1243112ed2d3SChris Wilson * request that won't finish until the reset is done. This 1244112ed2d3SChris Wilson * isn't the case at least when we get here by doing a 1245112ed2d3SChris Wilson * simulated reset via debugfs, so get an RPM reference. 1246112ed2d3SChris Wilson */ 1247112ed2d3SChris Wilson wakeref = intel_runtime_pm_get(i915); 1248112ed2d3SChris Wilson 1249112ed2d3SChris Wilson engine_mask &= INTEL_INFO(i915)->engine_mask; 1250112ed2d3SChris Wilson 1251112ed2d3SChris Wilson if (flags & I915_ERROR_CAPTURE) { 1252112ed2d3SChris Wilson i915_capture_error_state(i915, engine_mask, msg); 1253112ed2d3SChris Wilson i915_clear_error_registers(i915); 1254112ed2d3SChris Wilson } 1255112ed2d3SChris Wilson 1256112ed2d3SChris Wilson /* 1257112ed2d3SChris Wilson * Try engine reset when available. We fall back to full reset if 1258112ed2d3SChris Wilson * single reset fails. 1259112ed2d3SChris Wilson */ 1260112ed2d3SChris Wilson if (intel_has_reset_engine(i915) && !__i915_wedged(error)) { 1261112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 1262112ed2d3SChris Wilson BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); 1263112ed2d3SChris Wilson if (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1264112ed2d3SChris Wilson &error->flags)) 1265112ed2d3SChris Wilson continue; 1266112ed2d3SChris Wilson 1267112ed2d3SChris Wilson if (i915_reset_engine(engine, msg) == 0) 1268112ed2d3SChris Wilson engine_mask &= ~engine->mask; 1269112ed2d3SChris Wilson 1270112ed2d3SChris Wilson clear_bit(I915_RESET_ENGINE + engine->id, 1271112ed2d3SChris Wilson &error->flags); 1272112ed2d3SChris Wilson wake_up_bit(&error->flags, 1273112ed2d3SChris Wilson I915_RESET_ENGINE + engine->id); 1274112ed2d3SChris Wilson } 1275112ed2d3SChris Wilson } 1276112ed2d3SChris Wilson 1277112ed2d3SChris Wilson if (!engine_mask) 1278112ed2d3SChris Wilson goto out; 1279112ed2d3SChris Wilson 1280112ed2d3SChris Wilson /* Full reset needs the mutex, stop any other user trying to do so. */ 1281112ed2d3SChris Wilson if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) { 1282112ed2d3SChris Wilson wait_event(error->reset_queue, 1283112ed2d3SChris Wilson !test_bit(I915_RESET_BACKOFF, &error->flags)); 1284112ed2d3SChris Wilson goto out; /* piggy-back on the other reset */ 1285112ed2d3SChris Wilson } 1286112ed2d3SChris Wilson 1287112ed2d3SChris Wilson /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */ 1288112ed2d3SChris Wilson synchronize_rcu_expedited(); 1289112ed2d3SChris Wilson 1290112ed2d3SChris Wilson /* Prevent any other reset-engine attempt. */ 1291112ed2d3SChris Wilson for_each_engine(engine, i915, tmp) { 1292112ed2d3SChris Wilson while (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1293112ed2d3SChris Wilson &error->flags)) 1294112ed2d3SChris Wilson wait_on_bit(&error->flags, 1295112ed2d3SChris Wilson I915_RESET_ENGINE + engine->id, 1296112ed2d3SChris Wilson TASK_UNINTERRUPTIBLE); 1297112ed2d3SChris Wilson } 1298112ed2d3SChris Wilson 1299112ed2d3SChris Wilson i915_reset_device(i915, engine_mask, msg); 1300112ed2d3SChris Wilson 1301112ed2d3SChris Wilson for_each_engine(engine, i915, tmp) { 1302112ed2d3SChris Wilson clear_bit(I915_RESET_ENGINE + engine->id, 1303112ed2d3SChris Wilson &error->flags); 1304112ed2d3SChris Wilson } 1305112ed2d3SChris Wilson 1306112ed2d3SChris Wilson clear_bit(I915_RESET_BACKOFF, &error->flags); 1307112ed2d3SChris Wilson wake_up_all(&error->reset_queue); 1308112ed2d3SChris Wilson 1309112ed2d3SChris Wilson out: 1310112ed2d3SChris Wilson intel_runtime_pm_put(i915, wakeref); 1311112ed2d3SChris Wilson } 1312112ed2d3SChris Wilson 1313112ed2d3SChris Wilson int i915_reset_trylock(struct drm_i915_private *i915) 1314112ed2d3SChris Wilson { 1315112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1316112ed2d3SChris Wilson int srcu; 1317112ed2d3SChris Wilson 1318112ed2d3SChris Wilson might_lock(&error->reset_backoff_srcu); 1319112ed2d3SChris Wilson might_sleep(); 1320112ed2d3SChris Wilson 1321112ed2d3SChris Wilson rcu_read_lock(); 1322112ed2d3SChris Wilson while (test_bit(I915_RESET_BACKOFF, &error->flags)) { 1323112ed2d3SChris Wilson rcu_read_unlock(); 1324112ed2d3SChris Wilson 1325112ed2d3SChris Wilson if (wait_event_interruptible(error->reset_queue, 1326112ed2d3SChris Wilson !test_bit(I915_RESET_BACKOFF, 1327112ed2d3SChris Wilson &error->flags))) 1328112ed2d3SChris Wilson return -EINTR; 1329112ed2d3SChris Wilson 1330112ed2d3SChris Wilson rcu_read_lock(); 1331112ed2d3SChris Wilson } 1332112ed2d3SChris Wilson srcu = srcu_read_lock(&error->reset_backoff_srcu); 1333112ed2d3SChris Wilson rcu_read_unlock(); 1334112ed2d3SChris Wilson 1335112ed2d3SChris Wilson return srcu; 1336112ed2d3SChris Wilson } 1337112ed2d3SChris Wilson 1338112ed2d3SChris Wilson void i915_reset_unlock(struct drm_i915_private *i915, int tag) 1339112ed2d3SChris Wilson __releases(&i915->gpu_error.reset_backoff_srcu) 1340112ed2d3SChris Wilson { 1341112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1342112ed2d3SChris Wilson 1343112ed2d3SChris Wilson srcu_read_unlock(&error->reset_backoff_srcu, tag); 1344112ed2d3SChris Wilson } 1345112ed2d3SChris Wilson 1346112ed2d3SChris Wilson int i915_terminally_wedged(struct drm_i915_private *i915) 1347112ed2d3SChris Wilson { 1348112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1349112ed2d3SChris Wilson 1350112ed2d3SChris Wilson might_sleep(); 1351112ed2d3SChris Wilson 1352112ed2d3SChris Wilson if (!__i915_wedged(error)) 1353112ed2d3SChris Wilson return 0; 1354112ed2d3SChris Wilson 1355112ed2d3SChris Wilson /* Reset still in progress? Maybe we will recover? */ 1356112ed2d3SChris Wilson if (!test_bit(I915_RESET_BACKOFF, &error->flags)) 1357112ed2d3SChris Wilson return -EIO; 1358112ed2d3SChris Wilson 1359112ed2d3SChris Wilson /* XXX intel_reset_finish() still takes struct_mutex!!! */ 1360112ed2d3SChris Wilson if (mutex_is_locked(&i915->drm.struct_mutex)) 1361112ed2d3SChris Wilson return -EAGAIN; 1362112ed2d3SChris Wilson 1363112ed2d3SChris Wilson if (wait_event_interruptible(error->reset_queue, 1364112ed2d3SChris Wilson !test_bit(I915_RESET_BACKOFF, 1365112ed2d3SChris Wilson &error->flags))) 1366112ed2d3SChris Wilson return -EINTR; 1367112ed2d3SChris Wilson 1368112ed2d3SChris Wilson return __i915_wedged(error) ? -EIO : 0; 1369112ed2d3SChris Wilson } 1370112ed2d3SChris Wilson 1371112ed2d3SChris Wilson static void i915_wedge_me(struct work_struct *work) 1372112ed2d3SChris Wilson { 1373112ed2d3SChris Wilson struct i915_wedge_me *w = container_of(work, typeof(*w), work.work); 1374112ed2d3SChris Wilson 1375112ed2d3SChris Wilson dev_err(w->i915->drm.dev, 1376112ed2d3SChris Wilson "%s timed out, cancelling all in-flight rendering.\n", 1377112ed2d3SChris Wilson w->name); 1378112ed2d3SChris Wilson i915_gem_set_wedged(w->i915); 1379112ed2d3SChris Wilson } 1380112ed2d3SChris Wilson 1381112ed2d3SChris Wilson void __i915_init_wedge(struct i915_wedge_me *w, 1382112ed2d3SChris Wilson struct drm_i915_private *i915, 1383112ed2d3SChris Wilson long timeout, 1384112ed2d3SChris Wilson const char *name) 1385112ed2d3SChris Wilson { 1386112ed2d3SChris Wilson w->i915 = i915; 1387112ed2d3SChris Wilson w->name = name; 1388112ed2d3SChris Wilson 1389112ed2d3SChris Wilson INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me); 1390112ed2d3SChris Wilson schedule_delayed_work(&w->work, timeout); 1391112ed2d3SChris Wilson } 1392112ed2d3SChris Wilson 1393112ed2d3SChris Wilson void __i915_fini_wedge(struct i915_wedge_me *w) 1394112ed2d3SChris Wilson { 1395112ed2d3SChris Wilson cancel_delayed_work_sync(&w->work); 1396112ed2d3SChris Wilson destroy_delayed_work_on_stack(&w->work); 1397112ed2d3SChris Wilson w->i915 = NULL; 1398112ed2d3SChris Wilson } 1399932309fbSMichal Wajdeczko 1400932309fbSMichal Wajdeczko #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1401932309fbSMichal Wajdeczko #include "selftest_reset.c" 1402932309fbSMichal Wajdeczko #endif 1403