1112ed2d3SChris Wilson /* 2112ed2d3SChris Wilson * SPDX-License-Identifier: MIT 3112ed2d3SChris Wilson * 4112ed2d3SChris Wilson * Copyright © 2008-2018 Intel Corporation 5112ed2d3SChris Wilson */ 6112ed2d3SChris Wilson 7112ed2d3SChris Wilson #include <linux/sched/mm.h> 8112ed2d3SChris Wilson #include <linux/stop_machine.h> 9112ed2d3SChris Wilson 10df0566a6SJani Nikula #include "display/intel_overlay.h" 11df0566a6SJani Nikula 1210be98a7SChris Wilson #include "gem/i915_gem_context.h" 1310be98a7SChris Wilson 14112ed2d3SChris Wilson #include "i915_drv.h" 15112ed2d3SChris Wilson #include "i915_gpu_error.h" 16440e2b3dSJani Nikula #include "i915_irq.h" 1779ffac85SChris Wilson #include "intel_engine_pm.h" 18eaf522f6STvrtko Ursulin #include "intel_gt.h" 1979ffac85SChris Wilson #include "intel_gt_pm.h" 20112ed2d3SChris Wilson #include "intel_reset.h" 21112ed2d3SChris Wilson 22112ed2d3SChris Wilson #include "intel_guc.h" 23112ed2d3SChris Wilson 24112ed2d3SChris Wilson #define RESET_MAX_RETRIES 3 25112ed2d3SChris Wilson 26112ed2d3SChris Wilson /* XXX How to handle concurrent GGTT updates using tiling registers? */ 27112ed2d3SChris Wilson #define RESET_UNDER_STOP_MACHINE 0 28112ed2d3SChris Wilson 29112ed2d3SChris Wilson static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set) 30112ed2d3SChris Wilson { 31112ed2d3SChris Wilson intel_uncore_rmw_fw(uncore, reg, 0, set); 32112ed2d3SChris Wilson } 33112ed2d3SChris Wilson 34112ed2d3SChris Wilson static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) 35112ed2d3SChris Wilson { 36112ed2d3SChris Wilson intel_uncore_rmw_fw(uncore, reg, clr, 0); 37112ed2d3SChris Wilson } 38112ed2d3SChris Wilson 39112ed2d3SChris Wilson static void engine_skip_context(struct i915_request *rq) 40112ed2d3SChris Wilson { 41112ed2d3SChris Wilson struct intel_engine_cs *engine = rq->engine; 42112ed2d3SChris Wilson struct i915_gem_context *hung_ctx = rq->gem_context; 43112ed2d3SChris Wilson 44422d7df4SChris Wilson lockdep_assert_held(&engine->active.lock); 45112ed2d3SChris Wilson 46112ed2d3SChris Wilson if (!i915_request_is_active(rq)) 47112ed2d3SChris Wilson return; 48112ed2d3SChris Wilson 49422d7df4SChris Wilson list_for_each_entry_continue(rq, &engine->active.requests, sched.link) 50112ed2d3SChris Wilson if (rq->gem_context == hung_ctx) 51112ed2d3SChris Wilson i915_request_skip(rq, -EIO); 52112ed2d3SChris Wilson } 53112ed2d3SChris Wilson 54112ed2d3SChris Wilson static void client_mark_guilty(struct drm_i915_file_private *file_priv, 55112ed2d3SChris Wilson const struct i915_gem_context *ctx) 56112ed2d3SChris Wilson { 57112ed2d3SChris Wilson unsigned int score; 58112ed2d3SChris Wilson unsigned long prev_hang; 59112ed2d3SChris Wilson 60112ed2d3SChris Wilson if (i915_gem_context_is_banned(ctx)) 61112ed2d3SChris Wilson score = I915_CLIENT_SCORE_CONTEXT_BAN; 62112ed2d3SChris Wilson else 63112ed2d3SChris Wilson score = 0; 64112ed2d3SChris Wilson 65112ed2d3SChris Wilson prev_hang = xchg(&file_priv->hang_timestamp, jiffies); 66112ed2d3SChris Wilson if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) 67112ed2d3SChris Wilson score += I915_CLIENT_SCORE_HANG_FAST; 68112ed2d3SChris Wilson 69112ed2d3SChris Wilson if (score) { 70112ed2d3SChris Wilson atomic_add(score, &file_priv->ban_score); 71112ed2d3SChris Wilson 72112ed2d3SChris Wilson DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n", 73112ed2d3SChris Wilson ctx->name, score, 74112ed2d3SChris Wilson atomic_read(&file_priv->ban_score)); 75112ed2d3SChris Wilson } 76112ed2d3SChris Wilson } 77112ed2d3SChris Wilson 78112ed2d3SChris Wilson static bool context_mark_guilty(struct i915_gem_context *ctx) 79112ed2d3SChris Wilson { 80112ed2d3SChris Wilson unsigned long prev_hang; 81112ed2d3SChris Wilson bool banned; 82112ed2d3SChris Wilson int i; 83112ed2d3SChris Wilson 84112ed2d3SChris Wilson atomic_inc(&ctx->guilty_count); 85112ed2d3SChris Wilson 86112ed2d3SChris Wilson /* Cool contexts are too cool to be banned! (Used for reset testing.) */ 87112ed2d3SChris Wilson if (!i915_gem_context_is_bannable(ctx)) 88112ed2d3SChris Wilson return false; 89112ed2d3SChris Wilson 90112ed2d3SChris Wilson /* Record the timestamp for the last N hangs */ 91112ed2d3SChris Wilson prev_hang = ctx->hang_timestamp[0]; 92112ed2d3SChris Wilson for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) 93112ed2d3SChris Wilson ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; 94112ed2d3SChris Wilson ctx->hang_timestamp[i] = jiffies; 95112ed2d3SChris Wilson 96112ed2d3SChris Wilson /* If we have hung N+1 times in rapid succession, we ban the context! */ 97112ed2d3SChris Wilson banned = !i915_gem_context_is_recoverable(ctx); 98112ed2d3SChris Wilson if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) 99112ed2d3SChris Wilson banned = true; 100112ed2d3SChris Wilson if (banned) { 101112ed2d3SChris Wilson DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n", 102112ed2d3SChris Wilson ctx->name, atomic_read(&ctx->guilty_count)); 103112ed2d3SChris Wilson i915_gem_context_set_banned(ctx); 104112ed2d3SChris Wilson } 105112ed2d3SChris Wilson 106112ed2d3SChris Wilson if (!IS_ERR_OR_NULL(ctx->file_priv)) 107112ed2d3SChris Wilson client_mark_guilty(ctx->file_priv, ctx); 108112ed2d3SChris Wilson 109112ed2d3SChris Wilson return banned; 110112ed2d3SChris Wilson } 111112ed2d3SChris Wilson 112112ed2d3SChris Wilson static void context_mark_innocent(struct i915_gem_context *ctx) 113112ed2d3SChris Wilson { 114112ed2d3SChris Wilson atomic_inc(&ctx->active_count); 115112ed2d3SChris Wilson } 116112ed2d3SChris Wilson 117112ed2d3SChris Wilson void i915_reset_request(struct i915_request *rq, bool guilty) 118112ed2d3SChris Wilson { 119112ed2d3SChris Wilson GEM_TRACE("%s rq=%llx:%lld, guilty? %s\n", 120112ed2d3SChris Wilson rq->engine->name, 121112ed2d3SChris Wilson rq->fence.context, 122112ed2d3SChris Wilson rq->fence.seqno, 123112ed2d3SChris Wilson yesno(guilty)); 124112ed2d3SChris Wilson 125422d7df4SChris Wilson lockdep_assert_held(&rq->engine->active.lock); 126112ed2d3SChris Wilson GEM_BUG_ON(i915_request_completed(rq)); 127112ed2d3SChris Wilson 128112ed2d3SChris Wilson if (guilty) { 129112ed2d3SChris Wilson i915_request_skip(rq, -EIO); 130112ed2d3SChris Wilson if (context_mark_guilty(rq->gem_context)) 131112ed2d3SChris Wilson engine_skip_context(rq); 132112ed2d3SChris Wilson } else { 133112ed2d3SChris Wilson dma_fence_set_error(&rq->fence, -EAGAIN); 134112ed2d3SChris Wilson context_mark_innocent(rq->gem_context); 135112ed2d3SChris Wilson } 136112ed2d3SChris Wilson } 137112ed2d3SChris Wilson 138112ed2d3SChris Wilson static void gen3_stop_engine(struct intel_engine_cs *engine) 139112ed2d3SChris Wilson { 140112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 141112ed2d3SChris Wilson const u32 base = engine->mmio_base; 142112ed2d3SChris Wilson 143112ed2d3SChris Wilson GEM_TRACE("%s\n", engine->name); 144112ed2d3SChris Wilson 145112ed2d3SChris Wilson if (intel_engine_stop_cs(engine)) 146112ed2d3SChris Wilson GEM_TRACE("%s: timed out on STOP_RING\n", engine->name); 147112ed2d3SChris Wilson 148112ed2d3SChris Wilson intel_uncore_write_fw(uncore, 149112ed2d3SChris Wilson RING_HEAD(base), 150112ed2d3SChris Wilson intel_uncore_read_fw(uncore, RING_TAIL(base))); 151112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */ 152112ed2d3SChris Wilson 153112ed2d3SChris Wilson intel_uncore_write_fw(uncore, RING_HEAD(base), 0); 154112ed2d3SChris Wilson intel_uncore_write_fw(uncore, RING_TAIL(base), 0); 155112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, RING_TAIL(base)); 156112ed2d3SChris Wilson 157112ed2d3SChris Wilson /* The ring must be empty before it is disabled */ 158112ed2d3SChris Wilson intel_uncore_write_fw(uncore, RING_CTL(base), 0); 159112ed2d3SChris Wilson 160112ed2d3SChris Wilson /* Check acts as a post */ 161112ed2d3SChris Wilson if (intel_uncore_read_fw(uncore, RING_HEAD(base))) 162112ed2d3SChris Wilson GEM_TRACE("%s: ring head [%x] not parked\n", 163112ed2d3SChris Wilson engine->name, 164112ed2d3SChris Wilson intel_uncore_read_fw(uncore, RING_HEAD(base))); 165112ed2d3SChris Wilson } 166112ed2d3SChris Wilson 167112ed2d3SChris Wilson static void i915_stop_engines(struct drm_i915_private *i915, 168112ed2d3SChris Wilson intel_engine_mask_t engine_mask) 169112ed2d3SChris Wilson { 170112ed2d3SChris Wilson struct intel_engine_cs *engine; 171112ed2d3SChris Wilson intel_engine_mask_t tmp; 172112ed2d3SChris Wilson 173112ed2d3SChris Wilson if (INTEL_GEN(i915) < 3) 174112ed2d3SChris Wilson return; 175112ed2d3SChris Wilson 176112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) 177112ed2d3SChris Wilson gen3_stop_engine(engine); 178112ed2d3SChris Wilson } 179112ed2d3SChris Wilson 180112ed2d3SChris Wilson static bool i915_in_reset(struct pci_dev *pdev) 181112ed2d3SChris Wilson { 182112ed2d3SChris Wilson u8 gdrst; 183112ed2d3SChris Wilson 184112ed2d3SChris Wilson pci_read_config_byte(pdev, I915_GDRST, &gdrst); 185112ed2d3SChris Wilson return gdrst & GRDOM_RESET_STATUS; 186112ed2d3SChris Wilson } 187112ed2d3SChris Wilson 188112ed2d3SChris Wilson static int i915_do_reset(struct drm_i915_private *i915, 189112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 190112ed2d3SChris Wilson unsigned int retry) 191112ed2d3SChris Wilson { 192112ed2d3SChris Wilson struct pci_dev *pdev = i915->drm.pdev; 193112ed2d3SChris Wilson int err; 194112ed2d3SChris Wilson 195112ed2d3SChris Wilson /* Assert reset for at least 20 usec, and wait for acknowledgement. */ 196112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 197112ed2d3SChris Wilson udelay(50); 198112ed2d3SChris Wilson err = wait_for_atomic(i915_in_reset(pdev), 50); 199112ed2d3SChris Wilson 200112ed2d3SChris Wilson /* Clear the reset request. */ 201112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 0); 202112ed2d3SChris Wilson udelay(50); 203112ed2d3SChris Wilson if (!err) 204112ed2d3SChris Wilson err = wait_for_atomic(!i915_in_reset(pdev), 50); 205112ed2d3SChris Wilson 206112ed2d3SChris Wilson return err; 207112ed2d3SChris Wilson } 208112ed2d3SChris Wilson 209112ed2d3SChris Wilson static bool g4x_reset_complete(struct pci_dev *pdev) 210112ed2d3SChris Wilson { 211112ed2d3SChris Wilson u8 gdrst; 212112ed2d3SChris Wilson 213112ed2d3SChris Wilson pci_read_config_byte(pdev, I915_GDRST, &gdrst); 214112ed2d3SChris Wilson return (gdrst & GRDOM_RESET_ENABLE) == 0; 215112ed2d3SChris Wilson } 216112ed2d3SChris Wilson 217112ed2d3SChris Wilson static int g33_do_reset(struct drm_i915_private *i915, 218112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 219112ed2d3SChris Wilson unsigned int retry) 220112ed2d3SChris Wilson { 221112ed2d3SChris Wilson struct pci_dev *pdev = i915->drm.pdev; 222112ed2d3SChris Wilson 223112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 224112ed2d3SChris Wilson return wait_for_atomic(g4x_reset_complete(pdev), 50); 225112ed2d3SChris Wilson } 226112ed2d3SChris Wilson 227112ed2d3SChris Wilson static int g4x_do_reset(struct drm_i915_private *i915, 228112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 229112ed2d3SChris Wilson unsigned int retry) 230112ed2d3SChris Wilson { 231112ed2d3SChris Wilson struct pci_dev *pdev = i915->drm.pdev; 232112ed2d3SChris Wilson struct intel_uncore *uncore = &i915->uncore; 233112ed2d3SChris Wilson int ret; 234112ed2d3SChris Wilson 235112ed2d3SChris Wilson /* WaVcpClkGateDisableForMediaReset:ctg,elk */ 236112ed2d3SChris Wilson rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 237112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 238112ed2d3SChris Wilson 239112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 240112ed2d3SChris Wilson GRDOM_MEDIA | GRDOM_RESET_ENABLE); 241112ed2d3SChris Wilson ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 242112ed2d3SChris Wilson if (ret) { 243112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for media reset failed\n"); 244112ed2d3SChris Wilson goto out; 245112ed2d3SChris Wilson } 246112ed2d3SChris Wilson 247112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 248112ed2d3SChris Wilson GRDOM_RENDER | GRDOM_RESET_ENABLE); 249112ed2d3SChris Wilson ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 250112ed2d3SChris Wilson if (ret) { 251112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for render reset failed\n"); 252112ed2d3SChris Wilson goto out; 253112ed2d3SChris Wilson } 254112ed2d3SChris Wilson 255112ed2d3SChris Wilson out: 256112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 0); 257112ed2d3SChris Wilson 258112ed2d3SChris Wilson rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 259112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 260112ed2d3SChris Wilson 261112ed2d3SChris Wilson return ret; 262112ed2d3SChris Wilson } 263112ed2d3SChris Wilson 264112ed2d3SChris Wilson static int ironlake_do_reset(struct drm_i915_private *i915, 265112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 266112ed2d3SChris Wilson unsigned int retry) 267112ed2d3SChris Wilson { 268112ed2d3SChris Wilson struct intel_uncore *uncore = &i915->uncore; 269112ed2d3SChris Wilson int ret; 270112ed2d3SChris Wilson 271112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 272112ed2d3SChris Wilson ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); 273112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 274112ed2d3SChris Wilson ILK_GRDOM_RESET_ENABLE, 0, 275112ed2d3SChris Wilson 5000, 0, 276112ed2d3SChris Wilson NULL); 277112ed2d3SChris Wilson if (ret) { 278112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for render reset failed\n"); 279112ed2d3SChris Wilson goto out; 280112ed2d3SChris Wilson } 281112ed2d3SChris Wilson 282112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 283112ed2d3SChris Wilson ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); 284112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 285112ed2d3SChris Wilson ILK_GRDOM_RESET_ENABLE, 0, 286112ed2d3SChris Wilson 5000, 0, 287112ed2d3SChris Wilson NULL); 288112ed2d3SChris Wilson if (ret) { 289112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for media reset failed\n"); 290112ed2d3SChris Wilson goto out; 291112ed2d3SChris Wilson } 292112ed2d3SChris Wilson 293112ed2d3SChris Wilson out: 294112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 0); 295112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, ILK_GDSR); 296112ed2d3SChris Wilson return ret; 297112ed2d3SChris Wilson } 298112ed2d3SChris Wilson 299112ed2d3SChris Wilson /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ 300112ed2d3SChris Wilson static int gen6_hw_domain_reset(struct drm_i915_private *i915, 301112ed2d3SChris Wilson u32 hw_domain_mask) 302112ed2d3SChris Wilson { 303112ed2d3SChris Wilson struct intel_uncore *uncore = &i915->uncore; 304112ed2d3SChris Wilson int err; 305112ed2d3SChris Wilson 306112ed2d3SChris Wilson /* 307112ed2d3SChris Wilson * GEN6_GDRST is not in the gt power well, no need to check 308112ed2d3SChris Wilson * for fifo space for the write or forcewake the chip for 309112ed2d3SChris Wilson * the read 310112ed2d3SChris Wilson */ 311112ed2d3SChris Wilson intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); 312112ed2d3SChris Wilson 313112ed2d3SChris Wilson /* Wait for the device to ack the reset requests */ 314112ed2d3SChris Wilson err = __intel_wait_for_register_fw(uncore, 315112ed2d3SChris Wilson GEN6_GDRST, hw_domain_mask, 0, 316112ed2d3SChris Wilson 500, 0, 317112ed2d3SChris Wilson NULL); 318112ed2d3SChris Wilson if (err) 319112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n", 320112ed2d3SChris Wilson hw_domain_mask); 321112ed2d3SChris Wilson 322112ed2d3SChris Wilson return err; 323112ed2d3SChris Wilson } 324112ed2d3SChris Wilson 325112ed2d3SChris Wilson static int gen6_reset_engines(struct drm_i915_private *i915, 326112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 327112ed2d3SChris Wilson unsigned int retry) 328112ed2d3SChris Wilson { 329112ed2d3SChris Wilson struct intel_engine_cs *engine; 330112ed2d3SChris Wilson const u32 hw_engine_mask[] = { 331112ed2d3SChris Wilson [RCS0] = GEN6_GRDOM_RENDER, 332112ed2d3SChris Wilson [BCS0] = GEN6_GRDOM_BLT, 333112ed2d3SChris Wilson [VCS0] = GEN6_GRDOM_MEDIA, 334112ed2d3SChris Wilson [VCS1] = GEN8_GRDOM_MEDIA2, 335112ed2d3SChris Wilson [VECS0] = GEN6_GRDOM_VECS, 336112ed2d3SChris Wilson }; 337112ed2d3SChris Wilson u32 hw_mask; 338112ed2d3SChris Wilson 339112ed2d3SChris Wilson if (engine_mask == ALL_ENGINES) { 340112ed2d3SChris Wilson hw_mask = GEN6_GRDOM_FULL; 341112ed2d3SChris Wilson } else { 342112ed2d3SChris Wilson intel_engine_mask_t tmp; 343112ed2d3SChris Wilson 344112ed2d3SChris Wilson hw_mask = 0; 345112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 346112ed2d3SChris Wilson GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); 347112ed2d3SChris Wilson hw_mask |= hw_engine_mask[engine->id]; 348112ed2d3SChris Wilson } 349112ed2d3SChris Wilson } 350112ed2d3SChris Wilson 351112ed2d3SChris Wilson return gen6_hw_domain_reset(i915, hw_mask); 352112ed2d3SChris Wilson } 353112ed2d3SChris Wilson 354112ed2d3SChris Wilson static u32 gen11_lock_sfc(struct intel_engine_cs *engine) 355112ed2d3SChris Wilson { 356112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 357112ed2d3SChris Wilson u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access; 358112ed2d3SChris Wilson i915_reg_t sfc_forced_lock, sfc_forced_lock_ack; 359112ed2d3SChris Wilson u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit; 360112ed2d3SChris Wilson i915_reg_t sfc_usage; 361112ed2d3SChris Wilson u32 sfc_usage_bit; 362112ed2d3SChris Wilson u32 sfc_reset_bit; 363112ed2d3SChris Wilson 364112ed2d3SChris Wilson switch (engine->class) { 365112ed2d3SChris Wilson case VIDEO_DECODE_CLASS: 366112ed2d3SChris Wilson if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 367112ed2d3SChris Wilson return 0; 368112ed2d3SChris Wilson 369112ed2d3SChris Wilson sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); 370112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 371112ed2d3SChris Wilson 372112ed2d3SChris Wilson sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine); 373112ed2d3SChris Wilson sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; 374112ed2d3SChris Wilson 375112ed2d3SChris Wilson sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine); 376112ed2d3SChris Wilson sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT; 377112ed2d3SChris Wilson sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); 378112ed2d3SChris Wilson break; 379112ed2d3SChris Wilson 380112ed2d3SChris Wilson case VIDEO_ENHANCEMENT_CLASS: 381112ed2d3SChris Wilson sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); 382112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 383112ed2d3SChris Wilson 384112ed2d3SChris Wilson sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine); 385112ed2d3SChris Wilson sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; 386112ed2d3SChris Wilson 387112ed2d3SChris Wilson sfc_usage = GEN11_VECS_SFC_USAGE(engine); 388112ed2d3SChris Wilson sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT; 389112ed2d3SChris Wilson sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); 390112ed2d3SChris Wilson break; 391112ed2d3SChris Wilson 392112ed2d3SChris Wilson default: 393112ed2d3SChris Wilson return 0; 394112ed2d3SChris Wilson } 395112ed2d3SChris Wilson 396112ed2d3SChris Wilson /* 397112ed2d3SChris Wilson * Tell the engine that a software reset is going to happen. The engine 398112ed2d3SChris Wilson * will then try to force lock the SFC (if currently locked, it will 399112ed2d3SChris Wilson * remain so until we tell the engine it is safe to unlock; if currently 400112ed2d3SChris Wilson * unlocked, it will ignore this and all new lock requests). If SFC 401112ed2d3SChris Wilson * ends up being locked to the engine we want to reset, we have to reset 402112ed2d3SChris Wilson * it as well (we will unlock it once the reset sequence is completed). 403112ed2d3SChris Wilson */ 404112ed2d3SChris Wilson rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); 405112ed2d3SChris Wilson 406112ed2d3SChris Wilson if (__intel_wait_for_register_fw(uncore, 407112ed2d3SChris Wilson sfc_forced_lock_ack, 408112ed2d3SChris Wilson sfc_forced_lock_ack_bit, 409112ed2d3SChris Wilson sfc_forced_lock_ack_bit, 410112ed2d3SChris Wilson 1000, 0, NULL)) { 411112ed2d3SChris Wilson DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n"); 412112ed2d3SChris Wilson return 0; 413112ed2d3SChris Wilson } 414112ed2d3SChris Wilson 415112ed2d3SChris Wilson if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit) 416112ed2d3SChris Wilson return sfc_reset_bit; 417112ed2d3SChris Wilson 418112ed2d3SChris Wilson return 0; 419112ed2d3SChris Wilson } 420112ed2d3SChris Wilson 421112ed2d3SChris Wilson static void gen11_unlock_sfc(struct intel_engine_cs *engine) 422112ed2d3SChris Wilson { 423112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 424112ed2d3SChris Wilson u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access; 425112ed2d3SChris Wilson i915_reg_t sfc_forced_lock; 426112ed2d3SChris Wilson u32 sfc_forced_lock_bit; 427112ed2d3SChris Wilson 428112ed2d3SChris Wilson switch (engine->class) { 429112ed2d3SChris Wilson case VIDEO_DECODE_CLASS: 430112ed2d3SChris Wilson if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 431112ed2d3SChris Wilson return; 432112ed2d3SChris Wilson 433112ed2d3SChris Wilson sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); 434112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 435112ed2d3SChris Wilson break; 436112ed2d3SChris Wilson 437112ed2d3SChris Wilson case VIDEO_ENHANCEMENT_CLASS: 438112ed2d3SChris Wilson sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); 439112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 440112ed2d3SChris Wilson break; 441112ed2d3SChris Wilson 442112ed2d3SChris Wilson default: 443112ed2d3SChris Wilson return; 444112ed2d3SChris Wilson } 445112ed2d3SChris Wilson 446112ed2d3SChris Wilson rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); 447112ed2d3SChris Wilson } 448112ed2d3SChris Wilson 449112ed2d3SChris Wilson static int gen11_reset_engines(struct drm_i915_private *i915, 450112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 451112ed2d3SChris Wilson unsigned int retry) 452112ed2d3SChris Wilson { 453112ed2d3SChris Wilson const u32 hw_engine_mask[] = { 454112ed2d3SChris Wilson [RCS0] = GEN11_GRDOM_RENDER, 455112ed2d3SChris Wilson [BCS0] = GEN11_GRDOM_BLT, 456112ed2d3SChris Wilson [VCS0] = GEN11_GRDOM_MEDIA, 457112ed2d3SChris Wilson [VCS1] = GEN11_GRDOM_MEDIA2, 458112ed2d3SChris Wilson [VCS2] = GEN11_GRDOM_MEDIA3, 459112ed2d3SChris Wilson [VCS3] = GEN11_GRDOM_MEDIA4, 460112ed2d3SChris Wilson [VECS0] = GEN11_GRDOM_VECS, 461112ed2d3SChris Wilson [VECS1] = GEN11_GRDOM_VECS2, 462112ed2d3SChris Wilson }; 463112ed2d3SChris Wilson struct intel_engine_cs *engine; 464112ed2d3SChris Wilson intel_engine_mask_t tmp; 465112ed2d3SChris Wilson u32 hw_mask; 466112ed2d3SChris Wilson int ret; 467112ed2d3SChris Wilson 468112ed2d3SChris Wilson if (engine_mask == ALL_ENGINES) { 469112ed2d3SChris Wilson hw_mask = GEN11_GRDOM_FULL; 470112ed2d3SChris Wilson } else { 471112ed2d3SChris Wilson hw_mask = 0; 472112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 473112ed2d3SChris Wilson GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); 474112ed2d3SChris Wilson hw_mask |= hw_engine_mask[engine->id]; 475112ed2d3SChris Wilson hw_mask |= gen11_lock_sfc(engine); 476112ed2d3SChris Wilson } 477112ed2d3SChris Wilson } 478112ed2d3SChris Wilson 479112ed2d3SChris Wilson ret = gen6_hw_domain_reset(i915, hw_mask); 480112ed2d3SChris Wilson 481112ed2d3SChris Wilson if (engine_mask != ALL_ENGINES) 482112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) 483112ed2d3SChris Wilson gen11_unlock_sfc(engine); 484112ed2d3SChris Wilson 485112ed2d3SChris Wilson return ret; 486112ed2d3SChris Wilson } 487112ed2d3SChris Wilson 488112ed2d3SChris Wilson static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) 489112ed2d3SChris Wilson { 490112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 491112ed2d3SChris Wilson const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base); 492112ed2d3SChris Wilson u32 request, mask, ack; 493112ed2d3SChris Wilson int ret; 494112ed2d3SChris Wilson 495112ed2d3SChris Wilson ack = intel_uncore_read_fw(uncore, reg); 496112ed2d3SChris Wilson if (ack & RESET_CTL_CAT_ERROR) { 497112ed2d3SChris Wilson /* 498112ed2d3SChris Wilson * For catastrophic errors, ready-for-reset sequence 499112ed2d3SChris Wilson * needs to be bypassed: HAS#396813 500112ed2d3SChris Wilson */ 501112ed2d3SChris Wilson request = RESET_CTL_CAT_ERROR; 502112ed2d3SChris Wilson mask = RESET_CTL_CAT_ERROR; 503112ed2d3SChris Wilson 504112ed2d3SChris Wilson /* Catastrophic errors need to be cleared by HW */ 505112ed2d3SChris Wilson ack = 0; 506112ed2d3SChris Wilson } else if (!(ack & RESET_CTL_READY_TO_RESET)) { 507112ed2d3SChris Wilson request = RESET_CTL_REQUEST_RESET; 508112ed2d3SChris Wilson mask = RESET_CTL_READY_TO_RESET; 509112ed2d3SChris Wilson ack = RESET_CTL_READY_TO_RESET; 510112ed2d3SChris Wilson } else { 511112ed2d3SChris Wilson return 0; 512112ed2d3SChris Wilson } 513112ed2d3SChris Wilson 514112ed2d3SChris Wilson intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request)); 515112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 516112ed2d3SChris Wilson 700, 0, NULL); 517112ed2d3SChris Wilson if (ret) 518112ed2d3SChris Wilson DRM_ERROR("%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", 519112ed2d3SChris Wilson engine->name, request, 520112ed2d3SChris Wilson intel_uncore_read_fw(uncore, reg)); 521112ed2d3SChris Wilson 522112ed2d3SChris Wilson return ret; 523112ed2d3SChris Wilson } 524112ed2d3SChris Wilson 525112ed2d3SChris Wilson static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) 526112ed2d3SChris Wilson { 527112ed2d3SChris Wilson intel_uncore_write_fw(engine->uncore, 528112ed2d3SChris Wilson RING_RESET_CTL(engine->mmio_base), 529112ed2d3SChris Wilson _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); 530112ed2d3SChris Wilson } 531112ed2d3SChris Wilson 532112ed2d3SChris Wilson static int gen8_reset_engines(struct drm_i915_private *i915, 533112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 534112ed2d3SChris Wilson unsigned int retry) 535112ed2d3SChris Wilson { 536112ed2d3SChris Wilson struct intel_engine_cs *engine; 537112ed2d3SChris Wilson const bool reset_non_ready = retry >= 1; 538112ed2d3SChris Wilson intel_engine_mask_t tmp; 539112ed2d3SChris Wilson int ret; 540112ed2d3SChris Wilson 541112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 542112ed2d3SChris Wilson ret = gen8_engine_reset_prepare(engine); 543112ed2d3SChris Wilson if (ret && !reset_non_ready) 544112ed2d3SChris Wilson goto skip_reset; 545112ed2d3SChris Wilson 546112ed2d3SChris Wilson /* 547112ed2d3SChris Wilson * If this is not the first failed attempt to prepare, 548112ed2d3SChris Wilson * we decide to proceed anyway. 549112ed2d3SChris Wilson * 550112ed2d3SChris Wilson * By doing so we risk context corruption and with 551112ed2d3SChris Wilson * some gens (kbl), possible system hang if reset 552112ed2d3SChris Wilson * happens during active bb execution. 553112ed2d3SChris Wilson * 554112ed2d3SChris Wilson * We rather take context corruption instead of 555112ed2d3SChris Wilson * failed reset with a wedged driver/gpu. And 556112ed2d3SChris Wilson * active bb execution case should be covered by 557112ed2d3SChris Wilson * i915_stop_engines we have before the reset. 558112ed2d3SChris Wilson */ 559112ed2d3SChris Wilson } 560112ed2d3SChris Wilson 561112ed2d3SChris Wilson if (INTEL_GEN(i915) >= 11) 562112ed2d3SChris Wilson ret = gen11_reset_engines(i915, engine_mask, retry); 563112ed2d3SChris Wilson else 564112ed2d3SChris Wilson ret = gen6_reset_engines(i915, engine_mask, retry); 565112ed2d3SChris Wilson 566112ed2d3SChris Wilson skip_reset: 567112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) 568112ed2d3SChris Wilson gen8_engine_reset_cancel(engine); 569112ed2d3SChris Wilson 570112ed2d3SChris Wilson return ret; 571112ed2d3SChris Wilson } 572112ed2d3SChris Wilson 573112ed2d3SChris Wilson typedef int (*reset_func)(struct drm_i915_private *, 574112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 575112ed2d3SChris Wilson unsigned int retry); 576112ed2d3SChris Wilson 577112ed2d3SChris Wilson static reset_func intel_get_gpu_reset(struct drm_i915_private *i915) 578112ed2d3SChris Wilson { 579112ed2d3SChris Wilson if (INTEL_GEN(i915) >= 8) 580112ed2d3SChris Wilson return gen8_reset_engines; 581112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 6) 582112ed2d3SChris Wilson return gen6_reset_engines; 583112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 5) 584112ed2d3SChris Wilson return ironlake_do_reset; 585112ed2d3SChris Wilson else if (IS_G4X(i915)) 586112ed2d3SChris Wilson return g4x_do_reset; 587112ed2d3SChris Wilson else if (IS_G33(i915) || IS_PINEVIEW(i915)) 588112ed2d3SChris Wilson return g33_do_reset; 589112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 3) 590112ed2d3SChris Wilson return i915_do_reset; 591112ed2d3SChris Wilson else 592112ed2d3SChris Wilson return NULL; 593112ed2d3SChris Wilson } 594112ed2d3SChris Wilson 595112ed2d3SChris Wilson int intel_gpu_reset(struct drm_i915_private *i915, 596112ed2d3SChris Wilson intel_engine_mask_t engine_mask) 597112ed2d3SChris Wilson { 598112ed2d3SChris Wilson const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; 599112ed2d3SChris Wilson reset_func reset; 600112ed2d3SChris Wilson int ret = -ETIMEDOUT; 601112ed2d3SChris Wilson int retry; 602112ed2d3SChris Wilson 603112ed2d3SChris Wilson reset = intel_get_gpu_reset(i915); 604112ed2d3SChris Wilson if (!reset) 605112ed2d3SChris Wilson return -ENODEV; 606112ed2d3SChris Wilson 607112ed2d3SChris Wilson /* 608112ed2d3SChris Wilson * If the power well sleeps during the reset, the reset 609112ed2d3SChris Wilson * request may be dropped and never completes (causing -EIO). 610112ed2d3SChris Wilson */ 611112ed2d3SChris Wilson intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL); 612112ed2d3SChris Wilson for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { 613112ed2d3SChris Wilson /* 614112ed2d3SChris Wilson * We stop engines, otherwise we might get failed reset and a 615112ed2d3SChris Wilson * dead gpu (on elk). Also as modern gpu as kbl can suffer 616112ed2d3SChris Wilson * from system hang if batchbuffer is progressing when 617112ed2d3SChris Wilson * the reset is issued, regardless of READY_TO_RESET ack. 618112ed2d3SChris Wilson * Thus assume it is best to stop engines on all gens 619112ed2d3SChris Wilson * where we have a gpu reset. 620112ed2d3SChris Wilson * 621112ed2d3SChris Wilson * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 622112ed2d3SChris Wilson * 623112ed2d3SChris Wilson * WaMediaResetMainRingCleanup:ctg,elk (presumably) 624112ed2d3SChris Wilson * 625112ed2d3SChris Wilson * FIXME: Wa for more modern gens needs to be validated 626112ed2d3SChris Wilson */ 627112ed2d3SChris Wilson if (retry) 628112ed2d3SChris Wilson i915_stop_engines(i915, engine_mask); 629112ed2d3SChris Wilson 630112ed2d3SChris Wilson GEM_TRACE("engine_mask=%x\n", engine_mask); 631112ed2d3SChris Wilson preempt_disable(); 632112ed2d3SChris Wilson ret = reset(i915, engine_mask, retry); 633112ed2d3SChris Wilson preempt_enable(); 634112ed2d3SChris Wilson } 635112ed2d3SChris Wilson intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL); 636112ed2d3SChris Wilson 637112ed2d3SChris Wilson return ret; 638112ed2d3SChris Wilson } 639112ed2d3SChris Wilson 640112ed2d3SChris Wilson bool intel_has_gpu_reset(struct drm_i915_private *i915) 641112ed2d3SChris Wilson { 642112ed2d3SChris Wilson if (!i915_modparams.reset) 643112ed2d3SChris Wilson return NULL; 644112ed2d3SChris Wilson 645112ed2d3SChris Wilson return intel_get_gpu_reset(i915); 646112ed2d3SChris Wilson } 647112ed2d3SChris Wilson 648112ed2d3SChris Wilson bool intel_has_reset_engine(struct drm_i915_private *i915) 649112ed2d3SChris Wilson { 650112ed2d3SChris Wilson return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2; 651112ed2d3SChris Wilson } 652112ed2d3SChris Wilson 653112ed2d3SChris Wilson int intel_reset_guc(struct drm_i915_private *i915) 654112ed2d3SChris Wilson { 655112ed2d3SChris Wilson u32 guc_domain = 656112ed2d3SChris Wilson INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; 657112ed2d3SChris Wilson int ret; 658112ed2d3SChris Wilson 659112ed2d3SChris Wilson GEM_BUG_ON(!HAS_GUC(i915)); 660112ed2d3SChris Wilson 661112ed2d3SChris Wilson intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL); 662112ed2d3SChris Wilson ret = gen6_hw_domain_reset(i915, guc_domain); 663112ed2d3SChris Wilson intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL); 664112ed2d3SChris Wilson 665112ed2d3SChris Wilson return ret; 666112ed2d3SChris Wilson } 667112ed2d3SChris Wilson 668112ed2d3SChris Wilson /* 669112ed2d3SChris Wilson * Ensure irq handler finishes, and not run again. 670112ed2d3SChris Wilson * Also return the active request so that we only search for it once. 671112ed2d3SChris Wilson */ 672112ed2d3SChris Wilson static void reset_prepare_engine(struct intel_engine_cs *engine) 673112ed2d3SChris Wilson { 674112ed2d3SChris Wilson /* 675112ed2d3SChris Wilson * During the reset sequence, we must prevent the engine from 676112ed2d3SChris Wilson * entering RC6. As the context state is undefined until we restart 677112ed2d3SChris Wilson * the engine, if it does enter RC6 during the reset, the state 678112ed2d3SChris Wilson * written to the powercontext is undefined and so we may lose 679112ed2d3SChris Wilson * GPU state upon resume, i.e. fail to restart after a reset. 680112ed2d3SChris Wilson */ 68179ffac85SChris Wilson intel_engine_pm_get(engine); 682112ed2d3SChris Wilson intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); 683112ed2d3SChris Wilson engine->reset.prepare(engine); 684112ed2d3SChris Wilson } 685112ed2d3SChris Wilson 686112ed2d3SChris Wilson static void revoke_mmaps(struct drm_i915_private *i915) 687112ed2d3SChris Wilson { 688112ed2d3SChris Wilson int i; 689112ed2d3SChris Wilson 6900cf289bdSChris Wilson for (i = 0; i < i915->ggtt.num_fences; i++) { 691112ed2d3SChris Wilson struct drm_vma_offset_node *node; 692112ed2d3SChris Wilson struct i915_vma *vma; 693112ed2d3SChris Wilson u64 vma_offset; 694112ed2d3SChris Wilson 6950cf289bdSChris Wilson vma = READ_ONCE(i915->ggtt.fence_regs[i].vma); 696112ed2d3SChris Wilson if (!vma) 697112ed2d3SChris Wilson continue; 698112ed2d3SChris Wilson 699112ed2d3SChris Wilson if (!i915_vma_has_userfault(vma)) 700112ed2d3SChris Wilson continue; 701112ed2d3SChris Wilson 7020cf289bdSChris Wilson GEM_BUG_ON(vma->fence != &i915->ggtt.fence_regs[i]); 703112ed2d3SChris Wilson node = &vma->obj->base.vma_node; 704112ed2d3SChris Wilson vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT; 705112ed2d3SChris Wilson unmap_mapping_range(i915->drm.anon_inode->i_mapping, 706112ed2d3SChris Wilson drm_vma_node_offset_addr(node) + vma_offset, 707112ed2d3SChris Wilson vma->size, 708112ed2d3SChris Wilson 1); 709112ed2d3SChris Wilson } 710112ed2d3SChris Wilson } 711112ed2d3SChris Wilson 712112ed2d3SChris Wilson static void reset_prepare(struct drm_i915_private *i915) 713112ed2d3SChris Wilson { 714112ed2d3SChris Wilson struct intel_engine_cs *engine; 715112ed2d3SChris Wilson enum intel_engine_id id; 716112ed2d3SChris Wilson 71779ffac85SChris Wilson intel_gt_pm_get(i915); 718112ed2d3SChris Wilson for_each_engine(engine, i915, id) 719112ed2d3SChris Wilson reset_prepare_engine(engine); 720112ed2d3SChris Wilson 721112ed2d3SChris Wilson intel_uc_reset_prepare(i915); 722112ed2d3SChris Wilson } 723112ed2d3SChris Wilson 724112ed2d3SChris Wilson static void gt_revoke(struct drm_i915_private *i915) 725112ed2d3SChris Wilson { 726112ed2d3SChris Wilson revoke_mmaps(i915); 727112ed2d3SChris Wilson } 728112ed2d3SChris Wilson 729112ed2d3SChris Wilson static int gt_reset(struct drm_i915_private *i915, 730112ed2d3SChris Wilson intel_engine_mask_t stalled_mask) 731112ed2d3SChris Wilson { 732112ed2d3SChris Wilson struct intel_engine_cs *engine; 733112ed2d3SChris Wilson enum intel_engine_id id; 734112ed2d3SChris Wilson int err; 735112ed2d3SChris Wilson 736112ed2d3SChris Wilson /* 737112ed2d3SChris Wilson * Everything depends on having the GTT running, so we need to start 738112ed2d3SChris Wilson * there. 739112ed2d3SChris Wilson */ 740112ed2d3SChris Wilson err = i915_ggtt_enable_hw(i915); 741112ed2d3SChris Wilson if (err) 742112ed2d3SChris Wilson return err; 743112ed2d3SChris Wilson 744112ed2d3SChris Wilson for_each_engine(engine, i915, id) 745112ed2d3SChris Wilson intel_engine_reset(engine, stalled_mask & engine->mask); 746112ed2d3SChris Wilson 747112ed2d3SChris Wilson i915_gem_restore_fences(i915); 748112ed2d3SChris Wilson 749112ed2d3SChris Wilson return err; 750112ed2d3SChris Wilson } 751112ed2d3SChris Wilson 752112ed2d3SChris Wilson static void reset_finish_engine(struct intel_engine_cs *engine) 753112ed2d3SChris Wilson { 754112ed2d3SChris Wilson engine->reset.finish(engine); 75579ffac85SChris Wilson intel_engine_pm_put(engine); 756112ed2d3SChris Wilson intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); 757112ed2d3SChris Wilson } 758112ed2d3SChris Wilson 759112ed2d3SChris Wilson static void reset_finish(struct drm_i915_private *i915) 760112ed2d3SChris Wilson { 761112ed2d3SChris Wilson struct intel_engine_cs *engine; 762112ed2d3SChris Wilson enum intel_engine_id id; 763112ed2d3SChris Wilson 764112ed2d3SChris Wilson for_each_engine(engine, i915, id) { 765112ed2d3SChris Wilson reset_finish_engine(engine); 766112ed2d3SChris Wilson intel_engine_signal_breadcrumbs(engine); 767112ed2d3SChris Wilson } 76879ffac85SChris Wilson intel_gt_pm_put(i915); 769112ed2d3SChris Wilson } 770112ed2d3SChris Wilson 771112ed2d3SChris Wilson static void nop_submit_request(struct i915_request *request) 772112ed2d3SChris Wilson { 773112ed2d3SChris Wilson struct intel_engine_cs *engine = request->engine; 774112ed2d3SChris Wilson unsigned long flags; 775112ed2d3SChris Wilson 776112ed2d3SChris Wilson GEM_TRACE("%s fence %llx:%lld -> -EIO\n", 777112ed2d3SChris Wilson engine->name, request->fence.context, request->fence.seqno); 778112ed2d3SChris Wilson dma_fence_set_error(&request->fence, -EIO); 779112ed2d3SChris Wilson 780422d7df4SChris Wilson spin_lock_irqsave(&engine->active.lock, flags); 781112ed2d3SChris Wilson __i915_request_submit(request); 782112ed2d3SChris Wilson i915_request_mark_complete(request); 783422d7df4SChris Wilson spin_unlock_irqrestore(&engine->active.lock, flags); 784112ed2d3SChris Wilson 785112ed2d3SChris Wilson intel_engine_queue_breadcrumbs(engine); 786112ed2d3SChris Wilson } 787112ed2d3SChris Wilson 788112ed2d3SChris Wilson static void __i915_gem_set_wedged(struct drm_i915_private *i915) 789112ed2d3SChris Wilson { 790112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 791112ed2d3SChris Wilson struct intel_engine_cs *engine; 792112ed2d3SChris Wilson enum intel_engine_id id; 793112ed2d3SChris Wilson 794112ed2d3SChris Wilson if (test_bit(I915_WEDGED, &error->flags)) 795112ed2d3SChris Wilson return; 796112ed2d3SChris Wilson 797112ed2d3SChris Wilson if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) { 798112ed2d3SChris Wilson struct drm_printer p = drm_debug_printer(__func__); 799112ed2d3SChris Wilson 800112ed2d3SChris Wilson for_each_engine(engine, i915, id) 801112ed2d3SChris Wilson intel_engine_dump(engine, &p, "%s\n", engine->name); 802112ed2d3SChris Wilson } 803112ed2d3SChris Wilson 804112ed2d3SChris Wilson GEM_TRACE("start\n"); 805112ed2d3SChris Wilson 806112ed2d3SChris Wilson /* 807112ed2d3SChris Wilson * First, stop submission to hw, but do not yet complete requests by 808112ed2d3SChris Wilson * rolling the global seqno forward (since this would complete requests 809112ed2d3SChris Wilson * for which we haven't set the fence error to EIO yet). 810112ed2d3SChris Wilson */ 811112ed2d3SChris Wilson reset_prepare(i915); 812112ed2d3SChris Wilson 813112ed2d3SChris Wilson /* Even if the GPU reset fails, it should still stop the engines */ 814112ed2d3SChris Wilson if (!INTEL_INFO(i915)->gpu_reset_clobbers_display) 815112ed2d3SChris Wilson intel_gpu_reset(i915, ALL_ENGINES); 816112ed2d3SChris Wilson 817112ed2d3SChris Wilson for_each_engine(engine, i915, id) { 818112ed2d3SChris Wilson engine->submit_request = nop_submit_request; 819112ed2d3SChris Wilson engine->schedule = NULL; 820112ed2d3SChris Wilson } 821112ed2d3SChris Wilson i915->caps.scheduler = 0; 822112ed2d3SChris Wilson 823112ed2d3SChris Wilson /* 824112ed2d3SChris Wilson * Make sure no request can slip through without getting completed by 825112ed2d3SChris Wilson * either this call here to intel_engine_write_global_seqno, or the one 826112ed2d3SChris Wilson * in nop_submit_request. 827112ed2d3SChris Wilson */ 828112ed2d3SChris Wilson synchronize_rcu_expedited(); 82979ffac85SChris Wilson set_bit(I915_WEDGED, &error->flags); 830112ed2d3SChris Wilson 831112ed2d3SChris Wilson /* Mark all executing requests as skipped */ 832112ed2d3SChris Wilson for_each_engine(engine, i915, id) 833112ed2d3SChris Wilson engine->cancel_requests(engine); 834112ed2d3SChris Wilson 835112ed2d3SChris Wilson reset_finish(i915); 836112ed2d3SChris Wilson 837112ed2d3SChris Wilson GEM_TRACE("end\n"); 838112ed2d3SChris Wilson } 839112ed2d3SChris Wilson 840112ed2d3SChris Wilson void i915_gem_set_wedged(struct drm_i915_private *i915) 841112ed2d3SChris Wilson { 842112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 843112ed2d3SChris Wilson intel_wakeref_t wakeref; 844112ed2d3SChris Wilson 845112ed2d3SChris Wilson mutex_lock(&error->wedge_mutex); 846c447ff7dSDaniele Ceraolo Spurio with_intel_runtime_pm(&i915->runtime_pm, wakeref) 847112ed2d3SChris Wilson __i915_gem_set_wedged(i915); 848112ed2d3SChris Wilson mutex_unlock(&error->wedge_mutex); 849112ed2d3SChris Wilson } 850112ed2d3SChris Wilson 851112ed2d3SChris Wilson static bool __i915_gem_unset_wedged(struct drm_i915_private *i915) 852112ed2d3SChris Wilson { 853112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 854112ed2d3SChris Wilson struct i915_timeline *tl; 855112ed2d3SChris Wilson 856112ed2d3SChris Wilson if (!test_bit(I915_WEDGED, &error->flags)) 857112ed2d3SChris Wilson return true; 858112ed2d3SChris Wilson 859112ed2d3SChris Wilson if (!i915->gt.scratch) /* Never full initialised, recovery impossible */ 860112ed2d3SChris Wilson return false; 861112ed2d3SChris Wilson 862112ed2d3SChris Wilson GEM_TRACE("start\n"); 863112ed2d3SChris Wilson 864112ed2d3SChris Wilson /* 865112ed2d3SChris Wilson * Before unwedging, make sure that all pending operations 866112ed2d3SChris Wilson * are flushed and errored out - we may have requests waiting upon 867112ed2d3SChris Wilson * third party fences. We marked all inflight requests as EIO, and 868112ed2d3SChris Wilson * every execbuf since returned EIO, for consistency we want all 869112ed2d3SChris Wilson * the currently pending requests to also be marked as EIO, which 870112ed2d3SChris Wilson * is done inside our nop_submit_request - and so we must wait. 871112ed2d3SChris Wilson * 872112ed2d3SChris Wilson * No more can be submitted until we reset the wedged bit. 873112ed2d3SChris Wilson */ 874112ed2d3SChris Wilson mutex_lock(&i915->gt.timelines.mutex); 875112ed2d3SChris Wilson list_for_each_entry(tl, &i915->gt.timelines.active_list, link) { 876112ed2d3SChris Wilson struct i915_request *rq; 877112ed2d3SChris Wilson 878112ed2d3SChris Wilson rq = i915_active_request_get_unlocked(&tl->last_request); 879112ed2d3SChris Wilson if (!rq) 880112ed2d3SChris Wilson continue; 881112ed2d3SChris Wilson 882112ed2d3SChris Wilson /* 883112ed2d3SChris Wilson * All internal dependencies (i915_requests) will have 884112ed2d3SChris Wilson * been flushed by the set-wedge, but we may be stuck waiting 885112ed2d3SChris Wilson * for external fences. These should all be capped to 10s 886112ed2d3SChris Wilson * (I915_FENCE_TIMEOUT) so this wait should not be unbounded 887112ed2d3SChris Wilson * in the worst case. 888112ed2d3SChris Wilson */ 889112ed2d3SChris Wilson dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT); 890112ed2d3SChris Wilson i915_request_put(rq); 891112ed2d3SChris Wilson } 892112ed2d3SChris Wilson mutex_unlock(&i915->gt.timelines.mutex); 893112ed2d3SChris Wilson 89479ffac85SChris Wilson intel_gt_sanitize(i915, false); 895112ed2d3SChris Wilson 896112ed2d3SChris Wilson /* 897112ed2d3SChris Wilson * Undo nop_submit_request. We prevent all new i915 requests from 898112ed2d3SChris Wilson * being queued (by disallowing execbuf whilst wedged) so having 899112ed2d3SChris Wilson * waited for all active requests above, we know the system is idle 900112ed2d3SChris Wilson * and do not have to worry about a thread being inside 901112ed2d3SChris Wilson * engine->submit_request() as we swap over. So unlike installing 902112ed2d3SChris Wilson * the nop_submit_request on reset, we can do this from normal 903112ed2d3SChris Wilson * context and do not require stop_machine(). 904112ed2d3SChris Wilson */ 905112ed2d3SChris Wilson intel_engines_reset_default_submission(i915); 906112ed2d3SChris Wilson 907112ed2d3SChris Wilson GEM_TRACE("end\n"); 908112ed2d3SChris Wilson 909112ed2d3SChris Wilson smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ 910112ed2d3SChris Wilson clear_bit(I915_WEDGED, &i915->gpu_error.flags); 911112ed2d3SChris Wilson 912112ed2d3SChris Wilson return true; 913112ed2d3SChris Wilson } 914112ed2d3SChris Wilson 915112ed2d3SChris Wilson bool i915_gem_unset_wedged(struct drm_i915_private *i915) 916112ed2d3SChris Wilson { 917112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 918112ed2d3SChris Wilson bool result; 919112ed2d3SChris Wilson 920112ed2d3SChris Wilson mutex_lock(&error->wedge_mutex); 921112ed2d3SChris Wilson result = __i915_gem_unset_wedged(i915); 922112ed2d3SChris Wilson mutex_unlock(&error->wedge_mutex); 923112ed2d3SChris Wilson 924112ed2d3SChris Wilson return result; 925112ed2d3SChris Wilson } 926112ed2d3SChris Wilson 927112ed2d3SChris Wilson static int do_reset(struct drm_i915_private *i915, 928112ed2d3SChris Wilson intel_engine_mask_t stalled_mask) 929112ed2d3SChris Wilson { 930112ed2d3SChris Wilson int err, i; 931112ed2d3SChris Wilson 932112ed2d3SChris Wilson gt_revoke(i915); 933112ed2d3SChris Wilson 934112ed2d3SChris Wilson err = intel_gpu_reset(i915, ALL_ENGINES); 935112ed2d3SChris Wilson for (i = 0; err && i < RESET_MAX_RETRIES; i++) { 936112ed2d3SChris Wilson msleep(10 * (i + 1)); 937112ed2d3SChris Wilson err = intel_gpu_reset(i915, ALL_ENGINES); 938112ed2d3SChris Wilson } 939112ed2d3SChris Wilson if (err) 940112ed2d3SChris Wilson return err; 941112ed2d3SChris Wilson 942112ed2d3SChris Wilson return gt_reset(i915, stalled_mask); 943112ed2d3SChris Wilson } 944112ed2d3SChris Wilson 945112ed2d3SChris Wilson /** 946112ed2d3SChris Wilson * i915_reset - reset chip after a hang 947112ed2d3SChris Wilson * @i915: #drm_i915_private to reset 948112ed2d3SChris Wilson * @stalled_mask: mask of the stalled engines with the guilty requests 949112ed2d3SChris Wilson * @reason: user error message for why we are resetting 950112ed2d3SChris Wilson * 951112ed2d3SChris Wilson * Reset the chip. Useful if a hang is detected. Marks the device as wedged 952112ed2d3SChris Wilson * on failure. 953112ed2d3SChris Wilson * 954112ed2d3SChris Wilson * Procedure is fairly simple: 955112ed2d3SChris Wilson * - reset the chip using the reset reg 956112ed2d3SChris Wilson * - re-init context state 957112ed2d3SChris Wilson * - re-init hardware status page 958112ed2d3SChris Wilson * - re-init ring buffer 959112ed2d3SChris Wilson * - re-init interrupt state 960112ed2d3SChris Wilson * - re-init display 961112ed2d3SChris Wilson */ 962112ed2d3SChris Wilson void i915_reset(struct drm_i915_private *i915, 963112ed2d3SChris Wilson intel_engine_mask_t stalled_mask, 964112ed2d3SChris Wilson const char *reason) 965112ed2d3SChris Wilson { 966112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 967112ed2d3SChris Wilson int ret; 968112ed2d3SChris Wilson 969112ed2d3SChris Wilson GEM_TRACE("flags=%lx\n", error->flags); 970112ed2d3SChris Wilson 971112ed2d3SChris Wilson might_sleep(); 972112ed2d3SChris Wilson GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); 97384383d2eSChris Wilson mutex_lock(&error->wedge_mutex); 974112ed2d3SChris Wilson 975112ed2d3SChris Wilson /* Clear any previous failed attempts at recovery. Time to try again. */ 976112ed2d3SChris Wilson if (!__i915_gem_unset_wedged(i915)) 97733df8a76SChris Wilson goto unlock; 978112ed2d3SChris Wilson 979112ed2d3SChris Wilson if (reason) 980112ed2d3SChris Wilson dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason); 981112ed2d3SChris Wilson error->reset_count++; 982112ed2d3SChris Wilson 983112ed2d3SChris Wilson reset_prepare(i915); 984112ed2d3SChris Wilson 985112ed2d3SChris Wilson if (!intel_has_gpu_reset(i915)) { 986112ed2d3SChris Wilson if (i915_modparams.reset) 987112ed2d3SChris Wilson dev_err(i915->drm.dev, "GPU reset not supported\n"); 988112ed2d3SChris Wilson else 989112ed2d3SChris Wilson DRM_DEBUG_DRIVER("GPU reset disabled\n"); 990112ed2d3SChris Wilson goto error; 991112ed2d3SChris Wilson } 992112ed2d3SChris Wilson 993112ed2d3SChris Wilson if (INTEL_INFO(i915)->gpu_reset_clobbers_display) 994112ed2d3SChris Wilson intel_runtime_pm_disable_interrupts(i915); 995112ed2d3SChris Wilson 996112ed2d3SChris Wilson if (do_reset(i915, stalled_mask)) { 997112ed2d3SChris Wilson dev_err(i915->drm.dev, "Failed to reset chip\n"); 998112ed2d3SChris Wilson goto taint; 999112ed2d3SChris Wilson } 1000112ed2d3SChris Wilson 1001112ed2d3SChris Wilson if (INTEL_INFO(i915)->gpu_reset_clobbers_display) 1002112ed2d3SChris Wilson intel_runtime_pm_enable_interrupts(i915); 1003112ed2d3SChris Wilson 1004112ed2d3SChris Wilson intel_overlay_reset(i915); 1005112ed2d3SChris Wilson 1006112ed2d3SChris Wilson /* 1007112ed2d3SChris Wilson * Next we need to restore the context, but we don't use those 1008112ed2d3SChris Wilson * yet either... 1009112ed2d3SChris Wilson * 1010112ed2d3SChris Wilson * Ring buffer needs to be re-initialized in the KMS case, or if X 1011112ed2d3SChris Wilson * was running at the time of the reset (i.e. we weren't VT 1012112ed2d3SChris Wilson * switched away). 1013112ed2d3SChris Wilson */ 1014112ed2d3SChris Wilson ret = i915_gem_init_hw(i915); 1015112ed2d3SChris Wilson if (ret) { 1016112ed2d3SChris Wilson DRM_ERROR("Failed to initialise HW following reset (%d)\n", 1017112ed2d3SChris Wilson ret); 1018112ed2d3SChris Wilson goto error; 1019112ed2d3SChris Wilson } 1020112ed2d3SChris Wilson 1021112ed2d3SChris Wilson i915_queue_hangcheck(i915); 1022112ed2d3SChris Wilson 1023112ed2d3SChris Wilson finish: 1024112ed2d3SChris Wilson reset_finish(i915); 102533df8a76SChris Wilson unlock: 102684383d2eSChris Wilson mutex_unlock(&error->wedge_mutex); 1027112ed2d3SChris Wilson return; 1028112ed2d3SChris Wilson 1029112ed2d3SChris Wilson taint: 1030112ed2d3SChris Wilson /* 1031112ed2d3SChris Wilson * History tells us that if we cannot reset the GPU now, we 1032112ed2d3SChris Wilson * never will. This then impacts everything that is run 1033112ed2d3SChris Wilson * subsequently. On failing the reset, we mark the driver 1034112ed2d3SChris Wilson * as wedged, preventing further execution on the GPU. 1035112ed2d3SChris Wilson * We also want to go one step further and add a taint to the 1036112ed2d3SChris Wilson * kernel so that any subsequent faults can be traced back to 1037112ed2d3SChris Wilson * this failure. This is important for CI, where if the 1038112ed2d3SChris Wilson * GPU/driver fails we would like to reboot and restart testing 1039112ed2d3SChris Wilson * rather than continue on into oblivion. For everyone else, 1040112ed2d3SChris Wilson * the system should still plod along, but they have been warned! 1041112ed2d3SChris Wilson */ 104218ecc6c5SChris Wilson add_taint_for_CI(TAINT_WARN); 1043112ed2d3SChris Wilson error: 1044112ed2d3SChris Wilson __i915_gem_set_wedged(i915); 1045112ed2d3SChris Wilson goto finish; 1046112ed2d3SChris Wilson } 1047112ed2d3SChris Wilson 1048112ed2d3SChris Wilson static inline int intel_gt_reset_engine(struct drm_i915_private *i915, 1049112ed2d3SChris Wilson struct intel_engine_cs *engine) 1050112ed2d3SChris Wilson { 1051112ed2d3SChris Wilson return intel_gpu_reset(i915, engine->mask); 1052112ed2d3SChris Wilson } 1053112ed2d3SChris Wilson 1054112ed2d3SChris Wilson /** 1055112ed2d3SChris Wilson * i915_reset_engine - reset GPU engine to recover from a hang 1056112ed2d3SChris Wilson * @engine: engine to reset 1057112ed2d3SChris Wilson * @msg: reason for GPU reset; or NULL for no dev_notice() 1058112ed2d3SChris Wilson * 1059112ed2d3SChris Wilson * Reset a specific GPU engine. Useful if a hang is detected. 1060112ed2d3SChris Wilson * Returns zero on successful reset or otherwise an error code. 1061112ed2d3SChris Wilson * 1062112ed2d3SChris Wilson * Procedure is: 1063112ed2d3SChris Wilson * - identifies the request that caused the hang and it is dropped 1064112ed2d3SChris Wilson * - reset engine (which will force the engine to idle) 1065112ed2d3SChris Wilson * - re-init/configure engine 1066112ed2d3SChris Wilson */ 1067112ed2d3SChris Wilson int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) 1068112ed2d3SChris Wilson { 1069112ed2d3SChris Wilson struct i915_gpu_error *error = &engine->i915->gpu_error; 1070112ed2d3SChris Wilson int ret; 1071112ed2d3SChris Wilson 1072112ed2d3SChris Wilson GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); 1073112ed2d3SChris Wilson GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); 1074112ed2d3SChris Wilson 107579ffac85SChris Wilson if (!intel_wakeref_active(&engine->wakeref)) 107679ffac85SChris Wilson return 0; 107779ffac85SChris Wilson 1078112ed2d3SChris Wilson reset_prepare_engine(engine); 1079112ed2d3SChris Wilson 1080112ed2d3SChris Wilson if (msg) 1081112ed2d3SChris Wilson dev_notice(engine->i915->drm.dev, 1082112ed2d3SChris Wilson "Resetting %s for %s\n", engine->name, msg); 1083112ed2d3SChris Wilson error->reset_engine_count[engine->id]++; 1084112ed2d3SChris Wilson 1085112ed2d3SChris Wilson if (!engine->i915->guc.execbuf_client) 1086112ed2d3SChris Wilson ret = intel_gt_reset_engine(engine->i915, engine); 1087112ed2d3SChris Wilson else 1088112ed2d3SChris Wilson ret = intel_guc_reset_engine(&engine->i915->guc, engine); 1089112ed2d3SChris Wilson if (ret) { 1090112ed2d3SChris Wilson /* If we fail here, we expect to fallback to a global reset */ 1091112ed2d3SChris Wilson DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n", 1092112ed2d3SChris Wilson engine->i915->guc.execbuf_client ? "GuC " : "", 1093112ed2d3SChris Wilson engine->name, ret); 1094112ed2d3SChris Wilson goto out; 1095112ed2d3SChris Wilson } 1096112ed2d3SChris Wilson 1097112ed2d3SChris Wilson /* 1098112ed2d3SChris Wilson * The request that caused the hang is stuck on elsp, we know the 1099112ed2d3SChris Wilson * active request and can drop it, adjust head to skip the offending 1100112ed2d3SChris Wilson * request to resume executing remaining requests in the queue. 1101112ed2d3SChris Wilson */ 1102112ed2d3SChris Wilson intel_engine_reset(engine, true); 1103112ed2d3SChris Wilson 1104112ed2d3SChris Wilson /* 1105112ed2d3SChris Wilson * The engine and its registers (and workarounds in case of render) 1106112ed2d3SChris Wilson * have been reset to their default values. Follow the init_ring 1107112ed2d3SChris Wilson * process to program RING_MODE, HWSP and re-enable submission. 1108112ed2d3SChris Wilson */ 110979ffac85SChris Wilson ret = engine->resume(engine); 1110112ed2d3SChris Wilson if (ret) 1111112ed2d3SChris Wilson goto out; 1112112ed2d3SChris Wilson 1113112ed2d3SChris Wilson out: 1114112ed2d3SChris Wilson intel_engine_cancel_stop_cs(engine); 1115112ed2d3SChris Wilson reset_finish_engine(engine); 1116112ed2d3SChris Wilson return ret; 1117112ed2d3SChris Wilson } 1118112ed2d3SChris Wilson 1119112ed2d3SChris Wilson static void i915_reset_device(struct drm_i915_private *i915, 1120112ed2d3SChris Wilson u32 engine_mask, 1121112ed2d3SChris Wilson const char *reason) 1122112ed2d3SChris Wilson { 1123112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1124112ed2d3SChris Wilson struct kobject *kobj = &i915->drm.primary->kdev->kobj; 1125112ed2d3SChris Wilson char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; 1126112ed2d3SChris Wilson char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; 1127112ed2d3SChris Wilson char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; 1128112ed2d3SChris Wilson struct i915_wedge_me w; 1129112ed2d3SChris Wilson 1130112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); 1131112ed2d3SChris Wilson 1132112ed2d3SChris Wilson DRM_DEBUG_DRIVER("resetting chip\n"); 1133112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); 1134112ed2d3SChris Wilson 1135112ed2d3SChris Wilson /* Use a watchdog to ensure that our reset completes */ 1136112ed2d3SChris Wilson i915_wedge_on_timeout(&w, i915, 5 * HZ) { 1137112ed2d3SChris Wilson intel_prepare_reset(i915); 1138112ed2d3SChris Wilson 1139112ed2d3SChris Wilson /* Flush everyone using a resource about to be clobbered */ 1140112ed2d3SChris Wilson synchronize_srcu_expedited(&error->reset_backoff_srcu); 1141112ed2d3SChris Wilson 1142112ed2d3SChris Wilson i915_reset(i915, engine_mask, reason); 1143112ed2d3SChris Wilson 1144112ed2d3SChris Wilson intel_finish_reset(i915); 1145112ed2d3SChris Wilson } 1146112ed2d3SChris Wilson 1147112ed2d3SChris Wilson if (!test_bit(I915_WEDGED, &error->flags)) 1148112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); 1149112ed2d3SChris Wilson } 1150112ed2d3SChris Wilson 1151112ed2d3SChris Wilson /** 1152112ed2d3SChris Wilson * i915_handle_error - handle a gpu error 1153112ed2d3SChris Wilson * @i915: i915 device private 1154112ed2d3SChris Wilson * @engine_mask: mask representing engines that are hung 1155112ed2d3SChris Wilson * @flags: control flags 1156112ed2d3SChris Wilson * @fmt: Error message format string 1157112ed2d3SChris Wilson * 1158112ed2d3SChris Wilson * Do some basic checking of register state at error time and 1159112ed2d3SChris Wilson * dump it to the syslog. Also call i915_capture_error_state() to make 1160112ed2d3SChris Wilson * sure we get a record and make it available in debugfs. Fire a uevent 1161112ed2d3SChris Wilson * so userspace knows something bad happened (should trigger collection 1162112ed2d3SChris Wilson * of a ring dump etc.). 1163112ed2d3SChris Wilson */ 1164112ed2d3SChris Wilson void i915_handle_error(struct drm_i915_private *i915, 1165112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 1166112ed2d3SChris Wilson unsigned long flags, 1167112ed2d3SChris Wilson const char *fmt, ...) 1168112ed2d3SChris Wilson { 1169112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1170112ed2d3SChris Wilson struct intel_engine_cs *engine; 1171112ed2d3SChris Wilson intel_wakeref_t wakeref; 1172112ed2d3SChris Wilson intel_engine_mask_t tmp; 1173112ed2d3SChris Wilson char error_msg[80]; 1174112ed2d3SChris Wilson char *msg = NULL; 1175112ed2d3SChris Wilson 1176112ed2d3SChris Wilson if (fmt) { 1177112ed2d3SChris Wilson va_list args; 1178112ed2d3SChris Wilson 1179112ed2d3SChris Wilson va_start(args, fmt); 1180112ed2d3SChris Wilson vscnprintf(error_msg, sizeof(error_msg), fmt, args); 1181112ed2d3SChris Wilson va_end(args); 1182112ed2d3SChris Wilson 1183112ed2d3SChris Wilson msg = error_msg; 1184112ed2d3SChris Wilson } 1185112ed2d3SChris Wilson 1186112ed2d3SChris Wilson /* 1187112ed2d3SChris Wilson * In most cases it's guaranteed that we get here with an RPM 1188112ed2d3SChris Wilson * reference held, for example because there is a pending GPU 1189112ed2d3SChris Wilson * request that won't finish until the reset is done. This 1190112ed2d3SChris Wilson * isn't the case at least when we get here by doing a 1191112ed2d3SChris Wilson * simulated reset via debugfs, so get an RPM reference. 1192112ed2d3SChris Wilson */ 1193d858d569SDaniele Ceraolo Spurio wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1194112ed2d3SChris Wilson 1195112ed2d3SChris Wilson engine_mask &= INTEL_INFO(i915)->engine_mask; 1196112ed2d3SChris Wilson 1197112ed2d3SChris Wilson if (flags & I915_ERROR_CAPTURE) { 1198112ed2d3SChris Wilson i915_capture_error_state(i915, engine_mask, msg); 1199eaf522f6STvrtko Ursulin intel_gt_clear_error_registers(&i915->gt, engine_mask); 1200112ed2d3SChris Wilson } 1201112ed2d3SChris Wilson 1202112ed2d3SChris Wilson /* 1203112ed2d3SChris Wilson * Try engine reset when available. We fall back to full reset if 1204112ed2d3SChris Wilson * single reset fails. 1205112ed2d3SChris Wilson */ 1206112ed2d3SChris Wilson if (intel_has_reset_engine(i915) && !__i915_wedged(error)) { 1207112ed2d3SChris Wilson for_each_engine_masked(engine, i915, engine_mask, tmp) { 1208112ed2d3SChris Wilson BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); 1209112ed2d3SChris Wilson if (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1210112ed2d3SChris Wilson &error->flags)) 1211112ed2d3SChris Wilson continue; 1212112ed2d3SChris Wilson 1213112ed2d3SChris Wilson if (i915_reset_engine(engine, msg) == 0) 1214112ed2d3SChris Wilson engine_mask &= ~engine->mask; 1215112ed2d3SChris Wilson 1216112ed2d3SChris Wilson clear_bit(I915_RESET_ENGINE + engine->id, 1217112ed2d3SChris Wilson &error->flags); 1218112ed2d3SChris Wilson wake_up_bit(&error->flags, 1219112ed2d3SChris Wilson I915_RESET_ENGINE + engine->id); 1220112ed2d3SChris Wilson } 1221112ed2d3SChris Wilson } 1222112ed2d3SChris Wilson 1223112ed2d3SChris Wilson if (!engine_mask) 1224112ed2d3SChris Wilson goto out; 1225112ed2d3SChris Wilson 1226112ed2d3SChris Wilson /* Full reset needs the mutex, stop any other user trying to do so. */ 1227112ed2d3SChris Wilson if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) { 1228112ed2d3SChris Wilson wait_event(error->reset_queue, 1229112ed2d3SChris Wilson !test_bit(I915_RESET_BACKOFF, &error->flags)); 1230112ed2d3SChris Wilson goto out; /* piggy-back on the other reset */ 1231112ed2d3SChris Wilson } 1232112ed2d3SChris Wilson 1233112ed2d3SChris Wilson /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */ 1234112ed2d3SChris Wilson synchronize_rcu_expedited(); 1235112ed2d3SChris Wilson 1236112ed2d3SChris Wilson /* Prevent any other reset-engine attempt. */ 1237112ed2d3SChris Wilson for_each_engine(engine, i915, tmp) { 1238112ed2d3SChris Wilson while (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1239112ed2d3SChris Wilson &error->flags)) 1240112ed2d3SChris Wilson wait_on_bit(&error->flags, 1241112ed2d3SChris Wilson I915_RESET_ENGINE + engine->id, 1242112ed2d3SChris Wilson TASK_UNINTERRUPTIBLE); 1243112ed2d3SChris Wilson } 1244112ed2d3SChris Wilson 1245112ed2d3SChris Wilson i915_reset_device(i915, engine_mask, msg); 1246112ed2d3SChris Wilson 1247112ed2d3SChris Wilson for_each_engine(engine, i915, tmp) { 1248112ed2d3SChris Wilson clear_bit(I915_RESET_ENGINE + engine->id, 1249112ed2d3SChris Wilson &error->flags); 1250112ed2d3SChris Wilson } 1251112ed2d3SChris Wilson 1252112ed2d3SChris Wilson clear_bit(I915_RESET_BACKOFF, &error->flags); 1253112ed2d3SChris Wilson wake_up_all(&error->reset_queue); 1254112ed2d3SChris Wilson 1255112ed2d3SChris Wilson out: 1256d858d569SDaniele Ceraolo Spurio intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1257112ed2d3SChris Wilson } 1258112ed2d3SChris Wilson 1259112ed2d3SChris Wilson int i915_reset_trylock(struct drm_i915_private *i915) 1260112ed2d3SChris Wilson { 1261112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1262112ed2d3SChris Wilson int srcu; 1263112ed2d3SChris Wilson 1264112ed2d3SChris Wilson might_lock(&error->reset_backoff_srcu); 1265112ed2d3SChris Wilson might_sleep(); 1266112ed2d3SChris Wilson 1267112ed2d3SChris Wilson rcu_read_lock(); 1268112ed2d3SChris Wilson while (test_bit(I915_RESET_BACKOFF, &error->flags)) { 1269112ed2d3SChris Wilson rcu_read_unlock(); 1270112ed2d3SChris Wilson 1271112ed2d3SChris Wilson if (wait_event_interruptible(error->reset_queue, 1272112ed2d3SChris Wilson !test_bit(I915_RESET_BACKOFF, 1273112ed2d3SChris Wilson &error->flags))) 1274112ed2d3SChris Wilson return -EINTR; 1275112ed2d3SChris Wilson 1276112ed2d3SChris Wilson rcu_read_lock(); 1277112ed2d3SChris Wilson } 1278112ed2d3SChris Wilson srcu = srcu_read_lock(&error->reset_backoff_srcu); 1279112ed2d3SChris Wilson rcu_read_unlock(); 1280112ed2d3SChris Wilson 1281112ed2d3SChris Wilson return srcu; 1282112ed2d3SChris Wilson } 1283112ed2d3SChris Wilson 1284112ed2d3SChris Wilson void i915_reset_unlock(struct drm_i915_private *i915, int tag) 1285112ed2d3SChris Wilson __releases(&i915->gpu_error.reset_backoff_srcu) 1286112ed2d3SChris Wilson { 1287112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1288112ed2d3SChris Wilson 1289112ed2d3SChris Wilson srcu_read_unlock(&error->reset_backoff_srcu, tag); 1290112ed2d3SChris Wilson } 1291112ed2d3SChris Wilson 1292112ed2d3SChris Wilson int i915_terminally_wedged(struct drm_i915_private *i915) 1293112ed2d3SChris Wilson { 1294112ed2d3SChris Wilson struct i915_gpu_error *error = &i915->gpu_error; 1295112ed2d3SChris Wilson 1296112ed2d3SChris Wilson might_sleep(); 1297112ed2d3SChris Wilson 1298112ed2d3SChris Wilson if (!__i915_wedged(error)) 1299112ed2d3SChris Wilson return 0; 1300112ed2d3SChris Wilson 1301112ed2d3SChris Wilson /* Reset still in progress? Maybe we will recover? */ 1302112ed2d3SChris Wilson if (!test_bit(I915_RESET_BACKOFF, &error->flags)) 1303112ed2d3SChris Wilson return -EIO; 1304112ed2d3SChris Wilson 1305112ed2d3SChris Wilson /* XXX intel_reset_finish() still takes struct_mutex!!! */ 1306112ed2d3SChris Wilson if (mutex_is_locked(&i915->drm.struct_mutex)) 1307112ed2d3SChris Wilson return -EAGAIN; 1308112ed2d3SChris Wilson 1309112ed2d3SChris Wilson if (wait_event_interruptible(error->reset_queue, 1310112ed2d3SChris Wilson !test_bit(I915_RESET_BACKOFF, 1311112ed2d3SChris Wilson &error->flags))) 1312112ed2d3SChris Wilson return -EINTR; 1313112ed2d3SChris Wilson 1314112ed2d3SChris Wilson return __i915_wedged(error) ? -EIO : 0; 1315112ed2d3SChris Wilson } 1316112ed2d3SChris Wilson 1317112ed2d3SChris Wilson static void i915_wedge_me(struct work_struct *work) 1318112ed2d3SChris Wilson { 1319112ed2d3SChris Wilson struct i915_wedge_me *w = container_of(work, typeof(*w), work.work); 1320112ed2d3SChris Wilson 1321112ed2d3SChris Wilson dev_err(w->i915->drm.dev, 1322112ed2d3SChris Wilson "%s timed out, cancelling all in-flight rendering.\n", 1323112ed2d3SChris Wilson w->name); 1324112ed2d3SChris Wilson i915_gem_set_wedged(w->i915); 1325112ed2d3SChris Wilson } 1326112ed2d3SChris Wilson 1327112ed2d3SChris Wilson void __i915_init_wedge(struct i915_wedge_me *w, 1328112ed2d3SChris Wilson struct drm_i915_private *i915, 1329112ed2d3SChris Wilson long timeout, 1330112ed2d3SChris Wilson const char *name) 1331112ed2d3SChris Wilson { 1332112ed2d3SChris Wilson w->i915 = i915; 1333112ed2d3SChris Wilson w->name = name; 1334112ed2d3SChris Wilson 1335112ed2d3SChris Wilson INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me); 1336112ed2d3SChris Wilson schedule_delayed_work(&w->work, timeout); 1337112ed2d3SChris Wilson } 1338112ed2d3SChris Wilson 1339112ed2d3SChris Wilson void __i915_fini_wedge(struct i915_wedge_me *w) 1340112ed2d3SChris Wilson { 1341112ed2d3SChris Wilson cancel_delayed_work_sync(&w->work); 1342112ed2d3SChris Wilson destroy_delayed_work_on_stack(&w->work); 1343112ed2d3SChris Wilson w->i915 = NULL; 1344112ed2d3SChris Wilson } 1345932309fbSMichal Wajdeczko 1346932309fbSMichal Wajdeczko #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1347932309fbSMichal Wajdeczko #include "selftest_reset.c" 1348932309fbSMichal Wajdeczko #endif 1349