1112ed2d3SChris Wilson /* 2112ed2d3SChris Wilson * SPDX-License-Identifier: MIT 3112ed2d3SChris Wilson * 4112ed2d3SChris Wilson * Copyright © 2008-2018 Intel Corporation 5112ed2d3SChris Wilson */ 6112ed2d3SChris Wilson 7112ed2d3SChris Wilson #include <linux/sched/mm.h> 8112ed2d3SChris Wilson #include <linux/stop_machine.h> 9112ed2d3SChris Wilson 101d455f8dSJani Nikula #include "display/intel_display_types.h" 11df0566a6SJani Nikula #include "display/intel_overlay.h" 12df0566a6SJani Nikula 1310be98a7SChris Wilson #include "gem/i915_gem_context.h" 1410be98a7SChris Wilson 15112ed2d3SChris Wilson #include "i915_drv.h" 16112ed2d3SChris Wilson #include "i915_gpu_error.h" 17440e2b3dSJani Nikula #include "i915_irq.h" 18b3786b29SChris Wilson #include "intel_breadcrumbs.h" 1979ffac85SChris Wilson #include "intel_engine_pm.h" 20eaf522f6STvrtko Ursulin #include "intel_gt.h" 2179ffac85SChris Wilson #include "intel_gt_pm.h" 22b0573472SChris Wilson #include "intel_gt_requests.h" 23112ed2d3SChris Wilson #include "intel_reset.h" 24112ed2d3SChris Wilson 250f261b24SDaniele Ceraolo Spurio #include "uc/intel_guc.h" 263c9abe88SDaniele Ceraolo Spurio #include "uc/intel_guc_submission.h" 27112ed2d3SChris Wilson 28112ed2d3SChris Wilson #define RESET_MAX_RETRIES 3 29112ed2d3SChris Wilson 30112ed2d3SChris Wilson /* XXX How to handle concurrent GGTT updates using tiling registers? */ 31112ed2d3SChris Wilson #define RESET_UNDER_STOP_MACHINE 0 32112ed2d3SChris Wilson 33112ed2d3SChris Wilson static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set) 34112ed2d3SChris Wilson { 35112ed2d3SChris Wilson intel_uncore_rmw_fw(uncore, reg, 0, set); 36112ed2d3SChris Wilson } 37112ed2d3SChris Wilson 38112ed2d3SChris Wilson static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) 39112ed2d3SChris Wilson { 40112ed2d3SChris Wilson intel_uncore_rmw_fw(uncore, reg, clr, 0); 41112ed2d3SChris Wilson } 42112ed2d3SChris Wilson 4316f2941aSChris Wilson static void skip_context(struct i915_request *rq) 44112ed2d3SChris Wilson { 459f3ccd40SChris Wilson struct intel_context *hung_ctx = rq->context; 46112ed2d3SChris Wilson 4716f2941aSChris Wilson list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, link) { 48112ed2d3SChris Wilson if (!i915_request_is_active(rq)) 49112ed2d3SChris Wilson return; 50112ed2d3SChris Wilson 5136e191f0SChris Wilson if (rq->context == hung_ctx) { 5236e191f0SChris Wilson i915_request_set_error_once(rq, -EIO); 5336e191f0SChris Wilson __i915_request_skip(rq); 5436e191f0SChris Wilson } 55112ed2d3SChris Wilson } 5616f2941aSChris Wilson } 57112ed2d3SChris Wilson 58e6ba7648SChris Wilson static void client_mark_guilty(struct i915_gem_context *ctx, bool banned) 59112ed2d3SChris Wilson { 609f3ccd40SChris Wilson struct drm_i915_file_private *file_priv = ctx->file_priv; 61112ed2d3SChris Wilson unsigned long prev_hang; 629f3ccd40SChris Wilson unsigned int score; 63112ed2d3SChris Wilson 649f3ccd40SChris Wilson if (IS_ERR_OR_NULL(file_priv)) 659f3ccd40SChris Wilson return; 669f3ccd40SChris Wilson 67112ed2d3SChris Wilson score = 0; 689f3ccd40SChris Wilson if (banned) 699f3ccd40SChris Wilson score = I915_CLIENT_SCORE_CONTEXT_BAN; 70112ed2d3SChris Wilson 71112ed2d3SChris Wilson prev_hang = xchg(&file_priv->hang_timestamp, jiffies); 72112ed2d3SChris Wilson if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) 73112ed2d3SChris Wilson score += I915_CLIENT_SCORE_HANG_FAST; 74112ed2d3SChris Wilson 75112ed2d3SChris Wilson if (score) { 76112ed2d3SChris Wilson atomic_add(score, &file_priv->ban_score); 77112ed2d3SChris Wilson 78f8474622SWambui Karuga drm_dbg(&ctx->i915->drm, 79f8474622SWambui Karuga "client %s: gained %u ban score, now %u\n", 80112ed2d3SChris Wilson ctx->name, score, 81112ed2d3SChris Wilson atomic_read(&file_priv->ban_score)); 82112ed2d3SChris Wilson } 83112ed2d3SChris Wilson } 84112ed2d3SChris Wilson 859f3ccd40SChris Wilson static bool mark_guilty(struct i915_request *rq) 86112ed2d3SChris Wilson { 87e6ba7648SChris Wilson struct i915_gem_context *ctx; 88112ed2d3SChris Wilson unsigned long prev_hang; 89112ed2d3SChris Wilson bool banned; 90112ed2d3SChris Wilson int i; 91112ed2d3SChris Wilson 928e37d699SChris Wilson if (intel_context_is_closed(rq->context)) { 938e37d699SChris Wilson intel_context_set_banned(rq->context); 948e37d699SChris Wilson return true; 958e37d699SChris Wilson } 968e37d699SChris Wilson 976a8679c0SChris Wilson rcu_read_lock(); 986a8679c0SChris Wilson ctx = rcu_dereference(rq->context->gem_context); 996a8679c0SChris Wilson if (ctx && !kref_get_unless_zero(&ctx->ref)) 1006a8679c0SChris Wilson ctx = NULL; 1016a8679c0SChris Wilson rcu_read_unlock(); 102e6ba7648SChris Wilson if (!ctx) 103be90e344SChris Wilson return intel_context_is_banned(rq->context); 104e8887bb3SChris Wilson 105112ed2d3SChris Wilson atomic_inc(&ctx->guilty_count); 106112ed2d3SChris Wilson 107112ed2d3SChris Wilson /* Cool contexts are too cool to be banned! (Used for reset testing.) */ 1086a8679c0SChris Wilson if (!i915_gem_context_is_bannable(ctx)) { 1096a8679c0SChris Wilson banned = false; 1106a8679c0SChris Wilson goto out; 1116a8679c0SChris Wilson } 112112ed2d3SChris Wilson 113dc483ba5SJani Nikula drm_notice(&ctx->i915->drm, 114dfd9c1b4SChris Wilson "%s context reset due to GPU hang\n", 115dfd9c1b4SChris Wilson ctx->name); 116dfd9c1b4SChris Wilson 117112ed2d3SChris Wilson /* Record the timestamp for the last N hangs */ 118112ed2d3SChris Wilson prev_hang = ctx->hang_timestamp[0]; 119112ed2d3SChris Wilson for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) 120112ed2d3SChris Wilson ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; 121112ed2d3SChris Wilson ctx->hang_timestamp[i] = jiffies; 122112ed2d3SChris Wilson 123112ed2d3SChris Wilson /* If we have hung N+1 times in rapid succession, we ban the context! */ 124112ed2d3SChris Wilson banned = !i915_gem_context_is_recoverable(ctx); 125112ed2d3SChris Wilson if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) 126112ed2d3SChris Wilson banned = true; 127112ed2d3SChris Wilson if (banned) { 128f8474622SWambui Karuga drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n", 129112ed2d3SChris Wilson ctx->name, atomic_read(&ctx->guilty_count)); 1309f3ccd40SChris Wilson intel_context_set_banned(rq->context); 131112ed2d3SChris Wilson } 132112ed2d3SChris Wilson 133e6ba7648SChris Wilson client_mark_guilty(ctx, banned); 134112ed2d3SChris Wilson 1356a8679c0SChris Wilson out: 1366a8679c0SChris Wilson i915_gem_context_put(ctx); 137112ed2d3SChris Wilson return banned; 138112ed2d3SChris Wilson } 139112ed2d3SChris Wilson 1409f3ccd40SChris Wilson static void mark_innocent(struct i915_request *rq) 141112ed2d3SChris Wilson { 1426a8679c0SChris Wilson struct i915_gem_context *ctx; 1436a8679c0SChris Wilson 1446a8679c0SChris Wilson rcu_read_lock(); 1456a8679c0SChris Wilson ctx = rcu_dereference(rq->context->gem_context); 1466a8679c0SChris Wilson if (ctx) 1476a8679c0SChris Wilson atomic_inc(&ctx->active_count); 1486a8679c0SChris Wilson rcu_read_unlock(); 149112ed2d3SChris Wilson } 150112ed2d3SChris Wilson 151cb823ed9SChris Wilson void __i915_request_reset(struct i915_request *rq, bool guilty) 152112ed2d3SChris Wilson { 1533fbbbef4SChris Wilson RQ_TRACE(rq, "guilty? %s\n", yesno(guilty)); 154112ed2d3SChris Wilson 155112ed2d3SChris Wilson GEM_BUG_ON(i915_request_completed(rq)); 156112ed2d3SChris Wilson 157e8887bb3SChris Wilson rcu_read_lock(); /* protect the GEM context */ 158112ed2d3SChris Wilson if (guilty) { 15936e191f0SChris Wilson i915_request_set_error_once(rq, -EIO); 16036e191f0SChris Wilson __i915_request_skip(rq); 1619f3ccd40SChris Wilson if (mark_guilty(rq)) 16216f2941aSChris Wilson skip_context(rq); 163112ed2d3SChris Wilson } else { 16436e191f0SChris Wilson i915_request_set_error_once(rq, -EAGAIN); 1659f3ccd40SChris Wilson mark_innocent(rq); 166112ed2d3SChris Wilson } 167e8887bb3SChris Wilson rcu_read_unlock(); 168112ed2d3SChris Wilson } 169112ed2d3SChris Wilson 170112ed2d3SChris Wilson static bool i915_in_reset(struct pci_dev *pdev) 171112ed2d3SChris Wilson { 172112ed2d3SChris Wilson u8 gdrst; 173112ed2d3SChris Wilson 174112ed2d3SChris Wilson pci_read_config_byte(pdev, I915_GDRST, &gdrst); 175112ed2d3SChris Wilson return gdrst & GRDOM_RESET_STATUS; 176112ed2d3SChris Wilson } 177112ed2d3SChris Wilson 178cb823ed9SChris Wilson static int i915_do_reset(struct intel_gt *gt, 179112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 180112ed2d3SChris Wilson unsigned int retry) 181112ed2d3SChris Wilson { 182cb823ed9SChris Wilson struct pci_dev *pdev = gt->i915->drm.pdev; 183112ed2d3SChris Wilson int err; 184112ed2d3SChris Wilson 185112ed2d3SChris Wilson /* Assert reset for at least 20 usec, and wait for acknowledgement. */ 186112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 187112ed2d3SChris Wilson udelay(50); 188112ed2d3SChris Wilson err = wait_for_atomic(i915_in_reset(pdev), 50); 189112ed2d3SChris Wilson 190112ed2d3SChris Wilson /* Clear the reset request. */ 191112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 0); 192112ed2d3SChris Wilson udelay(50); 193112ed2d3SChris Wilson if (!err) 194112ed2d3SChris Wilson err = wait_for_atomic(!i915_in_reset(pdev), 50); 195112ed2d3SChris Wilson 196112ed2d3SChris Wilson return err; 197112ed2d3SChris Wilson } 198112ed2d3SChris Wilson 199112ed2d3SChris Wilson static bool g4x_reset_complete(struct pci_dev *pdev) 200112ed2d3SChris Wilson { 201112ed2d3SChris Wilson u8 gdrst; 202112ed2d3SChris Wilson 203112ed2d3SChris Wilson pci_read_config_byte(pdev, I915_GDRST, &gdrst); 204112ed2d3SChris Wilson return (gdrst & GRDOM_RESET_ENABLE) == 0; 205112ed2d3SChris Wilson } 206112ed2d3SChris Wilson 207cb823ed9SChris Wilson static int g33_do_reset(struct intel_gt *gt, 208112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 209112ed2d3SChris Wilson unsigned int retry) 210112ed2d3SChris Wilson { 211cb823ed9SChris Wilson struct pci_dev *pdev = gt->i915->drm.pdev; 212112ed2d3SChris Wilson 213112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 214112ed2d3SChris Wilson return wait_for_atomic(g4x_reset_complete(pdev), 50); 215112ed2d3SChris Wilson } 216112ed2d3SChris Wilson 217cb823ed9SChris Wilson static int g4x_do_reset(struct intel_gt *gt, 218112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 219112ed2d3SChris Wilson unsigned int retry) 220112ed2d3SChris Wilson { 221cb823ed9SChris Wilson struct pci_dev *pdev = gt->i915->drm.pdev; 222cb823ed9SChris Wilson struct intel_uncore *uncore = gt->uncore; 223112ed2d3SChris Wilson int ret; 224112ed2d3SChris Wilson 225112ed2d3SChris Wilson /* WaVcpClkGateDisableForMediaReset:ctg,elk */ 226112ed2d3SChris Wilson rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 227112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 228112ed2d3SChris Wilson 229112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 230112ed2d3SChris Wilson GRDOM_MEDIA | GRDOM_RESET_ENABLE); 231112ed2d3SChris Wilson ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 232112ed2d3SChris Wilson if (ret) { 233cb56a07dSChris Wilson GT_TRACE(gt, "Wait for media reset failed\n"); 234112ed2d3SChris Wilson goto out; 235112ed2d3SChris Wilson } 236112ed2d3SChris Wilson 237112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 238112ed2d3SChris Wilson GRDOM_RENDER | GRDOM_RESET_ENABLE); 239112ed2d3SChris Wilson ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 240112ed2d3SChris Wilson if (ret) { 241cb56a07dSChris Wilson GT_TRACE(gt, "Wait for render reset failed\n"); 242112ed2d3SChris Wilson goto out; 243112ed2d3SChris Wilson } 244112ed2d3SChris Wilson 245112ed2d3SChris Wilson out: 246112ed2d3SChris Wilson pci_write_config_byte(pdev, I915_GDRST, 0); 247112ed2d3SChris Wilson 248112ed2d3SChris Wilson rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 249112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 250112ed2d3SChris Wilson 251112ed2d3SChris Wilson return ret; 252112ed2d3SChris Wilson } 253112ed2d3SChris Wilson 2549eae5e27SLucas De Marchi static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask, 255112ed2d3SChris Wilson unsigned int retry) 256112ed2d3SChris Wilson { 257cb823ed9SChris Wilson struct intel_uncore *uncore = gt->uncore; 258112ed2d3SChris Wilson int ret; 259112ed2d3SChris Wilson 260112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 261112ed2d3SChris Wilson ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); 262112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 263112ed2d3SChris Wilson ILK_GRDOM_RESET_ENABLE, 0, 264112ed2d3SChris Wilson 5000, 0, 265112ed2d3SChris Wilson NULL); 266112ed2d3SChris Wilson if (ret) { 267cb56a07dSChris Wilson GT_TRACE(gt, "Wait for render reset failed\n"); 268112ed2d3SChris Wilson goto out; 269112ed2d3SChris Wilson } 270112ed2d3SChris Wilson 271112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 272112ed2d3SChris Wilson ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); 273112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 274112ed2d3SChris Wilson ILK_GRDOM_RESET_ENABLE, 0, 275112ed2d3SChris Wilson 5000, 0, 276112ed2d3SChris Wilson NULL); 277112ed2d3SChris Wilson if (ret) { 278cb56a07dSChris Wilson GT_TRACE(gt, "Wait for media reset failed\n"); 279112ed2d3SChris Wilson goto out; 280112ed2d3SChris Wilson } 281112ed2d3SChris Wilson 282112ed2d3SChris Wilson out: 283112ed2d3SChris Wilson intel_uncore_write_fw(uncore, ILK_GDSR, 0); 284112ed2d3SChris Wilson intel_uncore_posting_read_fw(uncore, ILK_GDSR); 285112ed2d3SChris Wilson return ret; 286112ed2d3SChris Wilson } 287112ed2d3SChris Wilson 288112ed2d3SChris Wilson /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ 289cb823ed9SChris Wilson static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask) 290112ed2d3SChris Wilson { 291cb823ed9SChris Wilson struct intel_uncore *uncore = gt->uncore; 292112ed2d3SChris Wilson int err; 293112ed2d3SChris Wilson 294112ed2d3SChris Wilson /* 295112ed2d3SChris Wilson * GEN6_GDRST is not in the gt power well, no need to check 296112ed2d3SChris Wilson * for fifo space for the write or forcewake the chip for 297112ed2d3SChris Wilson * the read 298112ed2d3SChris Wilson */ 299112ed2d3SChris Wilson intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); 300112ed2d3SChris Wilson 301112ed2d3SChris Wilson /* Wait for the device to ack the reset requests */ 302112ed2d3SChris Wilson err = __intel_wait_for_register_fw(uncore, 303112ed2d3SChris Wilson GEN6_GDRST, hw_domain_mask, 0, 304112ed2d3SChris Wilson 500, 0, 305112ed2d3SChris Wilson NULL); 306112ed2d3SChris Wilson if (err) 307cb56a07dSChris Wilson GT_TRACE(gt, 308f8474622SWambui Karuga "Wait for 0x%08x engines reset failed\n", 309112ed2d3SChris Wilson hw_domain_mask); 310112ed2d3SChris Wilson 311112ed2d3SChris Wilson return err; 312112ed2d3SChris Wilson } 313112ed2d3SChris Wilson 314cb823ed9SChris Wilson static int gen6_reset_engines(struct intel_gt *gt, 315112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 316112ed2d3SChris Wilson unsigned int retry) 317112ed2d3SChris Wilson { 318b9dcb97bSColin Ian King static const u32 hw_engine_mask[] = { 319112ed2d3SChris Wilson [RCS0] = GEN6_GRDOM_RENDER, 320112ed2d3SChris Wilson [BCS0] = GEN6_GRDOM_BLT, 321112ed2d3SChris Wilson [VCS0] = GEN6_GRDOM_MEDIA, 322112ed2d3SChris Wilson [VCS1] = GEN8_GRDOM_MEDIA2, 323112ed2d3SChris Wilson [VECS0] = GEN6_GRDOM_VECS, 324112ed2d3SChris Wilson }; 325b9dcb97bSColin Ian King struct intel_engine_cs *engine; 326112ed2d3SChris Wilson u32 hw_mask; 327112ed2d3SChris Wilson 328112ed2d3SChris Wilson if (engine_mask == ALL_ENGINES) { 329112ed2d3SChris Wilson hw_mask = GEN6_GRDOM_FULL; 330112ed2d3SChris Wilson } else { 331112ed2d3SChris Wilson intel_engine_mask_t tmp; 332112ed2d3SChris Wilson 333112ed2d3SChris Wilson hw_mask = 0; 334a50134b1STvrtko Ursulin for_each_engine_masked(engine, gt, engine_mask, tmp) { 335112ed2d3SChris Wilson GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); 336112ed2d3SChris Wilson hw_mask |= hw_engine_mask[engine->id]; 337112ed2d3SChris Wilson } 338112ed2d3SChris Wilson } 339112ed2d3SChris Wilson 340cb823ed9SChris Wilson return gen6_hw_domain_reset(gt, hw_mask); 341112ed2d3SChris Wilson } 342112ed2d3SChris Wilson 3430d333ac7SDaniele Ceraolo Spurio static int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask) 344112ed2d3SChris Wilson { 345112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 346792592e7SDaniele Ceraolo Spurio u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; 347112ed2d3SChris Wilson i915_reg_t sfc_forced_lock, sfc_forced_lock_ack; 348112ed2d3SChris Wilson u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit; 349112ed2d3SChris Wilson i915_reg_t sfc_usage; 350112ed2d3SChris Wilson u32 sfc_usage_bit; 351112ed2d3SChris Wilson u32 sfc_reset_bit; 3520d333ac7SDaniele Ceraolo Spurio int ret; 353112ed2d3SChris Wilson 354112ed2d3SChris Wilson switch (engine->class) { 355112ed2d3SChris Wilson case VIDEO_DECODE_CLASS: 356112ed2d3SChris Wilson if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 357112ed2d3SChris Wilson return 0; 358112ed2d3SChris Wilson 359112ed2d3SChris Wilson sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); 360112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 361112ed2d3SChris Wilson 362112ed2d3SChris Wilson sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine); 363112ed2d3SChris Wilson sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; 364112ed2d3SChris Wilson 365112ed2d3SChris Wilson sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine); 366112ed2d3SChris Wilson sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT; 367112ed2d3SChris Wilson sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); 368112ed2d3SChris Wilson break; 369112ed2d3SChris Wilson 370112ed2d3SChris Wilson case VIDEO_ENHANCEMENT_CLASS: 371112ed2d3SChris Wilson sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); 372112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 373112ed2d3SChris Wilson 374112ed2d3SChris Wilson sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine); 375112ed2d3SChris Wilson sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; 376112ed2d3SChris Wilson 377112ed2d3SChris Wilson sfc_usage = GEN11_VECS_SFC_USAGE(engine); 378112ed2d3SChris Wilson sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT; 379112ed2d3SChris Wilson sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); 380112ed2d3SChris Wilson break; 381112ed2d3SChris Wilson 382112ed2d3SChris Wilson default: 383112ed2d3SChris Wilson return 0; 384112ed2d3SChris Wilson } 385112ed2d3SChris Wilson 386112ed2d3SChris Wilson /* 3870d333ac7SDaniele Ceraolo Spurio * If the engine is using a SFC, tell the engine that a software reset 3880d333ac7SDaniele Ceraolo Spurio * is going to happen. The engine will then try to force lock the SFC. 3890d333ac7SDaniele Ceraolo Spurio * If SFC ends up being locked to the engine we want to reset, we have 3900d333ac7SDaniele Ceraolo Spurio * to reset it as well (we will unlock it once the reset sequence is 3910d333ac7SDaniele Ceraolo Spurio * completed). 392112ed2d3SChris Wilson */ 3930d333ac7SDaniele Ceraolo Spurio if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)) 3940d333ac7SDaniele Ceraolo Spurio return 0; 3950d333ac7SDaniele Ceraolo Spurio 396112ed2d3SChris Wilson rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); 397112ed2d3SChris Wilson 3980d333ac7SDaniele Ceraolo Spurio ret = __intel_wait_for_register_fw(uncore, 399112ed2d3SChris Wilson sfc_forced_lock_ack, 400112ed2d3SChris Wilson sfc_forced_lock_ack_bit, 401112ed2d3SChris Wilson sfc_forced_lock_ack_bit, 4020d333ac7SDaniele Ceraolo Spurio 1000, 0, NULL); 4030d333ac7SDaniele Ceraolo Spurio 4040d333ac7SDaniele Ceraolo Spurio /* Was the SFC released while we were trying to lock it? */ 4050d333ac7SDaniele Ceraolo Spurio if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)) 406112ed2d3SChris Wilson return 0; 4070d333ac7SDaniele Ceraolo Spurio 4080d333ac7SDaniele Ceraolo Spurio if (ret) { 409cb56a07dSChris Wilson ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n"); 4100d333ac7SDaniele Ceraolo Spurio return ret; 411112ed2d3SChris Wilson } 412112ed2d3SChris Wilson 4130d333ac7SDaniele Ceraolo Spurio *hw_mask |= sfc_reset_bit; 414112ed2d3SChris Wilson return 0; 415112ed2d3SChris Wilson } 416112ed2d3SChris Wilson 417112ed2d3SChris Wilson static void gen11_unlock_sfc(struct intel_engine_cs *engine) 418112ed2d3SChris Wilson { 419112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 420792592e7SDaniele Ceraolo Spurio u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; 421112ed2d3SChris Wilson i915_reg_t sfc_forced_lock; 422112ed2d3SChris Wilson u32 sfc_forced_lock_bit; 423112ed2d3SChris Wilson 424112ed2d3SChris Wilson switch (engine->class) { 425112ed2d3SChris Wilson case VIDEO_DECODE_CLASS: 426112ed2d3SChris Wilson if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 427112ed2d3SChris Wilson return; 428112ed2d3SChris Wilson 429112ed2d3SChris Wilson sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); 430112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 431112ed2d3SChris Wilson break; 432112ed2d3SChris Wilson 433112ed2d3SChris Wilson case VIDEO_ENHANCEMENT_CLASS: 434112ed2d3SChris Wilson sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); 435112ed2d3SChris Wilson sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 436112ed2d3SChris Wilson break; 437112ed2d3SChris Wilson 438112ed2d3SChris Wilson default: 439112ed2d3SChris Wilson return; 440112ed2d3SChris Wilson } 441112ed2d3SChris Wilson 442112ed2d3SChris Wilson rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); 443112ed2d3SChris Wilson } 444112ed2d3SChris Wilson 445cb823ed9SChris Wilson static int gen11_reset_engines(struct intel_gt *gt, 446112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 447112ed2d3SChris Wilson unsigned int retry) 448112ed2d3SChris Wilson { 449b9dcb97bSColin Ian King static const u32 hw_engine_mask[] = { 450112ed2d3SChris Wilson [RCS0] = GEN11_GRDOM_RENDER, 451112ed2d3SChris Wilson [BCS0] = GEN11_GRDOM_BLT, 452112ed2d3SChris Wilson [VCS0] = GEN11_GRDOM_MEDIA, 453112ed2d3SChris Wilson [VCS1] = GEN11_GRDOM_MEDIA2, 454112ed2d3SChris Wilson [VCS2] = GEN11_GRDOM_MEDIA3, 455112ed2d3SChris Wilson [VCS3] = GEN11_GRDOM_MEDIA4, 456112ed2d3SChris Wilson [VECS0] = GEN11_GRDOM_VECS, 457112ed2d3SChris Wilson [VECS1] = GEN11_GRDOM_VECS2, 458112ed2d3SChris Wilson }; 459112ed2d3SChris Wilson struct intel_engine_cs *engine; 460112ed2d3SChris Wilson intel_engine_mask_t tmp; 461112ed2d3SChris Wilson u32 hw_mask; 462112ed2d3SChris Wilson int ret; 463112ed2d3SChris Wilson 464112ed2d3SChris Wilson if (engine_mask == ALL_ENGINES) { 465112ed2d3SChris Wilson hw_mask = GEN11_GRDOM_FULL; 466112ed2d3SChris Wilson } else { 467112ed2d3SChris Wilson hw_mask = 0; 468a50134b1STvrtko Ursulin for_each_engine_masked(engine, gt, engine_mask, tmp) { 469112ed2d3SChris Wilson GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); 470112ed2d3SChris Wilson hw_mask |= hw_engine_mask[engine->id]; 4710d333ac7SDaniele Ceraolo Spurio ret = gen11_lock_sfc(engine, &hw_mask); 4720d333ac7SDaniele Ceraolo Spurio if (ret) 4730d333ac7SDaniele Ceraolo Spurio goto sfc_unlock; 474112ed2d3SChris Wilson } 475112ed2d3SChris Wilson } 476112ed2d3SChris Wilson 477cb823ed9SChris Wilson ret = gen6_hw_domain_reset(gt, hw_mask); 478112ed2d3SChris Wilson 4790d333ac7SDaniele Ceraolo Spurio sfc_unlock: 4800d333ac7SDaniele Ceraolo Spurio /* 4810d333ac7SDaniele Ceraolo Spurio * We unlock the SFC based on the lock status and not the result of 4820d333ac7SDaniele Ceraolo Spurio * gen11_lock_sfc to make sure that we clean properly if something 4830d333ac7SDaniele Ceraolo Spurio * wrong happened during the lock (e.g. lock acquired after timeout 4840d333ac7SDaniele Ceraolo Spurio * expiration). 4850d333ac7SDaniele Ceraolo Spurio */ 486112ed2d3SChris Wilson if (engine_mask != ALL_ENGINES) 487a50134b1STvrtko Ursulin for_each_engine_masked(engine, gt, engine_mask, tmp) 488112ed2d3SChris Wilson gen11_unlock_sfc(engine); 489112ed2d3SChris Wilson 490112ed2d3SChris Wilson return ret; 491112ed2d3SChris Wilson } 492112ed2d3SChris Wilson 493112ed2d3SChris Wilson static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) 494112ed2d3SChris Wilson { 495112ed2d3SChris Wilson struct intel_uncore *uncore = engine->uncore; 496112ed2d3SChris Wilson const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base); 497112ed2d3SChris Wilson u32 request, mask, ack; 498112ed2d3SChris Wilson int ret; 499112ed2d3SChris Wilson 500*0a7d355eSChris Wilson if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1))) 501*0a7d355eSChris Wilson return -ETIMEDOUT; 502*0a7d355eSChris Wilson 503112ed2d3SChris Wilson ack = intel_uncore_read_fw(uncore, reg); 504112ed2d3SChris Wilson if (ack & RESET_CTL_CAT_ERROR) { 505112ed2d3SChris Wilson /* 506112ed2d3SChris Wilson * For catastrophic errors, ready-for-reset sequence 507112ed2d3SChris Wilson * needs to be bypassed: HAS#396813 508112ed2d3SChris Wilson */ 509112ed2d3SChris Wilson request = RESET_CTL_CAT_ERROR; 510112ed2d3SChris Wilson mask = RESET_CTL_CAT_ERROR; 511112ed2d3SChris Wilson 512112ed2d3SChris Wilson /* Catastrophic errors need to be cleared by HW */ 513112ed2d3SChris Wilson ack = 0; 514112ed2d3SChris Wilson } else if (!(ack & RESET_CTL_READY_TO_RESET)) { 515112ed2d3SChris Wilson request = RESET_CTL_REQUEST_RESET; 516112ed2d3SChris Wilson mask = RESET_CTL_READY_TO_RESET; 517112ed2d3SChris Wilson ack = RESET_CTL_READY_TO_RESET; 518112ed2d3SChris Wilson } else { 519112ed2d3SChris Wilson return 0; 520112ed2d3SChris Wilson } 521112ed2d3SChris Wilson 522112ed2d3SChris Wilson intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request)); 523112ed2d3SChris Wilson ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 524112ed2d3SChris Wilson 700, 0, NULL); 525112ed2d3SChris Wilson if (ret) 526f8474622SWambui Karuga drm_err(&engine->i915->drm, 527f8474622SWambui Karuga "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", 528112ed2d3SChris Wilson engine->name, request, 529112ed2d3SChris Wilson intel_uncore_read_fw(uncore, reg)); 530112ed2d3SChris Wilson 531112ed2d3SChris Wilson return ret; 532112ed2d3SChris Wilson } 533112ed2d3SChris Wilson 534112ed2d3SChris Wilson static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) 535112ed2d3SChris Wilson { 536112ed2d3SChris Wilson intel_uncore_write_fw(engine->uncore, 537112ed2d3SChris Wilson RING_RESET_CTL(engine->mmio_base), 538112ed2d3SChris Wilson _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); 539112ed2d3SChris Wilson } 540112ed2d3SChris Wilson 541cb823ed9SChris Wilson static int gen8_reset_engines(struct intel_gt *gt, 542112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 543112ed2d3SChris Wilson unsigned int retry) 544112ed2d3SChris Wilson { 545112ed2d3SChris Wilson struct intel_engine_cs *engine; 546112ed2d3SChris Wilson const bool reset_non_ready = retry >= 1; 547112ed2d3SChris Wilson intel_engine_mask_t tmp; 548112ed2d3SChris Wilson int ret; 549112ed2d3SChris Wilson 550a50134b1STvrtko Ursulin for_each_engine_masked(engine, gt, engine_mask, tmp) { 551112ed2d3SChris Wilson ret = gen8_engine_reset_prepare(engine); 552112ed2d3SChris Wilson if (ret && !reset_non_ready) 553112ed2d3SChris Wilson goto skip_reset; 554112ed2d3SChris Wilson 555112ed2d3SChris Wilson /* 556112ed2d3SChris Wilson * If this is not the first failed attempt to prepare, 557112ed2d3SChris Wilson * we decide to proceed anyway. 558112ed2d3SChris Wilson * 559112ed2d3SChris Wilson * By doing so we risk context corruption and with 560112ed2d3SChris Wilson * some gens (kbl), possible system hang if reset 561112ed2d3SChris Wilson * happens during active bb execution. 562112ed2d3SChris Wilson * 563112ed2d3SChris Wilson * We rather take context corruption instead of 564112ed2d3SChris Wilson * failed reset with a wedged driver/gpu. And 565112ed2d3SChris Wilson * active bb execution case should be covered by 566cb823ed9SChris Wilson * stop_engines() we have before the reset. 567112ed2d3SChris Wilson */ 568112ed2d3SChris Wilson } 569112ed2d3SChris Wilson 570cb823ed9SChris Wilson if (INTEL_GEN(gt->i915) >= 11) 571cb823ed9SChris Wilson ret = gen11_reset_engines(gt, engine_mask, retry); 572112ed2d3SChris Wilson else 573cb823ed9SChris Wilson ret = gen6_reset_engines(gt, engine_mask, retry); 574112ed2d3SChris Wilson 575112ed2d3SChris Wilson skip_reset: 576a50134b1STvrtko Ursulin for_each_engine_masked(engine, gt, engine_mask, tmp) 577112ed2d3SChris Wilson gen8_engine_reset_cancel(engine); 578112ed2d3SChris Wilson 579112ed2d3SChris Wilson return ret; 580112ed2d3SChris Wilson } 581112ed2d3SChris Wilson 5824abc6e7cSChris Wilson static int mock_reset(struct intel_gt *gt, 5834abc6e7cSChris Wilson intel_engine_mask_t mask, 5844abc6e7cSChris Wilson unsigned int retry) 5854abc6e7cSChris Wilson { 5864abc6e7cSChris Wilson return 0; 5874abc6e7cSChris Wilson } 5884abc6e7cSChris Wilson 589cb823ed9SChris Wilson typedef int (*reset_func)(struct intel_gt *, 590112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 591112ed2d3SChris Wilson unsigned int retry); 592112ed2d3SChris Wilson 593260e6b71SChris Wilson static reset_func intel_get_gpu_reset(const struct intel_gt *gt) 594112ed2d3SChris Wilson { 595260e6b71SChris Wilson struct drm_i915_private *i915 = gt->i915; 596260e6b71SChris Wilson 5974abc6e7cSChris Wilson if (is_mock_gt(gt)) 5984abc6e7cSChris Wilson return mock_reset; 5994abc6e7cSChris Wilson else if (INTEL_GEN(i915) >= 8) 600112ed2d3SChris Wilson return gen8_reset_engines; 601112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 6) 602112ed2d3SChris Wilson return gen6_reset_engines; 603112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 5) 6049eae5e27SLucas De Marchi return ilk_do_reset; 605112ed2d3SChris Wilson else if (IS_G4X(i915)) 606112ed2d3SChris Wilson return g4x_do_reset; 607112ed2d3SChris Wilson else if (IS_G33(i915) || IS_PINEVIEW(i915)) 608112ed2d3SChris Wilson return g33_do_reset; 609112ed2d3SChris Wilson else if (INTEL_GEN(i915) >= 3) 610112ed2d3SChris Wilson return i915_do_reset; 611112ed2d3SChris Wilson else 612112ed2d3SChris Wilson return NULL; 613112ed2d3SChris Wilson } 614112ed2d3SChris Wilson 615cb823ed9SChris Wilson int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) 616112ed2d3SChris Wilson { 617112ed2d3SChris Wilson const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; 618112ed2d3SChris Wilson reset_func reset; 619112ed2d3SChris Wilson int ret = -ETIMEDOUT; 620112ed2d3SChris Wilson int retry; 621112ed2d3SChris Wilson 622260e6b71SChris Wilson reset = intel_get_gpu_reset(gt); 623112ed2d3SChris Wilson if (!reset) 624112ed2d3SChris Wilson return -ENODEV; 625112ed2d3SChris Wilson 626112ed2d3SChris Wilson /* 627112ed2d3SChris Wilson * If the power well sleeps during the reset, the reset 628112ed2d3SChris Wilson * request may be dropped and never completes (causing -EIO). 629112ed2d3SChris Wilson */ 630cb823ed9SChris Wilson intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 631112ed2d3SChris Wilson for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { 6323fbbbef4SChris Wilson GT_TRACE(gt, "engine_mask=%x\n", engine_mask); 633112ed2d3SChris Wilson preempt_disable(); 634cb823ed9SChris Wilson ret = reset(gt, engine_mask, retry); 635112ed2d3SChris Wilson preempt_enable(); 636112ed2d3SChris Wilson } 637cb823ed9SChris Wilson intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 638112ed2d3SChris Wilson 639112ed2d3SChris Wilson return ret; 640112ed2d3SChris Wilson } 641112ed2d3SChris Wilson 642260e6b71SChris Wilson bool intel_has_gpu_reset(const struct intel_gt *gt) 643112ed2d3SChris Wilson { 6448a25c4beSJani Nikula if (!gt->i915->params.reset) 645112ed2d3SChris Wilson return NULL; 646112ed2d3SChris Wilson 647260e6b71SChris Wilson return intel_get_gpu_reset(gt); 648112ed2d3SChris Wilson } 649112ed2d3SChris Wilson 650260e6b71SChris Wilson bool intel_has_reset_engine(const struct intel_gt *gt) 651112ed2d3SChris Wilson { 6528a25c4beSJani Nikula if (gt->i915->params.reset < 2) 653260e6b71SChris Wilson return false; 654260e6b71SChris Wilson 655260e6b71SChris Wilson return INTEL_INFO(gt->i915)->has_reset_engine; 656112ed2d3SChris Wilson } 657112ed2d3SChris Wilson 658cb823ed9SChris Wilson int intel_reset_guc(struct intel_gt *gt) 659112ed2d3SChris Wilson { 660112ed2d3SChris Wilson u32 guc_domain = 661cb823ed9SChris Wilson INTEL_GEN(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; 662112ed2d3SChris Wilson int ret; 663112ed2d3SChris Wilson 664702668e6SDaniele Ceraolo Spurio GEM_BUG_ON(!HAS_GT_UC(gt->i915)); 665112ed2d3SChris Wilson 666cb823ed9SChris Wilson intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 667cb823ed9SChris Wilson ret = gen6_hw_domain_reset(gt, guc_domain); 668cb823ed9SChris Wilson intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 669112ed2d3SChris Wilson 670112ed2d3SChris Wilson return ret; 671112ed2d3SChris Wilson } 672112ed2d3SChris Wilson 673112ed2d3SChris Wilson /* 674112ed2d3SChris Wilson * Ensure irq handler finishes, and not run again. 675112ed2d3SChris Wilson * Also return the active request so that we only search for it once. 676112ed2d3SChris Wilson */ 677112ed2d3SChris Wilson static void reset_prepare_engine(struct intel_engine_cs *engine) 678112ed2d3SChris Wilson { 679112ed2d3SChris Wilson /* 680112ed2d3SChris Wilson * During the reset sequence, we must prevent the engine from 681112ed2d3SChris Wilson * entering RC6. As the context state is undefined until we restart 682112ed2d3SChris Wilson * the engine, if it does enter RC6 during the reset, the state 683112ed2d3SChris Wilson * written to the powercontext is undefined and so we may lose 684112ed2d3SChris Wilson * GPU state upon resume, i.e. fail to restart after a reset. 685112ed2d3SChris Wilson */ 686112ed2d3SChris Wilson intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); 687e26b6d43SChris Wilson if (engine->reset.prepare) 688112ed2d3SChris Wilson engine->reset.prepare(engine); 689112ed2d3SChris Wilson } 690112ed2d3SChris Wilson 691cb823ed9SChris Wilson static void revoke_mmaps(struct intel_gt *gt) 692112ed2d3SChris Wilson { 693112ed2d3SChris Wilson int i; 694112ed2d3SChris Wilson 695cb823ed9SChris Wilson for (i = 0; i < gt->ggtt->num_fences; i++) { 696112ed2d3SChris Wilson struct drm_vma_offset_node *node; 697112ed2d3SChris Wilson struct i915_vma *vma; 698112ed2d3SChris Wilson u64 vma_offset; 699112ed2d3SChris Wilson 700cb823ed9SChris Wilson vma = READ_ONCE(gt->ggtt->fence_regs[i].vma); 701112ed2d3SChris Wilson if (!vma) 702112ed2d3SChris Wilson continue; 703112ed2d3SChris Wilson 704112ed2d3SChris Wilson if (!i915_vma_has_userfault(vma)) 705112ed2d3SChris Wilson continue; 706112ed2d3SChris Wilson 707cb823ed9SChris Wilson GEM_BUG_ON(vma->fence != >->ggtt->fence_regs[i]); 708cc662126SAbdiel Janulgue 709cc662126SAbdiel Janulgue if (!vma->mmo) 710cc662126SAbdiel Janulgue continue; 711cc662126SAbdiel Janulgue 712cc662126SAbdiel Janulgue node = &vma->mmo->vma_node; 713112ed2d3SChris Wilson vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT; 714cc662126SAbdiel Janulgue 715cb823ed9SChris Wilson unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping, 716112ed2d3SChris Wilson drm_vma_node_offset_addr(node) + vma_offset, 717112ed2d3SChris Wilson vma->size, 718112ed2d3SChris Wilson 1); 719112ed2d3SChris Wilson } 720112ed2d3SChris Wilson } 721112ed2d3SChris Wilson 722cb823ed9SChris Wilson static intel_engine_mask_t reset_prepare(struct intel_gt *gt) 723112ed2d3SChris Wilson { 724112ed2d3SChris Wilson struct intel_engine_cs *engine; 72518398904SChris Wilson intel_engine_mask_t awake = 0; 726112ed2d3SChris Wilson enum intel_engine_id id; 727112ed2d3SChris Wilson 7285d904e3cSTvrtko Ursulin for_each_engine(engine, gt, id) { 72918398904SChris Wilson if (intel_engine_pm_get_if_awake(engine)) 73018398904SChris Wilson awake |= engine->mask; 731112ed2d3SChris Wilson reset_prepare_engine(engine); 73218398904SChris Wilson } 733112ed2d3SChris Wilson 734ca7b2c1bSDaniele Ceraolo Spurio intel_uc_reset_prepare(>->uc); 73518398904SChris Wilson 73618398904SChris Wilson return awake; 737112ed2d3SChris Wilson } 738112ed2d3SChris Wilson 739cb823ed9SChris Wilson static void gt_revoke(struct intel_gt *gt) 740112ed2d3SChris Wilson { 741cb823ed9SChris Wilson revoke_mmaps(gt); 742112ed2d3SChris Wilson } 743112ed2d3SChris Wilson 744cb823ed9SChris Wilson static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) 745112ed2d3SChris Wilson { 746112ed2d3SChris Wilson struct intel_engine_cs *engine; 747112ed2d3SChris Wilson enum intel_engine_id id; 748112ed2d3SChris Wilson int err; 749112ed2d3SChris Wilson 750112ed2d3SChris Wilson /* 751112ed2d3SChris Wilson * Everything depends on having the GTT running, so we need to start 752112ed2d3SChris Wilson * there. 753112ed2d3SChris Wilson */ 754cb823ed9SChris Wilson err = i915_ggtt_enable_hw(gt->i915); 755112ed2d3SChris Wilson if (err) 756112ed2d3SChris Wilson return err; 757112ed2d3SChris Wilson 75816f2941aSChris Wilson local_bh_disable(); 7595d904e3cSTvrtko Ursulin for_each_engine(engine, gt, id) 760cb823ed9SChris Wilson __intel_engine_reset(engine, stalled_mask & engine->mask); 76116f2941aSChris Wilson local_bh_enable(); 762112ed2d3SChris Wilson 763f899f786SChris Wilson intel_ggtt_restore_fences(gt->ggtt); 764112ed2d3SChris Wilson 765112ed2d3SChris Wilson return err; 766112ed2d3SChris Wilson } 767112ed2d3SChris Wilson 768112ed2d3SChris Wilson static void reset_finish_engine(struct intel_engine_cs *engine) 769112ed2d3SChris Wilson { 770e26b6d43SChris Wilson if (engine->reset.finish) 771112ed2d3SChris Wilson engine->reset.finish(engine); 772112ed2d3SChris Wilson intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); 77318398904SChris Wilson 77454400257SChris Wilson intel_engine_signal_breadcrumbs(engine); 775112ed2d3SChris Wilson } 776112ed2d3SChris Wilson 777cb823ed9SChris Wilson static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake) 778112ed2d3SChris Wilson { 779112ed2d3SChris Wilson struct intel_engine_cs *engine; 780112ed2d3SChris Wilson enum intel_engine_id id; 781112ed2d3SChris Wilson 7825d904e3cSTvrtko Ursulin for_each_engine(engine, gt, id) { 783112ed2d3SChris Wilson reset_finish_engine(engine); 78418398904SChris Wilson if (awake & engine->mask) 78518398904SChris Wilson intel_engine_pm_put(engine); 786112ed2d3SChris Wilson } 787112ed2d3SChris Wilson } 788112ed2d3SChris Wilson 789112ed2d3SChris Wilson static void nop_submit_request(struct i915_request *request) 790112ed2d3SChris Wilson { 791112ed2d3SChris Wilson struct intel_engine_cs *engine = request->engine; 792112ed2d3SChris Wilson unsigned long flags; 793112ed2d3SChris Wilson 7943fbbbef4SChris Wilson RQ_TRACE(request, "-EIO\n"); 79536e191f0SChris Wilson i915_request_set_error_once(request, -EIO); 796112ed2d3SChris Wilson 797422d7df4SChris Wilson spin_lock_irqsave(&engine->active.lock, flags); 798112ed2d3SChris Wilson __i915_request_submit(request); 799112ed2d3SChris Wilson i915_request_mark_complete(request); 800422d7df4SChris Wilson spin_unlock_irqrestore(&engine->active.lock, flags); 801112ed2d3SChris Wilson 80254400257SChris Wilson intel_engine_signal_breadcrumbs(engine); 803112ed2d3SChris Wilson } 804112ed2d3SChris Wilson 805cb823ed9SChris Wilson static void __intel_gt_set_wedged(struct intel_gt *gt) 806112ed2d3SChris Wilson { 807112ed2d3SChris Wilson struct intel_engine_cs *engine; 80818398904SChris Wilson intel_engine_mask_t awake; 809112ed2d3SChris Wilson enum intel_engine_id id; 810112ed2d3SChris Wilson 811cb823ed9SChris Wilson if (test_bit(I915_WEDGED, >->reset.flags)) 812112ed2d3SChris Wilson return; 813112ed2d3SChris Wilson 8143fbbbef4SChris Wilson GT_TRACE(gt, "start\n"); 815112ed2d3SChris Wilson 816112ed2d3SChris Wilson /* 817112ed2d3SChris Wilson * First, stop submission to hw, but do not yet complete requests by 818112ed2d3SChris Wilson * rolling the global seqno forward (since this would complete requests 819112ed2d3SChris Wilson * for which we haven't set the fence error to EIO yet). 820112ed2d3SChris Wilson */ 821cb823ed9SChris Wilson awake = reset_prepare(gt); 822112ed2d3SChris Wilson 823112ed2d3SChris Wilson /* Even if the GPU reset fails, it should still stop the engines */ 824cb823ed9SChris Wilson if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 825cb823ed9SChris Wilson __intel_gt_reset(gt, ALL_ENGINES); 826112ed2d3SChris Wilson 8275d904e3cSTvrtko Ursulin for_each_engine(engine, gt, id) 828112ed2d3SChris Wilson engine->submit_request = nop_submit_request; 829112ed2d3SChris Wilson 830112ed2d3SChris Wilson /* 831112ed2d3SChris Wilson * Make sure no request can slip through without getting completed by 832112ed2d3SChris Wilson * either this call here to intel_engine_write_global_seqno, or the one 833112ed2d3SChris Wilson * in nop_submit_request. 834112ed2d3SChris Wilson */ 835112ed2d3SChris Wilson synchronize_rcu_expedited(); 836cb823ed9SChris Wilson set_bit(I915_WEDGED, >->reset.flags); 837112ed2d3SChris Wilson 838112ed2d3SChris Wilson /* Mark all executing requests as skipped */ 83916f2941aSChris Wilson local_bh_disable(); 8405d904e3cSTvrtko Ursulin for_each_engine(engine, gt, id) 841e26b6d43SChris Wilson if (engine->reset.cancel) 842e26b6d43SChris Wilson engine->reset.cancel(engine); 84316f2941aSChris Wilson local_bh_enable(); 844112ed2d3SChris Wilson 845cb823ed9SChris Wilson reset_finish(gt, awake); 846112ed2d3SChris Wilson 8473fbbbef4SChris Wilson GT_TRACE(gt, "end\n"); 848112ed2d3SChris Wilson } 849112ed2d3SChris Wilson 850cb823ed9SChris Wilson void intel_gt_set_wedged(struct intel_gt *gt) 851112ed2d3SChris Wilson { 852112ed2d3SChris Wilson intel_wakeref_t wakeref; 853112ed2d3SChris Wilson 854a2847782SChris Wilson if (test_bit(I915_WEDGED, >->reset.flags)) 855a2847782SChris Wilson return; 856a2847782SChris Wilson 857a2847782SChris Wilson wakeref = intel_runtime_pm_get(gt->uncore->rpm); 858cb823ed9SChris Wilson mutex_lock(>->reset.mutex); 859a2847782SChris Wilson 860a2847782SChris Wilson if (GEM_SHOW_DEBUG()) { 861a2847782SChris Wilson struct drm_printer p = drm_debug_printer(__func__); 862a2847782SChris Wilson struct intel_engine_cs *engine; 863a2847782SChris Wilson enum intel_engine_id id; 864a2847782SChris Wilson 865a2847782SChris Wilson drm_printf(&p, "called from %pS\n", (void *)_RET_IP_); 866a2847782SChris Wilson for_each_engine(engine, gt, id) { 867a2847782SChris Wilson if (intel_engine_is_idle(engine)) 868a2847782SChris Wilson continue; 869a2847782SChris Wilson 870a2847782SChris Wilson intel_engine_dump(engine, &p, "%s\n", engine->name); 871a2847782SChris Wilson } 872a2847782SChris Wilson } 873a2847782SChris Wilson 874cb823ed9SChris Wilson __intel_gt_set_wedged(gt); 875a2847782SChris Wilson 876cb823ed9SChris Wilson mutex_unlock(>->reset.mutex); 877a2847782SChris Wilson intel_runtime_pm_put(gt->uncore->rpm, wakeref); 878112ed2d3SChris Wilson } 879112ed2d3SChris Wilson 880cb823ed9SChris Wilson static bool __intel_gt_unset_wedged(struct intel_gt *gt) 881112ed2d3SChris Wilson { 882cb823ed9SChris Wilson struct intel_gt_timelines *timelines = >->timelines; 883f0c02c1bSTvrtko Ursulin struct intel_timeline *tl; 8841d6f1d16SChris Wilson bool ok; 885112ed2d3SChris Wilson 886cb823ed9SChris Wilson if (!test_bit(I915_WEDGED, >->reset.flags)) 887112ed2d3SChris Wilson return true; 888112ed2d3SChris Wilson 8895311f517SMichał Winiarski /* Never fully initialised, recovery impossible */ 8903f04bdceSMichał Winiarski if (intel_gt_has_unrecoverable_error(gt)) 891112ed2d3SChris Wilson return false; 892112ed2d3SChris Wilson 8933fbbbef4SChris Wilson GT_TRACE(gt, "start\n"); 894112ed2d3SChris Wilson 895112ed2d3SChris Wilson /* 896112ed2d3SChris Wilson * Before unwedging, make sure that all pending operations 897112ed2d3SChris Wilson * are flushed and errored out - we may have requests waiting upon 898112ed2d3SChris Wilson * third party fences. We marked all inflight requests as EIO, and 899112ed2d3SChris Wilson * every execbuf since returned EIO, for consistency we want all 900112ed2d3SChris Wilson * the currently pending requests to also be marked as EIO, which 901112ed2d3SChris Wilson * is done inside our nop_submit_request - and so we must wait. 902112ed2d3SChris Wilson * 903112ed2d3SChris Wilson * No more can be submitted until we reset the wedged bit. 904112ed2d3SChris Wilson */ 90588cec497SChris Wilson spin_lock(&timelines->lock); 906cb823ed9SChris Wilson list_for_each_entry(tl, &timelines->active_list, link) { 907b1e3177bSChris Wilson struct dma_fence *fence; 908112ed2d3SChris Wilson 909b1e3177bSChris Wilson fence = i915_active_fence_get(&tl->last_request); 910b1e3177bSChris Wilson if (!fence) 911112ed2d3SChris Wilson continue; 912112ed2d3SChris Wilson 91388cec497SChris Wilson spin_unlock(&timelines->lock); 914338aade9SChris Wilson 915112ed2d3SChris Wilson /* 916112ed2d3SChris Wilson * All internal dependencies (i915_requests) will have 917112ed2d3SChris Wilson * been flushed by the set-wedge, but we may be stuck waiting 918112ed2d3SChris Wilson * for external fences. These should all be capped to 10s 919112ed2d3SChris Wilson * (I915_FENCE_TIMEOUT) so this wait should not be unbounded 920112ed2d3SChris Wilson * in the worst case. 921112ed2d3SChris Wilson */ 922b1e3177bSChris Wilson dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT); 923b1e3177bSChris Wilson dma_fence_put(fence); 924338aade9SChris Wilson 925338aade9SChris Wilson /* Restart iteration after droping lock */ 92688cec497SChris Wilson spin_lock(&timelines->lock); 927338aade9SChris Wilson tl = list_entry(&timelines->active_list, typeof(*tl), link); 928112ed2d3SChris Wilson } 92988cec497SChris Wilson spin_unlock(&timelines->lock); 930112ed2d3SChris Wilson 9311d6f1d16SChris Wilson /* We must reset pending GPU events before restoring our submission */ 9321d6f1d16SChris Wilson ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */ 9331d6f1d16SChris Wilson if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 9341d6f1d16SChris Wilson ok = __intel_gt_reset(gt, ALL_ENGINES) == 0; 935542a5c66SChris Wilson if (!ok) { 936542a5c66SChris Wilson /* 937542a5c66SChris Wilson * Warn CI about the unrecoverable wedged condition. 938542a5c66SChris Wilson * Time for a reboot. 939542a5c66SChris Wilson */ 94065706203SMichał Winiarski add_taint_for_CI(gt->i915, TAINT_WARN); 9411d6f1d16SChris Wilson return false; 942542a5c66SChris Wilson } 943112ed2d3SChris Wilson 944112ed2d3SChris Wilson /* 945112ed2d3SChris Wilson * Undo nop_submit_request. We prevent all new i915 requests from 946112ed2d3SChris Wilson * being queued (by disallowing execbuf whilst wedged) so having 947112ed2d3SChris Wilson * waited for all active requests above, we know the system is idle 948112ed2d3SChris Wilson * and do not have to worry about a thread being inside 949112ed2d3SChris Wilson * engine->submit_request() as we swap over. So unlike installing 950112ed2d3SChris Wilson * the nop_submit_request on reset, we can do this from normal 951112ed2d3SChris Wilson * context and do not require stop_machine(). 952112ed2d3SChris Wilson */ 953cb823ed9SChris Wilson intel_engines_reset_default_submission(gt); 954112ed2d3SChris Wilson 9553fbbbef4SChris Wilson GT_TRACE(gt, "end\n"); 956112ed2d3SChris Wilson 957112ed2d3SChris Wilson smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ 958cb823ed9SChris Wilson clear_bit(I915_WEDGED, >->reset.flags); 959112ed2d3SChris Wilson 960112ed2d3SChris Wilson return true; 961112ed2d3SChris Wilson } 962112ed2d3SChris Wilson 963cb823ed9SChris Wilson bool intel_gt_unset_wedged(struct intel_gt *gt) 964112ed2d3SChris Wilson { 965112ed2d3SChris Wilson bool result; 966112ed2d3SChris Wilson 967cb823ed9SChris Wilson mutex_lock(>->reset.mutex); 968cb823ed9SChris Wilson result = __intel_gt_unset_wedged(gt); 969cb823ed9SChris Wilson mutex_unlock(>->reset.mutex); 970112ed2d3SChris Wilson 971112ed2d3SChris Wilson return result; 972112ed2d3SChris Wilson } 973112ed2d3SChris Wilson 974cb823ed9SChris Wilson static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) 975112ed2d3SChris Wilson { 976112ed2d3SChris Wilson int err, i; 977112ed2d3SChris Wilson 978cb823ed9SChris Wilson gt_revoke(gt); 979112ed2d3SChris Wilson 980cb823ed9SChris Wilson err = __intel_gt_reset(gt, ALL_ENGINES); 981112ed2d3SChris Wilson for (i = 0; err && i < RESET_MAX_RETRIES; i++) { 982112ed2d3SChris Wilson msleep(10 * (i + 1)); 983cb823ed9SChris Wilson err = __intel_gt_reset(gt, ALL_ENGINES); 984112ed2d3SChris Wilson } 985112ed2d3SChris Wilson if (err) 986112ed2d3SChris Wilson return err; 987112ed2d3SChris Wilson 988cb823ed9SChris Wilson return gt_reset(gt, stalled_mask); 989112ed2d3SChris Wilson } 990112ed2d3SChris Wilson 991cb823ed9SChris Wilson static int resume(struct intel_gt *gt) 992092be382SChris Wilson { 993092be382SChris Wilson struct intel_engine_cs *engine; 994092be382SChris Wilson enum intel_engine_id id; 995092be382SChris Wilson int ret; 996092be382SChris Wilson 9975d904e3cSTvrtko Ursulin for_each_engine(engine, gt, id) { 998faea1792SDaniele Ceraolo Spurio ret = intel_engine_resume(engine); 999092be382SChris Wilson if (ret) 1000092be382SChris Wilson return ret; 1001092be382SChris Wilson } 1002092be382SChris Wilson 1003092be382SChris Wilson return 0; 1004092be382SChris Wilson } 1005092be382SChris Wilson 1006112ed2d3SChris Wilson /** 1007cb823ed9SChris Wilson * intel_gt_reset - reset chip after a hang 1008cb823ed9SChris Wilson * @gt: #intel_gt to reset 1009112ed2d3SChris Wilson * @stalled_mask: mask of the stalled engines with the guilty requests 1010112ed2d3SChris Wilson * @reason: user error message for why we are resetting 1011112ed2d3SChris Wilson * 1012112ed2d3SChris Wilson * Reset the chip. Useful if a hang is detected. Marks the device as wedged 1013112ed2d3SChris Wilson * on failure. 1014112ed2d3SChris Wilson * 1015112ed2d3SChris Wilson * Procedure is fairly simple: 1016112ed2d3SChris Wilson * - reset the chip using the reset reg 1017112ed2d3SChris Wilson * - re-init context state 1018112ed2d3SChris Wilson * - re-init hardware status page 1019112ed2d3SChris Wilson * - re-init ring buffer 1020112ed2d3SChris Wilson * - re-init interrupt state 1021112ed2d3SChris Wilson * - re-init display 1022112ed2d3SChris Wilson */ 1023cb823ed9SChris Wilson void intel_gt_reset(struct intel_gt *gt, 1024112ed2d3SChris Wilson intel_engine_mask_t stalled_mask, 1025112ed2d3SChris Wilson const char *reason) 1026112ed2d3SChris Wilson { 102718398904SChris Wilson intel_engine_mask_t awake; 1028112ed2d3SChris Wilson int ret; 1029112ed2d3SChris Wilson 10303fbbbef4SChris Wilson GT_TRACE(gt, "flags=%lx\n", gt->reset.flags); 1031112ed2d3SChris Wilson 1032112ed2d3SChris Wilson might_sleep(); 1033cb823ed9SChris Wilson GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, >->reset.flags)); 1034cb823ed9SChris Wilson mutex_lock(>->reset.mutex); 1035112ed2d3SChris Wilson 1036112ed2d3SChris Wilson /* Clear any previous failed attempts at recovery. Time to try again. */ 1037cb823ed9SChris Wilson if (!__intel_gt_unset_wedged(gt)) 103833df8a76SChris Wilson goto unlock; 1039112ed2d3SChris Wilson 1040112ed2d3SChris Wilson if (reason) 1041dc483ba5SJani Nikula drm_notice(>->i915->drm, 1042cb823ed9SChris Wilson "Resetting chip for %s\n", reason); 1043cb823ed9SChris Wilson atomic_inc(>->i915->gpu_error.reset_count); 1044112ed2d3SChris Wilson 1045cb823ed9SChris Wilson awake = reset_prepare(gt); 1046112ed2d3SChris Wilson 1047260e6b71SChris Wilson if (!intel_has_gpu_reset(gt)) { 10488a25c4beSJani Nikula if (gt->i915->params.reset) 1049dc483ba5SJani Nikula drm_err(>->i915->drm, "GPU reset not supported\n"); 1050112ed2d3SChris Wilson else 1051f8474622SWambui Karuga drm_dbg(>->i915->drm, "GPU reset disabled\n"); 1052112ed2d3SChris Wilson goto error; 1053112ed2d3SChris Wilson } 1054112ed2d3SChris Wilson 1055cb823ed9SChris Wilson if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 1056cb823ed9SChris Wilson intel_runtime_pm_disable_interrupts(gt->i915); 1057112ed2d3SChris Wilson 1058cb823ed9SChris Wilson if (do_reset(gt, stalled_mask)) { 1059dc483ba5SJani Nikula drm_err(>->i915->drm, "Failed to reset chip\n"); 1060112ed2d3SChris Wilson goto taint; 1061112ed2d3SChris Wilson } 1062112ed2d3SChris Wilson 1063cb823ed9SChris Wilson if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 1064cb823ed9SChris Wilson intel_runtime_pm_enable_interrupts(gt->i915); 1065112ed2d3SChris Wilson 1066cb823ed9SChris Wilson intel_overlay_reset(gt->i915); 1067112ed2d3SChris Wilson 1068112ed2d3SChris Wilson /* 1069112ed2d3SChris Wilson * Next we need to restore the context, but we don't use those 1070112ed2d3SChris Wilson * yet either... 1071112ed2d3SChris Wilson * 1072112ed2d3SChris Wilson * Ring buffer needs to be re-initialized in the KMS case, or if X 1073112ed2d3SChris Wilson * was running at the time of the reset (i.e. we weren't VT 1074112ed2d3SChris Wilson * switched away). 1075112ed2d3SChris Wilson */ 107661fa60ffSTvrtko Ursulin ret = intel_gt_init_hw(gt); 1077112ed2d3SChris Wilson if (ret) { 1078f8474622SWambui Karuga drm_err(>->i915->drm, 1079f8474622SWambui Karuga "Failed to initialise HW following reset (%d)\n", 1080112ed2d3SChris Wilson ret); 1081092be382SChris Wilson goto taint; 1082112ed2d3SChris Wilson } 1083112ed2d3SChris Wilson 1084cb823ed9SChris Wilson ret = resume(gt); 1085092be382SChris Wilson if (ret) 1086092be382SChris Wilson goto taint; 1087092be382SChris Wilson 1088112ed2d3SChris Wilson finish: 1089cb823ed9SChris Wilson reset_finish(gt, awake); 109033df8a76SChris Wilson unlock: 1091cb823ed9SChris Wilson mutex_unlock(>->reset.mutex); 1092112ed2d3SChris Wilson return; 1093112ed2d3SChris Wilson 1094112ed2d3SChris Wilson taint: 1095112ed2d3SChris Wilson /* 1096112ed2d3SChris Wilson * History tells us that if we cannot reset the GPU now, we 1097112ed2d3SChris Wilson * never will. This then impacts everything that is run 1098112ed2d3SChris Wilson * subsequently. On failing the reset, we mark the driver 1099112ed2d3SChris Wilson * as wedged, preventing further execution on the GPU. 1100112ed2d3SChris Wilson * We also want to go one step further and add a taint to the 1101112ed2d3SChris Wilson * kernel so that any subsequent faults can be traced back to 1102112ed2d3SChris Wilson * this failure. This is important for CI, where if the 1103112ed2d3SChris Wilson * GPU/driver fails we would like to reboot and restart testing 1104112ed2d3SChris Wilson * rather than continue on into oblivion. For everyone else, 1105112ed2d3SChris Wilson * the system should still plod along, but they have been warned! 1106112ed2d3SChris Wilson */ 110765706203SMichał Winiarski add_taint_for_CI(gt->i915, TAINT_WARN); 1108112ed2d3SChris Wilson error: 1109cb823ed9SChris Wilson __intel_gt_set_wedged(gt); 1110112ed2d3SChris Wilson goto finish; 1111112ed2d3SChris Wilson } 1112112ed2d3SChris Wilson 1113cb823ed9SChris Wilson static inline int intel_gt_reset_engine(struct intel_engine_cs *engine) 1114112ed2d3SChris Wilson { 1115cb823ed9SChris Wilson return __intel_gt_reset(engine->gt, engine->mask); 1116112ed2d3SChris Wilson } 1117112ed2d3SChris Wilson 111816f2941aSChris Wilson int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg) 1119112ed2d3SChris Wilson { 1120cb823ed9SChris Wilson struct intel_gt *gt = engine->gt; 11213c9abe88SDaniele Ceraolo Spurio bool uses_guc = intel_engine_in_guc_submission_mode(engine); 1122112ed2d3SChris Wilson int ret; 1123112ed2d3SChris Wilson 1124639f2f24SVenkata Sandeep Dhanalakota ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags); 1125cb823ed9SChris Wilson GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, >->reset.flags)); 1126112ed2d3SChris Wilson 112718398904SChris Wilson if (!intel_engine_pm_get_if_awake(engine)) 112879ffac85SChris Wilson return 0; 112979ffac85SChris Wilson 1130112ed2d3SChris Wilson reset_prepare_engine(engine); 1131112ed2d3SChris Wilson 1132112ed2d3SChris Wilson if (msg) 1133dc483ba5SJani Nikula drm_notice(&engine->i915->drm, 1134112ed2d3SChris Wilson "Resetting %s for %s\n", engine->name, msg); 1135cb823ed9SChris Wilson atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]); 1136112ed2d3SChris Wilson 11373c9abe88SDaniele Ceraolo Spurio if (!uses_guc) 1138cb823ed9SChris Wilson ret = intel_gt_reset_engine(engine); 1139112ed2d3SChris Wilson else 11408b5689d7SDaniele Ceraolo Spurio ret = intel_guc_reset_engine(&engine->gt->uc.guc, engine); 1141112ed2d3SChris Wilson if (ret) { 1142112ed2d3SChris Wilson /* If we fail here, we expect to fallback to a global reset */ 1143cb56a07dSChris Wilson ENGINE_TRACE(engine, "Failed to reset, err: %d\n", ret); 1144112ed2d3SChris Wilson goto out; 1145112ed2d3SChris Wilson } 1146112ed2d3SChris Wilson 1147112ed2d3SChris Wilson /* 1148112ed2d3SChris Wilson * The request that caused the hang is stuck on elsp, we know the 1149112ed2d3SChris Wilson * active request and can drop it, adjust head to skip the offending 1150112ed2d3SChris Wilson * request to resume executing remaining requests in the queue. 1151112ed2d3SChris Wilson */ 1152cb823ed9SChris Wilson __intel_engine_reset(engine, true); 1153112ed2d3SChris Wilson 1154112ed2d3SChris Wilson /* 1155112ed2d3SChris Wilson * The engine and its registers (and workarounds in case of render) 1156112ed2d3SChris Wilson * have been reset to their default values. Follow the init_ring 1157112ed2d3SChris Wilson * process to program RING_MODE, HWSP and re-enable submission. 1158112ed2d3SChris Wilson */ 1159faea1792SDaniele Ceraolo Spurio ret = intel_engine_resume(engine); 1160112ed2d3SChris Wilson 1161112ed2d3SChris Wilson out: 1162112ed2d3SChris Wilson intel_engine_cancel_stop_cs(engine); 1163112ed2d3SChris Wilson reset_finish_engine(engine); 116407779a76SChris Wilson intel_engine_pm_put_async(engine); 1165112ed2d3SChris Wilson return ret; 1166112ed2d3SChris Wilson } 1167112ed2d3SChris Wilson 116816f2941aSChris Wilson /** 116916f2941aSChris Wilson * intel_engine_reset - reset GPU engine to recover from a hang 117016f2941aSChris Wilson * @engine: engine to reset 117116f2941aSChris Wilson * @msg: reason for GPU reset; or NULL for no drm_notice() 117216f2941aSChris Wilson * 117316f2941aSChris Wilson * Reset a specific GPU engine. Useful if a hang is detected. 117416f2941aSChris Wilson * Returns zero on successful reset or otherwise an error code. 117516f2941aSChris Wilson * 117616f2941aSChris Wilson * Procedure is: 117716f2941aSChris Wilson * - identifies the request that caused the hang and it is dropped 117816f2941aSChris Wilson * - reset engine (which will force the engine to idle) 117916f2941aSChris Wilson * - re-init/configure engine 118016f2941aSChris Wilson */ 118116f2941aSChris Wilson int intel_engine_reset(struct intel_engine_cs *engine, const char *msg) 118216f2941aSChris Wilson { 118316f2941aSChris Wilson int err; 118416f2941aSChris Wilson 118516f2941aSChris Wilson local_bh_disable(); 118616f2941aSChris Wilson err = __intel_engine_reset_bh(engine, msg); 118716f2941aSChris Wilson local_bh_enable(); 118816f2941aSChris Wilson 118916f2941aSChris Wilson return err; 119016f2941aSChris Wilson } 119116f2941aSChris Wilson 1192cb823ed9SChris Wilson static void intel_gt_reset_global(struct intel_gt *gt, 1193112ed2d3SChris Wilson u32 engine_mask, 1194112ed2d3SChris Wilson const char *reason) 1195112ed2d3SChris Wilson { 1196cb823ed9SChris Wilson struct kobject *kobj = >->i915->drm.primary->kdev->kobj; 1197112ed2d3SChris Wilson char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; 1198112ed2d3SChris Wilson char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; 1199112ed2d3SChris Wilson char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; 1200cb823ed9SChris Wilson struct intel_wedge_me w; 1201112ed2d3SChris Wilson 1202112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); 1203112ed2d3SChris Wilson 1204cb56a07dSChris Wilson GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask); 1205112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); 1206112ed2d3SChris Wilson 1207112ed2d3SChris Wilson /* Use a watchdog to ensure that our reset completes */ 1208cb823ed9SChris Wilson intel_wedge_on_timeout(&w, gt, 5 * HZ) { 1209cb823ed9SChris Wilson intel_prepare_reset(gt->i915); 1210112ed2d3SChris Wilson 1211112ed2d3SChris Wilson /* Flush everyone using a resource about to be clobbered */ 1212cb823ed9SChris Wilson synchronize_srcu_expedited(>->reset.backoff_srcu); 1213112ed2d3SChris Wilson 1214cb823ed9SChris Wilson intel_gt_reset(gt, engine_mask, reason); 1215112ed2d3SChris Wilson 1216cb823ed9SChris Wilson intel_finish_reset(gt->i915); 1217112ed2d3SChris Wilson } 1218112ed2d3SChris Wilson 1219cb823ed9SChris Wilson if (!test_bit(I915_WEDGED, >->reset.flags)) 1220112ed2d3SChris Wilson kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); 1221112ed2d3SChris Wilson } 1222112ed2d3SChris Wilson 1223112ed2d3SChris Wilson /** 1224cb823ed9SChris Wilson * intel_gt_handle_error - handle a gpu error 1225cb823ed9SChris Wilson * @gt: the intel_gt 1226112ed2d3SChris Wilson * @engine_mask: mask representing engines that are hung 1227112ed2d3SChris Wilson * @flags: control flags 1228112ed2d3SChris Wilson * @fmt: Error message format string 1229112ed2d3SChris Wilson * 1230112ed2d3SChris Wilson * Do some basic checking of register state at error time and 1231112ed2d3SChris Wilson * dump it to the syslog. Also call i915_capture_error_state() to make 1232112ed2d3SChris Wilson * sure we get a record and make it available in debugfs. Fire a uevent 1233112ed2d3SChris Wilson * so userspace knows something bad happened (should trigger collection 1234112ed2d3SChris Wilson * of a ring dump etc.). 1235112ed2d3SChris Wilson */ 1236cb823ed9SChris Wilson void intel_gt_handle_error(struct intel_gt *gt, 1237112ed2d3SChris Wilson intel_engine_mask_t engine_mask, 1238112ed2d3SChris Wilson unsigned long flags, 1239112ed2d3SChris Wilson const char *fmt, ...) 1240112ed2d3SChris Wilson { 1241112ed2d3SChris Wilson struct intel_engine_cs *engine; 1242112ed2d3SChris Wilson intel_wakeref_t wakeref; 1243112ed2d3SChris Wilson intel_engine_mask_t tmp; 1244112ed2d3SChris Wilson char error_msg[80]; 1245112ed2d3SChris Wilson char *msg = NULL; 1246112ed2d3SChris Wilson 1247112ed2d3SChris Wilson if (fmt) { 1248112ed2d3SChris Wilson va_list args; 1249112ed2d3SChris Wilson 1250112ed2d3SChris Wilson va_start(args, fmt); 1251112ed2d3SChris Wilson vscnprintf(error_msg, sizeof(error_msg), fmt, args); 1252112ed2d3SChris Wilson va_end(args); 1253112ed2d3SChris Wilson 1254112ed2d3SChris Wilson msg = error_msg; 1255112ed2d3SChris Wilson } 1256112ed2d3SChris Wilson 1257112ed2d3SChris Wilson /* 1258112ed2d3SChris Wilson * In most cases it's guaranteed that we get here with an RPM 1259112ed2d3SChris Wilson * reference held, for example because there is a pending GPU 1260112ed2d3SChris Wilson * request that won't finish until the reset is done. This 1261112ed2d3SChris Wilson * isn't the case at least when we get here by doing a 1262112ed2d3SChris Wilson * simulated reset via debugfs, so get an RPM reference. 1263112ed2d3SChris Wilson */ 1264cd6a8513SChris Wilson wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1265112ed2d3SChris Wilson 1266792592e7SDaniele Ceraolo Spurio engine_mask &= gt->info.engine_mask; 1267112ed2d3SChris Wilson 1268112ed2d3SChris Wilson if (flags & I915_ERROR_CAPTURE) { 1269bda30024STvrtko Ursulin i915_capture_error_state(gt, engine_mask); 1270cb823ed9SChris Wilson intel_gt_clear_error_registers(gt, engine_mask); 1271112ed2d3SChris Wilson } 1272112ed2d3SChris Wilson 1273112ed2d3SChris Wilson /* 1274112ed2d3SChris Wilson * Try engine reset when available. We fall back to full reset if 1275112ed2d3SChris Wilson * single reset fails. 1276112ed2d3SChris Wilson */ 1277260e6b71SChris Wilson if (intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) { 127816f2941aSChris Wilson local_bh_disable(); 1279a50134b1STvrtko Ursulin for_each_engine_masked(engine, gt, engine_mask, tmp) { 1280112ed2d3SChris Wilson BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); 1281112ed2d3SChris Wilson if (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1282cb823ed9SChris Wilson >->reset.flags)) 1283112ed2d3SChris Wilson continue; 1284112ed2d3SChris Wilson 128516f2941aSChris Wilson if (__intel_engine_reset_bh(engine, msg) == 0) 1286112ed2d3SChris Wilson engine_mask &= ~engine->mask; 1287112ed2d3SChris Wilson 1288cb823ed9SChris Wilson clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, 1289cb823ed9SChris Wilson >->reset.flags); 1290112ed2d3SChris Wilson } 129116f2941aSChris Wilson local_bh_enable(); 1292112ed2d3SChris Wilson } 1293112ed2d3SChris Wilson 1294112ed2d3SChris Wilson if (!engine_mask) 1295112ed2d3SChris Wilson goto out; 1296112ed2d3SChris Wilson 1297112ed2d3SChris Wilson /* Full reset needs the mutex, stop any other user trying to do so. */ 1298cb823ed9SChris Wilson if (test_and_set_bit(I915_RESET_BACKOFF, >->reset.flags)) { 1299cb823ed9SChris Wilson wait_event(gt->reset.queue, 1300cb823ed9SChris Wilson !test_bit(I915_RESET_BACKOFF, >->reset.flags)); 1301112ed2d3SChris Wilson goto out; /* piggy-back on the other reset */ 1302112ed2d3SChris Wilson } 1303112ed2d3SChris Wilson 1304112ed2d3SChris Wilson /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */ 1305112ed2d3SChris Wilson synchronize_rcu_expedited(); 1306112ed2d3SChris Wilson 1307112ed2d3SChris Wilson /* Prevent any other reset-engine attempt. */ 13085d904e3cSTvrtko Ursulin for_each_engine(engine, gt, tmp) { 1309112ed2d3SChris Wilson while (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1310cb823ed9SChris Wilson >->reset.flags)) 1311cb823ed9SChris Wilson wait_on_bit(>->reset.flags, 1312112ed2d3SChris Wilson I915_RESET_ENGINE + engine->id, 1313112ed2d3SChris Wilson TASK_UNINTERRUPTIBLE); 1314112ed2d3SChris Wilson } 1315112ed2d3SChris Wilson 1316cb823ed9SChris Wilson intel_gt_reset_global(gt, engine_mask, msg); 1317112ed2d3SChris Wilson 13185d904e3cSTvrtko Ursulin for_each_engine(engine, gt, tmp) 1319cb823ed9SChris Wilson clear_bit_unlock(I915_RESET_ENGINE + engine->id, 1320cb823ed9SChris Wilson >->reset.flags); 1321cb823ed9SChris Wilson clear_bit_unlock(I915_RESET_BACKOFF, >->reset.flags); 1322cb823ed9SChris Wilson smp_mb__after_atomic(); 1323cb823ed9SChris Wilson wake_up_all(>->reset.queue); 1324112ed2d3SChris Wilson 1325112ed2d3SChris Wilson out: 1326cd6a8513SChris Wilson intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1327112ed2d3SChris Wilson } 1328112ed2d3SChris Wilson 1329eebab60fSChris Wilson int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) 1330112ed2d3SChris Wilson { 1331cb823ed9SChris Wilson might_lock(>->reset.backoff_srcu); 1332112ed2d3SChris Wilson might_sleep(); 1333112ed2d3SChris Wilson 1334112ed2d3SChris Wilson rcu_read_lock(); 1335cb823ed9SChris Wilson while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) { 1336112ed2d3SChris Wilson rcu_read_unlock(); 1337112ed2d3SChris Wilson 1338cb823ed9SChris Wilson if (wait_event_interruptible(gt->reset.queue, 1339112ed2d3SChris Wilson !test_bit(I915_RESET_BACKOFF, 1340cb823ed9SChris Wilson >->reset.flags))) 1341112ed2d3SChris Wilson return -EINTR; 1342112ed2d3SChris Wilson 1343112ed2d3SChris Wilson rcu_read_lock(); 1344112ed2d3SChris Wilson } 1345eebab60fSChris Wilson *srcu = srcu_read_lock(>->reset.backoff_srcu); 1346112ed2d3SChris Wilson rcu_read_unlock(); 1347112ed2d3SChris Wilson 1348eebab60fSChris Wilson return 0; 1349112ed2d3SChris Wilson } 1350112ed2d3SChris Wilson 1351cb823ed9SChris Wilson void intel_gt_reset_unlock(struct intel_gt *gt, int tag) 1352cb823ed9SChris Wilson __releases(>->reset.backoff_srcu) 1353112ed2d3SChris Wilson { 1354cb823ed9SChris Wilson srcu_read_unlock(>->reset.backoff_srcu, tag); 1355112ed2d3SChris Wilson } 1356112ed2d3SChris Wilson 1357cb823ed9SChris Wilson int intel_gt_terminally_wedged(struct intel_gt *gt) 1358112ed2d3SChris Wilson { 1359112ed2d3SChris Wilson might_sleep(); 1360112ed2d3SChris Wilson 1361cb823ed9SChris Wilson if (!intel_gt_is_wedged(gt)) 1362112ed2d3SChris Wilson return 0; 1363112ed2d3SChris Wilson 13643f04bdceSMichał Winiarski if (intel_gt_has_unrecoverable_error(gt)) 1365112ed2d3SChris Wilson return -EIO; 1366112ed2d3SChris Wilson 1367b761a7b4SChris Wilson /* Reset still in progress? Maybe we will recover? */ 1368cb823ed9SChris Wilson if (wait_event_interruptible(gt->reset.queue, 1369112ed2d3SChris Wilson !test_bit(I915_RESET_BACKOFF, 1370cb823ed9SChris Wilson >->reset.flags))) 1371112ed2d3SChris Wilson return -EINTR; 1372112ed2d3SChris Wilson 1373cb823ed9SChris Wilson return intel_gt_is_wedged(gt) ? -EIO : 0; 1374112ed2d3SChris Wilson } 1375112ed2d3SChris Wilson 13765311f517SMichał Winiarski void intel_gt_set_wedged_on_init(struct intel_gt *gt) 13775311f517SMichał Winiarski { 13785311f517SMichał Winiarski BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES > 13795311f517SMichał Winiarski I915_WEDGED_ON_INIT); 13805311f517SMichał Winiarski intel_gt_set_wedged(gt); 13815311f517SMichał Winiarski set_bit(I915_WEDGED_ON_INIT, >->reset.flags); 13823f04bdceSMichał Winiarski 13833f04bdceSMichał Winiarski /* Wedged on init is non-recoverable */ 138465706203SMichał Winiarski add_taint_for_CI(gt->i915, TAINT_WARN); 13853f04bdceSMichał Winiarski } 13863f04bdceSMichał Winiarski 13873f04bdceSMichał Winiarski void intel_gt_set_wedged_on_fini(struct intel_gt *gt) 13883f04bdceSMichał Winiarski { 13893f04bdceSMichał Winiarski intel_gt_set_wedged(gt); 13903f04bdceSMichał Winiarski set_bit(I915_WEDGED_ON_FINI, >->reset.flags); 1391b0573472SChris Wilson intel_gt_retire_requests(gt); /* cleanup any wedged requests */ 13925311f517SMichał Winiarski } 13935311f517SMichał Winiarski 1394cb823ed9SChris Wilson void intel_gt_init_reset(struct intel_gt *gt) 1395112ed2d3SChris Wilson { 1396cb823ed9SChris Wilson init_waitqueue_head(>->reset.queue); 1397cb823ed9SChris Wilson mutex_init(>->reset.mutex); 1398cb823ed9SChris Wilson init_srcu_struct(>->reset.backoff_srcu); 139945b152f7SChris Wilson 1400cecb2af4SChris Wilson /* 1401cecb2af4SChris Wilson * While undesirable to wait inside the shrinker, complain anyway. 1402cecb2af4SChris Wilson * 1403cecb2af4SChris Wilson * If we have to wait during shrinking, we guarantee forward progress 1404cecb2af4SChris Wilson * by forcing the reset. Therefore during the reset we must not 1405cecb2af4SChris Wilson * re-enter the shrinker. By declaring that we take the reset mutex 1406cecb2af4SChris Wilson * within the shrinker, we forbid ourselves from performing any 1407cecb2af4SChris Wilson * fs-reclaim or taking related locks during reset. 1408cecb2af4SChris Wilson */ 1409cecb2af4SChris Wilson i915_gem_shrinker_taints_mutex(gt->i915, >->reset.mutex); 1410cecb2af4SChris Wilson 141145b152f7SChris Wilson /* no GPU until we are ready! */ 141245b152f7SChris Wilson __set_bit(I915_WEDGED, >->reset.flags); 1413cb823ed9SChris Wilson } 1414112ed2d3SChris Wilson 1415cb823ed9SChris Wilson void intel_gt_fini_reset(struct intel_gt *gt) 1416cb823ed9SChris Wilson { 1417cb823ed9SChris Wilson cleanup_srcu_struct(>->reset.backoff_srcu); 1418cb823ed9SChris Wilson } 1419cb823ed9SChris Wilson 1420cb823ed9SChris Wilson static void intel_wedge_me(struct work_struct *work) 1421cb823ed9SChris Wilson { 1422cb823ed9SChris Wilson struct intel_wedge_me *w = container_of(work, typeof(*w), work.work); 1423cb823ed9SChris Wilson 1424dc483ba5SJani Nikula drm_err(&w->gt->i915->drm, 1425112ed2d3SChris Wilson "%s timed out, cancelling all in-flight rendering.\n", 1426112ed2d3SChris Wilson w->name); 1427cb823ed9SChris Wilson intel_gt_set_wedged(w->gt); 1428112ed2d3SChris Wilson } 1429112ed2d3SChris Wilson 1430cb823ed9SChris Wilson void __intel_init_wedge(struct intel_wedge_me *w, 1431cb823ed9SChris Wilson struct intel_gt *gt, 1432112ed2d3SChris Wilson long timeout, 1433112ed2d3SChris Wilson const char *name) 1434112ed2d3SChris Wilson { 1435cb823ed9SChris Wilson w->gt = gt; 1436112ed2d3SChris Wilson w->name = name; 1437112ed2d3SChris Wilson 1438cb823ed9SChris Wilson INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me); 1439112ed2d3SChris Wilson schedule_delayed_work(&w->work, timeout); 1440112ed2d3SChris Wilson } 1441112ed2d3SChris Wilson 1442cb823ed9SChris Wilson void __intel_fini_wedge(struct intel_wedge_me *w) 1443112ed2d3SChris Wilson { 1444112ed2d3SChris Wilson cancel_delayed_work_sync(&w->work); 1445112ed2d3SChris Wilson destroy_delayed_work_on_stack(&w->work); 1446cb823ed9SChris Wilson w->gt = NULL; 1447112ed2d3SChris Wilson } 1448932309fbSMichal Wajdeczko 1449932309fbSMichal Wajdeczko #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1450932309fbSMichal Wajdeczko #include "selftest_reset.c" 1451058179e7SChris Wilson #include "selftest_hangcheck.c" 1452932309fbSMichal Wajdeczko #endif 1453