1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2019 Intel Corporation 4 */ 5 6 #include "i915_drv.h" 7 #include "i915_request.h" 8 9 #include "intel_context.h" 10 #include "intel_engine_heartbeat.h" 11 #include "intel_engine_pm.h" 12 #include "intel_engine.h" 13 #include "intel_gt.h" 14 #include "intel_reset.h" 15 16 /* 17 * While the engine is active, we send a periodic pulse along the engine 18 * to check on its health and to flush any idle-barriers. If that request 19 * is stuck, and we fail to preempt it, we declare the engine hung and 20 * issue a reset -- in the hope that restores progress. 21 */ 22 23 static bool next_heartbeat(struct intel_engine_cs *engine) 24 { 25 long delay; 26 27 delay = READ_ONCE(engine->props.heartbeat_interval_ms); 28 if (!delay) 29 return false; 30 31 delay = msecs_to_jiffies_timeout(delay); 32 if (delay >= HZ) 33 delay = round_jiffies_up_relative(delay); 34 mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1); 35 36 return true; 37 } 38 39 static struct i915_request * 40 heartbeat_create(struct intel_context *ce, gfp_t gfp) 41 { 42 struct i915_request *rq; 43 44 intel_context_enter(ce); 45 rq = __i915_request_create(ce, gfp); 46 intel_context_exit(ce); 47 48 return rq; 49 } 50 51 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) 52 { 53 engine->wakeref_serial = READ_ONCE(engine->serial) + 1; 54 i915_request_add_active_barriers(rq); 55 if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine)) 56 engine->heartbeat.systole = i915_request_get(rq); 57 } 58 59 static void heartbeat_commit(struct i915_request *rq, 60 const struct i915_sched_attr *attr) 61 { 62 idle_pulse(rq->engine, rq); 63 64 __i915_request_commit(rq); 65 __i915_request_queue(rq, attr); 66 } 67 68 static void show_heartbeat(const struct i915_request *rq, 69 struct intel_engine_cs *engine) 70 { 71 struct drm_printer p = drm_debug_printer("heartbeat"); 72 73 if (!rq) { 74 intel_engine_dump(engine, &p, 75 "%s heartbeat not ticking\n", 76 engine->name); 77 } else { 78 intel_engine_dump(engine, &p, 79 "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n", 80 engine->name, 81 rq->fence.context, 82 rq->fence.seqno, 83 rq->sched.attr.priority); 84 } 85 } 86 87 static void 88 reset_engine(struct intel_engine_cs *engine, struct i915_request *rq) 89 { 90 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 91 show_heartbeat(rq, engine); 92 93 if (intel_engine_uses_guc(engine)) 94 /* 95 * GuC itself is toast or GuC's hang detection 96 * is disabled. Either way, need to find the 97 * hang culprit manually. 98 */ 99 intel_guc_find_hung_context(engine); 100 101 intel_gt_handle_error(engine->gt, engine->mask, 102 I915_ERROR_CAPTURE, 103 "stopped heartbeat on %s", 104 engine->name); 105 } 106 107 static void heartbeat(struct work_struct *wrk) 108 { 109 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 110 struct intel_engine_cs *engine = 111 container_of(wrk, typeof(*engine), heartbeat.work.work); 112 struct intel_context *ce = engine->kernel_context; 113 struct i915_request *rq; 114 unsigned long serial; 115 116 /* Just in case everything has gone horribly wrong, give it a kick */ 117 intel_engine_flush_submission(engine); 118 119 rq = engine->heartbeat.systole; 120 if (rq && i915_request_completed(rq)) { 121 i915_request_put(rq); 122 engine->heartbeat.systole = NULL; 123 } 124 125 if (!intel_engine_pm_get_if_awake(engine)) 126 return; 127 128 if (intel_gt_is_wedged(engine->gt)) 129 goto out; 130 131 if (i915_sched_engine_disabled(engine->sched_engine)) { 132 reset_engine(engine, engine->heartbeat.systole); 133 goto out; 134 } 135 136 if (engine->heartbeat.systole) { 137 long delay = READ_ONCE(engine->props.heartbeat_interval_ms); 138 139 /* Safeguard against too-fast worker invocations */ 140 if (!time_after(jiffies, 141 rq->emitted_jiffies + msecs_to_jiffies(delay))) 142 goto out; 143 144 if (!i915_sw_fence_signaled(&rq->submit)) { 145 /* 146 * Not yet submitted, system is stalled. 147 * 148 * This more often happens for ring submission, 149 * where all contexts are funnelled into a common 150 * ringbuffer. If one context is blocked on an 151 * external fence, not only is it not submitted, 152 * but all other contexts, including the kernel 153 * context are stuck waiting for the signal. 154 */ 155 } else if (engine->sched_engine->schedule && 156 rq->sched.attr.priority < I915_PRIORITY_BARRIER) { 157 /* 158 * Gradually raise the priority of the heartbeat to 159 * give high priority work [which presumably desires 160 * low latency and no jitter] the chance to naturally 161 * complete before being preempted. 162 */ 163 attr.priority = 0; 164 if (rq->sched.attr.priority >= attr.priority) 165 attr.priority = I915_PRIORITY_HEARTBEAT; 166 if (rq->sched.attr.priority >= attr.priority) 167 attr.priority = I915_PRIORITY_BARRIER; 168 169 local_bh_disable(); 170 engine->sched_engine->schedule(rq, &attr); 171 local_bh_enable(); 172 } else { 173 reset_engine(engine, rq); 174 } 175 176 rq->emitted_jiffies = jiffies; 177 goto out; 178 } 179 180 serial = READ_ONCE(engine->serial); 181 if (engine->wakeref_serial == serial) 182 goto out; 183 184 if (!mutex_trylock(&ce->timeline->mutex)) { 185 /* Unable to lock the kernel timeline, is the engine stuck? */ 186 if (xchg(&engine->heartbeat.blocked, serial) == serial) 187 intel_gt_handle_error(engine->gt, engine->mask, 188 I915_ERROR_CAPTURE, 189 "no heartbeat on %s", 190 engine->name); 191 goto out; 192 } 193 194 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 195 if (IS_ERR(rq)) 196 goto unlock; 197 198 heartbeat_commit(rq, &attr); 199 200 unlock: 201 mutex_unlock(&ce->timeline->mutex); 202 out: 203 if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) 204 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 205 intel_engine_pm_put(engine); 206 } 207 208 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) 209 { 210 if (!IS_ACTIVE(CONFIG_DRM_I915_HEARTBEAT_INTERVAL)) 211 return; 212 213 next_heartbeat(engine); 214 } 215 216 void intel_engine_park_heartbeat(struct intel_engine_cs *engine) 217 { 218 if (cancel_delayed_work(&engine->heartbeat.work)) 219 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 220 } 221 222 void intel_gt_unpark_heartbeats(struct intel_gt *gt) 223 { 224 struct intel_engine_cs *engine; 225 enum intel_engine_id id; 226 227 for_each_engine(engine, gt, id) 228 if (intel_engine_pm_is_awake(engine)) 229 intel_engine_unpark_heartbeat(engine); 230 } 231 232 void intel_gt_park_heartbeats(struct intel_gt *gt) 233 { 234 struct intel_engine_cs *engine; 235 enum intel_engine_id id; 236 237 for_each_engine(engine, gt, id) 238 intel_engine_park_heartbeat(engine); 239 } 240 241 void intel_engine_init_heartbeat(struct intel_engine_cs *engine) 242 { 243 INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); 244 } 245 246 static int __intel_engine_pulse(struct intel_engine_cs *engine) 247 { 248 struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; 249 struct intel_context *ce = engine->kernel_context; 250 struct i915_request *rq; 251 252 lockdep_assert_held(&ce->timeline->mutex); 253 GEM_BUG_ON(!intel_engine_has_preemption(engine)); 254 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 255 256 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 257 if (IS_ERR(rq)) 258 return PTR_ERR(rq); 259 260 __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); 261 262 heartbeat_commit(rq, &attr); 263 GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); 264 265 return 0; 266 } 267 268 static unsigned long set_heartbeat(struct intel_engine_cs *engine, 269 unsigned long delay) 270 { 271 unsigned long old; 272 273 old = xchg(&engine->props.heartbeat_interval_ms, delay); 274 if (delay) 275 intel_engine_unpark_heartbeat(engine); 276 else 277 intel_engine_park_heartbeat(engine); 278 279 return old; 280 } 281 282 int intel_engine_set_heartbeat(struct intel_engine_cs *engine, 283 unsigned long delay) 284 { 285 struct intel_context *ce = engine->kernel_context; 286 int err = 0; 287 288 if (!delay && !intel_engine_has_preempt_reset(engine)) 289 return -ENODEV; 290 291 intel_engine_pm_get(engine); 292 293 err = mutex_lock_interruptible(&ce->timeline->mutex); 294 if (err) 295 goto out_rpm; 296 297 if (delay != engine->props.heartbeat_interval_ms) { 298 unsigned long saved = set_heartbeat(engine, delay); 299 300 /* recheck current execution */ 301 if (intel_engine_has_preemption(engine)) { 302 err = __intel_engine_pulse(engine); 303 if (err) 304 set_heartbeat(engine, saved); 305 } 306 } 307 308 mutex_unlock(&ce->timeline->mutex); 309 310 out_rpm: 311 intel_engine_pm_put(engine); 312 return err; 313 } 314 315 int intel_engine_pulse(struct intel_engine_cs *engine) 316 { 317 struct intel_context *ce = engine->kernel_context; 318 int err; 319 320 if (!intel_engine_has_preemption(engine)) 321 return -ENODEV; 322 323 if (!intel_engine_pm_get_if_awake(engine)) 324 return 0; 325 326 err = -EINTR; 327 if (!mutex_lock_interruptible(&ce->timeline->mutex)) { 328 err = __intel_engine_pulse(engine); 329 mutex_unlock(&ce->timeline->mutex); 330 } 331 332 intel_engine_flush_submission(engine); 333 intel_engine_pm_put(engine); 334 return err; 335 } 336 337 int intel_engine_flush_barriers(struct intel_engine_cs *engine) 338 { 339 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 340 struct intel_context *ce = engine->kernel_context; 341 struct i915_request *rq; 342 int err; 343 344 if (llist_empty(&engine->barrier_tasks)) 345 return 0; 346 347 if (!intel_engine_pm_get_if_awake(engine)) 348 return 0; 349 350 if (mutex_lock_interruptible(&ce->timeline->mutex)) { 351 err = -EINTR; 352 goto out_rpm; 353 } 354 355 rq = heartbeat_create(ce, GFP_KERNEL); 356 if (IS_ERR(rq)) { 357 err = PTR_ERR(rq); 358 goto out_unlock; 359 } 360 361 heartbeat_commit(rq, &attr); 362 363 err = 0; 364 out_unlock: 365 mutex_unlock(&ce->timeline->mutex); 366 out_rpm: 367 intel_engine_pm_put(engine); 368 return err; 369 } 370 371 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 372 #include "selftest_engine_heartbeat.c" 373 #endif 374