1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2019 Intel Corporation 4 */ 5 6 #include "i915_drv.h" 7 #include "i915_request.h" 8 9 #include "intel_context.h" 10 #include "intel_engine_heartbeat.h" 11 #include "intel_engine_pm.h" 12 #include "intel_engine.h" 13 #include "intel_gt.h" 14 #include "intel_reset.h" 15 16 /* 17 * While the engine is active, we send a periodic pulse along the engine 18 * to check on its health and to flush any idle-barriers. If that request 19 * is stuck, and we fail to preempt it, we declare the engine hung and 20 * issue a reset -- in the hope that restores progress. 21 */ 22 23 static bool next_heartbeat(struct intel_engine_cs *engine) 24 { 25 long delay; 26 27 delay = READ_ONCE(engine->props.heartbeat_interval_ms); 28 if (!delay) 29 return false; 30 31 delay = msecs_to_jiffies_timeout(delay); 32 if (delay >= HZ) 33 delay = round_jiffies_up_relative(delay); 34 mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1); 35 36 return true; 37 } 38 39 static struct i915_request * 40 heartbeat_create(struct intel_context *ce, gfp_t gfp) 41 { 42 struct i915_request *rq; 43 44 intel_context_enter(ce); 45 rq = __i915_request_create(ce, gfp); 46 intel_context_exit(ce); 47 48 return rq; 49 } 50 51 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) 52 { 53 engine->wakeref_serial = READ_ONCE(engine->serial) + 1; 54 i915_request_add_active_barriers(rq); 55 if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine)) 56 engine->heartbeat.systole = i915_request_get(rq); 57 } 58 59 static void heartbeat_commit(struct i915_request *rq, 60 const struct i915_sched_attr *attr) 61 { 62 idle_pulse(rq->engine, rq); 63 64 __i915_request_commit(rq); 65 __i915_request_queue(rq, attr); 66 } 67 68 static void show_heartbeat(const struct i915_request *rq, 69 struct intel_engine_cs *engine) 70 { 71 struct drm_printer p = drm_debug_printer("heartbeat"); 72 73 intel_engine_dump(engine, &p, 74 "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n", 75 engine->name, 76 rq->fence.context, 77 rq->fence.seqno, 78 rq->sched.attr.priority); 79 } 80 81 static void heartbeat(struct work_struct *wrk) 82 { 83 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 84 struct intel_engine_cs *engine = 85 container_of(wrk, typeof(*engine), heartbeat.work.work); 86 struct intel_context *ce = engine->kernel_context; 87 struct i915_request *rq; 88 unsigned long serial; 89 90 /* Just in case everything has gone horribly wrong, give it a kick */ 91 intel_engine_flush_submission(engine); 92 93 rq = engine->heartbeat.systole; 94 if (rq && i915_request_completed(rq)) { 95 i915_request_put(rq); 96 engine->heartbeat.systole = NULL; 97 } 98 99 if (!intel_engine_pm_get_if_awake(engine)) 100 return; 101 102 if (intel_gt_is_wedged(engine->gt)) 103 goto out; 104 105 if (engine->heartbeat.systole) { 106 long delay = READ_ONCE(engine->props.heartbeat_interval_ms); 107 108 /* Safeguard against too-fast worker invocations */ 109 if (!time_after(jiffies, 110 rq->emitted_jiffies + msecs_to_jiffies(delay))) 111 goto out; 112 113 if (!i915_sw_fence_signaled(&rq->submit)) { 114 /* 115 * Not yet submitted, system is stalled. 116 * 117 * This more often happens for ring submission, 118 * where all contexts are funnelled into a common 119 * ringbuffer. If one context is blocked on an 120 * external fence, not only is it not submitted, 121 * but all other contexts, including the kernel 122 * context are stuck waiting for the signal. 123 */ 124 } else if (engine->schedule && 125 rq->sched.attr.priority < I915_PRIORITY_BARRIER) { 126 /* 127 * Gradually raise the priority of the heartbeat to 128 * give high priority work [which presumably desires 129 * low latency and no jitter] the chance to naturally 130 * complete before being preempted. 131 */ 132 attr.priority = 0; 133 if (rq->sched.attr.priority >= attr.priority) 134 attr.priority = I915_PRIORITY_HEARTBEAT; 135 if (rq->sched.attr.priority >= attr.priority) 136 attr.priority = I915_PRIORITY_BARRIER; 137 138 local_bh_disable(); 139 engine->schedule(rq, &attr); 140 local_bh_enable(); 141 } else { 142 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 143 show_heartbeat(rq, engine); 144 145 intel_gt_handle_error(engine->gt, engine->mask, 146 I915_ERROR_CAPTURE, 147 "stopped heartbeat on %s", 148 engine->name); 149 } 150 151 rq->emitted_jiffies = jiffies; 152 goto out; 153 } 154 155 serial = READ_ONCE(engine->serial); 156 if (engine->wakeref_serial == serial) 157 goto out; 158 159 if (!mutex_trylock(&ce->timeline->mutex)) { 160 /* Unable to lock the kernel timeline, is the engine stuck? */ 161 if (xchg(&engine->heartbeat.blocked, serial) == serial) 162 intel_gt_handle_error(engine->gt, engine->mask, 163 I915_ERROR_CAPTURE, 164 "no heartbeat on %s", 165 engine->name); 166 goto out; 167 } 168 169 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 170 if (IS_ERR(rq)) 171 goto unlock; 172 173 heartbeat_commit(rq, &attr); 174 175 unlock: 176 mutex_unlock(&ce->timeline->mutex); 177 out: 178 if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) 179 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 180 intel_engine_pm_put(engine); 181 } 182 183 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) 184 { 185 if (!IS_ACTIVE(CONFIG_DRM_I915_HEARTBEAT_INTERVAL)) 186 return; 187 188 next_heartbeat(engine); 189 } 190 191 void intel_engine_park_heartbeat(struct intel_engine_cs *engine) 192 { 193 if (cancel_delayed_work(&engine->heartbeat.work)) 194 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 195 } 196 197 void intel_engine_init_heartbeat(struct intel_engine_cs *engine) 198 { 199 INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); 200 } 201 202 static int __intel_engine_pulse(struct intel_engine_cs *engine) 203 { 204 struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; 205 struct intel_context *ce = engine->kernel_context; 206 struct i915_request *rq; 207 208 lockdep_assert_held(&ce->timeline->mutex); 209 GEM_BUG_ON(!intel_engine_has_preemption(engine)); 210 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 211 212 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 213 if (IS_ERR(rq)) 214 return PTR_ERR(rq); 215 216 __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); 217 218 heartbeat_commit(rq, &attr); 219 GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); 220 221 return 0; 222 } 223 224 static unsigned long set_heartbeat(struct intel_engine_cs *engine, 225 unsigned long delay) 226 { 227 unsigned long old; 228 229 old = xchg(&engine->props.heartbeat_interval_ms, delay); 230 if (delay) 231 intel_engine_unpark_heartbeat(engine); 232 else 233 intel_engine_park_heartbeat(engine); 234 235 return old; 236 } 237 238 int intel_engine_set_heartbeat(struct intel_engine_cs *engine, 239 unsigned long delay) 240 { 241 struct intel_context *ce = engine->kernel_context; 242 int err = 0; 243 244 if (!delay && !intel_engine_has_preempt_reset(engine)) 245 return -ENODEV; 246 247 intel_engine_pm_get(engine); 248 249 err = mutex_lock_interruptible(&ce->timeline->mutex); 250 if (err) 251 goto out_rpm; 252 253 if (delay != engine->props.heartbeat_interval_ms) { 254 unsigned long saved = set_heartbeat(engine, delay); 255 256 /* recheck current execution */ 257 if (intel_engine_has_preemption(engine)) { 258 err = __intel_engine_pulse(engine); 259 if (err) 260 set_heartbeat(engine, saved); 261 } 262 } 263 264 mutex_unlock(&ce->timeline->mutex); 265 266 out_rpm: 267 intel_engine_pm_put(engine); 268 return err; 269 } 270 271 int intel_engine_pulse(struct intel_engine_cs *engine) 272 { 273 struct intel_context *ce = engine->kernel_context; 274 int err; 275 276 if (!intel_engine_has_preemption(engine)) 277 return -ENODEV; 278 279 if (!intel_engine_pm_get_if_awake(engine)) 280 return 0; 281 282 err = -EINTR; 283 if (!mutex_lock_interruptible(&ce->timeline->mutex)) { 284 err = __intel_engine_pulse(engine); 285 mutex_unlock(&ce->timeline->mutex); 286 } 287 288 intel_engine_flush_submission(engine); 289 intel_engine_pm_put(engine); 290 return err; 291 } 292 293 int intel_engine_flush_barriers(struct intel_engine_cs *engine) 294 { 295 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 296 struct intel_context *ce = engine->kernel_context; 297 struct i915_request *rq; 298 int err; 299 300 if (llist_empty(&engine->barrier_tasks)) 301 return 0; 302 303 if (!intel_engine_pm_get_if_awake(engine)) 304 return 0; 305 306 if (mutex_lock_interruptible(&ce->timeline->mutex)) { 307 err = -EINTR; 308 goto out_rpm; 309 } 310 311 rq = heartbeat_create(ce, GFP_KERNEL); 312 if (IS_ERR(rq)) { 313 err = PTR_ERR(rq); 314 goto out_unlock; 315 } 316 317 heartbeat_commit(rq, &attr); 318 319 err = 0; 320 out_unlock: 321 mutex_unlock(&ce->timeline->mutex); 322 out_rpm: 323 intel_engine_pm_put(engine); 324 return err; 325 } 326 327 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 328 #include "selftest_engine_heartbeat.c" 329 #endif 330