1 /* 2 * SPDX-License-Identifier: MIT 3 * 4 * Copyright © 2019 Intel Corporation 5 */ 6 7 #include "i915_drv.h" 8 #include "i915_request.h" 9 10 #include "intel_context.h" 11 #include "intel_engine_heartbeat.h" 12 #include "intel_engine_pm.h" 13 #include "intel_engine.h" 14 #include "intel_gt.h" 15 #include "intel_reset.h" 16 17 /* 18 * While the engine is active, we send a periodic pulse along the engine 19 * to check on its health and to flush any idle-barriers. If that request 20 * is stuck, and we fail to preempt it, we declare the engine hung and 21 * issue a reset -- in the hope that restores progress. 22 */ 23 24 static bool next_heartbeat(struct intel_engine_cs *engine) 25 { 26 long delay; 27 28 delay = READ_ONCE(engine->props.heartbeat_interval_ms); 29 if (!delay) 30 return false; 31 32 delay = msecs_to_jiffies_timeout(delay); 33 if (delay >= HZ) 34 delay = round_jiffies_up_relative(delay); 35 mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay); 36 37 return true; 38 } 39 40 static struct i915_request * 41 heartbeat_create(struct intel_context *ce, gfp_t gfp) 42 { 43 struct i915_request *rq; 44 45 intel_context_enter(ce); 46 rq = __i915_request_create(ce, gfp); 47 intel_context_exit(ce); 48 49 return rq; 50 } 51 52 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) 53 { 54 engine->wakeref_serial = READ_ONCE(engine->serial) + 1; 55 i915_request_add_active_barriers(rq); 56 if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine)) 57 engine->heartbeat.systole = i915_request_get(rq); 58 } 59 60 static void heartbeat_commit(struct i915_request *rq, 61 const struct i915_sched_attr *attr) 62 { 63 idle_pulse(rq->engine, rq); 64 65 __i915_request_commit(rq); 66 __i915_request_queue(rq, attr); 67 } 68 69 static void show_heartbeat(const struct i915_request *rq, 70 struct intel_engine_cs *engine) 71 { 72 struct drm_printer p = drm_debug_printer("heartbeat"); 73 74 intel_engine_dump(engine, &p, 75 "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n", 76 engine->name, 77 rq->fence.context, 78 rq->fence.seqno, 79 rq->sched.attr.priority); 80 } 81 82 static void heartbeat(struct work_struct *wrk) 83 { 84 struct i915_sched_attr attr = { 85 .priority = I915_USER_PRIORITY(I915_PRIORITY_MIN), 86 }; 87 struct intel_engine_cs *engine = 88 container_of(wrk, typeof(*engine), heartbeat.work.work); 89 struct intel_context *ce = engine->kernel_context; 90 struct i915_request *rq; 91 unsigned long serial; 92 93 /* Just in case everything has gone horribly wrong, give it a kick */ 94 intel_engine_flush_submission(engine); 95 96 rq = engine->heartbeat.systole; 97 if (rq && i915_request_completed(rq)) { 98 i915_request_put(rq); 99 engine->heartbeat.systole = NULL; 100 } 101 102 if (!intel_engine_pm_get_if_awake(engine)) 103 return; 104 105 if (intel_gt_is_wedged(engine->gt)) 106 goto out; 107 108 if (engine->heartbeat.systole) { 109 if (!i915_sw_fence_signaled(&rq->submit)) { 110 /* 111 * Not yet submitted, system is stalled. 112 * 113 * This more often happens for ring submission, 114 * where all contexts are funnelled into a common 115 * ringbuffer. If one context is blocked on an 116 * external fence, not only is it not submitted, 117 * but all other contexts, including the kernel 118 * context are stuck waiting for the signal. 119 */ 120 } else if (engine->schedule && 121 rq->sched.attr.priority < I915_PRIORITY_BARRIER) { 122 /* 123 * Gradually raise the priority of the heartbeat to 124 * give high priority work [which presumably desires 125 * low latency and no jitter] the chance to naturally 126 * complete before being preempted. 127 */ 128 attr.priority = I915_PRIORITY_MASK; 129 if (rq->sched.attr.priority >= attr.priority) 130 attr.priority |= I915_USER_PRIORITY(I915_PRIORITY_HEARTBEAT); 131 if (rq->sched.attr.priority >= attr.priority) 132 attr.priority = I915_PRIORITY_BARRIER; 133 134 local_bh_disable(); 135 engine->schedule(rq, &attr); 136 local_bh_enable(); 137 } else { 138 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 139 show_heartbeat(rq, engine); 140 141 intel_gt_handle_error(engine->gt, engine->mask, 142 I915_ERROR_CAPTURE, 143 "stopped heartbeat on %s", 144 engine->name); 145 } 146 goto out; 147 } 148 149 serial = READ_ONCE(engine->serial); 150 if (engine->wakeref_serial == serial) 151 goto out; 152 153 if (!mutex_trylock(&ce->timeline->mutex)) { 154 /* Unable to lock the kernel timeline, is the engine stuck? */ 155 if (xchg(&engine->heartbeat.blocked, serial) == serial) 156 intel_gt_handle_error(engine->gt, engine->mask, 157 I915_ERROR_CAPTURE, 158 "no heartbeat on %s", 159 engine->name); 160 goto out; 161 } 162 163 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 164 if (IS_ERR(rq)) 165 goto unlock; 166 167 heartbeat_commit(rq, &attr); 168 169 unlock: 170 mutex_unlock(&ce->timeline->mutex); 171 out: 172 if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) 173 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 174 intel_engine_pm_put(engine); 175 } 176 177 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) 178 { 179 if (!IS_ACTIVE(CONFIG_DRM_I915_HEARTBEAT_INTERVAL)) 180 return; 181 182 next_heartbeat(engine); 183 } 184 185 void intel_engine_park_heartbeat(struct intel_engine_cs *engine) 186 { 187 if (cancel_delayed_work(&engine->heartbeat.work)) 188 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 189 } 190 191 void intel_engine_init_heartbeat(struct intel_engine_cs *engine) 192 { 193 INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); 194 } 195 196 static int __intel_engine_pulse(struct intel_engine_cs *engine) 197 { 198 struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; 199 struct intel_context *ce = engine->kernel_context; 200 struct i915_request *rq; 201 202 lockdep_assert_held(&ce->timeline->mutex); 203 GEM_BUG_ON(!intel_engine_has_preemption(engine)); 204 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 205 206 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 207 if (IS_ERR(rq)) 208 return PTR_ERR(rq); 209 210 __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); 211 212 heartbeat_commit(rq, &attr); 213 GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); 214 215 return 0; 216 } 217 218 static unsigned long set_heartbeat(struct intel_engine_cs *engine, 219 unsigned long delay) 220 { 221 unsigned long old; 222 223 old = xchg(&engine->props.heartbeat_interval_ms, delay); 224 if (delay) 225 intel_engine_unpark_heartbeat(engine); 226 else 227 intel_engine_park_heartbeat(engine); 228 229 return old; 230 } 231 232 int intel_engine_set_heartbeat(struct intel_engine_cs *engine, 233 unsigned long delay) 234 { 235 struct intel_context *ce = engine->kernel_context; 236 int err = 0; 237 238 if (!delay && !intel_engine_has_preempt_reset(engine)) 239 return -ENODEV; 240 241 intel_engine_pm_get(engine); 242 243 err = mutex_lock_interruptible(&ce->timeline->mutex); 244 if (err) 245 goto out_rpm; 246 247 if (delay != engine->props.heartbeat_interval_ms) { 248 unsigned long saved = set_heartbeat(engine, delay); 249 250 /* recheck current execution */ 251 if (intel_engine_has_preemption(engine)) { 252 err = __intel_engine_pulse(engine); 253 if (err) 254 set_heartbeat(engine, saved); 255 } 256 } 257 258 mutex_unlock(&ce->timeline->mutex); 259 260 out_rpm: 261 intel_engine_pm_put(engine); 262 return err; 263 } 264 265 int intel_engine_pulse(struct intel_engine_cs *engine) 266 { 267 struct intel_context *ce = engine->kernel_context; 268 int err; 269 270 if (!intel_engine_has_preemption(engine)) 271 return -ENODEV; 272 273 if (!intel_engine_pm_get_if_awake(engine)) 274 return 0; 275 276 err = -EINTR; 277 if (!mutex_lock_interruptible(&ce->timeline->mutex)) { 278 err = __intel_engine_pulse(engine); 279 mutex_unlock(&ce->timeline->mutex); 280 } 281 282 intel_engine_pm_put(engine); 283 return err; 284 } 285 286 int intel_engine_flush_barriers(struct intel_engine_cs *engine) 287 { 288 struct i915_sched_attr attr = { 289 .priority = I915_USER_PRIORITY(I915_PRIORITY_MIN), 290 }; 291 struct intel_context *ce = engine->kernel_context; 292 struct i915_request *rq; 293 int err; 294 295 if (llist_empty(&engine->barrier_tasks)) 296 return 0; 297 298 if (!intel_engine_pm_get_if_awake(engine)) 299 return 0; 300 301 if (mutex_lock_interruptible(&ce->timeline->mutex)) { 302 err = -EINTR; 303 goto out_rpm; 304 } 305 306 rq = heartbeat_create(ce, GFP_KERNEL); 307 if (IS_ERR(rq)) { 308 err = PTR_ERR(rq); 309 goto out_unlock; 310 } 311 312 heartbeat_commit(rq, &attr); 313 314 err = 0; 315 out_unlock: 316 mutex_unlock(&ce->timeline->mutex); 317 out_rpm: 318 intel_engine_pm_put(engine); 319 return err; 320 } 321 322 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 323 #include "selftest_engine_heartbeat.c" 324 #endif 325