xref: /openbmc/linux/kernel/irq/timings.c (revision 23aa3b9a6b7d5029c1f124426bc5ba4430dcc29c)
1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
3 
4 #include <linux/kernel.h>
5 #include <linux/percpu.h>
6 #include <linux/slab.h>
7 #include <linux/static_key.h>
8 #include <linux/interrupt.h>
9 #include <linux/idr.h>
10 #include <linux/irq.h>
11 #include <linux/math64.h>
12 #include <linux/log2.h>
13 
14 #include <trace/events/irq.h>
15 
16 #include "internals.h"
17 
18 DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
19 
20 DEFINE_PER_CPU(struct irq_timings, irq_timings);
21 
22 static DEFINE_IDR(irqt_stats);
23 
24 void irq_timings_enable(void)
25 {
26 	static_branch_enable(&irq_timing_enabled);
27 }
28 
29 void irq_timings_disable(void)
30 {
31 	static_branch_disable(&irq_timing_enabled);
32 }
33 
34 /*
35  * The main goal of this algorithm is to predict the next interrupt
36  * occurrence on the current CPU.
37  *
38  * Currently, the interrupt timings are stored in a circular array
39  * buffer every time there is an interrupt, as a tuple: the interrupt
40  * number and the associated timestamp when the event occurred <irq,
41  * timestamp>.
42  *
43  * For every interrupt occurring in a short period of time, we can
44  * measure the elapsed time between the occurrences for the same
45  * interrupt and we end up with a suite of intervals. The experience
46  * showed the interrupts are often coming following a periodic
47  * pattern.
48  *
49  * The objective of the algorithm is to find out this periodic pattern
50  * in a fastest way and use its period to predict the next irq event.
51  *
52  * When the next interrupt event is requested, we are in the situation
53  * where the interrupts are disabled and the circular buffer
54  * containing the timings is filled with the events which happened
55  * after the previous next-interrupt-event request.
56  *
57  * At this point, we read the circular buffer and we fill the irq
58  * related statistics structure. After this step, the circular array
59  * containing the timings is empty because all the values are
60  * dispatched in their corresponding buffers.
61  *
62  * Now for each interrupt, we can predict the next event by using the
63  * suffix array, log interval and exponential moving average
64  *
65  * 1. Suffix array
66  *
67  * Suffix array is an array of all the suffixes of a string. It is
68  * widely used as a data structure for compression, text search, ...
69  * For instance for the word 'banana', the suffixes will be: 'banana'
70  * 'anana' 'nana' 'ana' 'na' 'a'
71  *
72  * Usually, the suffix array is sorted but for our purpose it is
73  * not necessary and won't provide any improvement in the context of
74  * the solved problem where we clearly define the boundaries of the
75  * search by a max period and min period.
76  *
77  * The suffix array will build a suite of intervals of different
78  * length and will look for the repetition of each suite. If the suite
79  * is repeating then we have the period because it is the length of
80  * the suite whatever its position in the buffer.
81  *
82  * 2. Log interval
83  *
84  * We saw the irq timings allow to compute the interval of the
85  * occurrences for a specific interrupt. We can reasonibly assume the
86  * longer is the interval, the higher is the error for the next event
87  * and we can consider storing those interval values into an array
88  * where each slot in the array correspond to an interval at the power
89  * of 2 of the index. For example, index 12 will contain values
90  * between 2^11 and 2^12.
91  *
92  * At the end we have an array of values where at each index defines a
93  * [2^index - 1, 2 ^ index] interval values allowing to store a large
94  * number of values inside a small array.
95  *
96  * For example, if we have the value 1123, then we store it at
97  * ilog2(1123) = 10 index value.
98  *
99  * Storing those value at the specific index is done by computing an
100  * exponential moving average for this specific slot. For instance,
101  * for values 1800, 1123, 1453, ... fall under the same slot (10) and
102  * the exponential moving average is computed every time a new value
103  * is stored at this slot.
104  *
105  * 3. Exponential Moving Average
106  *
107  * The EMA is largely used to track a signal for stocks or as a low
108  * pass filter. The magic of the formula, is it is very simple and the
109  * reactivity of the average can be tuned with the factors called
110  * alpha.
111  *
112  * The higher the alphas are, the faster the average respond to the
113  * signal change. In our case, if a slot in the array is a big
114  * interval, we can have numbers with a big difference between
115  * them. The impact of those differences in the average computation
116  * can be tuned by changing the alpha value.
117  *
118  *
119  *  -- The algorithm --
120  *
121  * We saw the different processing above, now let's see how they are
122  * used together.
123  *
124  * For each interrupt:
125  *	For each interval:
126  *		Compute the index = ilog2(interval)
127  *		Compute a new_ema(buffer[index], interval)
128  *		Store the index in a circular buffer
129  *
130  *	Compute the suffix array of the indexes
131  *
132  *	For each suffix:
133  *		If the suffix is reverse-found 3 times
134  *			Return suffix
135  *
136  *	Return Not found
137  *
138  * However we can not have endless suffix array to be build, it won't
139  * make sense and it will add an extra overhead, so we can restrict
140  * this to a maximum suffix length of 5 and a minimum suffix length of
141  * 2. The experience showed 5 is the majority of the maximum pattern
142  * period found for different devices.
143  *
144  * The result is a pattern finding less than 1us for an interrupt.
145  *
146  * Example based on real values:
147  *
148  * Example 1 : MMC write/read interrupt interval:
149  *
150  *	223947, 1240, 1384, 1386, 1386,
151  *	217416, 1236, 1384, 1386, 1387,
152  *	214719, 1241, 1386, 1387, 1384,
153  *	213696, 1234, 1384, 1386, 1388,
154  *	219904, 1240, 1385, 1389, 1385,
155  *	212240, 1240, 1386, 1386, 1386,
156  *	214415, 1236, 1384, 1386, 1387,
157  *	214276, 1234, 1384, 1388, ?
158  *
159  * For each element, apply ilog2(value)
160  *
161  *	15, 8, 8, 8, 8,
162  *	15, 8, 8, 8, 8,
163  *	15, 8, 8, 8, 8,
164  *	15, 8, 8, 8, 8,
165  *	15, 8, 8, 8, 8,
166  *	15, 8, 8, 8, 8,
167  *	15, 8, 8, 8, 8,
168  *	15, 8, 8, 8, ?
169  *
170  * Max period of 5, we take the last (max_period * 3) 15 elements as
171  * we can be confident if the pattern repeats itself three times it is
172  * a repeating pattern.
173  *
174  *	             8,
175  *	15, 8, 8, 8, 8,
176  *	15, 8, 8, 8, 8,
177  *	15, 8, 8, 8, ?
178  *
179  * Suffixes are:
180  *
181  *  1) 8, 15, 8, 8, 8  <- max period
182  *  2) 8, 15, 8, 8
183  *  3) 8, 15, 8
184  *  4) 8, 15           <- min period
185  *
186  * From there we search the repeating pattern for each suffix.
187  *
188  * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
189  *         |   |  |  |  |  |   |  |  |  |  |   |  |  |  |
190  *         8, 15, 8, 8, 8  |   |  |  |  |  |   |  |  |  |
191  *                         8, 15, 8, 8, 8  |   |  |  |  |
192  *                                         8, 15, 8, 8, 8
193  *
194  * When moving the suffix, we found exactly 3 matches.
195  *
196  * The first suffix with period 5 is repeating.
197  *
198  * The next event is (3 * max_period) % suffix_period
199  *
200  * In this example, the result 0, so the next event is suffix[0] => 8
201  *
202  * However, 8 is the index in the array of exponential moving average
203  * which was calculated on the fly when storing the values, so the
204  * interval is ema[8] = 1366
205  *
206  *
207  * Example 2:
208  *
209  *	4, 3, 5, 100,
210  *	3, 3, 5, 117,
211  *	4, 4, 5, 112,
212  *	4, 3, 4, 110,
213  *	3, 5, 3, 117,
214  *	4, 4, 5, 112,
215  *	4, 3, 4, 110,
216  *	3, 4, 5, 112,
217  *	4, 3, 4, 110
218  *
219  * ilog2
220  *
221  *	0, 0, 0, 4,
222  *	0, 0, 0, 4,
223  *	0, 0, 0, 4,
224  *	0, 0, 0, 4,
225  *	0, 0, 0, 4,
226  *	0, 0, 0, 4,
227  *	0, 0, 0, 4,
228  *	0, 0, 0, 4,
229  *	0, 0, 0, 4
230  *
231  * Max period 5:
232  *	   0, 0, 4,
233  *	0, 0, 0, 4,
234  *	0, 0, 0, 4,
235  *	0, 0, 0, 4
236  *
237  * Suffixes:
238  *
239  *  1) 0, 0, 4, 0, 0
240  *  2) 0, 0, 4, 0
241  *  3) 0, 0, 4
242  *  4) 0, 0
243  *
244  * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
245  *         |  |  |  |  |  |  X
246  *         0, 0, 4, 0, 0, |  X
247  *                        0, 0
248  *
249  * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
250  *         |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
251  *         0, 0, 4, 0, |  |  |  |  |  |  |  |  |  |  |
252  *                     0, 0, 4, 0, |  |  |  |  |  |  |
253  *                                 0, 0, 4, 0, |  |  |
254  *                                             0  0  4
255  *
256  * Pattern is found 3 times, the remaining is 1 which results from
257  * (max_period * 3) % suffix_period. This value is the index in the
258  * suffix arrays. The suffix array for a period 4 has the value 4
259  * at index 1.
260  */
261 #define EMA_ALPHA_VAL		64
262 #define EMA_ALPHA_SHIFT		7
263 
264 #define PREDICTION_PERIOD_MIN	3
265 #define PREDICTION_PERIOD_MAX	5
266 #define PREDICTION_FACTOR	4
267 #define PREDICTION_MAX		10 /* 2 ^ PREDICTION_MAX useconds */
268 #define PREDICTION_BUFFER_SIZE	16 /* slots for EMAs, hardly more than 16 */
269 
270 /*
271  * Number of elements in the circular buffer: If it happens it was
272  * flushed before, then the number of elements could be smaller than
273  * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
274  * used as we wrapped. The index begins from zero when we did not
275  * wrap. That could be done in a nicer way with the proper circular
276  * array structure type but with the cost of extra computation in the
277  * interrupt handler hot path. We choose efficiency.
278  */
279 #define for_each_irqts(i, irqts)					\
280 	for (i = irqts->count < IRQ_TIMINGS_SIZE ?			\
281 		     0 : irqts->count & IRQ_TIMINGS_MASK,		\
282 		     irqts->count = min(IRQ_TIMINGS_SIZE,		\
283 					irqts->count);			\
284 	     irqts->count > 0; irqts->count--,				\
285 		     i = (i + 1) & IRQ_TIMINGS_MASK)
286 
287 struct irqt_stat {
288 	u64	last_ts;
289 	u64	ema_time[PREDICTION_BUFFER_SIZE];
290 	int	timings[IRQ_TIMINGS_SIZE];
291 	int	circ_timings[IRQ_TIMINGS_SIZE];
292 	int	count;
293 };
294 
295 /*
296  * Exponential moving average computation
297  */
298 static u64 irq_timings_ema_new(u64 value, u64 ema_old)
299 {
300 	s64 diff;
301 
302 	if (unlikely(!ema_old))
303 		return value;
304 
305 	diff = (value - ema_old) * EMA_ALPHA_VAL;
306 	/*
307 	 * We can use a s64 type variable to be added with the u64
308 	 * ema_old variable as this one will never have its topmost
309 	 * bit set, it will be always smaller than 2^63 nanosec
310 	 * interrupt interval (292 years).
311 	 */
312 	return ema_old + (diff >> EMA_ALPHA_SHIFT);
313 }
314 
315 static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
316 {
317 	int period;
318 
319 	/*
320 	 * Move the beginning pointer to the end minus the max period x 3.
321 	 * We are at the point we can begin searching the pattern
322 	 */
323 	buffer = &buffer[len - (period_max * 3)];
324 
325 	/* Adjust the length to the maximum allowed period x 3 */
326 	len = period_max * 3;
327 
328 	/*
329 	 * The buffer contains the suite of intervals, in a ilog2
330 	 * basis, we are looking for a repetition. We point the
331 	 * beginning of the search three times the length of the
332 	 * period beginning at the end of the buffer. We do that for
333 	 * each suffix.
334 	 */
335 	for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) {
336 
337 		/*
338 		 * The first comparison always succeed because the
339 		 * suffix is deduced from the first n-period bytes of
340 		 * the buffer and we compare the initial suffix with
341 		 * itself, so we can skip the first iteration.
342 		 */
343 		int idx = period;
344 		size_t size = period;
345 
346 		/*
347 		 * We look if the suite with period 'i' repeat
348 		 * itself. If it is truncated at the end, as it
349 		 * repeats we can use the period to find out the next
350 		 * element with the modulo.
351 		 */
352 		while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) {
353 
354 			/*
355 			 * Move the index in a period basis
356 			 */
357 			idx += size;
358 
359 			/*
360 			 * If this condition is reached, all previous
361 			 * memcmp were successful, so the period is
362 			 * found.
363 			 */
364 			if (idx == len)
365 				return buffer[len % period];
366 
367 			/*
368 			 * If the remaining elements to compare are
369 			 * smaller than the period, readjust the size
370 			 * of the comparison for the last iteration.
371 			 */
372 			if (len - idx < period)
373 				size = len - idx;
374 		}
375 	}
376 
377 	return -1;
378 }
379 
380 static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
381 {
382 	int index, i, period_max, count, start, min = INT_MAX;
383 
384 	if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
385 		irqs->count = irqs->last_ts = 0;
386 		return U64_MAX;
387 	}
388 
389 	/*
390 	 * As we want to find three times the repetition, we need a
391 	 * number of intervals greater or equal to three times the
392 	 * maximum period, otherwise we truncate the max period.
393 	 */
394 	period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
395 		PREDICTION_PERIOD_MAX : irqs->count / 3;
396 
397 	/*
398 	 * If we don't have enough irq timings for this prediction,
399 	 * just bail out.
400 	 */
401 	if (period_max <= PREDICTION_PERIOD_MIN)
402 		return U64_MAX;
403 
404 	/*
405 	 * 'count' will depends if the circular buffer wrapped or not
406 	 */
407 	count = irqs->count < IRQ_TIMINGS_SIZE ?
408 		irqs->count : IRQ_TIMINGS_SIZE;
409 
410 	start = irqs->count < IRQ_TIMINGS_SIZE ?
411 		0 : (irqs->count & IRQ_TIMINGS_MASK);
412 
413 	/*
414 	 * Copy the content of the circular buffer into another buffer
415 	 * in order to linearize the buffer instead of dealing with
416 	 * wrapping indexes and shifted array which will be prone to
417 	 * error and extremelly difficult to debug.
418 	 */
419 	for (i = 0; i < count; i++) {
420 		int index = (start + i) & IRQ_TIMINGS_MASK;
421 
422 		irqs->timings[i] = irqs->circ_timings[index];
423 		min = min_t(int, irqs->timings[i], min);
424 	}
425 
426 	index = irq_timings_next_event_index(irqs->timings, count, period_max);
427 	if (index < 0)
428 		return irqs->last_ts + irqs->ema_time[min];
429 
430 	return irqs->last_ts + irqs->ema_time[index];
431 }
432 
433 static __always_inline int irq_timings_interval_index(u64 interval)
434 {
435 	/*
436 	 * The PREDICTION_FACTOR increase the interval size for the
437 	 * array of exponential average.
438 	 */
439 	u64 interval_us = (interval >> 10) / PREDICTION_FACTOR;
440 
441 	return likely(interval_us) ? ilog2(interval_us) : 0;
442 }
443 
444 static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
445 						u64 interval)
446 {
447 	int index;
448 
449 	/*
450 	 * Get the index in the ema table for this interrupt.
451 	 */
452 	index = irq_timings_interval_index(interval);
453 
454 	/*
455 	 * Store the index as an element of the pattern in another
456 	 * circular array.
457 	 */
458 	irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
459 
460 	irqs->ema_time[index] = irq_timings_ema_new(interval,
461 						    irqs->ema_time[index]);
462 
463 	irqs->count++;
464 }
465 
466 static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
467 {
468 	u64 old_ts = irqs->last_ts;
469 	u64 interval;
470 
471 	/*
472 	 * The timestamps are absolute time values, we need to compute
473 	 * the timing interval between two interrupts.
474 	 */
475 	irqs->last_ts = ts;
476 
477 	/*
478 	 * The interval type is u64 in order to deal with the same
479 	 * type in our computation, that prevent mindfuck issues with
480 	 * overflow, sign and division.
481 	 */
482 	interval = ts - old_ts;
483 
484 	/*
485 	 * The interrupt triggered more than one second apart, that
486 	 * ends the sequence as predictible for our purpose. In this
487 	 * case, assume we have the beginning of a sequence and the
488 	 * timestamp is the first value. As it is impossible to
489 	 * predict anything at this point, return.
490 	 *
491 	 * Note the first timestamp of the sequence will always fall
492 	 * in this test because the old_ts is zero. That is what we
493 	 * want as we need another timestamp to compute an interval.
494 	 */
495 	if (interval >= NSEC_PER_SEC) {
496 		irqs->count = 0;
497 		return;
498 	}
499 
500 	__irq_timings_store(irq, irqs, interval);
501 }
502 
503 /**
504  * irq_timings_next_event - Return when the next event is supposed to arrive
505  *
506  * During the last busy cycle, the number of interrupts is incremented
507  * and stored in the irq_timings structure. This information is
508  * necessary to:
509  *
510  * - know if the index in the table wrapped up:
511  *
512  *      If more than the array size interrupts happened during the
513  *      last busy/idle cycle, the index wrapped up and we have to
514  *      begin with the next element in the array which is the last one
515  *      in the sequence, otherwise it is a the index 0.
516  *
517  * - have an indication of the interrupts activity on this CPU
518  *   (eg. irq/sec)
519  *
520  * The values are 'consumed' after inserting in the statistical model,
521  * thus the count is reinitialized.
522  *
523  * The array of values **must** be browsed in the time direction, the
524  * timestamp must increase between an element and the next one.
525  *
526  * Returns a nanosec time based estimation of the earliest interrupt,
527  * U64_MAX otherwise.
528  */
529 u64 irq_timings_next_event(u64 now)
530 {
531 	struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
532 	struct irqt_stat *irqs;
533 	struct irqt_stat __percpu *s;
534 	u64 ts, next_evt = U64_MAX;
535 	int i, irq = 0;
536 
537 	/*
538 	 * This function must be called with the local irq disabled in
539 	 * order to prevent the timings circular buffer to be updated
540 	 * while we are reading it.
541 	 */
542 	lockdep_assert_irqs_disabled();
543 
544 	if (!irqts->count)
545 		return next_evt;
546 
547 	/*
548 	 * Number of elements in the circular buffer: If it happens it
549 	 * was flushed before, then the number of elements could be
550 	 * smaller than IRQ_TIMINGS_SIZE, so the count is used,
551 	 * otherwise the array size is used as we wrapped. The index
552 	 * begins from zero when we did not wrap. That could be done
553 	 * in a nicer way with the proper circular array structure
554 	 * type but with the cost of extra computation in the
555 	 * interrupt handler hot path. We choose efficiency.
556 	 *
557 	 * Inject measured irq/timestamp to the pattern prediction
558 	 * model while decrementing the counter because we consume the
559 	 * data from our circular buffer.
560 	 */
561 	for_each_irqts(i, irqts) {
562 		irq = irq_timing_decode(irqts->values[i], &ts);
563 		s = idr_find(&irqt_stats, irq);
564 		if (s)
565 			irq_timings_store(irq, this_cpu_ptr(s), ts);
566 	}
567 
568 	/*
569 	 * Look in the list of interrupts' statistics, the earliest
570 	 * next event.
571 	 */
572 	idr_for_each_entry(&irqt_stats, s, i) {
573 
574 		irqs = this_cpu_ptr(s);
575 
576 		ts = __irq_timings_next_event(irqs, i, now);
577 		if (ts <= now)
578 			return now;
579 
580 		if (ts < next_evt)
581 			next_evt = ts;
582 	}
583 
584 	return next_evt;
585 }
586 
587 void irq_timings_free(int irq)
588 {
589 	struct irqt_stat __percpu *s;
590 
591 	s = idr_find(&irqt_stats, irq);
592 	if (s) {
593 		free_percpu(s);
594 		idr_remove(&irqt_stats, irq);
595 	}
596 }
597 
598 int irq_timings_alloc(int irq)
599 {
600 	struct irqt_stat __percpu *s;
601 	int id;
602 
603 	/*
604 	 * Some platforms can have the same private interrupt per cpu,
605 	 * so this function may be be called several times with the
606 	 * same interrupt number. Just bail out in case the per cpu
607 	 * stat structure is already allocated.
608 	 */
609 	s = idr_find(&irqt_stats, irq);
610 	if (s)
611 		return 0;
612 
613 	s = alloc_percpu(*s);
614 	if (!s)
615 		return -ENOMEM;
616 
617 	idr_preload(GFP_KERNEL);
618 	id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
619 	idr_preload_end();
620 
621 	if (id < 0) {
622 		free_percpu(s);
623 		return id;
624 	}
625 
626 	return 0;
627 }
628