1 // SPDX-License-Identifier: GPL-2.0 2 // Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> 3 4 #include <linux/kernel.h> 5 #include <linux/percpu.h> 6 #include <linux/slab.h> 7 #include <linux/static_key.h> 8 #include <linux/interrupt.h> 9 #include <linux/idr.h> 10 #include <linux/irq.h> 11 #include <linux/math64.h> 12 #include <linux/log2.h> 13 14 #include <trace/events/irq.h> 15 16 #include "internals.h" 17 18 DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); 19 20 DEFINE_PER_CPU(struct irq_timings, irq_timings); 21 22 static DEFINE_IDR(irqt_stats); 23 24 void irq_timings_enable(void) 25 { 26 static_branch_enable(&irq_timing_enabled); 27 } 28 29 void irq_timings_disable(void) 30 { 31 static_branch_disable(&irq_timing_enabled); 32 } 33 34 /* 35 * The main goal of this algorithm is to predict the next interrupt 36 * occurrence on the current CPU. 37 * 38 * Currently, the interrupt timings are stored in a circular array 39 * buffer every time there is an interrupt, as a tuple: the interrupt 40 * number and the associated timestamp when the event occurred <irq, 41 * timestamp>. 42 * 43 * For every interrupt occurring in a short period of time, we can 44 * measure the elapsed time between the occurrences for the same 45 * interrupt and we end up with a suite of intervals. The experience 46 * showed the interrupts are often coming following a periodic 47 * pattern. 48 * 49 * The objective of the algorithm is to find out this periodic pattern 50 * in a fastest way and use its period to predict the next irq event. 51 * 52 * When the next interrupt event is requested, we are in the situation 53 * where the interrupts are disabled and the circular buffer 54 * containing the timings is filled with the events which happened 55 * after the previous next-interrupt-event request. 56 * 57 * At this point, we read the circular buffer and we fill the irq 58 * related statistics structure. After this step, the circular array 59 * containing the timings is empty because all the values are 60 * dispatched in their corresponding buffers. 61 * 62 * Now for each interrupt, we can predict the next event by using the 63 * suffix array, log interval and exponential moving average 64 * 65 * 1. Suffix array 66 * 67 * Suffix array is an array of all the suffixes of a string. It is 68 * widely used as a data structure for compression, text search, ... 69 * For instance for the word 'banana', the suffixes will be: 'banana' 70 * 'anana' 'nana' 'ana' 'na' 'a' 71 * 72 * Usually, the suffix array is sorted but for our purpose it is 73 * not necessary and won't provide any improvement in the context of 74 * the solved problem where we clearly define the boundaries of the 75 * search by a max period and min period. 76 * 77 * The suffix array will build a suite of intervals of different 78 * length and will look for the repetition of each suite. If the suite 79 * is repeating then we have the period because it is the length of 80 * the suite whatever its position in the buffer. 81 * 82 * 2. Log interval 83 * 84 * We saw the irq timings allow to compute the interval of the 85 * occurrences for a specific interrupt. We can reasonibly assume the 86 * longer is the interval, the higher is the error for the next event 87 * and we can consider storing those interval values into an array 88 * where each slot in the array correspond to an interval at the power 89 * of 2 of the index. For example, index 12 will contain values 90 * between 2^11 and 2^12. 91 * 92 * At the end we have an array of values where at each index defines a 93 * [2^index - 1, 2 ^ index] interval values allowing to store a large 94 * number of values inside a small array. 95 * 96 * For example, if we have the value 1123, then we store it at 97 * ilog2(1123) = 10 index value. 98 * 99 * Storing those value at the specific index is done by computing an 100 * exponential moving average for this specific slot. For instance, 101 * for values 1800, 1123, 1453, ... fall under the same slot (10) and 102 * the exponential moving average is computed every time a new value 103 * is stored at this slot. 104 * 105 * 3. Exponential Moving Average 106 * 107 * The EMA is largely used to track a signal for stocks or as a low 108 * pass filter. The magic of the formula, is it is very simple and the 109 * reactivity of the average can be tuned with the factors called 110 * alpha. 111 * 112 * The higher the alphas are, the faster the average respond to the 113 * signal change. In our case, if a slot in the array is a big 114 * interval, we can have numbers with a big difference between 115 * them. The impact of those differences in the average computation 116 * can be tuned by changing the alpha value. 117 * 118 * 119 * -- The algorithm -- 120 * 121 * We saw the different processing above, now let's see how they are 122 * used together. 123 * 124 * For each interrupt: 125 * For each interval: 126 * Compute the index = ilog2(interval) 127 * Compute a new_ema(buffer[index], interval) 128 * Store the index in a circular buffer 129 * 130 * Compute the suffix array of the indexes 131 * 132 * For each suffix: 133 * If the suffix is reverse-found 3 times 134 * Return suffix 135 * 136 * Return Not found 137 * 138 * However we can not have endless suffix array to be build, it won't 139 * make sense and it will add an extra overhead, so we can restrict 140 * this to a maximum suffix length of 5 and a minimum suffix length of 141 * 2. The experience showed 5 is the majority of the maximum pattern 142 * period found for different devices. 143 * 144 * The result is a pattern finding less than 1us for an interrupt. 145 * 146 * Example based on real values: 147 * 148 * Example 1 : MMC write/read interrupt interval: 149 * 150 * 223947, 1240, 1384, 1386, 1386, 151 * 217416, 1236, 1384, 1386, 1387, 152 * 214719, 1241, 1386, 1387, 1384, 153 * 213696, 1234, 1384, 1386, 1388, 154 * 219904, 1240, 1385, 1389, 1385, 155 * 212240, 1240, 1386, 1386, 1386, 156 * 214415, 1236, 1384, 1386, 1387, 157 * 214276, 1234, 1384, 1388, ? 158 * 159 * For each element, apply ilog2(value) 160 * 161 * 15, 8, 8, 8, 8, 162 * 15, 8, 8, 8, 8, 163 * 15, 8, 8, 8, 8, 164 * 15, 8, 8, 8, 8, 165 * 15, 8, 8, 8, 8, 166 * 15, 8, 8, 8, 8, 167 * 15, 8, 8, 8, 8, 168 * 15, 8, 8, 8, ? 169 * 170 * Max period of 5, we take the last (max_period * 3) 15 elements as 171 * we can be confident if the pattern repeats itself three times it is 172 * a repeating pattern. 173 * 174 * 8, 175 * 15, 8, 8, 8, 8, 176 * 15, 8, 8, 8, 8, 177 * 15, 8, 8, 8, ? 178 * 179 * Suffixes are: 180 * 181 * 1) 8, 15, 8, 8, 8 <- max period 182 * 2) 8, 15, 8, 8 183 * 3) 8, 15, 8 184 * 4) 8, 15 <- min period 185 * 186 * From there we search the repeating pattern for each suffix. 187 * 188 * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8 189 * | | | | | | | | | | | | | | | 190 * 8, 15, 8, 8, 8 | | | | | | | | | | 191 * 8, 15, 8, 8, 8 | | | | | 192 * 8, 15, 8, 8, 8 193 * 194 * When moving the suffix, we found exactly 3 matches. 195 * 196 * The first suffix with period 5 is repeating. 197 * 198 * The next event is (3 * max_period) % suffix_period 199 * 200 * In this example, the result 0, so the next event is suffix[0] => 8 201 * 202 * However, 8 is the index in the array of exponential moving average 203 * which was calculated on the fly when storing the values, so the 204 * interval is ema[8] = 1366 205 * 206 * 207 * Example 2: 208 * 209 * 4, 3, 5, 100, 210 * 3, 3, 5, 117, 211 * 4, 4, 5, 112, 212 * 4, 3, 4, 110, 213 * 3, 5, 3, 117, 214 * 4, 4, 5, 112, 215 * 4, 3, 4, 110, 216 * 3, 4, 5, 112, 217 * 4, 3, 4, 110 218 * 219 * ilog2 220 * 221 * 0, 0, 0, 4, 222 * 0, 0, 0, 4, 223 * 0, 0, 0, 4, 224 * 0, 0, 0, 4, 225 * 0, 0, 0, 4, 226 * 0, 0, 0, 4, 227 * 0, 0, 0, 4, 228 * 0, 0, 0, 4, 229 * 0, 0, 0, 4 230 * 231 * Max period 5: 232 * 0, 0, 4, 233 * 0, 0, 0, 4, 234 * 0, 0, 0, 4, 235 * 0, 0, 0, 4 236 * 237 * Suffixes: 238 * 239 * 1) 0, 0, 4, 0, 0 240 * 2) 0, 0, 4, 0 241 * 3) 0, 0, 4 242 * 4) 0, 0 243 * 244 * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 245 * | | | | | | X 246 * 0, 0, 4, 0, 0, | X 247 * 0, 0 248 * 249 * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 250 * | | | | | | | | | | | | | | | 251 * 0, 0, 4, 0, | | | | | | | | | | | 252 * 0, 0, 4, 0, | | | | | | | 253 * 0, 0, 4, 0, | | | 254 * 0 0 4 255 * 256 * Pattern is found 3 times, the remaining is 1 which results from 257 * (max_period * 3) % suffix_period. This value is the index in the 258 * suffix arrays. The suffix array for a period 4 has the value 4 259 * at index 1. 260 */ 261 #define EMA_ALPHA_VAL 64 262 #define EMA_ALPHA_SHIFT 7 263 264 #define PREDICTION_PERIOD_MIN 3 265 #define PREDICTION_PERIOD_MAX 5 266 #define PREDICTION_FACTOR 4 267 #define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ 268 #define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ 269 270 /* 271 * Number of elements in the circular buffer: If it happens it was 272 * flushed before, then the number of elements could be smaller than 273 * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is 274 * used as we wrapped. The index begins from zero when we did not 275 * wrap. That could be done in a nicer way with the proper circular 276 * array structure type but with the cost of extra computation in the 277 * interrupt handler hot path. We choose efficiency. 278 */ 279 #define for_each_irqts(i, irqts) \ 280 for (i = irqts->count < IRQ_TIMINGS_SIZE ? \ 281 0 : irqts->count & IRQ_TIMINGS_MASK, \ 282 irqts->count = min(IRQ_TIMINGS_SIZE, \ 283 irqts->count); \ 284 irqts->count > 0; irqts->count--, \ 285 i = (i + 1) & IRQ_TIMINGS_MASK) 286 287 struct irqt_stat { 288 u64 last_ts; 289 u64 ema_time[PREDICTION_BUFFER_SIZE]; 290 int timings[IRQ_TIMINGS_SIZE]; 291 int circ_timings[IRQ_TIMINGS_SIZE]; 292 int count; 293 }; 294 295 /* 296 * Exponential moving average computation 297 */ 298 static u64 irq_timings_ema_new(u64 value, u64 ema_old) 299 { 300 s64 diff; 301 302 if (unlikely(!ema_old)) 303 return value; 304 305 diff = (value - ema_old) * EMA_ALPHA_VAL; 306 /* 307 * We can use a s64 type variable to be added with the u64 308 * ema_old variable as this one will never have its topmost 309 * bit set, it will be always smaller than 2^63 nanosec 310 * interrupt interval (292 years). 311 */ 312 return ema_old + (diff >> EMA_ALPHA_SHIFT); 313 } 314 315 static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) 316 { 317 int period; 318 319 /* 320 * Move the beginning pointer to the end minus the max period x 3. 321 * We are at the point we can begin searching the pattern 322 */ 323 buffer = &buffer[len - (period_max * 3)]; 324 325 /* Adjust the length to the maximum allowed period x 3 */ 326 len = period_max * 3; 327 328 /* 329 * The buffer contains the suite of intervals, in a ilog2 330 * basis, we are looking for a repetition. We point the 331 * beginning of the search three times the length of the 332 * period beginning at the end of the buffer. We do that for 333 * each suffix. 334 */ 335 for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) { 336 337 /* 338 * The first comparison always succeed because the 339 * suffix is deduced from the first n-period bytes of 340 * the buffer and we compare the initial suffix with 341 * itself, so we can skip the first iteration. 342 */ 343 int idx = period; 344 size_t size = period; 345 346 /* 347 * We look if the suite with period 'i' repeat 348 * itself. If it is truncated at the end, as it 349 * repeats we can use the period to find out the next 350 * element with the modulo. 351 */ 352 while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) { 353 354 /* 355 * Move the index in a period basis 356 */ 357 idx += size; 358 359 /* 360 * If this condition is reached, all previous 361 * memcmp were successful, so the period is 362 * found. 363 */ 364 if (idx == len) 365 return buffer[len % period]; 366 367 /* 368 * If the remaining elements to compare are 369 * smaller than the period, readjust the size 370 * of the comparison for the last iteration. 371 */ 372 if (len - idx < period) 373 size = len - idx; 374 } 375 } 376 377 return -1; 378 } 379 380 static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) 381 { 382 int index, i, period_max, count, start, min = INT_MAX; 383 384 if ((now - irqs->last_ts) >= NSEC_PER_SEC) { 385 irqs->count = irqs->last_ts = 0; 386 return U64_MAX; 387 } 388 389 /* 390 * As we want to find three times the repetition, we need a 391 * number of intervals greater or equal to three times the 392 * maximum period, otherwise we truncate the max period. 393 */ 394 period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ? 395 PREDICTION_PERIOD_MAX : irqs->count / 3; 396 397 /* 398 * If we don't have enough irq timings for this prediction, 399 * just bail out. 400 */ 401 if (period_max <= PREDICTION_PERIOD_MIN) 402 return U64_MAX; 403 404 /* 405 * 'count' will depends if the circular buffer wrapped or not 406 */ 407 count = irqs->count < IRQ_TIMINGS_SIZE ? 408 irqs->count : IRQ_TIMINGS_SIZE; 409 410 start = irqs->count < IRQ_TIMINGS_SIZE ? 411 0 : (irqs->count & IRQ_TIMINGS_MASK); 412 413 /* 414 * Copy the content of the circular buffer into another buffer 415 * in order to linearize the buffer instead of dealing with 416 * wrapping indexes and shifted array which will be prone to 417 * error and extremelly difficult to debug. 418 */ 419 for (i = 0; i < count; i++) { 420 int index = (start + i) & IRQ_TIMINGS_MASK; 421 422 irqs->timings[i] = irqs->circ_timings[index]; 423 min = min_t(int, irqs->timings[i], min); 424 } 425 426 index = irq_timings_next_event_index(irqs->timings, count, period_max); 427 if (index < 0) 428 return irqs->last_ts + irqs->ema_time[min]; 429 430 return irqs->last_ts + irqs->ema_time[index]; 431 } 432 433 static __always_inline int irq_timings_interval_index(u64 interval) 434 { 435 /* 436 * The PREDICTION_FACTOR increase the interval size for the 437 * array of exponential average. 438 */ 439 u64 interval_us = (interval >> 10) / PREDICTION_FACTOR; 440 441 return likely(interval_us) ? ilog2(interval_us) : 0; 442 } 443 444 static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs, 445 u64 interval) 446 { 447 int index; 448 449 /* 450 * Get the index in the ema table for this interrupt. 451 */ 452 index = irq_timings_interval_index(interval); 453 454 /* 455 * Store the index as an element of the pattern in another 456 * circular array. 457 */ 458 irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; 459 460 irqs->ema_time[index] = irq_timings_ema_new(interval, 461 irqs->ema_time[index]); 462 463 irqs->count++; 464 } 465 466 static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) 467 { 468 u64 old_ts = irqs->last_ts; 469 u64 interval; 470 471 /* 472 * The timestamps are absolute time values, we need to compute 473 * the timing interval between two interrupts. 474 */ 475 irqs->last_ts = ts; 476 477 /* 478 * The interval type is u64 in order to deal with the same 479 * type in our computation, that prevent mindfuck issues with 480 * overflow, sign and division. 481 */ 482 interval = ts - old_ts; 483 484 /* 485 * The interrupt triggered more than one second apart, that 486 * ends the sequence as predictible for our purpose. In this 487 * case, assume we have the beginning of a sequence and the 488 * timestamp is the first value. As it is impossible to 489 * predict anything at this point, return. 490 * 491 * Note the first timestamp of the sequence will always fall 492 * in this test because the old_ts is zero. That is what we 493 * want as we need another timestamp to compute an interval. 494 */ 495 if (interval >= NSEC_PER_SEC) { 496 irqs->count = 0; 497 return; 498 } 499 500 __irq_timings_store(irq, irqs, interval); 501 } 502 503 /** 504 * irq_timings_next_event - Return when the next event is supposed to arrive 505 * 506 * During the last busy cycle, the number of interrupts is incremented 507 * and stored in the irq_timings structure. This information is 508 * necessary to: 509 * 510 * - know if the index in the table wrapped up: 511 * 512 * If more than the array size interrupts happened during the 513 * last busy/idle cycle, the index wrapped up and we have to 514 * begin with the next element in the array which is the last one 515 * in the sequence, otherwise it is a the index 0. 516 * 517 * - have an indication of the interrupts activity on this CPU 518 * (eg. irq/sec) 519 * 520 * The values are 'consumed' after inserting in the statistical model, 521 * thus the count is reinitialized. 522 * 523 * The array of values **must** be browsed in the time direction, the 524 * timestamp must increase between an element and the next one. 525 * 526 * Returns a nanosec time based estimation of the earliest interrupt, 527 * U64_MAX otherwise. 528 */ 529 u64 irq_timings_next_event(u64 now) 530 { 531 struct irq_timings *irqts = this_cpu_ptr(&irq_timings); 532 struct irqt_stat *irqs; 533 struct irqt_stat __percpu *s; 534 u64 ts, next_evt = U64_MAX; 535 int i, irq = 0; 536 537 /* 538 * This function must be called with the local irq disabled in 539 * order to prevent the timings circular buffer to be updated 540 * while we are reading it. 541 */ 542 lockdep_assert_irqs_disabled(); 543 544 if (!irqts->count) 545 return next_evt; 546 547 /* 548 * Number of elements in the circular buffer: If it happens it 549 * was flushed before, then the number of elements could be 550 * smaller than IRQ_TIMINGS_SIZE, so the count is used, 551 * otherwise the array size is used as we wrapped. The index 552 * begins from zero when we did not wrap. That could be done 553 * in a nicer way with the proper circular array structure 554 * type but with the cost of extra computation in the 555 * interrupt handler hot path. We choose efficiency. 556 * 557 * Inject measured irq/timestamp to the pattern prediction 558 * model while decrementing the counter because we consume the 559 * data from our circular buffer. 560 */ 561 for_each_irqts(i, irqts) { 562 irq = irq_timing_decode(irqts->values[i], &ts); 563 s = idr_find(&irqt_stats, irq); 564 if (s) 565 irq_timings_store(irq, this_cpu_ptr(s), ts); 566 } 567 568 /* 569 * Look in the list of interrupts' statistics, the earliest 570 * next event. 571 */ 572 idr_for_each_entry(&irqt_stats, s, i) { 573 574 irqs = this_cpu_ptr(s); 575 576 ts = __irq_timings_next_event(irqs, i, now); 577 if (ts <= now) 578 return now; 579 580 if (ts < next_evt) 581 next_evt = ts; 582 } 583 584 return next_evt; 585 } 586 587 void irq_timings_free(int irq) 588 { 589 struct irqt_stat __percpu *s; 590 591 s = idr_find(&irqt_stats, irq); 592 if (s) { 593 free_percpu(s); 594 idr_remove(&irqt_stats, irq); 595 } 596 } 597 598 int irq_timings_alloc(int irq) 599 { 600 struct irqt_stat __percpu *s; 601 int id; 602 603 /* 604 * Some platforms can have the same private interrupt per cpu, 605 * so this function may be be called several times with the 606 * same interrupt number. Just bail out in case the per cpu 607 * stat structure is already allocated. 608 */ 609 s = idr_find(&irqt_stats, irq); 610 if (s) 611 return 0; 612 613 s = alloc_percpu(*s); 614 if (!s) 615 return -ENOMEM; 616 617 idr_preload(GFP_KERNEL); 618 id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT); 619 idr_preload_end(); 620 621 if (id < 0) { 622 free_percpu(s); 623 return id; 624 } 625 626 return 0; 627 } 628