1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Historical Service Time 4 * 5 * Keeps a time-weighted exponential moving average of the historical 6 * service time. Estimates future service time based on the historical 7 * service time and the number of outstanding requests. 8 * 9 * Marks paths stale if they have not finished within hst * 10 * num_paths. If a path is stale and unused, we will send a single 11 * request to probe in case the path has improved. This situation 12 * generally arises if the path is so much worse than others that it 13 * will never have the best estimated service time, or if the entire 14 * multipath device is unused. If a path is stale and in use, limit the 15 * number of requests it can receive with the assumption that the path 16 * has become degraded. 17 * 18 * To avoid repeatedly calculating exponents for time weighting, times 19 * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) 20 * ns, and the weighting is pre-calculated. 21 * 22 */ 23 24 #include "dm.h" 25 #include "dm-path-selector.h" 26 27 #include <linux/blkdev.h> 28 #include <linux/slab.h> 29 #include <linux/module.h> 30 #include <linux/sched/clock.h> 31 32 33 #define DM_MSG_PREFIX "multipath historical-service-time" 34 #define HST_MIN_IO 1 35 #define HST_VERSION "0.1.1" 36 37 #define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ 38 #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) 39 #define HST_FIXED_1 (1 << HST_FIXED_SHIFT) 40 #define HST_FIXED_95 972 41 #define HST_MAX_INFLIGHT HST_FIXED_1 42 #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ 43 #define HST_WEIGHT_COUNT 64ULL 44 45 struct selector { 46 struct list_head valid_paths; 47 struct list_head failed_paths; 48 int valid_count; 49 spinlock_t lock; 50 51 unsigned int weights[HST_WEIGHT_COUNT]; 52 unsigned int threshold_multiplier; 53 }; 54 55 struct path_info { 56 struct list_head list; 57 struct dm_path *path; 58 unsigned int repeat_count; 59 60 spinlock_t lock; 61 62 u64 historical_service_time; /* Fixed point */ 63 64 u64 stale_after; 65 u64 last_finish; 66 67 u64 outstanding; 68 }; 69 70 /** 71 * fixed_power - compute: x^n, in O(log n) time 72 * 73 * @x: base of the power 74 * @frac_bits: fractional bits of @x 75 * @n: power to raise @x to. 76 * 77 * By exploiting the relation between the definition of the natural power 78 * function: x^n := x*x*...*x (x multiplied by itself for n times), and 79 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, 80 * (where: n_i \elem {0, 1}, the binary vector representing n), 81 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is 82 * of course trivially computable in O(log_2 n), the length of our binary 83 * vector. 84 * 85 * (see: kernel/sched/loadavg.c) 86 */ 87 static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) 88 { 89 unsigned long result = 1UL << frac_bits; 90 91 if (n) { 92 for (;;) { 93 if (n & 1) { 94 result *= x; 95 result += 1UL << (frac_bits - 1); 96 result >>= frac_bits; 97 } 98 n >>= 1; 99 if (!n) 100 break; 101 x *= x; 102 x += 1UL << (frac_bits - 1); 103 x >>= frac_bits; 104 } 105 } 106 107 return result; 108 } 109 110 /* 111 * Calculate the next value of an exponential moving average 112 * a_1 = a_0 * e + a * (1 - e) 113 * 114 * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] 115 * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] 116 * @weight: [0, HST_FIXED_1] 117 * 118 * Note: 119 * To account for multiple periods in the same calculation, 120 * a_n = a_0 * e^n + a * (1 - e^n), 121 * so call fixed_ema(last, next, pow(weight, N)) 122 */ 123 static u64 fixed_ema(u64 last, u64 next, u64 weight) 124 { 125 last *= weight; 126 last += next * (HST_FIXED_1 - weight); 127 last += 1ULL << (HST_FIXED_SHIFT - 1); 128 return last >> HST_FIXED_SHIFT; 129 } 130 131 static struct selector *alloc_selector(void) 132 { 133 struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); 134 135 if (s) { 136 INIT_LIST_HEAD(&s->valid_paths); 137 INIT_LIST_HEAD(&s->failed_paths); 138 spin_lock_init(&s->lock); 139 s->valid_count = 0; 140 } 141 142 return s; 143 } 144 145 /* 146 * Get the weight for a given time span. 147 */ 148 static u64 hst_weight(struct path_selector *ps, u64 delta) 149 { 150 struct selector *s = ps->context; 151 int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, 152 HST_WEIGHT_COUNT - 1); 153 154 return s->weights[bucket]; 155 } 156 157 /* 158 * Set up the weights array. 159 * 160 * weights[len-1] = 0 161 * weights[n] = base ^ (n + 1) 162 */ 163 static void hst_set_weights(struct path_selector *ps, unsigned int base) 164 { 165 struct selector *s = ps->context; 166 int i; 167 168 if (base >= HST_FIXED_1) 169 return; 170 171 for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) 172 s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); 173 s->weights[HST_WEIGHT_COUNT - 1] = 0; 174 } 175 176 static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) 177 { 178 struct selector *s; 179 unsigned int base_weight = HST_FIXED_95; 180 unsigned int threshold_multiplier = 0; 181 char dummy; 182 183 /* 184 * Arguments: [<base_weight> [<threshold_multiplier>]] 185 * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A 186 * value of 0 will completely ignore any history. 187 * If not given, default (HST_FIXED_95) is used. 188 * <threshold_multiplier>: Minimum threshold multiplier for paths to 189 * be considered different. That is, a path is 190 * considered different iff (p1 > N * p2) where p1 191 * is the path with higher service time. A threshold 192 * of 1 or 0 has no effect. Defaults to 0. 193 */ 194 if (argc > 2) 195 return -EINVAL; 196 197 if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || 198 base_weight >= HST_FIXED_1)) { 199 return -EINVAL; 200 } 201 202 if (argc > 1 && (sscanf(argv[1], "%u%c", 203 &threshold_multiplier, &dummy) != 1)) { 204 return -EINVAL; 205 } 206 207 s = alloc_selector(); 208 if (!s) 209 return -ENOMEM; 210 211 ps->context = s; 212 213 hst_set_weights(ps, base_weight); 214 s->threshold_multiplier = threshold_multiplier; 215 return 0; 216 } 217 218 static void free_paths(struct list_head *paths) 219 { 220 struct path_info *pi, *next; 221 222 list_for_each_entry_safe(pi, next, paths, list) { 223 list_del(&pi->list); 224 kfree(pi); 225 } 226 } 227 228 static void hst_destroy(struct path_selector *ps) 229 { 230 struct selector *s = ps->context; 231 232 free_paths(&s->valid_paths); 233 free_paths(&s->failed_paths); 234 kfree(s); 235 ps->context = NULL; 236 } 237 238 static int hst_status(struct path_selector *ps, struct dm_path *path, 239 status_type_t type, char *result, unsigned int maxlen) 240 { 241 unsigned int sz = 0; 242 struct path_info *pi; 243 244 if (!path) { 245 struct selector *s = ps->context; 246 247 DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); 248 } else { 249 pi = path->pscontext; 250 251 switch (type) { 252 case STATUSTYPE_INFO: 253 DMEMIT("%llu %llu %llu ", pi->historical_service_time, 254 pi->outstanding, pi->stale_after); 255 break; 256 case STATUSTYPE_TABLE: 257 DMEMIT("0 "); 258 break; 259 case STATUSTYPE_IMA: 260 *result = '\0'; 261 break; 262 } 263 } 264 265 return sz; 266 } 267 268 static int hst_add_path(struct path_selector *ps, struct dm_path *path, 269 int argc, char **argv, char **error) 270 { 271 struct selector *s = ps->context; 272 struct path_info *pi; 273 unsigned int repeat_count = HST_MIN_IO; 274 char dummy; 275 unsigned long flags; 276 277 /* 278 * Arguments: [<repeat_count>] 279 * <repeat_count>: The number of I/Os before switching path. 280 * If not given, default (HST_MIN_IO) is used. 281 */ 282 if (argc > 1) { 283 *error = "historical-service-time ps: incorrect number of arguments"; 284 return -EINVAL; 285 } 286 287 if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { 288 *error = "historical-service-time ps: invalid repeat count"; 289 return -EINVAL; 290 } 291 292 /* allocate the path */ 293 pi = kmalloc(sizeof(*pi), GFP_KERNEL); 294 if (!pi) { 295 *error = "historical-service-time ps: Error allocating path context"; 296 return -ENOMEM; 297 } 298 299 pi->path = path; 300 pi->repeat_count = repeat_count; 301 302 pi->historical_service_time = HST_FIXED_1; 303 304 spin_lock_init(&pi->lock); 305 pi->outstanding = 0; 306 307 pi->stale_after = 0; 308 pi->last_finish = 0; 309 310 path->pscontext = pi; 311 312 spin_lock_irqsave(&s->lock, flags); 313 list_add_tail(&pi->list, &s->valid_paths); 314 s->valid_count++; 315 spin_unlock_irqrestore(&s->lock, flags); 316 317 return 0; 318 } 319 320 static void hst_fail_path(struct path_selector *ps, struct dm_path *path) 321 { 322 struct selector *s = ps->context; 323 struct path_info *pi = path->pscontext; 324 unsigned long flags; 325 326 spin_lock_irqsave(&s->lock, flags); 327 list_move(&pi->list, &s->failed_paths); 328 s->valid_count--; 329 spin_unlock_irqrestore(&s->lock, flags); 330 } 331 332 static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) 333 { 334 struct selector *s = ps->context; 335 struct path_info *pi = path->pscontext; 336 unsigned long flags; 337 338 spin_lock_irqsave(&s->lock, flags); 339 list_move_tail(&pi->list, &s->valid_paths); 340 s->valid_count++; 341 spin_unlock_irqrestore(&s->lock, flags); 342 343 return 0; 344 } 345 346 static void hst_fill_compare(struct path_info *pi, u64 *hst, 347 u64 *out, u64 *stale) 348 { 349 unsigned long flags; 350 351 spin_lock_irqsave(&pi->lock, flags); 352 *hst = pi->historical_service_time; 353 *out = pi->outstanding; 354 *stale = pi->stale_after; 355 spin_unlock_irqrestore(&pi->lock, flags); 356 } 357 358 /* 359 * Compare the estimated service time of 2 paths, pi1 and pi2, 360 * for the incoming I/O. 361 * 362 * Returns: 363 * < 0 : pi1 is better 364 * 0 : no difference between pi1 and pi2 365 * > 0 : pi2 is better 366 * 367 */ 368 static long long hst_compare(struct path_info *pi1, struct path_info *pi2, 369 u64 time_now, struct path_selector *ps) 370 { 371 struct selector *s = ps->context; 372 u64 hst1, hst2; 373 long long out1, out2, stale1, stale2; 374 int pi2_better, over_threshold; 375 376 hst_fill_compare(pi1, &hst1, &out1, &stale1); 377 hst_fill_compare(pi2, &hst2, &out2, &stale2); 378 379 /* Check here if estimated latency for two paths are too similar. 380 * If this is the case, we skip extra calculation and just compare 381 * outstanding requests. In this case, any unloaded paths will 382 * be preferred. 383 */ 384 if (hst1 > hst2) 385 over_threshold = hst1 > (s->threshold_multiplier * hst2); 386 else 387 over_threshold = hst2 > (s->threshold_multiplier * hst1); 388 389 if (!over_threshold) 390 return out1 - out2; 391 392 /* 393 * If an unloaded path is stale, choose it. If both paths are unloaded, 394 * choose path that is the most stale. 395 * (If one path is loaded, choose the other) 396 */ 397 if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || 398 (!out1 && !out2)) 399 return (!out2 * stale1) - (!out1 * stale2); 400 401 /* Compare estimated service time. If outstanding is the same, we 402 * don't need to multiply 403 */ 404 if (out1 == out2) { 405 pi2_better = hst1 > hst2; 406 } else { 407 /* Potential overflow with out >= 1024 */ 408 if (unlikely(out1 >= HST_MAX_INFLIGHT || 409 out2 >= HST_MAX_INFLIGHT)) { 410 /* If over 1023 in-flights, we may overflow if hst 411 * is at max. (With this shift we still overflow at 412 * 1048576 in-flights, which is high enough). 413 */ 414 hst1 >>= HST_FIXED_SHIFT; 415 hst2 >>= HST_FIXED_SHIFT; 416 } 417 pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; 418 } 419 420 /* In the case that the 'winner' is stale, limit to equal usage. */ 421 if (pi2_better) { 422 if (stale2 < time_now) 423 return out1 - out2; 424 return 1; 425 } 426 if (stale1 < time_now) 427 return out1 - out2; 428 return -1; 429 } 430 431 static struct dm_path *hst_select_path(struct path_selector *ps, 432 size_t nr_bytes) 433 { 434 struct selector *s = ps->context; 435 struct path_info *pi = NULL, *best = NULL; 436 u64 time_now = sched_clock(); 437 struct dm_path *ret = NULL; 438 unsigned long flags; 439 440 spin_lock_irqsave(&s->lock, flags); 441 if (list_empty(&s->valid_paths)) 442 goto out; 443 444 list_for_each_entry(pi, &s->valid_paths, list) { 445 if (!best || (hst_compare(pi, best, time_now, ps) < 0)) 446 best = pi; 447 } 448 449 if (!best) 450 goto out; 451 452 /* Move last used path to end (least preferred in case of ties) */ 453 list_move_tail(&best->list, &s->valid_paths); 454 455 ret = best->path; 456 457 out: 458 spin_unlock_irqrestore(&s->lock, flags); 459 return ret; 460 } 461 462 static int hst_start_io(struct path_selector *ps, struct dm_path *path, 463 size_t nr_bytes) 464 { 465 struct path_info *pi = path->pscontext; 466 unsigned long flags; 467 468 spin_lock_irqsave(&pi->lock, flags); 469 pi->outstanding++; 470 spin_unlock_irqrestore(&pi->lock, flags); 471 472 return 0; 473 } 474 475 static u64 path_service_time(struct path_info *pi, u64 start_time) 476 { 477 u64 sched_now = ktime_get_ns(); 478 479 /* if a previous disk request has finished after this IO was 480 * sent to the hardware, pretend the submission happened 481 * serially. 482 */ 483 if (time_after64(pi->last_finish, start_time)) 484 start_time = pi->last_finish; 485 486 pi->last_finish = sched_now; 487 if (time_before64(sched_now, start_time)) 488 return 0; 489 490 return sched_now - start_time; 491 } 492 493 static int hst_end_io(struct path_selector *ps, struct dm_path *path, 494 size_t nr_bytes, u64 start_time) 495 { 496 struct path_info *pi = path->pscontext; 497 struct selector *s = ps->context; 498 unsigned long flags; 499 u64 st; 500 501 spin_lock_irqsave(&pi->lock, flags); 502 503 st = path_service_time(pi, start_time); 504 pi->outstanding--; 505 pi->historical_service_time = 506 fixed_ema(pi->historical_service_time, 507 min(st * HST_FIXED_1, HST_FIXED_MAX), 508 hst_weight(ps, st)); 509 510 /* 511 * On request end, mark path as fresh. If a path hasn't 512 * finished any requests within the fresh period, the estimated 513 * service time is considered too optimistic and we limit the 514 * maximum requests on that path. 515 */ 516 pi->stale_after = pi->last_finish + 517 (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); 518 519 spin_unlock_irqrestore(&pi->lock, flags); 520 521 return 0; 522 } 523 524 static struct path_selector_type hst_ps = { 525 .name = "historical-service-time", 526 .module = THIS_MODULE, 527 .table_args = 1, 528 .info_args = 3, 529 .create = hst_create, 530 .destroy = hst_destroy, 531 .status = hst_status, 532 .add_path = hst_add_path, 533 .fail_path = hst_fail_path, 534 .reinstate_path = hst_reinstate_path, 535 .select_path = hst_select_path, 536 .start_io = hst_start_io, 537 .end_io = hst_end_io, 538 }; 539 540 static int __init dm_hst_init(void) 541 { 542 int r = dm_register_path_selector(&hst_ps); 543 544 if (r < 0) 545 DMERR("register failed %d", r); 546 547 DMINFO("version " HST_VERSION " loaded"); 548 549 return r; 550 } 551 552 static void __exit dm_hst_exit(void) 553 { 554 int r = dm_unregister_path_selector(&hst_ps); 555 556 if (r < 0) 557 DMERR("unregister failed %d", r); 558 } 559 560 module_init(dm_hst_init); 561 module_exit(dm_hst_exit); 562 563 MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); 564 MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>"); 565 MODULE_LICENSE("GPL"); 566