1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Historical Service Time 4 * 5 * Keeps a time-weighted exponential moving average of the historical 6 * service time. Estimates future service time based on the historical 7 * service time and the number of outstanding requests. 8 * 9 * Marks paths stale if they have not finished within hst * 10 * num_paths. If a path is stale and unused, we will send a single 11 * request to probe in case the path has improved. This situation 12 * generally arises if the path is so much worse than others that it 13 * will never have the best estimated service time, or if the entire 14 * multipath device is unused. If a path is stale and in use, limit the 15 * number of requests it can receive with the assumption that the path 16 * has become degraded. 17 * 18 * To avoid repeatedly calculating exponents for time weighting, times 19 * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) 20 * ns, and the weighting is pre-calculated. 21 * 22 */ 23 24 #include "dm.h" 25 #include "dm-path-selector.h" 26 27 #include <linux/blkdev.h> 28 #include <linux/slab.h> 29 #include <linux/module.h> 30 31 32 #define DM_MSG_PREFIX "multipath historical-service-time" 33 #define HST_MIN_IO 1 34 #define HST_VERSION "0.1.1" 35 36 #define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ 37 #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) 38 #define HST_FIXED_1 (1 << HST_FIXED_SHIFT) 39 #define HST_FIXED_95 972 40 #define HST_MAX_INFLIGHT HST_FIXED_1 41 #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ 42 #define HST_WEIGHT_COUNT 64ULL 43 44 struct selector { 45 struct list_head valid_paths; 46 struct list_head failed_paths; 47 int valid_count; 48 spinlock_t lock; 49 50 unsigned int weights[HST_WEIGHT_COUNT]; 51 unsigned int threshold_multiplier; 52 }; 53 54 struct path_info { 55 struct list_head list; 56 struct dm_path *path; 57 unsigned int repeat_count; 58 59 spinlock_t lock; 60 61 u64 historical_service_time; /* Fixed point */ 62 63 u64 stale_after; 64 u64 last_finish; 65 66 u64 outstanding; 67 }; 68 69 /** 70 * fixed_power - compute: x^n, in O(log n) time 71 * 72 * @x: base of the power 73 * @frac_bits: fractional bits of @x 74 * @n: power to raise @x to. 75 * 76 * By exploiting the relation between the definition of the natural power 77 * function: x^n := x*x*...*x (x multiplied by itself for n times), and 78 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, 79 * (where: n_i \elem {0, 1}, the binary vector representing n), 80 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is 81 * of course trivially computable in O(log_2 n), the length of our binary 82 * vector. 83 * 84 * (see: kernel/sched/loadavg.c) 85 */ 86 static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) 87 { 88 unsigned long result = 1UL << frac_bits; 89 90 if (n) { 91 for (;;) { 92 if (n & 1) { 93 result *= x; 94 result += 1UL << (frac_bits - 1); 95 result >>= frac_bits; 96 } 97 n >>= 1; 98 if (!n) 99 break; 100 x *= x; 101 x += 1UL << (frac_bits - 1); 102 x >>= frac_bits; 103 } 104 } 105 106 return result; 107 } 108 109 /* 110 * Calculate the next value of an exponential moving average 111 * a_1 = a_0 * e + a * (1 - e) 112 * 113 * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] 114 * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] 115 * @weight: [0, HST_FIXED_1] 116 * 117 * Note: 118 * To account for multiple periods in the same calculation, 119 * a_n = a_0 * e^n + a * (1 - e^n), 120 * so call fixed_ema(last, next, pow(weight, N)) 121 */ 122 static u64 fixed_ema(u64 last, u64 next, u64 weight) 123 { 124 last *= weight; 125 last += next * (HST_FIXED_1 - weight); 126 last += 1ULL << (HST_FIXED_SHIFT - 1); 127 return last >> HST_FIXED_SHIFT; 128 } 129 130 static struct selector *alloc_selector(void) 131 { 132 struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); 133 134 if (s) { 135 INIT_LIST_HEAD(&s->valid_paths); 136 INIT_LIST_HEAD(&s->failed_paths); 137 spin_lock_init(&s->lock); 138 s->valid_count = 0; 139 } 140 141 return s; 142 } 143 144 /* 145 * Get the weight for a given time span. 146 */ 147 static u64 hst_weight(struct path_selector *ps, u64 delta) 148 { 149 struct selector *s = ps->context; 150 int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, 151 HST_WEIGHT_COUNT - 1); 152 153 return s->weights[bucket]; 154 } 155 156 /* 157 * Set up the weights array. 158 * 159 * weights[len-1] = 0 160 * weights[n] = base ^ (n + 1) 161 */ 162 static void hst_set_weights(struct path_selector *ps, unsigned int base) 163 { 164 struct selector *s = ps->context; 165 int i; 166 167 if (base >= HST_FIXED_1) 168 return; 169 170 for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) 171 s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); 172 s->weights[HST_WEIGHT_COUNT - 1] = 0; 173 } 174 175 static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) 176 { 177 struct selector *s; 178 unsigned int base_weight = HST_FIXED_95; 179 unsigned int threshold_multiplier = 0; 180 char dummy; 181 182 /* 183 * Arguments: [<base_weight> [<threshold_multiplier>]] 184 * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A 185 * value of 0 will completely ignore any history. 186 * If not given, default (HST_FIXED_95) is used. 187 * <threshold_multiplier>: Minimum threshold multiplier for paths to 188 * be considered different. That is, a path is 189 * considered different iff (p1 > N * p2) where p1 190 * is the path with higher service time. A threshold 191 * of 1 or 0 has no effect. Defaults to 0. 192 */ 193 if (argc > 2) 194 return -EINVAL; 195 196 if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || 197 base_weight >= HST_FIXED_1)) { 198 return -EINVAL; 199 } 200 201 if (argc > 1 && (sscanf(argv[1], "%u%c", 202 &threshold_multiplier, &dummy) != 1)) { 203 return -EINVAL; 204 } 205 206 s = alloc_selector(); 207 if (!s) 208 return -ENOMEM; 209 210 ps->context = s; 211 212 hst_set_weights(ps, base_weight); 213 s->threshold_multiplier = threshold_multiplier; 214 return 0; 215 } 216 217 static void free_paths(struct list_head *paths) 218 { 219 struct path_info *pi, *next; 220 221 list_for_each_entry_safe(pi, next, paths, list) { 222 list_del(&pi->list); 223 kfree(pi); 224 } 225 } 226 227 static void hst_destroy(struct path_selector *ps) 228 { 229 struct selector *s = ps->context; 230 231 free_paths(&s->valid_paths); 232 free_paths(&s->failed_paths); 233 kfree(s); 234 ps->context = NULL; 235 } 236 237 static int hst_status(struct path_selector *ps, struct dm_path *path, 238 status_type_t type, char *result, unsigned int maxlen) 239 { 240 unsigned int sz = 0; 241 struct path_info *pi; 242 243 if (!path) { 244 struct selector *s = ps->context; 245 246 DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); 247 } else { 248 pi = path->pscontext; 249 250 switch (type) { 251 case STATUSTYPE_INFO: 252 DMEMIT("%llu %llu %llu ", pi->historical_service_time, 253 pi->outstanding, pi->stale_after); 254 break; 255 case STATUSTYPE_TABLE: 256 DMEMIT("0 "); 257 break; 258 case STATUSTYPE_IMA: 259 *result = '\0'; 260 break; 261 } 262 } 263 264 return sz; 265 } 266 267 static int hst_add_path(struct path_selector *ps, struct dm_path *path, 268 int argc, char **argv, char **error) 269 { 270 struct selector *s = ps->context; 271 struct path_info *pi; 272 unsigned int repeat_count = HST_MIN_IO; 273 char dummy; 274 unsigned long flags; 275 276 /* 277 * Arguments: [<repeat_count>] 278 * <repeat_count>: The number of I/Os before switching path. 279 * If not given, default (HST_MIN_IO) is used. 280 */ 281 if (argc > 1) { 282 *error = "historical-service-time ps: incorrect number of arguments"; 283 return -EINVAL; 284 } 285 286 if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { 287 *error = "historical-service-time ps: invalid repeat count"; 288 return -EINVAL; 289 } 290 291 /* allocate the path */ 292 pi = kmalloc(sizeof(*pi), GFP_KERNEL); 293 if (!pi) { 294 *error = "historical-service-time ps: Error allocating path context"; 295 return -ENOMEM; 296 } 297 298 pi->path = path; 299 pi->repeat_count = repeat_count; 300 301 pi->historical_service_time = HST_FIXED_1; 302 303 spin_lock_init(&pi->lock); 304 pi->outstanding = 0; 305 306 pi->stale_after = 0; 307 pi->last_finish = 0; 308 309 path->pscontext = pi; 310 311 spin_lock_irqsave(&s->lock, flags); 312 list_add_tail(&pi->list, &s->valid_paths); 313 s->valid_count++; 314 spin_unlock_irqrestore(&s->lock, flags); 315 316 return 0; 317 } 318 319 static void hst_fail_path(struct path_selector *ps, struct dm_path *path) 320 { 321 struct selector *s = ps->context; 322 struct path_info *pi = path->pscontext; 323 unsigned long flags; 324 325 spin_lock_irqsave(&s->lock, flags); 326 list_move(&pi->list, &s->failed_paths); 327 s->valid_count--; 328 spin_unlock_irqrestore(&s->lock, flags); 329 } 330 331 static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) 332 { 333 struct selector *s = ps->context; 334 struct path_info *pi = path->pscontext; 335 unsigned long flags; 336 337 spin_lock_irqsave(&s->lock, flags); 338 list_move_tail(&pi->list, &s->valid_paths); 339 s->valid_count++; 340 spin_unlock_irqrestore(&s->lock, flags); 341 342 return 0; 343 } 344 345 static void hst_fill_compare(struct path_info *pi, u64 *hst, 346 u64 *out, u64 *stale) 347 { 348 unsigned long flags; 349 350 spin_lock_irqsave(&pi->lock, flags); 351 *hst = pi->historical_service_time; 352 *out = pi->outstanding; 353 *stale = pi->stale_after; 354 spin_unlock_irqrestore(&pi->lock, flags); 355 } 356 357 /* 358 * Compare the estimated service time of 2 paths, pi1 and pi2, 359 * for the incoming I/O. 360 * 361 * Returns: 362 * < 0 : pi1 is better 363 * 0 : no difference between pi1 and pi2 364 * > 0 : pi2 is better 365 * 366 */ 367 static long long hst_compare(struct path_info *pi1, struct path_info *pi2, 368 u64 time_now, struct path_selector *ps) 369 { 370 struct selector *s = ps->context; 371 u64 hst1, hst2; 372 long long out1, out2, stale1, stale2; 373 int pi2_better, over_threshold; 374 375 hst_fill_compare(pi1, &hst1, &out1, &stale1); 376 hst_fill_compare(pi2, &hst2, &out2, &stale2); 377 378 /* Check here if estimated latency for two paths are too similar. 379 * If this is the case, we skip extra calculation and just compare 380 * outstanding requests. In this case, any unloaded paths will 381 * be preferred. 382 */ 383 if (hst1 > hst2) 384 over_threshold = hst1 > (s->threshold_multiplier * hst2); 385 else 386 over_threshold = hst2 > (s->threshold_multiplier * hst1); 387 388 if (!over_threshold) 389 return out1 - out2; 390 391 /* 392 * If an unloaded path is stale, choose it. If both paths are unloaded, 393 * choose path that is the most stale. 394 * (If one path is loaded, choose the other) 395 */ 396 if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || 397 (!out1 && !out2)) 398 return (!out2 * stale1) - (!out1 * stale2); 399 400 /* Compare estimated service time. If outstanding is the same, we 401 * don't need to multiply 402 */ 403 if (out1 == out2) { 404 pi2_better = hst1 > hst2; 405 } else { 406 /* Potential overflow with out >= 1024 */ 407 if (unlikely(out1 >= HST_MAX_INFLIGHT || 408 out2 >= HST_MAX_INFLIGHT)) { 409 /* If over 1023 in-flights, we may overflow if hst 410 * is at max. (With this shift we still overflow at 411 * 1048576 in-flights, which is high enough). 412 */ 413 hst1 >>= HST_FIXED_SHIFT; 414 hst2 >>= HST_FIXED_SHIFT; 415 } 416 pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; 417 } 418 419 /* In the case that the 'winner' is stale, limit to equal usage. */ 420 if (pi2_better) { 421 if (stale2 < time_now) 422 return out1 - out2; 423 return 1; 424 } 425 if (stale1 < time_now) 426 return out1 - out2; 427 return -1; 428 } 429 430 static struct dm_path *hst_select_path(struct path_selector *ps, 431 size_t nr_bytes) 432 { 433 struct selector *s = ps->context; 434 struct path_info *pi = NULL, *best = NULL; 435 u64 time_now = sched_clock(); 436 struct dm_path *ret = NULL; 437 unsigned long flags; 438 439 spin_lock_irqsave(&s->lock, flags); 440 if (list_empty(&s->valid_paths)) 441 goto out; 442 443 list_for_each_entry(pi, &s->valid_paths, list) { 444 if (!best || (hst_compare(pi, best, time_now, ps) < 0)) 445 best = pi; 446 } 447 448 if (!best) 449 goto out; 450 451 /* Move last used path to end (least preferred in case of ties) */ 452 list_move_tail(&best->list, &s->valid_paths); 453 454 ret = best->path; 455 456 out: 457 spin_unlock_irqrestore(&s->lock, flags); 458 return ret; 459 } 460 461 static int hst_start_io(struct path_selector *ps, struct dm_path *path, 462 size_t nr_bytes) 463 { 464 struct path_info *pi = path->pscontext; 465 unsigned long flags; 466 467 spin_lock_irqsave(&pi->lock, flags); 468 pi->outstanding++; 469 spin_unlock_irqrestore(&pi->lock, flags); 470 471 return 0; 472 } 473 474 static u64 path_service_time(struct path_info *pi, u64 start_time) 475 { 476 u64 sched_now = ktime_get_ns(); 477 478 /* if a previous disk request has finished after this IO was 479 * sent to the hardware, pretend the submission happened 480 * serially. 481 */ 482 if (time_after64(pi->last_finish, start_time)) 483 start_time = pi->last_finish; 484 485 pi->last_finish = sched_now; 486 if (time_before64(sched_now, start_time)) 487 return 0; 488 489 return sched_now - start_time; 490 } 491 492 static int hst_end_io(struct path_selector *ps, struct dm_path *path, 493 size_t nr_bytes, u64 start_time) 494 { 495 struct path_info *pi = path->pscontext; 496 struct selector *s = ps->context; 497 unsigned long flags; 498 u64 st; 499 500 spin_lock_irqsave(&pi->lock, flags); 501 502 st = path_service_time(pi, start_time); 503 pi->outstanding--; 504 pi->historical_service_time = 505 fixed_ema(pi->historical_service_time, 506 min(st * HST_FIXED_1, HST_FIXED_MAX), 507 hst_weight(ps, st)); 508 509 /* 510 * On request end, mark path as fresh. If a path hasn't 511 * finished any requests within the fresh period, the estimated 512 * service time is considered too optimistic and we limit the 513 * maximum requests on that path. 514 */ 515 pi->stale_after = pi->last_finish + 516 (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); 517 518 spin_unlock_irqrestore(&pi->lock, flags); 519 520 return 0; 521 } 522 523 static struct path_selector_type hst_ps = { 524 .name = "historical-service-time", 525 .module = THIS_MODULE, 526 .table_args = 1, 527 .info_args = 3, 528 .create = hst_create, 529 .destroy = hst_destroy, 530 .status = hst_status, 531 .add_path = hst_add_path, 532 .fail_path = hst_fail_path, 533 .reinstate_path = hst_reinstate_path, 534 .select_path = hst_select_path, 535 .start_io = hst_start_io, 536 .end_io = hst_end_io, 537 }; 538 539 static int __init dm_hst_init(void) 540 { 541 int r = dm_register_path_selector(&hst_ps); 542 543 if (r < 0) 544 DMERR("register failed %d", r); 545 546 DMINFO("version " HST_VERSION " loaded"); 547 548 return r; 549 } 550 551 static void __exit dm_hst_exit(void) 552 { 553 int r = dm_unregister_path_selector(&hst_ps); 554 555 if (r < 0) 556 DMERR("unregister failed %d", r); 557 } 558 559 module_init(dm_hst_init); 560 module_exit(dm_hst_exit); 561 562 MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); 563 MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>"); 564 MODULE_LICENSE("GPL"); 565