1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Historical Service Time
4  *
5  *  Keeps a time-weighted exponential moving average of the historical
6  *  service time. Estimates future service time based on the historical
7  *  service time and the number of outstanding requests.
8  *
9  *  Marks paths stale if they have not finished within hst *
10  *  num_paths. If a path is stale and unused, we will send a single
11  *  request to probe in case the path has improved. This situation
12  *  generally arises if the path is so much worse than others that it
13  *  will never have the best estimated service time, or if the entire
14  *  multipath device is unused. If a path is stale and in use, limit the
15  *  number of requests it can receive with the assumption that the path
16  *  has become degraded.
17  *
18  *  To avoid repeatedly calculating exponents for time weighting, times
19  *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
20  *  ns, and the weighting is pre-calculated.
21  *
22  */
23 
24 #include "dm.h"
25 #include "dm-path-selector.h"
26 
27 #include <linux/blkdev.h>
28 #include <linux/slab.h>
29 #include <linux/module.h>
30 
31 
32 #define DM_MSG_PREFIX	"multipath historical-service-time"
33 #define HST_MIN_IO 1
34 #define HST_VERSION "0.1.1"
35 
36 #define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
37 #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
38 #define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
39 #define HST_FIXED_95 972
40 #define HST_MAX_INFLIGHT HST_FIXED_1
41 #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
42 #define HST_WEIGHT_COUNT 64ULL
43 
44 struct selector {
45 	struct list_head valid_paths;
46 	struct list_head failed_paths;
47 	int valid_count;
48 	spinlock_t lock;
49 
50 	unsigned int weights[HST_WEIGHT_COUNT];
51 	unsigned int threshold_multiplier;
52 };
53 
54 struct path_info {
55 	struct list_head list;
56 	struct dm_path *path;
57 	unsigned int repeat_count;
58 
59 	spinlock_t lock;
60 
61 	u64 historical_service_time; /* Fixed point */
62 
63 	u64 stale_after;
64 	u64 last_finish;
65 
66 	u64 outstanding;
67 };
68 
69 /**
70  * fixed_power - compute: x^n, in O(log n) time
71  *
72  * @x:         base of the power
73  * @frac_bits: fractional bits of @x
74  * @n:         power to raise @x to.
75  *
76  * By exploiting the relation between the definition of the natural power
77  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
78  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
79  * (where: n_i \elem {0, 1}, the binary vector representing n),
80  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
81  * of course trivially computable in O(log_2 n), the length of our binary
82  * vector.
83  *
84  * (see: kernel/sched/loadavg.c)
85  */
86 static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
87 {
88 	unsigned long result = 1UL << frac_bits;
89 
90 	if (n) {
91 		for (;;) {
92 			if (n & 1) {
93 				result *= x;
94 				result += 1UL << (frac_bits - 1);
95 				result >>= frac_bits;
96 			}
97 			n >>= 1;
98 			if (!n)
99 				break;
100 			x *= x;
101 			x += 1UL << (frac_bits - 1);
102 			x >>= frac_bits;
103 		}
104 	}
105 
106 	return result;
107 }
108 
109 /*
110  * Calculate the next value of an exponential moving average
111  * a_1 = a_0 * e + a * (1 - e)
112  *
113  * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
114  * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
115  * @weight: [0, HST_FIXED_1]
116  *
117  * Note:
118  *   To account for multiple periods in the same calculation,
119  *   a_n = a_0 * e^n + a * (1 - e^n),
120  *   so call fixed_ema(last, next, pow(weight, N))
121  */
122 static u64 fixed_ema(u64 last, u64 next, u64 weight)
123 {
124 	last *= weight;
125 	last += next * (HST_FIXED_1 - weight);
126 	last += 1ULL << (HST_FIXED_SHIFT - 1);
127 	return last >> HST_FIXED_SHIFT;
128 }
129 
130 static struct selector *alloc_selector(void)
131 {
132 	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
133 
134 	if (s) {
135 		INIT_LIST_HEAD(&s->valid_paths);
136 		INIT_LIST_HEAD(&s->failed_paths);
137 		spin_lock_init(&s->lock);
138 		s->valid_count = 0;
139 	}
140 
141 	return s;
142 }
143 
144 /*
145  * Get the weight for a given time span.
146  */
147 static u64 hst_weight(struct path_selector *ps, u64 delta)
148 {
149 	struct selector *s = ps->context;
150 	int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
151 			   HST_WEIGHT_COUNT - 1);
152 
153 	return s->weights[bucket];
154 }
155 
156 /*
157  * Set up the weights array.
158  *
159  * weights[len-1] = 0
160  * weights[n] = base ^ (n + 1)
161  */
162 static void hst_set_weights(struct path_selector *ps, unsigned int base)
163 {
164 	struct selector *s = ps->context;
165 	int i;
166 
167 	if (base >= HST_FIXED_1)
168 		return;
169 
170 	for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
171 		s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
172 	s->weights[HST_WEIGHT_COUNT - 1] = 0;
173 }
174 
175 static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
176 {
177 	struct selector *s;
178 	unsigned int base_weight = HST_FIXED_95;
179 	unsigned int threshold_multiplier = 0;
180 	char dummy;
181 
182 	/*
183 	 * Arguments: [<base_weight> [<threshold_multiplier>]]
184 	 *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
185 	 *                  value of 0 will completely ignore any history.
186 	 *                  If not given, default (HST_FIXED_95) is used.
187 	 *   <threshold_multiplier>: Minimum threshold multiplier for paths to
188 	 *                  be considered different. That is, a path is
189 	 *                  considered different iff (p1 > N * p2) where p1
190 	 *                  is the path with higher service time. A threshold
191 	 *                  of 1 or 0 has no effect. Defaults to 0.
192 	 */
193 	if (argc > 2)
194 		return -EINVAL;
195 
196 	if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
197 	     base_weight >= HST_FIXED_1)) {
198 		return -EINVAL;
199 	}
200 
201 	if (argc > 1 && (sscanf(argv[1], "%u%c",
202 				&threshold_multiplier, &dummy) != 1)) {
203 		return -EINVAL;
204 	}
205 
206 	s = alloc_selector();
207 	if (!s)
208 		return -ENOMEM;
209 
210 	ps->context = s;
211 
212 	hst_set_weights(ps, base_weight);
213 	s->threshold_multiplier = threshold_multiplier;
214 	return 0;
215 }
216 
217 static void free_paths(struct list_head *paths)
218 {
219 	struct path_info *pi, *next;
220 
221 	list_for_each_entry_safe(pi, next, paths, list) {
222 		list_del(&pi->list);
223 		kfree(pi);
224 	}
225 }
226 
227 static void hst_destroy(struct path_selector *ps)
228 {
229 	struct selector *s = ps->context;
230 
231 	free_paths(&s->valid_paths);
232 	free_paths(&s->failed_paths);
233 	kfree(s);
234 	ps->context = NULL;
235 }
236 
237 static int hst_status(struct path_selector *ps, struct dm_path *path,
238 		     status_type_t type, char *result, unsigned int maxlen)
239 {
240 	unsigned int sz = 0;
241 	struct path_info *pi;
242 
243 	if (!path) {
244 		struct selector *s = ps->context;
245 
246 		DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
247 	} else {
248 		pi = path->pscontext;
249 
250 		switch (type) {
251 		case STATUSTYPE_INFO:
252 			DMEMIT("%llu %llu %llu ", pi->historical_service_time,
253 			       pi->outstanding, pi->stale_after);
254 			break;
255 		case STATUSTYPE_TABLE:
256 			DMEMIT("0 ");
257 			break;
258 		case STATUSTYPE_IMA:
259 			*result = '\0';
260 			break;
261 		}
262 	}
263 
264 	return sz;
265 }
266 
267 static int hst_add_path(struct path_selector *ps, struct dm_path *path,
268 		       int argc, char **argv, char **error)
269 {
270 	struct selector *s = ps->context;
271 	struct path_info *pi;
272 	unsigned int repeat_count = HST_MIN_IO;
273 	char dummy;
274 	unsigned long flags;
275 
276 	/*
277 	 * Arguments: [<repeat_count>]
278 	 *   <repeat_count>: The number of I/Os before switching path.
279 	 *                   If not given, default (HST_MIN_IO) is used.
280 	 */
281 	if (argc > 1) {
282 		*error = "historical-service-time ps: incorrect number of arguments";
283 		return -EINVAL;
284 	}
285 
286 	if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
287 		*error = "historical-service-time ps: invalid repeat count";
288 		return -EINVAL;
289 	}
290 
291 	/* allocate the path */
292 	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
293 	if (!pi) {
294 		*error = "historical-service-time ps: Error allocating path context";
295 		return -ENOMEM;
296 	}
297 
298 	pi->path = path;
299 	pi->repeat_count = repeat_count;
300 
301 	pi->historical_service_time = HST_FIXED_1;
302 
303 	spin_lock_init(&pi->lock);
304 	pi->outstanding = 0;
305 
306 	pi->stale_after = 0;
307 	pi->last_finish = 0;
308 
309 	path->pscontext = pi;
310 
311 	spin_lock_irqsave(&s->lock, flags);
312 	list_add_tail(&pi->list, &s->valid_paths);
313 	s->valid_count++;
314 	spin_unlock_irqrestore(&s->lock, flags);
315 
316 	return 0;
317 }
318 
319 static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
320 {
321 	struct selector *s = ps->context;
322 	struct path_info *pi = path->pscontext;
323 	unsigned long flags;
324 
325 	spin_lock_irqsave(&s->lock, flags);
326 	list_move(&pi->list, &s->failed_paths);
327 	s->valid_count--;
328 	spin_unlock_irqrestore(&s->lock, flags);
329 }
330 
331 static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
332 {
333 	struct selector *s = ps->context;
334 	struct path_info *pi = path->pscontext;
335 	unsigned long flags;
336 
337 	spin_lock_irqsave(&s->lock, flags);
338 	list_move_tail(&pi->list, &s->valid_paths);
339 	s->valid_count++;
340 	spin_unlock_irqrestore(&s->lock, flags);
341 
342 	return 0;
343 }
344 
345 static void hst_fill_compare(struct path_info *pi, u64 *hst,
346 			     u64 *out, u64 *stale)
347 {
348 	unsigned long flags;
349 
350 	spin_lock_irqsave(&pi->lock, flags);
351 	*hst = pi->historical_service_time;
352 	*out = pi->outstanding;
353 	*stale = pi->stale_after;
354 	spin_unlock_irqrestore(&pi->lock, flags);
355 }
356 
357 /*
358  * Compare the estimated service time of 2 paths, pi1 and pi2,
359  * for the incoming I/O.
360  *
361  * Returns:
362  * < 0 : pi1 is better
363  * 0   : no difference between pi1 and pi2
364  * > 0 : pi2 is better
365  *
366  */
367 static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
368 			     u64 time_now, struct path_selector *ps)
369 {
370 	struct selector *s = ps->context;
371 	u64 hst1, hst2;
372 	long long out1, out2, stale1, stale2;
373 	int pi2_better, over_threshold;
374 
375 	hst_fill_compare(pi1, &hst1, &out1, &stale1);
376 	hst_fill_compare(pi2, &hst2, &out2, &stale2);
377 
378 	/* Check here if estimated latency for two paths are too similar.
379 	 * If this is the case, we skip extra calculation and just compare
380 	 * outstanding requests. In this case, any unloaded paths will
381 	 * be preferred.
382 	 */
383 	if (hst1 > hst2)
384 		over_threshold = hst1 > (s->threshold_multiplier * hst2);
385 	else
386 		over_threshold = hst2 > (s->threshold_multiplier * hst1);
387 
388 	if (!over_threshold)
389 		return out1 - out2;
390 
391 	/*
392 	 * If an unloaded path is stale, choose it. If both paths are unloaded,
393 	 * choose path that is the most stale.
394 	 * (If one path is loaded, choose the other)
395 	 */
396 	if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
397 	    (!out1 && !out2))
398 		return (!out2 * stale1) - (!out1 * stale2);
399 
400 	/* Compare estimated service time. If outstanding is the same, we
401 	 * don't need to multiply
402 	 */
403 	if (out1 == out2) {
404 		pi2_better = hst1 > hst2;
405 	} else {
406 		/* Potential overflow with out >= 1024 */
407 		if (unlikely(out1 >= HST_MAX_INFLIGHT ||
408 			     out2 >= HST_MAX_INFLIGHT)) {
409 			/* If over 1023 in-flights, we may overflow if hst
410 			 * is at max. (With this shift we still overflow at
411 			 * 1048576 in-flights, which is high enough).
412 			 */
413 			hst1 >>= HST_FIXED_SHIFT;
414 			hst2 >>= HST_FIXED_SHIFT;
415 		}
416 		pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
417 	}
418 
419 	/* In the case that the 'winner' is stale, limit to equal usage. */
420 	if (pi2_better) {
421 		if (stale2 < time_now)
422 			return out1 - out2;
423 		return 1;
424 	}
425 	if (stale1 < time_now)
426 		return out1 - out2;
427 	return -1;
428 }
429 
430 static struct dm_path *hst_select_path(struct path_selector *ps,
431 				       size_t nr_bytes)
432 {
433 	struct selector *s = ps->context;
434 	struct path_info *pi = NULL, *best = NULL;
435 	u64 time_now = sched_clock();
436 	struct dm_path *ret = NULL;
437 	unsigned long flags;
438 
439 	spin_lock_irqsave(&s->lock, flags);
440 	if (list_empty(&s->valid_paths))
441 		goto out;
442 
443 	list_for_each_entry(pi, &s->valid_paths, list) {
444 		if (!best || (hst_compare(pi, best, time_now, ps) < 0))
445 			best = pi;
446 	}
447 
448 	if (!best)
449 		goto out;
450 
451 	/* Move last used path to end (least preferred in case of ties) */
452 	list_move_tail(&best->list, &s->valid_paths);
453 
454 	ret = best->path;
455 
456 out:
457 	spin_unlock_irqrestore(&s->lock, flags);
458 	return ret;
459 }
460 
461 static int hst_start_io(struct path_selector *ps, struct dm_path *path,
462 			size_t nr_bytes)
463 {
464 	struct path_info *pi = path->pscontext;
465 	unsigned long flags;
466 
467 	spin_lock_irqsave(&pi->lock, flags);
468 	pi->outstanding++;
469 	spin_unlock_irqrestore(&pi->lock, flags);
470 
471 	return 0;
472 }
473 
474 static u64 path_service_time(struct path_info *pi, u64 start_time)
475 {
476 	u64 sched_now = ktime_get_ns();
477 
478 	/* if a previous disk request has finished after this IO was
479 	 * sent to the hardware, pretend the submission happened
480 	 * serially.
481 	 */
482 	if (time_after64(pi->last_finish, start_time))
483 		start_time = pi->last_finish;
484 
485 	pi->last_finish = sched_now;
486 	if (time_before64(sched_now, start_time))
487 		return 0;
488 
489 	return sched_now - start_time;
490 }
491 
492 static int hst_end_io(struct path_selector *ps, struct dm_path *path,
493 		      size_t nr_bytes, u64 start_time)
494 {
495 	struct path_info *pi = path->pscontext;
496 	struct selector *s = ps->context;
497 	unsigned long flags;
498 	u64 st;
499 
500 	spin_lock_irqsave(&pi->lock, flags);
501 
502 	st = path_service_time(pi, start_time);
503 	pi->outstanding--;
504 	pi->historical_service_time =
505 		fixed_ema(pi->historical_service_time,
506 			  min(st * HST_FIXED_1, HST_FIXED_MAX),
507 			  hst_weight(ps, st));
508 
509 	/*
510 	 * On request end, mark path as fresh. If a path hasn't
511 	 * finished any requests within the fresh period, the estimated
512 	 * service time is considered too optimistic and we limit the
513 	 * maximum requests on that path.
514 	 */
515 	pi->stale_after = pi->last_finish +
516 		(s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
517 
518 	spin_unlock_irqrestore(&pi->lock, flags);
519 
520 	return 0;
521 }
522 
523 static struct path_selector_type hst_ps = {
524 	.name		= "historical-service-time",
525 	.module		= THIS_MODULE,
526 	.table_args	= 1,
527 	.info_args	= 3,
528 	.create		= hst_create,
529 	.destroy	= hst_destroy,
530 	.status		= hst_status,
531 	.add_path	= hst_add_path,
532 	.fail_path	= hst_fail_path,
533 	.reinstate_path	= hst_reinstate_path,
534 	.select_path	= hst_select_path,
535 	.start_io	= hst_start_io,
536 	.end_io		= hst_end_io,
537 };
538 
539 static int __init dm_hst_init(void)
540 {
541 	int r = dm_register_path_selector(&hst_ps);
542 
543 	if (r < 0)
544 		DMERR("register failed %d", r);
545 
546 	DMINFO("version " HST_VERSION " loaded");
547 
548 	return r;
549 }
550 
551 static void __exit dm_hst_exit(void)
552 {
553 	int r = dm_unregister_path_selector(&hst_ps);
554 
555 	if (r < 0)
556 		DMERR("unregister failed %d", r);
557 }
558 
559 module_init(dm_hst_init);
560 module_exit(dm_hst_exit);
561 
562 MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
563 MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
564 MODULE_LICENSE("GPL");
565