xref: /openbmc/linux/block/kyber-iosched.c (revision c000c4f1)
1 /*
2  * The Kyber I/O scheduler. Controls latency by throttling queue depths using
3  * scalable techniques.
4  *
5  * Copyright (C) 2017 Facebook
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public
9  * License v2 as published by the Free Software Foundation.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
18  */
19 
20 #include <linux/kernel.h>
21 #include <linux/blkdev.h>
22 #include <linux/blk-mq.h>
23 #include <linux/elevator.h>
24 #include <linux/module.h>
25 #include <linux/sbitmap.h>
26 
27 #include "blk.h"
28 #include "blk-mq.h"
29 #include "blk-mq-debugfs.h"
30 #include "blk-mq-sched.h"
31 #include "blk-mq-tag.h"
32 #include "blk-stat.h"
33 
34 /* Scheduling domains. */
35 enum {
36 	KYBER_READ,
37 	KYBER_SYNC_WRITE,
38 	KYBER_OTHER, /* Async writes, discard, etc. */
39 	KYBER_NUM_DOMAINS,
40 };
41 
42 enum {
43 	KYBER_MIN_DEPTH = 256,
44 
45 	/*
46 	 * In order to prevent starvation of synchronous requests by a flood of
47 	 * asynchronous requests, we reserve 25% of requests for synchronous
48 	 * operations.
49 	 */
50 	KYBER_ASYNC_PERCENT = 75,
51 };
52 
53 /*
54  * Initial device-wide depths for each scheduling domain.
55  *
56  * Even for fast devices with lots of tags like NVMe, you can saturate
57  * the device with only a fraction of the maximum possible queue depth.
58  * So, we cap these to a reasonable value.
59  */
60 static const unsigned int kyber_depth[] = {
61 	[KYBER_READ] = 256,
62 	[KYBER_SYNC_WRITE] = 128,
63 	[KYBER_OTHER] = 64,
64 };
65 
66 /*
67  * Scheduling domain batch sizes. We favor reads.
68  */
69 static const unsigned int kyber_batch_size[] = {
70 	[KYBER_READ] = 16,
71 	[KYBER_SYNC_WRITE] = 8,
72 	[KYBER_OTHER] = 8,
73 };
74 
75 /*
76  * There is a same mapping between ctx & hctx and kcq & khd,
77  * we use request->mq_ctx->index_hw to index the kcq in khd.
78  */
79 struct kyber_ctx_queue {
80 	/*
81 	 * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
82 	 * Also protect the rqs on rq_list when merge.
83 	 */
84 	spinlock_t lock;
85 	struct list_head rq_list[KYBER_NUM_DOMAINS];
86 } ____cacheline_aligned_in_smp;
87 
88 struct kyber_queue_data {
89 	struct request_queue *q;
90 
91 	struct blk_stat_callback *cb;
92 
93 	/*
94 	 * The device is divided into multiple scheduling domains based on the
95 	 * request type. Each domain has a fixed number of in-flight requests of
96 	 * that type device-wide, limited by these tokens.
97 	 */
98 	struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
99 
100 	/*
101 	 * Async request percentage, converted to per-word depth for
102 	 * sbitmap_get_shallow().
103 	 */
104 	unsigned int async_depth;
105 
106 	/* Target latencies in nanoseconds. */
107 	u64 read_lat_nsec, write_lat_nsec;
108 };
109 
110 struct kyber_hctx_data {
111 	spinlock_t lock;
112 	struct list_head rqs[KYBER_NUM_DOMAINS];
113 	unsigned int cur_domain;
114 	unsigned int batching;
115 	struct kyber_ctx_queue *kcqs;
116 	struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
117 	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
118 	struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
119 	atomic_t wait_index[KYBER_NUM_DOMAINS];
120 };
121 
122 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
123 			     void *key);
124 
125 static unsigned int kyber_sched_domain(unsigned int op)
126 {
127 	if ((op & REQ_OP_MASK) == REQ_OP_READ)
128 		return KYBER_READ;
129 	else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
130 		return KYBER_SYNC_WRITE;
131 	else
132 		return KYBER_OTHER;
133 }
134 
135 enum {
136 	NONE = 0,
137 	GOOD = 1,
138 	GREAT = 2,
139 	BAD = -1,
140 	AWFUL = -2,
141 };
142 
143 #define IS_GOOD(status) ((status) > 0)
144 #define IS_BAD(status) ((status) < 0)
145 
146 static int kyber_lat_status(struct blk_stat_callback *cb,
147 			    unsigned int sched_domain, u64 target)
148 {
149 	u64 latency;
150 
151 	if (!cb->stat[sched_domain].nr_samples)
152 		return NONE;
153 
154 	latency = cb->stat[sched_domain].mean;
155 	if (latency >= 2 * target)
156 		return AWFUL;
157 	else if (latency > target)
158 		return BAD;
159 	else if (latency <= target / 2)
160 		return GREAT;
161 	else /* (latency <= target) */
162 		return GOOD;
163 }
164 
165 /*
166  * Adjust the read or synchronous write depth given the status of reads and
167  * writes. The goal is that the latencies of the two domains are fair (i.e., if
168  * one is good, then the other is good).
169  */
170 static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
171 				  unsigned int sched_domain, int this_status,
172 				  int other_status)
173 {
174 	unsigned int orig_depth, depth;
175 
176 	/*
177 	 * If this domain had no samples, or reads and writes are both good or
178 	 * both bad, don't adjust the depth.
179 	 */
180 	if (this_status == NONE ||
181 	    (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
182 	    (IS_BAD(this_status) && IS_BAD(other_status)))
183 		return;
184 
185 	orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
186 
187 	if (other_status == NONE) {
188 		depth++;
189 	} else {
190 		switch (this_status) {
191 		case GOOD:
192 			if (other_status == AWFUL)
193 				depth -= max(depth / 4, 1U);
194 			else
195 				depth -= max(depth / 8, 1U);
196 			break;
197 		case GREAT:
198 			if (other_status == AWFUL)
199 				depth /= 2;
200 			else
201 				depth -= max(depth / 4, 1U);
202 			break;
203 		case BAD:
204 			depth++;
205 			break;
206 		case AWFUL:
207 			if (other_status == GREAT)
208 				depth += 2;
209 			else
210 				depth++;
211 			break;
212 		}
213 	}
214 
215 	depth = clamp(depth, 1U, kyber_depth[sched_domain]);
216 	if (depth != orig_depth)
217 		sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
218 }
219 
220 /*
221  * Adjust the depth of other requests given the status of reads and synchronous
222  * writes. As long as either domain is doing fine, we don't throttle, but if
223  * both domains are doing badly, we throttle heavily.
224  */
225 static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
226 				     int read_status, int write_status,
227 				     bool have_samples)
228 {
229 	unsigned int orig_depth, depth;
230 	int status;
231 
232 	orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
233 
234 	if (read_status == NONE && write_status == NONE) {
235 		depth += 2;
236 	} else if (have_samples) {
237 		if (read_status == NONE)
238 			status = write_status;
239 		else if (write_status == NONE)
240 			status = read_status;
241 		else
242 			status = max(read_status, write_status);
243 		switch (status) {
244 		case GREAT:
245 			depth += 2;
246 			break;
247 		case GOOD:
248 			depth++;
249 			break;
250 		case BAD:
251 			depth -= max(depth / 4, 1U);
252 			break;
253 		case AWFUL:
254 			depth /= 2;
255 			break;
256 		}
257 	}
258 
259 	depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
260 	if (depth != orig_depth)
261 		sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
262 }
263 
264 /*
265  * Apply heuristics for limiting queue depths based on gathered latency
266  * statistics.
267  */
268 static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
269 {
270 	struct kyber_queue_data *kqd = cb->data;
271 	int read_status, write_status;
272 
273 	read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
274 	write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
275 
276 	kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
277 	kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
278 	kyber_adjust_other_depth(kqd, read_status, write_status,
279 				 cb->stat[KYBER_OTHER].nr_samples != 0);
280 
281 	/*
282 	 * Continue monitoring latencies if we aren't hitting the targets or
283 	 * we're still throttling other requests.
284 	 */
285 	if (!blk_stat_is_active(kqd->cb) &&
286 	    ((IS_BAD(read_status) || IS_BAD(write_status) ||
287 	      kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
288 		blk_stat_activate_msecs(kqd->cb, 100);
289 }
290 
291 static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
292 {
293 	/*
294 	 * All of the hardware queues have the same depth, so we can just grab
295 	 * the shift of the first one.
296 	 */
297 	return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
298 }
299 
300 static int kyber_bucket_fn(const struct request *rq)
301 {
302 	return kyber_sched_domain(rq->cmd_flags);
303 }
304 
305 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
306 {
307 	struct kyber_queue_data *kqd;
308 	unsigned int max_tokens;
309 	unsigned int shift;
310 	int ret = -ENOMEM;
311 	int i;
312 
313 	kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
314 	if (!kqd)
315 		goto err;
316 	kqd->q = q;
317 
318 	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
319 					  KYBER_NUM_DOMAINS, kqd);
320 	if (!kqd->cb)
321 		goto err_kqd;
322 
323 	/*
324 	 * The maximum number of tokens for any scheduling domain is at least
325 	 * the queue depth of a single hardware queue. If the hardware doesn't
326 	 * have many tags, still provide a reasonable number.
327 	 */
328 	max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
329 			   KYBER_MIN_DEPTH);
330 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
331 		WARN_ON(!kyber_depth[i]);
332 		WARN_ON(!kyber_batch_size[i]);
333 		ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
334 					      max_tokens, -1, false, GFP_KERNEL,
335 					      q->node);
336 		if (ret) {
337 			while (--i >= 0)
338 				sbitmap_queue_free(&kqd->domain_tokens[i]);
339 			goto err_cb;
340 		}
341 		sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
342 	}
343 
344 	shift = kyber_sched_tags_shift(kqd);
345 	kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
346 
347 	kqd->read_lat_nsec = 2000000ULL;
348 	kqd->write_lat_nsec = 10000000ULL;
349 
350 	return kqd;
351 
352 err_cb:
353 	blk_stat_free_callback(kqd->cb);
354 err_kqd:
355 	kfree(kqd);
356 err:
357 	return ERR_PTR(ret);
358 }
359 
360 static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
361 {
362 	struct kyber_queue_data *kqd;
363 	struct elevator_queue *eq;
364 
365 	eq = elevator_alloc(q, e);
366 	if (!eq)
367 		return -ENOMEM;
368 
369 	kqd = kyber_queue_data_alloc(q);
370 	if (IS_ERR(kqd)) {
371 		kobject_put(&eq->kobj);
372 		return PTR_ERR(kqd);
373 	}
374 
375 	eq->elevator_data = kqd;
376 	q->elevator = eq;
377 
378 	blk_stat_add_callback(q, kqd->cb);
379 
380 	return 0;
381 }
382 
383 static void kyber_exit_sched(struct elevator_queue *e)
384 {
385 	struct kyber_queue_data *kqd = e->elevator_data;
386 	struct request_queue *q = kqd->q;
387 	int i;
388 
389 	blk_stat_remove_callback(q, kqd->cb);
390 
391 	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
392 		sbitmap_queue_free(&kqd->domain_tokens[i]);
393 	blk_stat_free_callback(kqd->cb);
394 	kfree(kqd);
395 }
396 
397 static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
398 {
399 	unsigned int i;
400 
401 	spin_lock_init(&kcq->lock);
402 	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
403 		INIT_LIST_HEAD(&kcq->rq_list[i]);
404 }
405 
406 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
407 {
408 	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
409 	struct kyber_hctx_data *khd;
410 	int i;
411 
412 	khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
413 	if (!khd)
414 		return -ENOMEM;
415 
416 	khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
417 				       sizeof(struct kyber_ctx_queue),
418 				       GFP_KERNEL, hctx->numa_node);
419 	if (!khd->kcqs)
420 		goto err_khd;
421 
422 	for (i = 0; i < hctx->nr_ctx; i++)
423 		kyber_ctx_queue_init(&khd->kcqs[i]);
424 
425 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
426 		if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
427 				      ilog2(8), GFP_KERNEL, hctx->numa_node)) {
428 			while (--i >= 0)
429 				sbitmap_free(&khd->kcq_map[i]);
430 			goto err_kcqs;
431 		}
432 	}
433 
434 	spin_lock_init(&khd->lock);
435 
436 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
437 		INIT_LIST_HEAD(&khd->rqs[i]);
438 		init_waitqueue_func_entry(&khd->domain_wait[i],
439 					  kyber_domain_wake);
440 		khd->domain_wait[i].private = hctx;
441 		INIT_LIST_HEAD(&khd->domain_wait[i].entry);
442 		atomic_set(&khd->wait_index[i], 0);
443 	}
444 
445 	khd->cur_domain = 0;
446 	khd->batching = 0;
447 
448 	hctx->sched_data = khd;
449 	sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
450 					kqd->async_depth);
451 
452 	return 0;
453 
454 err_kcqs:
455 	kfree(khd->kcqs);
456 err_khd:
457 	kfree(khd);
458 	return -ENOMEM;
459 }
460 
461 static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
462 {
463 	struct kyber_hctx_data *khd = hctx->sched_data;
464 	int i;
465 
466 	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
467 		sbitmap_free(&khd->kcq_map[i]);
468 	kfree(khd->kcqs);
469 	kfree(hctx->sched_data);
470 }
471 
472 static int rq_get_domain_token(struct request *rq)
473 {
474 	return (long)rq->elv.priv[0];
475 }
476 
477 static void rq_set_domain_token(struct request *rq, int token)
478 {
479 	rq->elv.priv[0] = (void *)(long)token;
480 }
481 
482 static void rq_clear_domain_token(struct kyber_queue_data *kqd,
483 				  struct request *rq)
484 {
485 	unsigned int sched_domain;
486 	int nr;
487 
488 	nr = rq_get_domain_token(rq);
489 	if (nr != -1) {
490 		sched_domain = kyber_sched_domain(rq->cmd_flags);
491 		sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
492 				    rq->mq_ctx->cpu);
493 	}
494 }
495 
496 static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
497 {
498 	/*
499 	 * We use the scheduler tags as per-hardware queue queueing tokens.
500 	 * Async requests can be limited at this stage.
501 	 */
502 	if (!op_is_sync(op)) {
503 		struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
504 
505 		data->shallow_depth = kqd->async_depth;
506 	}
507 }
508 
509 static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
510 {
511 	struct kyber_hctx_data *khd = hctx->sched_data;
512 	struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
513 	struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
514 	unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
515 	struct list_head *rq_list = &kcq->rq_list[sched_domain];
516 	bool merged;
517 
518 	spin_lock(&kcq->lock);
519 	merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
520 	spin_unlock(&kcq->lock);
521 	blk_mq_put_ctx(ctx);
522 
523 	return merged;
524 }
525 
526 static void kyber_prepare_request(struct request *rq, struct bio *bio)
527 {
528 	rq_set_domain_token(rq, -1);
529 }
530 
531 static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
532 				  struct list_head *rq_list, bool at_head)
533 {
534 	struct kyber_hctx_data *khd = hctx->sched_data;
535 	struct request *rq, *next;
536 
537 	list_for_each_entry_safe(rq, next, rq_list, queuelist) {
538 		unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
539 		struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
540 		struct list_head *head = &kcq->rq_list[sched_domain];
541 
542 		spin_lock(&kcq->lock);
543 		if (at_head)
544 			list_move(&rq->queuelist, head);
545 		else
546 			list_move_tail(&rq->queuelist, head);
547 		sbitmap_set_bit(&khd->kcq_map[sched_domain],
548 				rq->mq_ctx->index_hw);
549 		blk_mq_sched_request_inserted(rq);
550 		spin_unlock(&kcq->lock);
551 	}
552 }
553 
554 static void kyber_finish_request(struct request *rq)
555 {
556 	struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
557 
558 	rq_clear_domain_token(kqd, rq);
559 }
560 
561 static void kyber_completed_request(struct request *rq)
562 {
563 	struct request_queue *q = rq->q;
564 	struct kyber_queue_data *kqd = q->elevator->elevator_data;
565 	unsigned int sched_domain;
566 	u64 now, latency, target;
567 
568 	/*
569 	 * Check if this request met our latency goal. If not, quickly gather
570 	 * some statistics and start throttling.
571 	 */
572 	sched_domain = kyber_sched_domain(rq->cmd_flags);
573 	switch (sched_domain) {
574 	case KYBER_READ:
575 		target = kqd->read_lat_nsec;
576 		break;
577 	case KYBER_SYNC_WRITE:
578 		target = kqd->write_lat_nsec;
579 		break;
580 	default:
581 		return;
582 	}
583 
584 	/* If we are already monitoring latencies, don't check again. */
585 	if (blk_stat_is_active(kqd->cb))
586 		return;
587 
588 	now = ktime_get_ns();
589 	if (now < rq->io_start_time_ns)
590 		return;
591 
592 	latency = now - rq->io_start_time_ns;
593 
594 	if (latency > target)
595 		blk_stat_activate_msecs(kqd->cb, 10);
596 }
597 
598 struct flush_kcq_data {
599 	struct kyber_hctx_data *khd;
600 	unsigned int sched_domain;
601 	struct list_head *list;
602 };
603 
604 static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
605 {
606 	struct flush_kcq_data *flush_data = data;
607 	struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
608 
609 	spin_lock(&kcq->lock);
610 	list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
611 			      flush_data->list);
612 	sbitmap_clear_bit(sb, bitnr);
613 	spin_unlock(&kcq->lock);
614 
615 	return true;
616 }
617 
618 static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
619 				  unsigned int sched_domain,
620 				  struct list_head *list)
621 {
622 	struct flush_kcq_data data = {
623 		.khd = khd,
624 		.sched_domain = sched_domain,
625 		.list = list,
626 	};
627 
628 	sbitmap_for_each_set(&khd->kcq_map[sched_domain],
629 			     flush_busy_kcq, &data);
630 }
631 
632 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
633 			     void *key)
634 {
635 	struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
636 
637 	list_del_init(&wait->entry);
638 	blk_mq_run_hw_queue(hctx, true);
639 	return 1;
640 }
641 
642 static int kyber_get_domain_token(struct kyber_queue_data *kqd,
643 				  struct kyber_hctx_data *khd,
644 				  struct blk_mq_hw_ctx *hctx)
645 {
646 	unsigned int sched_domain = khd->cur_domain;
647 	struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
648 	wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
649 	struct sbq_wait_state *ws;
650 	int nr;
651 
652 	nr = __sbitmap_queue_get(domain_tokens);
653 
654 	/*
655 	 * If we failed to get a domain token, make sure the hardware queue is
656 	 * run when one becomes available. Note that this is serialized on
657 	 * khd->lock, but we still need to be careful about the waker.
658 	 */
659 	if (nr < 0 && list_empty_careful(&wait->entry)) {
660 		ws = sbq_wait_ptr(domain_tokens,
661 				  &khd->wait_index[sched_domain]);
662 		khd->domain_ws[sched_domain] = ws;
663 		add_wait_queue(&ws->wait, wait);
664 
665 		/*
666 		 * Try again in case a token was freed before we got on the wait
667 		 * queue.
668 		 */
669 		nr = __sbitmap_queue_get(domain_tokens);
670 	}
671 
672 	/*
673 	 * If we got a token while we were on the wait queue, remove ourselves
674 	 * from the wait queue to ensure that all wake ups make forward
675 	 * progress. It's possible that the waker already deleted the entry
676 	 * between the !list_empty_careful() check and us grabbing the lock, but
677 	 * list_del_init() is okay with that.
678 	 */
679 	if (nr >= 0 && !list_empty_careful(&wait->entry)) {
680 		ws = khd->domain_ws[sched_domain];
681 		spin_lock_irq(&ws->wait.lock);
682 		list_del_init(&wait->entry);
683 		spin_unlock_irq(&ws->wait.lock);
684 	}
685 
686 	return nr;
687 }
688 
689 static struct request *
690 kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
691 			  struct kyber_hctx_data *khd,
692 			  struct blk_mq_hw_ctx *hctx)
693 {
694 	struct list_head *rqs;
695 	struct request *rq;
696 	int nr;
697 
698 	rqs = &khd->rqs[khd->cur_domain];
699 
700 	/*
701 	 * If we already have a flushed request, then we just need to get a
702 	 * token for it. Otherwise, if there are pending requests in the kcqs,
703 	 * flush the kcqs, but only if we can get a token. If not, we should
704 	 * leave the requests in the kcqs so that they can be merged. Note that
705 	 * khd->lock serializes the flushes, so if we observed any bit set in
706 	 * the kcq_map, we will always get a request.
707 	 */
708 	rq = list_first_entry_or_null(rqs, struct request, queuelist);
709 	if (rq) {
710 		nr = kyber_get_domain_token(kqd, khd, hctx);
711 		if (nr >= 0) {
712 			khd->batching++;
713 			rq_set_domain_token(rq, nr);
714 			list_del_init(&rq->queuelist);
715 			return rq;
716 		}
717 	} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
718 		nr = kyber_get_domain_token(kqd, khd, hctx);
719 		if (nr >= 0) {
720 			kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
721 			rq = list_first_entry(rqs, struct request, queuelist);
722 			khd->batching++;
723 			rq_set_domain_token(rq, nr);
724 			list_del_init(&rq->queuelist);
725 			return rq;
726 		}
727 	}
728 
729 	/* There were either no pending requests or no tokens. */
730 	return NULL;
731 }
732 
733 static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
734 {
735 	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
736 	struct kyber_hctx_data *khd = hctx->sched_data;
737 	struct request *rq;
738 	int i;
739 
740 	spin_lock(&khd->lock);
741 
742 	/*
743 	 * First, if we are still entitled to batch, try to dispatch a request
744 	 * from the batch.
745 	 */
746 	if (khd->batching < kyber_batch_size[khd->cur_domain]) {
747 		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
748 		if (rq)
749 			goto out;
750 	}
751 
752 	/*
753 	 * Either,
754 	 * 1. We were no longer entitled to a batch.
755 	 * 2. The domain we were batching didn't have any requests.
756 	 * 3. The domain we were batching was out of tokens.
757 	 *
758 	 * Start another batch. Note that this wraps back around to the original
759 	 * domain if no other domains have requests or tokens.
760 	 */
761 	khd->batching = 0;
762 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
763 		if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
764 			khd->cur_domain = 0;
765 		else
766 			khd->cur_domain++;
767 
768 		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
769 		if (rq)
770 			goto out;
771 	}
772 
773 	rq = NULL;
774 out:
775 	spin_unlock(&khd->lock);
776 	return rq;
777 }
778 
779 static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
780 {
781 	struct kyber_hctx_data *khd = hctx->sched_data;
782 	int i;
783 
784 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
785 		if (!list_empty_careful(&khd->rqs[i]) ||
786 		    sbitmap_any_bit_set(&khd->kcq_map[i]))
787 			return true;
788 	}
789 
790 	return false;
791 }
792 
793 #define KYBER_LAT_SHOW_STORE(op)					\
794 static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,		\
795 				     char *page)			\
796 {									\
797 	struct kyber_queue_data *kqd = e->elevator_data;		\
798 									\
799 	return sprintf(page, "%llu\n", kqd->op##_lat_nsec);		\
800 }									\
801 									\
802 static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,		\
803 				      const char *page, size_t count)	\
804 {									\
805 	struct kyber_queue_data *kqd = e->elevator_data;		\
806 	unsigned long long nsec;					\
807 	int ret;							\
808 									\
809 	ret = kstrtoull(page, 10, &nsec);				\
810 	if (ret)							\
811 		return ret;						\
812 									\
813 	kqd->op##_lat_nsec = nsec;					\
814 									\
815 	return count;							\
816 }
817 KYBER_LAT_SHOW_STORE(read);
818 KYBER_LAT_SHOW_STORE(write);
819 #undef KYBER_LAT_SHOW_STORE
820 
821 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
822 static struct elv_fs_entry kyber_sched_attrs[] = {
823 	KYBER_LAT_ATTR(read),
824 	KYBER_LAT_ATTR(write),
825 	__ATTR_NULL
826 };
827 #undef KYBER_LAT_ATTR
828 
829 #ifdef CONFIG_BLK_DEBUG_FS
830 #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name)			\
831 static int kyber_##name##_tokens_show(void *data, struct seq_file *m)	\
832 {									\
833 	struct request_queue *q = data;					\
834 	struct kyber_queue_data *kqd = q->elevator->elevator_data;	\
835 									\
836 	sbitmap_queue_show(&kqd->domain_tokens[domain], m);		\
837 	return 0;							\
838 }									\
839 									\
840 static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos)	\
841 	__acquires(&khd->lock)						\
842 {									\
843 	struct blk_mq_hw_ctx *hctx = m->private;			\
844 	struct kyber_hctx_data *khd = hctx->sched_data;			\
845 									\
846 	spin_lock(&khd->lock);						\
847 	return seq_list_start(&khd->rqs[domain], *pos);			\
848 }									\
849 									\
850 static void *kyber_##name##_rqs_next(struct seq_file *m, void *v,	\
851 				     loff_t *pos)			\
852 {									\
853 	struct blk_mq_hw_ctx *hctx = m->private;			\
854 	struct kyber_hctx_data *khd = hctx->sched_data;			\
855 									\
856 	return seq_list_next(v, &khd->rqs[domain], pos);		\
857 }									\
858 									\
859 static void kyber_##name##_rqs_stop(struct seq_file *m, void *v)	\
860 	__releases(&khd->lock)						\
861 {									\
862 	struct blk_mq_hw_ctx *hctx = m->private;			\
863 	struct kyber_hctx_data *khd = hctx->sched_data;			\
864 									\
865 	spin_unlock(&khd->lock);					\
866 }									\
867 									\
868 static const struct seq_operations kyber_##name##_rqs_seq_ops = {	\
869 	.start	= kyber_##name##_rqs_start,				\
870 	.next	= kyber_##name##_rqs_next,				\
871 	.stop	= kyber_##name##_rqs_stop,				\
872 	.show	= blk_mq_debugfs_rq_show,				\
873 };									\
874 									\
875 static int kyber_##name##_waiting_show(void *data, struct seq_file *m)	\
876 {									\
877 	struct blk_mq_hw_ctx *hctx = data;				\
878 	struct kyber_hctx_data *khd = hctx->sched_data;			\
879 	wait_queue_entry_t *wait = &khd->domain_wait[domain];		\
880 									\
881 	seq_printf(m, "%d\n", !list_empty_careful(&wait->entry));	\
882 	return 0;							\
883 }
884 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
885 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
886 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
887 #undef KYBER_DEBUGFS_DOMAIN_ATTRS
888 
889 static int kyber_async_depth_show(void *data, struct seq_file *m)
890 {
891 	struct request_queue *q = data;
892 	struct kyber_queue_data *kqd = q->elevator->elevator_data;
893 
894 	seq_printf(m, "%u\n", kqd->async_depth);
895 	return 0;
896 }
897 
898 static int kyber_cur_domain_show(void *data, struct seq_file *m)
899 {
900 	struct blk_mq_hw_ctx *hctx = data;
901 	struct kyber_hctx_data *khd = hctx->sched_data;
902 
903 	switch (khd->cur_domain) {
904 	case KYBER_READ:
905 		seq_puts(m, "READ\n");
906 		break;
907 	case KYBER_SYNC_WRITE:
908 		seq_puts(m, "SYNC_WRITE\n");
909 		break;
910 	case KYBER_OTHER:
911 		seq_puts(m, "OTHER\n");
912 		break;
913 	default:
914 		seq_printf(m, "%u\n", khd->cur_domain);
915 		break;
916 	}
917 	return 0;
918 }
919 
920 static int kyber_batching_show(void *data, struct seq_file *m)
921 {
922 	struct blk_mq_hw_ctx *hctx = data;
923 	struct kyber_hctx_data *khd = hctx->sched_data;
924 
925 	seq_printf(m, "%u\n", khd->batching);
926 	return 0;
927 }
928 
929 #define KYBER_QUEUE_DOMAIN_ATTRS(name)	\
930 	{#name "_tokens", 0400, kyber_##name##_tokens_show}
931 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
932 	KYBER_QUEUE_DOMAIN_ATTRS(read),
933 	KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
934 	KYBER_QUEUE_DOMAIN_ATTRS(other),
935 	{"async_depth", 0400, kyber_async_depth_show},
936 	{},
937 };
938 #undef KYBER_QUEUE_DOMAIN_ATTRS
939 
940 #define KYBER_HCTX_DOMAIN_ATTRS(name)					\
941 	{#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops},	\
942 	{#name "_waiting", 0400, kyber_##name##_waiting_show}
943 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
944 	KYBER_HCTX_DOMAIN_ATTRS(read),
945 	KYBER_HCTX_DOMAIN_ATTRS(sync_write),
946 	KYBER_HCTX_DOMAIN_ATTRS(other),
947 	{"cur_domain", 0400, kyber_cur_domain_show},
948 	{"batching", 0400, kyber_batching_show},
949 	{},
950 };
951 #undef KYBER_HCTX_DOMAIN_ATTRS
952 #endif
953 
954 static struct elevator_type kyber_sched = {
955 	.ops.mq = {
956 		.init_sched = kyber_init_sched,
957 		.exit_sched = kyber_exit_sched,
958 		.init_hctx = kyber_init_hctx,
959 		.exit_hctx = kyber_exit_hctx,
960 		.limit_depth = kyber_limit_depth,
961 		.bio_merge = kyber_bio_merge,
962 		.prepare_request = kyber_prepare_request,
963 		.insert_requests = kyber_insert_requests,
964 		.finish_request = kyber_finish_request,
965 		.requeue_request = kyber_finish_request,
966 		.completed_request = kyber_completed_request,
967 		.dispatch_request = kyber_dispatch_request,
968 		.has_work = kyber_has_work,
969 	},
970 	.uses_mq = true,
971 #ifdef CONFIG_BLK_DEBUG_FS
972 	.queue_debugfs_attrs = kyber_queue_debugfs_attrs,
973 	.hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
974 #endif
975 	.elevator_attrs = kyber_sched_attrs,
976 	.elevator_name = "kyber",
977 	.elevator_owner = THIS_MODULE,
978 };
979 
980 static int __init kyber_init(void)
981 {
982 	return elv_register(&kyber_sched);
983 }
984 
985 static void __exit kyber_exit(void)
986 {
987 	elv_unregister(&kyber_sched);
988 }
989 
990 module_init(kyber_init);
991 module_exit(kyber_exit);
992 
993 MODULE_AUTHOR("Omar Sandoval");
994 MODULE_LICENSE("GPL");
995 MODULE_DESCRIPTION("Kyber I/O scheduler");
996