xref: /openbmc/linux/block/elevator.c (revision 22d55f02)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Block device elevator/IO-scheduler.
4  *
5  *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
6  *
7  * 30042000 Jens Axboe <axboe@kernel.dk> :
8  *
9  * Split the elevator a bit so that it is possible to choose a different
10  * one or even write a new "plug in". There are three pieces:
11  * - elevator_fn, inserts a new request in the queue list
12  * - elevator_merge_fn, decides whether a new buffer can be merged with
13  *   an existing request
14  * - elevator_dequeue_fn, called when a request is taken off the active list
15  *
16  * 20082000 Dave Jones <davej@suse.de> :
17  * Removed tests for max-bomb-segments, which was breaking elvtune
18  *  when run without -bN
19  *
20  * Jens:
21  * - Rework again to work with bio instead of buffer_heads
22  * - loose bi_dev comparisons, partition handling is right now
23  * - completely modularize elevator setup and teardown
24  *
25  */
26 #include <linux/kernel.h>
27 #include <linux/fs.h>
28 #include <linux/blkdev.h>
29 #include <linux/elevator.h>
30 #include <linux/bio.h>
31 #include <linux/module.h>
32 #include <linux/slab.h>
33 #include <linux/init.h>
34 #include <linux/compiler.h>
35 #include <linux/blktrace_api.h>
36 #include <linux/hash.h>
37 #include <linux/uaccess.h>
38 #include <linux/pm_runtime.h>
39 #include <linux/blk-cgroup.h>
40 
41 #include <trace/events/block.h>
42 
43 #include "blk.h"
44 #include "blk-mq-sched.h"
45 #include "blk-pm.h"
46 #include "blk-wbt.h"
47 
48 static DEFINE_SPINLOCK(elv_list_lock);
49 static LIST_HEAD(elv_list);
50 
51 /*
52  * Merge hash stuff.
53  */
54 #define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
55 
56 /*
57  * Query io scheduler to see if the current process issuing bio may be
58  * merged with rq.
59  */
60 static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
61 {
62 	struct request_queue *q = rq->q;
63 	struct elevator_queue *e = q->elevator;
64 
65 	if (e->type->ops.allow_merge)
66 		return e->type->ops.allow_merge(q, rq, bio);
67 
68 	return 1;
69 }
70 
71 /*
72  * can we safely merge with this request?
73  */
74 bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
75 {
76 	if (!blk_rq_merge_ok(rq, bio))
77 		return false;
78 
79 	if (!elv_iosched_allow_bio_merge(rq, bio))
80 		return false;
81 
82 	return true;
83 }
84 EXPORT_SYMBOL(elv_bio_merge_ok);
85 
86 static bool elevator_match(const struct elevator_type *e, const char *name)
87 {
88 	if (!strcmp(e->elevator_name, name))
89 		return true;
90 	if (e->elevator_alias && !strcmp(e->elevator_alias, name))
91 		return true;
92 
93 	return false;
94 }
95 
96 /*
97  * Return scheduler with name 'name'
98  */
99 static struct elevator_type *elevator_find(const char *name)
100 {
101 	struct elevator_type *e;
102 
103 	list_for_each_entry(e, &elv_list, list) {
104 		if (elevator_match(e, name))
105 			return e;
106 	}
107 
108 	return NULL;
109 }
110 
111 static void elevator_put(struct elevator_type *e)
112 {
113 	module_put(e->elevator_owner);
114 }
115 
116 static struct elevator_type *elevator_get(struct request_queue *q,
117 					  const char *name, bool try_loading)
118 {
119 	struct elevator_type *e;
120 
121 	spin_lock(&elv_list_lock);
122 
123 	e = elevator_find(name);
124 	if (!e && try_loading) {
125 		spin_unlock(&elv_list_lock);
126 		request_module("%s-iosched", name);
127 		spin_lock(&elv_list_lock);
128 		e = elevator_find(name);
129 	}
130 
131 	if (e && !try_module_get(e->elevator_owner))
132 		e = NULL;
133 
134 	spin_unlock(&elv_list_lock);
135 	return e;
136 }
137 
138 static char chosen_elevator[ELV_NAME_MAX];
139 
140 static int __init elevator_setup(char *str)
141 {
142 	/*
143 	 * Be backwards-compatible with previous kernels, so users
144 	 * won't get the wrong elevator.
145 	 */
146 	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
147 	return 1;
148 }
149 
150 __setup("elevator=", elevator_setup);
151 
152 static struct kobj_type elv_ktype;
153 
154 struct elevator_queue *elevator_alloc(struct request_queue *q,
155 				  struct elevator_type *e)
156 {
157 	struct elevator_queue *eq;
158 
159 	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
160 	if (unlikely(!eq))
161 		return NULL;
162 
163 	eq->type = e;
164 	kobject_init(&eq->kobj, &elv_ktype);
165 	mutex_init(&eq->sysfs_lock);
166 	hash_init(eq->hash);
167 
168 	return eq;
169 }
170 EXPORT_SYMBOL(elevator_alloc);
171 
172 static void elevator_release(struct kobject *kobj)
173 {
174 	struct elevator_queue *e;
175 
176 	e = container_of(kobj, struct elevator_queue, kobj);
177 	elevator_put(e->type);
178 	kfree(e);
179 }
180 
181 void __elevator_exit(struct request_queue *q, struct elevator_queue *e)
182 {
183 	mutex_lock(&e->sysfs_lock);
184 	if (e->type->ops.exit_sched)
185 		blk_mq_exit_sched(q, e);
186 	mutex_unlock(&e->sysfs_lock);
187 
188 	kobject_put(&e->kobj);
189 }
190 
191 static inline void __elv_rqhash_del(struct request *rq)
192 {
193 	hash_del(&rq->hash);
194 	rq->rq_flags &= ~RQF_HASHED;
195 }
196 
197 void elv_rqhash_del(struct request_queue *q, struct request *rq)
198 {
199 	if (ELV_ON_HASH(rq))
200 		__elv_rqhash_del(rq);
201 }
202 EXPORT_SYMBOL_GPL(elv_rqhash_del);
203 
204 void elv_rqhash_add(struct request_queue *q, struct request *rq)
205 {
206 	struct elevator_queue *e = q->elevator;
207 
208 	BUG_ON(ELV_ON_HASH(rq));
209 	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
210 	rq->rq_flags |= RQF_HASHED;
211 }
212 EXPORT_SYMBOL_GPL(elv_rqhash_add);
213 
214 void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
215 {
216 	__elv_rqhash_del(rq);
217 	elv_rqhash_add(q, rq);
218 }
219 
220 struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
221 {
222 	struct elevator_queue *e = q->elevator;
223 	struct hlist_node *next;
224 	struct request *rq;
225 
226 	hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
227 		BUG_ON(!ELV_ON_HASH(rq));
228 
229 		if (unlikely(!rq_mergeable(rq))) {
230 			__elv_rqhash_del(rq);
231 			continue;
232 		}
233 
234 		if (rq_hash_key(rq) == offset)
235 			return rq;
236 	}
237 
238 	return NULL;
239 }
240 
241 /*
242  * RB-tree support functions for inserting/lookup/removal of requests
243  * in a sorted RB tree.
244  */
245 void elv_rb_add(struct rb_root *root, struct request *rq)
246 {
247 	struct rb_node **p = &root->rb_node;
248 	struct rb_node *parent = NULL;
249 	struct request *__rq;
250 
251 	while (*p) {
252 		parent = *p;
253 		__rq = rb_entry(parent, struct request, rb_node);
254 
255 		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
256 			p = &(*p)->rb_left;
257 		else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
258 			p = &(*p)->rb_right;
259 	}
260 
261 	rb_link_node(&rq->rb_node, parent, p);
262 	rb_insert_color(&rq->rb_node, root);
263 }
264 EXPORT_SYMBOL(elv_rb_add);
265 
266 void elv_rb_del(struct rb_root *root, struct request *rq)
267 {
268 	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
269 	rb_erase(&rq->rb_node, root);
270 	RB_CLEAR_NODE(&rq->rb_node);
271 }
272 EXPORT_SYMBOL(elv_rb_del);
273 
274 struct request *elv_rb_find(struct rb_root *root, sector_t sector)
275 {
276 	struct rb_node *n = root->rb_node;
277 	struct request *rq;
278 
279 	while (n) {
280 		rq = rb_entry(n, struct request, rb_node);
281 
282 		if (sector < blk_rq_pos(rq))
283 			n = n->rb_left;
284 		else if (sector > blk_rq_pos(rq))
285 			n = n->rb_right;
286 		else
287 			return rq;
288 	}
289 
290 	return NULL;
291 }
292 EXPORT_SYMBOL(elv_rb_find);
293 
294 enum elv_merge elv_merge(struct request_queue *q, struct request **req,
295 		struct bio *bio)
296 {
297 	struct elevator_queue *e = q->elevator;
298 	struct request *__rq;
299 
300 	/*
301 	 * Levels of merges:
302 	 * 	nomerges:  No merges at all attempted
303 	 * 	noxmerges: Only simple one-hit cache try
304 	 * 	merges:	   All merge tries attempted
305 	 */
306 	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
307 		return ELEVATOR_NO_MERGE;
308 
309 	/*
310 	 * First try one-hit cache.
311 	 */
312 	if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
313 		enum elv_merge ret = blk_try_merge(q->last_merge, bio);
314 
315 		if (ret != ELEVATOR_NO_MERGE) {
316 			*req = q->last_merge;
317 			return ret;
318 		}
319 	}
320 
321 	if (blk_queue_noxmerges(q))
322 		return ELEVATOR_NO_MERGE;
323 
324 	/*
325 	 * See if our hash lookup can find a potential backmerge.
326 	 */
327 	__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
328 	if (__rq && elv_bio_merge_ok(__rq, bio)) {
329 		*req = __rq;
330 		return ELEVATOR_BACK_MERGE;
331 	}
332 
333 	if (e->type->ops.request_merge)
334 		return e->type->ops.request_merge(q, req, bio);
335 
336 	return ELEVATOR_NO_MERGE;
337 }
338 
339 /*
340  * Attempt to do an insertion back merge. Only check for the case where
341  * we can append 'rq' to an existing request, so we can throw 'rq' away
342  * afterwards.
343  *
344  * Returns true if we merged, false otherwise
345  */
346 bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
347 {
348 	struct request *__rq;
349 	bool ret;
350 
351 	if (blk_queue_nomerges(q))
352 		return false;
353 
354 	/*
355 	 * First try one-hit cache.
356 	 */
357 	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
358 		return true;
359 
360 	if (blk_queue_noxmerges(q))
361 		return false;
362 
363 	ret = false;
364 	/*
365 	 * See if our hash lookup can find a potential backmerge.
366 	 */
367 	while (1) {
368 		__rq = elv_rqhash_find(q, blk_rq_pos(rq));
369 		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
370 			break;
371 
372 		/* The merged request could be merged with others, try again */
373 		ret = true;
374 		rq = __rq;
375 	}
376 
377 	return ret;
378 }
379 
380 void elv_merged_request(struct request_queue *q, struct request *rq,
381 		enum elv_merge type)
382 {
383 	struct elevator_queue *e = q->elevator;
384 
385 	if (e->type->ops.request_merged)
386 		e->type->ops.request_merged(q, rq, type);
387 
388 	if (type == ELEVATOR_BACK_MERGE)
389 		elv_rqhash_reposition(q, rq);
390 
391 	q->last_merge = rq;
392 }
393 
394 void elv_merge_requests(struct request_queue *q, struct request *rq,
395 			     struct request *next)
396 {
397 	struct elevator_queue *e = q->elevator;
398 
399 	if (e->type->ops.requests_merged)
400 		e->type->ops.requests_merged(q, rq, next);
401 
402 	elv_rqhash_reposition(q, rq);
403 	q->last_merge = rq;
404 }
405 
406 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
407 {
408 	struct elevator_queue *e = q->elevator;
409 
410 	if (e->type->ops.next_request)
411 		return e->type->ops.next_request(q, rq);
412 
413 	return NULL;
414 }
415 
416 struct request *elv_former_request(struct request_queue *q, struct request *rq)
417 {
418 	struct elevator_queue *e = q->elevator;
419 
420 	if (e->type->ops.former_request)
421 		return e->type->ops.former_request(q, rq);
422 
423 	return NULL;
424 }
425 
426 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
427 
428 static ssize_t
429 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
430 {
431 	struct elv_fs_entry *entry = to_elv(attr);
432 	struct elevator_queue *e;
433 	ssize_t error;
434 
435 	if (!entry->show)
436 		return -EIO;
437 
438 	e = container_of(kobj, struct elevator_queue, kobj);
439 	mutex_lock(&e->sysfs_lock);
440 	error = e->type ? entry->show(e, page) : -ENOENT;
441 	mutex_unlock(&e->sysfs_lock);
442 	return error;
443 }
444 
445 static ssize_t
446 elv_attr_store(struct kobject *kobj, struct attribute *attr,
447 	       const char *page, size_t length)
448 {
449 	struct elv_fs_entry *entry = to_elv(attr);
450 	struct elevator_queue *e;
451 	ssize_t error;
452 
453 	if (!entry->store)
454 		return -EIO;
455 
456 	e = container_of(kobj, struct elevator_queue, kobj);
457 	mutex_lock(&e->sysfs_lock);
458 	error = e->type ? entry->store(e, page, length) : -ENOENT;
459 	mutex_unlock(&e->sysfs_lock);
460 	return error;
461 }
462 
463 static const struct sysfs_ops elv_sysfs_ops = {
464 	.show	= elv_attr_show,
465 	.store	= elv_attr_store,
466 };
467 
468 static struct kobj_type elv_ktype = {
469 	.sysfs_ops	= &elv_sysfs_ops,
470 	.release	= elevator_release,
471 };
472 
473 int elv_register_queue(struct request_queue *q)
474 {
475 	struct elevator_queue *e = q->elevator;
476 	int error;
477 
478 	lockdep_assert_held(&q->sysfs_lock);
479 
480 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
481 	if (!error) {
482 		struct elv_fs_entry *attr = e->type->elevator_attrs;
483 		if (attr) {
484 			while (attr->attr.name) {
485 				if (sysfs_create_file(&e->kobj, &attr->attr))
486 					break;
487 				attr++;
488 			}
489 		}
490 		kobject_uevent(&e->kobj, KOBJ_ADD);
491 		e->registered = 1;
492 	}
493 	return error;
494 }
495 
496 void elv_unregister_queue(struct request_queue *q)
497 {
498 	lockdep_assert_held(&q->sysfs_lock);
499 
500 	if (q) {
501 		struct elevator_queue *e = q->elevator;
502 
503 		kobject_uevent(&e->kobj, KOBJ_REMOVE);
504 		kobject_del(&e->kobj);
505 		e->registered = 0;
506 		/* Re-enable throttling in case elevator disabled it */
507 		wbt_enable_default(q);
508 	}
509 }
510 
511 int elv_register(struct elevator_type *e)
512 {
513 	/* create icq_cache if requested */
514 	if (e->icq_size) {
515 		if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
516 		    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
517 			return -EINVAL;
518 
519 		snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
520 			 "%s_io_cq", e->elevator_name);
521 		e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
522 						 e->icq_align, 0, NULL);
523 		if (!e->icq_cache)
524 			return -ENOMEM;
525 	}
526 
527 	/* register, don't allow duplicate names */
528 	spin_lock(&elv_list_lock);
529 	if (elevator_find(e->elevator_name)) {
530 		spin_unlock(&elv_list_lock);
531 		kmem_cache_destroy(e->icq_cache);
532 		return -EBUSY;
533 	}
534 	list_add_tail(&e->list, &elv_list);
535 	spin_unlock(&elv_list_lock);
536 
537 	printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name);
538 
539 	return 0;
540 }
541 EXPORT_SYMBOL_GPL(elv_register);
542 
543 void elv_unregister(struct elevator_type *e)
544 {
545 	/* unregister */
546 	spin_lock(&elv_list_lock);
547 	list_del_init(&e->list);
548 	spin_unlock(&elv_list_lock);
549 
550 	/*
551 	 * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
552 	 * sure all RCU operations are complete before proceeding.
553 	 */
554 	if (e->icq_cache) {
555 		rcu_barrier();
556 		kmem_cache_destroy(e->icq_cache);
557 		e->icq_cache = NULL;
558 	}
559 }
560 EXPORT_SYMBOL_GPL(elv_unregister);
561 
562 int elevator_switch_mq(struct request_queue *q,
563 			      struct elevator_type *new_e)
564 {
565 	int ret;
566 
567 	lockdep_assert_held(&q->sysfs_lock);
568 
569 	if (q->elevator) {
570 		if (q->elevator->registered)
571 			elv_unregister_queue(q);
572 		ioc_clear_queue(q);
573 		elevator_exit(q, q->elevator);
574 	}
575 
576 	ret = blk_mq_init_sched(q, new_e);
577 	if (ret)
578 		goto out;
579 
580 	if (new_e) {
581 		ret = elv_register_queue(q);
582 		if (ret) {
583 			elevator_exit(q, q->elevator);
584 			goto out;
585 		}
586 	}
587 
588 	if (new_e)
589 		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
590 	else
591 		blk_add_trace_msg(q, "elv switch: none");
592 
593 out:
594 	return ret;
595 }
596 
597 /*
598  * For blk-mq devices, we default to using mq-deadline, if available, for single
599  * queue devices.  If deadline isn't available OR we have multiple queues,
600  * default to "none".
601  */
602 int elevator_init_mq(struct request_queue *q)
603 {
604 	struct elevator_type *e;
605 	int err = 0;
606 
607 	if (q->nr_hw_queues != 1)
608 		return 0;
609 
610 	/*
611 	 * q->sysfs_lock must be held to provide mutual exclusion between
612 	 * elevator_switch() and here.
613 	 */
614 	mutex_lock(&q->sysfs_lock);
615 	if (unlikely(q->elevator))
616 		goto out_unlock;
617 
618 	e = elevator_get(q, "mq-deadline", false);
619 	if (!e)
620 		goto out_unlock;
621 
622 	err = blk_mq_init_sched(q, e);
623 	if (err)
624 		elevator_put(e);
625 out_unlock:
626 	mutex_unlock(&q->sysfs_lock);
627 	return err;
628 }
629 
630 
631 /*
632  * switch to new_e io scheduler. be careful not to introduce deadlocks -
633  * we don't free the old io scheduler, before we have allocated what we
634  * need for the new one. this way we have a chance of going back to the old
635  * one, if the new one fails init for some reason.
636  */
637 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
638 {
639 	int err;
640 
641 	lockdep_assert_held(&q->sysfs_lock);
642 
643 	blk_mq_freeze_queue(q);
644 	blk_mq_quiesce_queue(q);
645 
646 	err = elevator_switch_mq(q, new_e);
647 
648 	blk_mq_unquiesce_queue(q);
649 	blk_mq_unfreeze_queue(q);
650 
651 	return err;
652 }
653 
654 /*
655  * Switch this queue to the given IO scheduler.
656  */
657 static int __elevator_change(struct request_queue *q, const char *name)
658 {
659 	char elevator_name[ELV_NAME_MAX];
660 	struct elevator_type *e;
661 
662 	/* Make sure queue is not in the middle of being removed */
663 	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
664 		return -ENOENT;
665 
666 	/*
667 	 * Special case for mq, turn off scheduling
668 	 */
669 	if (!strncmp(name, "none", 4)) {
670 		if (!q->elevator)
671 			return 0;
672 		return elevator_switch(q, NULL);
673 	}
674 
675 	strlcpy(elevator_name, name, sizeof(elevator_name));
676 	e = elevator_get(q, strstrip(elevator_name), true);
677 	if (!e)
678 		return -EINVAL;
679 
680 	if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
681 		elevator_put(e);
682 		return 0;
683 	}
684 
685 	return elevator_switch(q, e);
686 }
687 
688 static inline bool elv_support_iosched(struct request_queue *q)
689 {
690 	if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
691 		return false;
692 	return true;
693 }
694 
695 ssize_t elv_iosched_store(struct request_queue *q, const char *name,
696 			  size_t count)
697 {
698 	int ret;
699 
700 	if (!queue_is_mq(q) || !elv_support_iosched(q))
701 		return count;
702 
703 	ret = __elevator_change(q, name);
704 	if (!ret)
705 		return count;
706 
707 	return ret;
708 }
709 
710 ssize_t elv_iosched_show(struct request_queue *q, char *name)
711 {
712 	struct elevator_queue *e = q->elevator;
713 	struct elevator_type *elv = NULL;
714 	struct elevator_type *__e;
715 	int len = 0;
716 
717 	if (!queue_is_mq(q))
718 		return sprintf(name, "none\n");
719 
720 	if (!q->elevator)
721 		len += sprintf(name+len, "[none] ");
722 	else
723 		elv = e->type;
724 
725 	spin_lock(&elv_list_lock);
726 	list_for_each_entry(__e, &elv_list, list) {
727 		if (elv && elevator_match(elv, __e->elevator_name)) {
728 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
729 			continue;
730 		}
731 		if (elv_support_iosched(q))
732 			len += sprintf(name+len, "%s ", __e->elevator_name);
733 	}
734 	spin_unlock(&elv_list_lock);
735 
736 	if (q->elevator)
737 		len += sprintf(name+len, "none");
738 
739 	len += sprintf(len+name, "\n");
740 	return len;
741 }
742 
743 struct request *elv_rb_former_request(struct request_queue *q,
744 				      struct request *rq)
745 {
746 	struct rb_node *rbprev = rb_prev(&rq->rb_node);
747 
748 	if (rbprev)
749 		return rb_entry_rq(rbprev);
750 
751 	return NULL;
752 }
753 EXPORT_SYMBOL(elv_rb_former_request);
754 
755 struct request *elv_rb_latter_request(struct request_queue *q,
756 				      struct request *rq)
757 {
758 	struct rb_node *rbnext = rb_next(&rq->rb_node);
759 
760 	if (rbnext)
761 		return rb_entry_rq(rbnext);
762 
763 	return NULL;
764 }
765 EXPORT_SYMBOL(elv_rb_latter_request);
766