xref: /openbmc/linux/block/elevator.c (revision 9a8f3203)
1 /*
2  *  Block device elevator/IO-scheduler.
3  *
4  *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5  *
6  * 30042000 Jens Axboe <axboe@kernel.dk> :
7  *
8  * Split the elevator a bit so that it is possible to choose a different
9  * one or even write a new "plug in". There are three pieces:
10  * - elevator_fn, inserts a new request in the queue list
11  * - elevator_merge_fn, decides whether a new buffer can be merged with
12  *   an existing request
13  * - elevator_dequeue_fn, called when a request is taken off the active list
14  *
15  * 20082000 Dave Jones <davej@suse.de> :
16  * Removed tests for max-bomb-segments, which was breaking elvtune
17  *  when run without -bN
18  *
19  * Jens:
20  * - Rework again to work with bio instead of buffer_heads
21  * - loose bi_dev comparisons, partition handling is right now
22  * - completely modularize elevator setup and teardown
23  *
24  */
25 #include <linux/kernel.h>
26 #include <linux/fs.h>
27 #include <linux/blkdev.h>
28 #include <linux/elevator.h>
29 #include <linux/bio.h>
30 #include <linux/module.h>
31 #include <linux/slab.h>
32 #include <linux/init.h>
33 #include <linux/compiler.h>
34 #include <linux/blktrace_api.h>
35 #include <linux/hash.h>
36 #include <linux/uaccess.h>
37 #include <linux/pm_runtime.h>
38 #include <linux/blk-cgroup.h>
39 
40 #include <trace/events/block.h>
41 
42 #include "blk.h"
43 #include "blk-mq-sched.h"
44 #include "blk-pm.h"
45 #include "blk-wbt.h"
46 
47 static DEFINE_SPINLOCK(elv_list_lock);
48 static LIST_HEAD(elv_list);
49 
50 /*
51  * Merge hash stuff.
52  */
53 #define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
54 
55 /*
56  * Query io scheduler to see if the current process issuing bio may be
57  * merged with rq.
58  */
59 static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
60 {
61 	struct request_queue *q = rq->q;
62 	struct elevator_queue *e = q->elevator;
63 
64 	if (e->type->ops.allow_merge)
65 		return e->type->ops.allow_merge(q, rq, bio);
66 
67 	return 1;
68 }
69 
70 /*
71  * can we safely merge with this request?
72  */
73 bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
74 {
75 	if (!blk_rq_merge_ok(rq, bio))
76 		return false;
77 
78 	if (!elv_iosched_allow_bio_merge(rq, bio))
79 		return false;
80 
81 	return true;
82 }
83 EXPORT_SYMBOL(elv_bio_merge_ok);
84 
85 static bool elevator_match(const struct elevator_type *e, const char *name)
86 {
87 	if (!strcmp(e->elevator_name, name))
88 		return true;
89 	if (e->elevator_alias && !strcmp(e->elevator_alias, name))
90 		return true;
91 
92 	return false;
93 }
94 
95 /*
96  * Return scheduler with name 'name'
97  */
98 static struct elevator_type *elevator_find(const char *name)
99 {
100 	struct elevator_type *e;
101 
102 	list_for_each_entry(e, &elv_list, list) {
103 		if (elevator_match(e, name))
104 			return e;
105 	}
106 
107 	return NULL;
108 }
109 
110 static void elevator_put(struct elevator_type *e)
111 {
112 	module_put(e->elevator_owner);
113 }
114 
115 static struct elevator_type *elevator_get(struct request_queue *q,
116 					  const char *name, bool try_loading)
117 {
118 	struct elevator_type *e;
119 
120 	spin_lock(&elv_list_lock);
121 
122 	e = elevator_find(name);
123 	if (!e && try_loading) {
124 		spin_unlock(&elv_list_lock);
125 		request_module("%s-iosched", name);
126 		spin_lock(&elv_list_lock);
127 		e = elevator_find(name);
128 	}
129 
130 	if (e && !try_module_get(e->elevator_owner))
131 		e = NULL;
132 
133 	spin_unlock(&elv_list_lock);
134 	return e;
135 }
136 
137 static char chosen_elevator[ELV_NAME_MAX];
138 
139 static int __init elevator_setup(char *str)
140 {
141 	/*
142 	 * Be backwards-compatible with previous kernels, so users
143 	 * won't get the wrong elevator.
144 	 */
145 	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
146 	return 1;
147 }
148 
149 __setup("elevator=", elevator_setup);
150 
151 static struct kobj_type elv_ktype;
152 
153 struct elevator_queue *elevator_alloc(struct request_queue *q,
154 				  struct elevator_type *e)
155 {
156 	struct elevator_queue *eq;
157 
158 	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
159 	if (unlikely(!eq))
160 		return NULL;
161 
162 	eq->type = e;
163 	kobject_init(&eq->kobj, &elv_ktype);
164 	mutex_init(&eq->sysfs_lock);
165 	hash_init(eq->hash);
166 
167 	return eq;
168 }
169 EXPORT_SYMBOL(elevator_alloc);
170 
171 static void elevator_release(struct kobject *kobj)
172 {
173 	struct elevator_queue *e;
174 
175 	e = container_of(kobj, struct elevator_queue, kobj);
176 	elevator_put(e->type);
177 	kfree(e);
178 }
179 
180 void elevator_exit(struct request_queue *q, struct elevator_queue *e)
181 {
182 	mutex_lock(&e->sysfs_lock);
183 	if (e->type->ops.exit_sched)
184 		blk_mq_exit_sched(q, e);
185 	mutex_unlock(&e->sysfs_lock);
186 
187 	kobject_put(&e->kobj);
188 }
189 
190 static inline void __elv_rqhash_del(struct request *rq)
191 {
192 	hash_del(&rq->hash);
193 	rq->rq_flags &= ~RQF_HASHED;
194 }
195 
196 void elv_rqhash_del(struct request_queue *q, struct request *rq)
197 {
198 	if (ELV_ON_HASH(rq))
199 		__elv_rqhash_del(rq);
200 }
201 EXPORT_SYMBOL_GPL(elv_rqhash_del);
202 
203 void elv_rqhash_add(struct request_queue *q, struct request *rq)
204 {
205 	struct elevator_queue *e = q->elevator;
206 
207 	BUG_ON(ELV_ON_HASH(rq));
208 	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
209 	rq->rq_flags |= RQF_HASHED;
210 }
211 EXPORT_SYMBOL_GPL(elv_rqhash_add);
212 
213 void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
214 {
215 	__elv_rqhash_del(rq);
216 	elv_rqhash_add(q, rq);
217 }
218 
219 struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
220 {
221 	struct elevator_queue *e = q->elevator;
222 	struct hlist_node *next;
223 	struct request *rq;
224 
225 	hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
226 		BUG_ON(!ELV_ON_HASH(rq));
227 
228 		if (unlikely(!rq_mergeable(rq))) {
229 			__elv_rqhash_del(rq);
230 			continue;
231 		}
232 
233 		if (rq_hash_key(rq) == offset)
234 			return rq;
235 	}
236 
237 	return NULL;
238 }
239 
240 /*
241  * RB-tree support functions for inserting/lookup/removal of requests
242  * in a sorted RB tree.
243  */
244 void elv_rb_add(struct rb_root *root, struct request *rq)
245 {
246 	struct rb_node **p = &root->rb_node;
247 	struct rb_node *parent = NULL;
248 	struct request *__rq;
249 
250 	while (*p) {
251 		parent = *p;
252 		__rq = rb_entry(parent, struct request, rb_node);
253 
254 		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
255 			p = &(*p)->rb_left;
256 		else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
257 			p = &(*p)->rb_right;
258 	}
259 
260 	rb_link_node(&rq->rb_node, parent, p);
261 	rb_insert_color(&rq->rb_node, root);
262 }
263 EXPORT_SYMBOL(elv_rb_add);
264 
265 void elv_rb_del(struct rb_root *root, struct request *rq)
266 {
267 	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
268 	rb_erase(&rq->rb_node, root);
269 	RB_CLEAR_NODE(&rq->rb_node);
270 }
271 EXPORT_SYMBOL(elv_rb_del);
272 
273 struct request *elv_rb_find(struct rb_root *root, sector_t sector)
274 {
275 	struct rb_node *n = root->rb_node;
276 	struct request *rq;
277 
278 	while (n) {
279 		rq = rb_entry(n, struct request, rb_node);
280 
281 		if (sector < blk_rq_pos(rq))
282 			n = n->rb_left;
283 		else if (sector > blk_rq_pos(rq))
284 			n = n->rb_right;
285 		else
286 			return rq;
287 	}
288 
289 	return NULL;
290 }
291 EXPORT_SYMBOL(elv_rb_find);
292 
293 enum elv_merge elv_merge(struct request_queue *q, struct request **req,
294 		struct bio *bio)
295 {
296 	struct elevator_queue *e = q->elevator;
297 	struct request *__rq;
298 
299 	/*
300 	 * Levels of merges:
301 	 * 	nomerges:  No merges at all attempted
302 	 * 	noxmerges: Only simple one-hit cache try
303 	 * 	merges:	   All merge tries attempted
304 	 */
305 	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
306 		return ELEVATOR_NO_MERGE;
307 
308 	/*
309 	 * First try one-hit cache.
310 	 */
311 	if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
312 		enum elv_merge ret = blk_try_merge(q->last_merge, bio);
313 
314 		if (ret != ELEVATOR_NO_MERGE) {
315 			*req = q->last_merge;
316 			return ret;
317 		}
318 	}
319 
320 	if (blk_queue_noxmerges(q))
321 		return ELEVATOR_NO_MERGE;
322 
323 	/*
324 	 * See if our hash lookup can find a potential backmerge.
325 	 */
326 	__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
327 	if (__rq && elv_bio_merge_ok(__rq, bio)) {
328 		*req = __rq;
329 		return ELEVATOR_BACK_MERGE;
330 	}
331 
332 	if (e->type->ops.request_merge)
333 		return e->type->ops.request_merge(q, req, bio);
334 
335 	return ELEVATOR_NO_MERGE;
336 }
337 
338 /*
339  * Attempt to do an insertion back merge. Only check for the case where
340  * we can append 'rq' to an existing request, so we can throw 'rq' away
341  * afterwards.
342  *
343  * Returns true if we merged, false otherwise
344  */
345 bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
346 {
347 	struct request *__rq;
348 	bool ret;
349 
350 	if (blk_queue_nomerges(q))
351 		return false;
352 
353 	/*
354 	 * First try one-hit cache.
355 	 */
356 	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
357 		return true;
358 
359 	if (blk_queue_noxmerges(q))
360 		return false;
361 
362 	ret = false;
363 	/*
364 	 * See if our hash lookup can find a potential backmerge.
365 	 */
366 	while (1) {
367 		__rq = elv_rqhash_find(q, blk_rq_pos(rq));
368 		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
369 			break;
370 
371 		/* The merged request could be merged with others, try again */
372 		ret = true;
373 		rq = __rq;
374 	}
375 
376 	return ret;
377 }
378 
379 void elv_merged_request(struct request_queue *q, struct request *rq,
380 		enum elv_merge type)
381 {
382 	struct elevator_queue *e = q->elevator;
383 
384 	if (e->type->ops.request_merged)
385 		e->type->ops.request_merged(q, rq, type);
386 
387 	if (type == ELEVATOR_BACK_MERGE)
388 		elv_rqhash_reposition(q, rq);
389 
390 	q->last_merge = rq;
391 }
392 
393 void elv_merge_requests(struct request_queue *q, struct request *rq,
394 			     struct request *next)
395 {
396 	struct elevator_queue *e = q->elevator;
397 
398 	if (e->type->ops.requests_merged)
399 		e->type->ops.requests_merged(q, rq, next);
400 
401 	elv_rqhash_reposition(q, rq);
402 	q->last_merge = rq;
403 }
404 
405 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
406 {
407 	struct elevator_queue *e = q->elevator;
408 
409 	if (e->type->ops.next_request)
410 		return e->type->ops.next_request(q, rq);
411 
412 	return NULL;
413 }
414 
415 struct request *elv_former_request(struct request_queue *q, struct request *rq)
416 {
417 	struct elevator_queue *e = q->elevator;
418 
419 	if (e->type->ops.former_request)
420 		return e->type->ops.former_request(q, rq);
421 
422 	return NULL;
423 }
424 
425 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
426 
427 static ssize_t
428 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
429 {
430 	struct elv_fs_entry *entry = to_elv(attr);
431 	struct elevator_queue *e;
432 	ssize_t error;
433 
434 	if (!entry->show)
435 		return -EIO;
436 
437 	e = container_of(kobj, struct elevator_queue, kobj);
438 	mutex_lock(&e->sysfs_lock);
439 	error = e->type ? entry->show(e, page) : -ENOENT;
440 	mutex_unlock(&e->sysfs_lock);
441 	return error;
442 }
443 
444 static ssize_t
445 elv_attr_store(struct kobject *kobj, struct attribute *attr,
446 	       const char *page, size_t length)
447 {
448 	struct elv_fs_entry *entry = to_elv(attr);
449 	struct elevator_queue *e;
450 	ssize_t error;
451 
452 	if (!entry->store)
453 		return -EIO;
454 
455 	e = container_of(kobj, struct elevator_queue, kobj);
456 	mutex_lock(&e->sysfs_lock);
457 	error = e->type ? entry->store(e, page, length) : -ENOENT;
458 	mutex_unlock(&e->sysfs_lock);
459 	return error;
460 }
461 
462 static const struct sysfs_ops elv_sysfs_ops = {
463 	.show	= elv_attr_show,
464 	.store	= elv_attr_store,
465 };
466 
467 static struct kobj_type elv_ktype = {
468 	.sysfs_ops	= &elv_sysfs_ops,
469 	.release	= elevator_release,
470 };
471 
472 int elv_register_queue(struct request_queue *q)
473 {
474 	struct elevator_queue *e = q->elevator;
475 	int error;
476 
477 	lockdep_assert_held(&q->sysfs_lock);
478 
479 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
480 	if (!error) {
481 		struct elv_fs_entry *attr = e->type->elevator_attrs;
482 		if (attr) {
483 			while (attr->attr.name) {
484 				if (sysfs_create_file(&e->kobj, &attr->attr))
485 					break;
486 				attr++;
487 			}
488 		}
489 		kobject_uevent(&e->kobj, KOBJ_ADD);
490 		e->registered = 1;
491 	}
492 	return error;
493 }
494 
495 void elv_unregister_queue(struct request_queue *q)
496 {
497 	lockdep_assert_held(&q->sysfs_lock);
498 
499 	if (q) {
500 		struct elevator_queue *e = q->elevator;
501 
502 		kobject_uevent(&e->kobj, KOBJ_REMOVE);
503 		kobject_del(&e->kobj);
504 		e->registered = 0;
505 		/* Re-enable throttling in case elevator disabled it */
506 		wbt_enable_default(q);
507 	}
508 }
509 
510 int elv_register(struct elevator_type *e)
511 {
512 	char *def = "";
513 
514 	/* create icq_cache if requested */
515 	if (e->icq_size) {
516 		if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
517 		    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
518 			return -EINVAL;
519 
520 		snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
521 			 "%s_io_cq", e->elevator_name);
522 		e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
523 						 e->icq_align, 0, NULL);
524 		if (!e->icq_cache)
525 			return -ENOMEM;
526 	}
527 
528 	/* register, don't allow duplicate names */
529 	spin_lock(&elv_list_lock);
530 	if (elevator_find(e->elevator_name)) {
531 		spin_unlock(&elv_list_lock);
532 		kmem_cache_destroy(e->icq_cache);
533 		return -EBUSY;
534 	}
535 	list_add_tail(&e->list, &elv_list);
536 	spin_unlock(&elv_list_lock);
537 
538 	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
539 								def);
540 	return 0;
541 }
542 EXPORT_SYMBOL_GPL(elv_register);
543 
544 void elv_unregister(struct elevator_type *e)
545 {
546 	/* unregister */
547 	spin_lock(&elv_list_lock);
548 	list_del_init(&e->list);
549 	spin_unlock(&elv_list_lock);
550 
551 	/*
552 	 * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
553 	 * sure all RCU operations are complete before proceeding.
554 	 */
555 	if (e->icq_cache) {
556 		rcu_barrier();
557 		kmem_cache_destroy(e->icq_cache);
558 		e->icq_cache = NULL;
559 	}
560 }
561 EXPORT_SYMBOL_GPL(elv_unregister);
562 
563 int elevator_switch_mq(struct request_queue *q,
564 			      struct elevator_type *new_e)
565 {
566 	int ret;
567 
568 	lockdep_assert_held(&q->sysfs_lock);
569 
570 	if (q->elevator) {
571 		if (q->elevator->registered)
572 			elv_unregister_queue(q);
573 		ioc_clear_queue(q);
574 		elevator_exit(q, q->elevator);
575 	}
576 
577 	ret = blk_mq_init_sched(q, new_e);
578 	if (ret)
579 		goto out;
580 
581 	if (new_e) {
582 		ret = elv_register_queue(q);
583 		if (ret) {
584 			elevator_exit(q, q->elevator);
585 			goto out;
586 		}
587 	}
588 
589 	if (new_e)
590 		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
591 	else
592 		blk_add_trace_msg(q, "elv switch: none");
593 
594 out:
595 	return ret;
596 }
597 
598 /*
599  * For blk-mq devices, we default to using mq-deadline, if available, for single
600  * queue devices.  If deadline isn't available OR we have multiple queues,
601  * default to "none".
602  */
603 int elevator_init_mq(struct request_queue *q)
604 {
605 	struct elevator_type *e;
606 	int err = 0;
607 
608 	if (q->nr_hw_queues != 1)
609 		return 0;
610 
611 	/*
612 	 * q->sysfs_lock must be held to provide mutual exclusion between
613 	 * elevator_switch() and here.
614 	 */
615 	mutex_lock(&q->sysfs_lock);
616 	if (unlikely(q->elevator))
617 		goto out_unlock;
618 
619 	e = elevator_get(q, "mq-deadline", false);
620 	if (!e)
621 		goto out_unlock;
622 
623 	err = blk_mq_init_sched(q, e);
624 	if (err)
625 		elevator_put(e);
626 out_unlock:
627 	mutex_unlock(&q->sysfs_lock);
628 	return err;
629 }
630 
631 
632 /*
633  * switch to new_e io scheduler. be careful not to introduce deadlocks -
634  * we don't free the old io scheduler, before we have allocated what we
635  * need for the new one. this way we have a chance of going back to the old
636  * one, if the new one fails init for some reason.
637  */
638 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
639 {
640 	int err;
641 
642 	lockdep_assert_held(&q->sysfs_lock);
643 
644 	blk_mq_freeze_queue(q);
645 	blk_mq_quiesce_queue(q);
646 
647 	err = elevator_switch_mq(q, new_e);
648 
649 	blk_mq_unquiesce_queue(q);
650 	blk_mq_unfreeze_queue(q);
651 
652 	return err;
653 }
654 
655 /*
656  * Switch this queue to the given IO scheduler.
657  */
658 static int __elevator_change(struct request_queue *q, const char *name)
659 {
660 	char elevator_name[ELV_NAME_MAX];
661 	struct elevator_type *e;
662 
663 	/* Make sure queue is not in the middle of being removed */
664 	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
665 		return -ENOENT;
666 
667 	/*
668 	 * Special case for mq, turn off scheduling
669 	 */
670 	if (!strncmp(name, "none", 4)) {
671 		if (!q->elevator)
672 			return 0;
673 		return elevator_switch(q, NULL);
674 	}
675 
676 	strlcpy(elevator_name, name, sizeof(elevator_name));
677 	e = elevator_get(q, strstrip(elevator_name), true);
678 	if (!e)
679 		return -EINVAL;
680 
681 	if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
682 		elevator_put(e);
683 		return 0;
684 	}
685 
686 	return elevator_switch(q, e);
687 }
688 
689 static inline bool elv_support_iosched(struct request_queue *q)
690 {
691 	if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
692 		return false;
693 	return true;
694 }
695 
696 ssize_t elv_iosched_store(struct request_queue *q, const char *name,
697 			  size_t count)
698 {
699 	int ret;
700 
701 	if (!queue_is_mq(q) || !elv_support_iosched(q))
702 		return count;
703 
704 	ret = __elevator_change(q, name);
705 	if (!ret)
706 		return count;
707 
708 	return ret;
709 }
710 
711 ssize_t elv_iosched_show(struct request_queue *q, char *name)
712 {
713 	struct elevator_queue *e = q->elevator;
714 	struct elevator_type *elv = NULL;
715 	struct elevator_type *__e;
716 	int len = 0;
717 
718 	if (!queue_is_mq(q))
719 		return sprintf(name, "none\n");
720 
721 	if (!q->elevator)
722 		len += sprintf(name+len, "[none] ");
723 	else
724 		elv = e->type;
725 
726 	spin_lock(&elv_list_lock);
727 	list_for_each_entry(__e, &elv_list, list) {
728 		if (elv && elevator_match(elv, __e->elevator_name)) {
729 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
730 			continue;
731 		}
732 		if (elv_support_iosched(q))
733 			len += sprintf(name+len, "%s ", __e->elevator_name);
734 	}
735 	spin_unlock(&elv_list_lock);
736 
737 	if (q->elevator)
738 		len += sprintf(name+len, "none");
739 
740 	len += sprintf(len+name, "\n");
741 	return len;
742 }
743 
744 struct request *elv_rb_former_request(struct request_queue *q,
745 				      struct request *rq)
746 {
747 	struct rb_node *rbprev = rb_prev(&rq->rb_node);
748 
749 	if (rbprev)
750 		return rb_entry_rq(rbprev);
751 
752 	return NULL;
753 }
754 EXPORT_SYMBOL(elv_rb_former_request);
755 
756 struct request *elv_rb_latter_request(struct request_queue *q,
757 				      struct request *rq)
758 {
759 	struct rb_node *rbnext = rb_next(&rq->rb_node);
760 
761 	if (rbnext)
762 		return rb_entry_rq(rbnext);
763 
764 	return NULL;
765 }
766 EXPORT_SYMBOL(elv_rb_latter_request);
767