linux/block/kyber-iosched.c

1 // SPDX-License-Identifier: GPL-2.0
3  * The Kyber I/O scheduler. Controls latency by throttling queue depths using
18 #include "blk-mq.h"
19 #include "blk-mq-debugfs.h"
20 #include "blk-mq-sched.h"
54  * Maximum device-wide depth for each scheduling domain.
68  * Default latency targets for each scheduling domain.
89  * to the target latency:
91  * <= 1/4 * target latency
92  * <= 1/2 * target latency
93  * <= 3/4 * target latency
94  * <= target latency
95  * <= 1 1/4 * target latency
96  * <= 1 1/2 * target latency
97  * <= 1 3/4 * target latency
98  * > 1 3/4 * target latency
102 	 * The width of the latency histogram buckets is
103 	 * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
107 	 * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
116  * We measure both the total latency and the I/O latency (i.e., latency after
130  * Per-cpu latency histograms: total latency and I/O latency for each scheduling
139  * we use request->mq_ctx->index_hw to index the kcq in khd.
155 	 * Each scheduling domain has a limited number of in-flight requests
156 	 * device-wide, limited by these tokens.
161 	 * Async request percentage, converted to per-word depth for
214 	unsigned int *buckets = kqd->latency_buckets[sched_domain][type];  in flush_latency_buckets()
215 	atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];  in flush_latency_buckets()
223  * Calculate the histogram bucket with the given percentile rank, or -1 if there
230 	unsigned int *buckets = kqd->latency_buckets[sched_domain][type];  in calculate_percentile()
237 		return -1;  in calculate_percentile()
243 	if (!kqd->latency_timeout[sched_domain])  in calculate_percentile()
244 		kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);  in calculate_percentile()
246 	    time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {  in calculate_percentile()
247 		return -1;  in calculate_percentile()
249 	kqd->latency_timeout[sched_domain] = 0;  in calculate_percentile()
252 	for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {  in calculate_percentile()
255 		percentile_samples -= buckets[bucket];  in calculate_percentile()
257 	memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));  in calculate_percentile()
259 	trace_kyber_latency(kqd->dev, kyber_domain_names[sched_domain],  in calculate_percentile()
270 	if (depth != kqd->domain_tokens[sched_domain].sb.depth) {  in kyber_resize_domain()
271 		sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);  in kyber_resize_domain()
272 		trace_kyber_adjust(kqd->dev, kyber_domain_names[sched_domain],  in kyber_resize_domain()
284 	/* Sum all of the per-cpu latency histograms. */  in kyber_timer_fn()
288 		cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);  in kyber_timer_fn()
298 	 * Check if any domains have a high I/O latency, which might indicate  in kyber_timer_fn()
324 		 * necessarily have enough samples to calculate the latency  in kyber_timer_fn()
328 		 * reset it to -1.  in kyber_timer_fn()
332 				p99 = kqd->domain_p99[sched_domain];  in kyber_timer_fn()
333 			kqd->domain_p99[sched_domain] = -1;  in kyber_timer_fn()
335 			kqd->domain_p99[sched_domain] = p99;  in kyber_timer_fn()
341 		 * If this domain has bad latency, throttle less. Otherwise,  in kyber_timer_fn()
344 		 * The new depth is scaled linearly with the p99 latency vs the  in kyber_timer_fn()
345 		 * latency target. E.g., if the p99 is 3/4 of the target, then  in kyber_timer_fn()
350 			orig_depth = kqd->domain_tokens[sched_domain].sb.depth;  in kyber_timer_fn()
360 	int ret = -ENOMEM;  in kyber_queue_data_alloc()
363 	kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);  in kyber_queue_data_alloc()
367 	kqd->q = q;  in kyber_queue_data_alloc()
368 	kqd->dev = disk_devt(q->disk);  in kyber_queue_data_alloc()
370 	kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,  in kyber_queue_data_alloc()
372 	if (!kqd->cpu_latency)  in kyber_queue_data_alloc()
375 	timer_setup(&kqd->timer, kyber_timer_fn, 0);  in kyber_queue_data_alloc()
380 		ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],  in kyber_queue_data_alloc()
381 					      kyber_depth[i], -1, false,  in kyber_queue_data_alloc()
382 					      GFP_KERNEL, q->node);  in kyber_queue_data_alloc()
384 			while (--i >= 0)  in kyber_queue_data_alloc()
385 				sbitmap_queue_free(&kqd->domain_tokens[i]);  in kyber_queue_data_alloc()
391 		kqd->domain_p99[i] = -1;  in kyber_queue_data_alloc()
392 		kqd->latency_targets[i] = kyber_latency_targets[i];  in kyber_queue_data_alloc()
398 	free_percpu(kqd->cpu_latency);  in kyber_queue_data_alloc()
412 		return -ENOMEM;  in kyber_init_sched()
416 		kobject_put(&eq->kobj);  in kyber_init_sched()
424 	eq->elevator_data = kqd;  in kyber_init_sched()
425 	q->elevator = eq;  in kyber_init_sched()
432 	struct kyber_queue_data *kqd = e->elevator_data;  in kyber_exit_sched()
435 	timer_shutdown_sync(&kqd->timer);  in kyber_exit_sched()
436 	blk_stat_disable_accounting(kqd->q);  in kyber_exit_sched()
439 		sbitmap_queue_free(&kqd->domain_tokens[i]);  in kyber_exit_sched()
440 	free_percpu(kqd->cpu_latency);  in kyber_exit_sched()
448 	spin_lock_init(&kcq->lock);  in kyber_ctx_queue_init()
450 		INIT_LIST_HEAD(&kcq->rq_list[i]);  in kyber_ctx_queue_init()
455 	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;  in kyber_depth_updated()
456 	struct blk_mq_tags *tags = hctx->sched_tags;  in kyber_depth_updated()
457 	unsigned int shift = tags->bitmap_tags.sb.shift;  in kyber_depth_updated()
459 	kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;  in kyber_depth_updated()
461 	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);  in kyber_depth_updated()
469 	khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);  in kyber_init_hctx()
471 		return -ENOMEM;  in kyber_init_hctx()
473 	khd->kcqs = kmalloc_array_node(hctx->nr_ctx,  in kyber_init_hctx()
475 				       GFP_KERNEL, hctx->numa_node);  in kyber_init_hctx()
476 	if (!khd->kcqs)  in kyber_init_hctx()
479 	for (i = 0; i < hctx->nr_ctx; i++)  in kyber_init_hctx()
480 		kyber_ctx_queue_init(&khd->kcqs[i]);  in kyber_init_hctx()
483 		if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,  in kyber_init_hctx()
484 				      ilog2(8), GFP_KERNEL, hctx->numa_node,  in kyber_init_hctx()
486 			while (--i >= 0)  in kyber_init_hctx()
487 				sbitmap_free(&khd->kcq_map[i]);  in kyber_init_hctx()
492 	spin_lock_init(&khd->lock);  in kyber_init_hctx()
495 		INIT_LIST_HEAD(&khd->rqs[i]);  in kyber_init_hctx()
496 		khd->domain_wait[i].sbq = NULL;  in kyber_init_hctx()
497 		init_waitqueue_func_entry(&khd->domain_wait[i].wait,  in kyber_init_hctx()
499 		khd->domain_wait[i].wait.private = hctx;  in kyber_init_hctx()
500 		INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry);  in kyber_init_hctx()
501 		atomic_set(&khd->wait_index[i], 0);  in kyber_init_hctx()
504 	khd->cur_domain = 0;  in kyber_init_hctx()
505 	khd->batching = 0;  in kyber_init_hctx()
507 	hctx->sched_data = khd;  in kyber_init_hctx()
513 	kfree(khd->kcqs);  in kyber_init_hctx()
516 	return -ENOMEM;  in kyber_init_hctx()
521 	struct kyber_hctx_data *khd = hctx->sched_data;  in kyber_exit_hctx()
525 		sbitmap_free(&khd->kcq_map[i]);  in kyber_exit_hctx()
526 	kfree(khd->kcqs);  in kyber_exit_hctx()
527 	kfree(hctx->sched_data);  in kyber_exit_hctx()
532 	return (long)rq->elv.priv[0];  in rq_get_domain_token()
537 	rq->elv.priv[0] = (void *)(long)token;  in rq_set_domain_token()
547 	if (nr != -1) {  in rq_clear_domain_token()
548 		sched_domain = kyber_sched_domain(rq->cmd_flags);  in rq_clear_domain_token()
549 		sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,  in rq_clear_domain_token()
550 				    rq->mq_ctx->cpu);  in rq_clear_domain_token()
557 	 * We use the scheduler tags as per-hardware queue queueing tokens.  in kyber_limit_depth()
561 		struct kyber_queue_data *kqd = data->q->elevator->elevator_data;  in kyber_limit_depth()
563 		data->shallow_depth = kqd->async_depth;  in kyber_limit_depth()
571 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);  in kyber_bio_merge()
572 	struct kyber_hctx_data *khd = hctx->sched_data;  in kyber_bio_merge()
573 	struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];  in kyber_bio_merge()
574 	unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);  in kyber_bio_merge()
575 	struct list_head *rq_list = &kcq->rq_list[sched_domain];  in kyber_bio_merge()
578 	spin_lock(&kcq->lock);  in kyber_bio_merge()
579 	merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);  in kyber_bio_merge()
580 	spin_unlock(&kcq->lock);  in kyber_bio_merge()
587 	rq_set_domain_token(rq, -1);  in kyber_prepare_request()
594 	struct kyber_hctx_data *khd = hctx->sched_data;  in kyber_insert_requests()
598 		unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);  in kyber_insert_requests()
599 		struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]];  in kyber_insert_requests()
600 		struct list_head *head = &kcq->rq_list[sched_domain];  in kyber_insert_requests()
602 		spin_lock(&kcq->lock);  in kyber_insert_requests()
605 			list_move(&rq->queuelist, head);  in kyber_insert_requests()
607 			list_move_tail(&rq->queuelist, head);  in kyber_insert_requests()
608 		sbitmap_set_bit(&khd->kcq_map[sched_domain],  in kyber_insert_requests()
609 				rq->mq_ctx->index_hw[hctx->type]);  in kyber_insert_requests()
610 		spin_unlock(&kcq->lock);  in kyber_insert_requests()
616 	struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;  in kyber_finish_request()
623 			       u64 target, u64 latency)  in add_latency_sample()  argument
628 	if (latency > 0) {  in add_latency_sample()
630 		bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),  in add_latency_sample()
631 			       KYBER_LATENCY_BUCKETS - 1);  in add_latency_sample()
636 	atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);  in add_latency_sample()
641 	struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;  in kyber_completed_request()
646 	sched_domain = kyber_sched_domain(rq->cmd_flags);  in kyber_completed_request()
650 	cpu_latency = get_cpu_ptr(kqd->cpu_latency);  in kyber_completed_request()
651 	target = kqd->latency_targets[sched_domain];  in kyber_completed_request()
653 			   target, now - rq->start_time_ns);  in kyber_completed_request()
655 			   now - rq->io_start_time_ns);  in kyber_completed_request()
656 	put_cpu_ptr(kqd->cpu_latency);  in kyber_completed_request()
658 	timer_reduce(&kqd->timer, jiffies + HZ / 10);  in kyber_completed_request()
670 	struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];  in flush_busy_kcq()
672 	spin_lock(&kcq->lock);  in flush_busy_kcq()
673 	list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],  in flush_busy_kcq()
674 			      flush_data->list);  in flush_busy_kcq()
676 	spin_unlock(&kcq->lock);  in flush_busy_kcq()
691 	sbitmap_for_each_set(&khd->kcq_map[sched_domain],  in kyber_flush_busy_kcqs()
698 	struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private);  in kyber_domain_wake()
710 	unsigned int sched_domain = khd->cur_domain;  in kyber_get_domain_token()
711 	struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];  in kyber_get_domain_token()
712 	struct sbq_wait *wait = &khd->domain_wait[sched_domain];  in kyber_get_domain_token()
721 	 * khd->lock, but we still need to be careful about the waker.  in kyber_get_domain_token()
723 	if (nr < 0 && list_empty_careful(&wait->wait.entry)) {  in kyber_get_domain_token()
725 				  &khd->wait_index[sched_domain]);  in kyber_get_domain_token()
726 		khd->domain_ws[sched_domain] = ws;  in kyber_get_domain_token()
739 	 * progress. It's possible that the waker already deleted the entry  in kyber_get_domain_token()
743 	if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) {  in kyber_get_domain_token()
744 		ws = khd->domain_ws[sched_domain];  in kyber_get_domain_token()
745 		spin_lock_irq(&ws->wait.lock);  in kyber_get_domain_token()
747 		spin_unlock_irq(&ws->wait.lock);  in kyber_get_domain_token()
762 	rqs = &khd->rqs[khd->cur_domain];  in kyber_dispatch_cur_domain()
769 	 * khd->lock serializes the flushes, so if we observed any bit set in  in kyber_dispatch_cur_domain()
776 			khd->batching++;  in kyber_dispatch_cur_domain()
778 			list_del_init(&rq->queuelist);  in kyber_dispatch_cur_domain()
781 			trace_kyber_throttled(kqd->dev,  in kyber_dispatch_cur_domain()
782 					      kyber_domain_names[khd->cur_domain]);  in kyber_dispatch_cur_domain()
784 	} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {  in kyber_dispatch_cur_domain()
787 			kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);  in kyber_dispatch_cur_domain()
789 			khd->batching++;  in kyber_dispatch_cur_domain()
791 			list_del_init(&rq->queuelist);  in kyber_dispatch_cur_domain()
794 			trace_kyber_throttled(kqd->dev,  in kyber_dispatch_cur_domain()
795 					      kyber_domain_names[khd->cur_domain]);  in kyber_dispatch_cur_domain()
805 	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;  in kyber_dispatch_request()
806 	struct kyber_hctx_data *khd = hctx->sched_data;  in kyber_dispatch_request()
810 	spin_lock(&khd->lock);  in kyber_dispatch_request()
816 	if (khd->batching < kyber_batch_size[khd->cur_domain]) {  in kyber_dispatch_request()
831 	khd->batching = 0;  in kyber_dispatch_request()
833 		if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)  in kyber_dispatch_request()
834 			khd->cur_domain = 0;  in kyber_dispatch_request()
836 			khd->cur_domain++;  in kyber_dispatch_request()
845 	spin_unlock(&khd->lock);  in kyber_dispatch_request()
851 	struct kyber_hctx_data *khd = hctx->sched_data;  in kyber_has_work()
855 		if (!list_empty_careful(&khd->rqs[i]) ||  in kyber_has_work()
856 		    sbitmap_any_bit_set(&khd->kcq_map[i]))  in kyber_has_work()
867 	struct kyber_queue_data *kqd = e->elevator_data;		\
869 	return sprintf(page, "%llu\n", kqd->latency_targets[domain]);	\
875 	struct kyber_queue_data *kqd = e->elevator_data;		\
883 	kqd->latency_targets[domain] = nsec;				\
904 	struct kyber_queue_data *kqd = q->elevator->elevator_data;	\
906 	sbitmap_queue_show(&kqd->domain_tokens[domain], m);		\
911 	__acquires(&khd->lock)						\
913 	struct blk_mq_hw_ctx *hctx = m->private;			\
914 	struct kyber_hctx_data *khd = hctx->sched_data;			\
916 	spin_lock(&khd->lock);						\
917 	return seq_list_start(&khd->rqs[domain], *pos);			\
923 	struct blk_mq_hw_ctx *hctx = m->private;			\
924 	struct kyber_hctx_data *khd = hctx->sched_data;			\
926 	return seq_list_next(v, &khd->rqs[domain], pos);		\
930 	__releases(&khd->lock)						\
932 	struct blk_mq_hw_ctx *hctx = m->private;			\
933 	struct kyber_hctx_data *khd = hctx->sched_data;			\
935 	spin_unlock(&khd->lock);					\
948 	struct kyber_hctx_data *khd = hctx->sched_data;			\
949 	wait_queue_entry_t *wait = &khd->domain_wait[domain].wait;	\
951 	seq_printf(m, "%d\n", !list_empty_careful(&wait->entry));	\
963 	struct kyber_queue_data *kqd = q->elevator->elevator_data;  in KYBER_DEBUGFS_DOMAIN_ATTRS()
965 	seq_printf(m, "%u\n", kqd->async_depth);  in KYBER_DEBUGFS_DOMAIN_ATTRS()
972 	struct kyber_hctx_data *khd = hctx->sched_data;  in kyber_cur_domain_show()
974 	seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);  in kyber_cur_domain_show()
981 	struct kyber_hctx_data *khd = hctx->sched_data;  in kyber_batching_show()
983 	seq_printf(m, "%u\n", khd->batching);  in kyber_batching_show()