xref: /openbmc/linux/block/blk-mq.c (revision afc98d90)
1 #include <linux/kernel.h>
2 #include <linux/module.h>
3 #include <linux/backing-dev.h>
4 #include <linux/bio.h>
5 #include <linux/blkdev.h>
6 #include <linux/mm.h>
7 #include <linux/init.h>
8 #include <linux/slab.h>
9 #include <linux/workqueue.h>
10 #include <linux/smp.h>
11 #include <linux/llist.h>
12 #include <linux/list_sort.h>
13 #include <linux/cpu.h>
14 #include <linux/cache.h>
15 #include <linux/sched/sysctl.h>
16 #include <linux/delay.h>
17 
18 #include <trace/events/block.h>
19 
20 #include <linux/blk-mq.h>
21 #include "blk.h"
22 #include "blk-mq.h"
23 #include "blk-mq-tag.h"
24 
25 static DEFINE_MUTEX(all_q_mutex);
26 static LIST_HEAD(all_q_list);
27 
28 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
29 
30 static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
31 					   unsigned int cpu)
32 {
33 	return per_cpu_ptr(q->queue_ctx, cpu);
34 }
35 
36 /*
37  * This assumes per-cpu software queueing queues. They could be per-node
38  * as well, for instance. For now this is hardcoded as-is. Note that we don't
39  * care about preemption, since we know the ctx's are persistent. This does
40  * mean that we can't rely on ctx always matching the currently running CPU.
41  */
42 static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
43 {
44 	return __blk_mq_get_ctx(q, get_cpu());
45 }
46 
47 static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
48 {
49 	put_cpu();
50 }
51 
52 /*
53  * Check if any of the ctx's have pending work in this hardware queue
54  */
55 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
56 {
57 	unsigned int i;
58 
59 	for (i = 0; i < hctx->nr_ctx_map; i++)
60 		if (hctx->ctx_map[i])
61 			return true;
62 
63 	return false;
64 }
65 
66 /*
67  * Mark this ctx as having pending work in this hardware queue
68  */
69 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
70 				     struct blk_mq_ctx *ctx)
71 {
72 	if (!test_bit(ctx->index_hw, hctx->ctx_map))
73 		set_bit(ctx->index_hw, hctx->ctx_map);
74 }
75 
76 static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp,
77 				       bool reserved)
78 {
79 	struct request *rq;
80 	unsigned int tag;
81 
82 	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
83 	if (tag != BLK_MQ_TAG_FAIL) {
84 		rq = hctx->rqs[tag];
85 		rq->tag = tag;
86 
87 		return rq;
88 	}
89 
90 	return NULL;
91 }
92 
93 static int blk_mq_queue_enter(struct request_queue *q)
94 {
95 	int ret;
96 
97 	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
98 	smp_wmb();
99 	/* we have problems to freeze the queue if it's initializing */
100 	if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
101 		return 0;
102 
103 	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
104 
105 	spin_lock_irq(q->queue_lock);
106 	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
107 		!blk_queue_bypass(q) || blk_queue_dying(q),
108 		*q->queue_lock);
109 	/* inc usage with lock hold to avoid freeze_queue runs here */
110 	if (!ret && !blk_queue_dying(q))
111 		__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
112 	else if (blk_queue_dying(q))
113 		ret = -ENODEV;
114 	spin_unlock_irq(q->queue_lock);
115 
116 	return ret;
117 }
118 
119 static void blk_mq_queue_exit(struct request_queue *q)
120 {
121 	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
122 }
123 
124 static void __blk_mq_drain_queue(struct request_queue *q)
125 {
126 	while (true) {
127 		s64 count;
128 
129 		spin_lock_irq(q->queue_lock);
130 		count = percpu_counter_sum(&q->mq_usage_counter);
131 		spin_unlock_irq(q->queue_lock);
132 
133 		if (count == 0)
134 			break;
135 		blk_mq_run_queues(q, false);
136 		msleep(10);
137 	}
138 }
139 
140 /*
141  * Guarantee no request is in use, so we can change any data structure of
142  * the queue afterward.
143  */
144 static void blk_mq_freeze_queue(struct request_queue *q)
145 {
146 	bool drain;
147 
148 	spin_lock_irq(q->queue_lock);
149 	drain = !q->bypass_depth++;
150 	queue_flag_set(QUEUE_FLAG_BYPASS, q);
151 	spin_unlock_irq(q->queue_lock);
152 
153 	if (drain)
154 		__blk_mq_drain_queue(q);
155 }
156 
157 void blk_mq_drain_queue(struct request_queue *q)
158 {
159 	__blk_mq_drain_queue(q);
160 }
161 
162 static void blk_mq_unfreeze_queue(struct request_queue *q)
163 {
164 	bool wake = false;
165 
166 	spin_lock_irq(q->queue_lock);
167 	if (!--q->bypass_depth) {
168 		queue_flag_clear(QUEUE_FLAG_BYPASS, q);
169 		wake = true;
170 	}
171 	WARN_ON_ONCE(q->bypass_depth < 0);
172 	spin_unlock_irq(q->queue_lock);
173 	if (wake)
174 		wake_up_all(&q->mq_freeze_wq);
175 }
176 
177 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
178 {
179 	return blk_mq_has_free_tags(hctx->tags);
180 }
181 EXPORT_SYMBOL(blk_mq_can_queue);
182 
183 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
184 			       struct request *rq, unsigned int rw_flags)
185 {
186 	if (blk_queue_io_stat(q))
187 		rw_flags |= REQ_IO_STAT;
188 
189 	rq->mq_ctx = ctx;
190 	rq->cmd_flags = rw_flags;
191 	rq->start_time = jiffies;
192 	set_start_time_ns(rq);
193 	ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
194 }
195 
196 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
197 					      gfp_t gfp, bool reserved)
198 {
199 	return blk_mq_alloc_rq(hctx, gfp, reserved);
200 }
201 
202 static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
203 						   int rw, gfp_t gfp,
204 						   bool reserved)
205 {
206 	struct request *rq;
207 
208 	do {
209 		struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
210 		struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
211 
212 		rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
213 		if (rq) {
214 			blk_mq_rq_ctx_init(q, ctx, rq, rw);
215 			break;
216 		}
217 
218 		blk_mq_put_ctx(ctx);
219 		if (!(gfp & __GFP_WAIT))
220 			break;
221 
222 		__blk_mq_run_hw_queue(hctx);
223 		blk_mq_wait_for_tags(hctx->tags);
224 	} while (1);
225 
226 	return rq;
227 }
228 
229 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
230 {
231 	struct request *rq;
232 
233 	if (blk_mq_queue_enter(q))
234 		return NULL;
235 
236 	rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
237 	if (rq)
238 		blk_mq_put_ctx(rq->mq_ctx);
239 	return rq;
240 }
241 
242 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
243 					      gfp_t gfp)
244 {
245 	struct request *rq;
246 
247 	if (blk_mq_queue_enter(q))
248 		return NULL;
249 
250 	rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
251 	if (rq)
252 		blk_mq_put_ctx(rq->mq_ctx);
253 	return rq;
254 }
255 EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
256 
257 /*
258  * Re-init and set pdu, if we have it
259  */
260 void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
261 {
262 	blk_rq_init(hctx->queue, rq);
263 
264 	if (hctx->cmd_size)
265 		rq->special = blk_mq_rq_to_pdu(rq);
266 }
267 
268 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
269 				  struct blk_mq_ctx *ctx, struct request *rq)
270 {
271 	const int tag = rq->tag;
272 	struct request_queue *q = rq->q;
273 
274 	blk_mq_rq_init(hctx, rq);
275 	blk_mq_put_tag(hctx->tags, tag);
276 
277 	blk_mq_queue_exit(q);
278 }
279 
280 void blk_mq_free_request(struct request *rq)
281 {
282 	struct blk_mq_ctx *ctx = rq->mq_ctx;
283 	struct blk_mq_hw_ctx *hctx;
284 	struct request_queue *q = rq->q;
285 
286 	ctx->rq_completed[rq_is_sync(rq)]++;
287 
288 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
289 	__blk_mq_free_request(hctx, ctx, rq);
290 }
291 
292 static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error)
293 {
294 	if (error)
295 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
296 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
297 		error = -EIO;
298 
299 	if (unlikely(rq->cmd_flags & REQ_QUIET))
300 		set_bit(BIO_QUIET, &bio->bi_flags);
301 
302 	/* don't actually finish bio if it's part of flush sequence */
303 	if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
304 		bio_endio(bio, error);
305 }
306 
307 void blk_mq_end_io(struct request *rq, int error)
308 {
309 	struct bio *bio = rq->bio;
310 	unsigned int bytes = 0;
311 
312 	trace_block_rq_complete(rq->q, rq);
313 
314 	while (bio) {
315 		struct bio *next = bio->bi_next;
316 
317 		bio->bi_next = NULL;
318 		bytes += bio->bi_iter.bi_size;
319 		blk_mq_bio_endio(rq, bio, error);
320 		bio = next;
321 	}
322 
323 	blk_account_io_completion(rq, bytes);
324 
325 	blk_account_io_done(rq);
326 
327 	if (rq->end_io)
328 		rq->end_io(rq, error);
329 	else
330 		blk_mq_free_request(rq);
331 }
332 EXPORT_SYMBOL(blk_mq_end_io);
333 
334 static void __blk_mq_complete_request_remote(void *data)
335 {
336 	struct request *rq = data;
337 
338 	rq->q->softirq_done_fn(rq);
339 }
340 
341 void __blk_mq_complete_request(struct request *rq)
342 {
343 	struct blk_mq_ctx *ctx = rq->mq_ctx;
344 	int cpu;
345 
346 	if (!ctx->ipi_redirect) {
347 		rq->q->softirq_done_fn(rq);
348 		return;
349 	}
350 
351 	cpu = get_cpu();
352 	if (cpu != ctx->cpu && cpu_online(ctx->cpu)) {
353 		rq->csd.func = __blk_mq_complete_request_remote;
354 		rq->csd.info = rq;
355 		rq->csd.flags = 0;
356 		__smp_call_function_single(ctx->cpu, &rq->csd, 0);
357 	} else {
358 		rq->q->softirq_done_fn(rq);
359 	}
360 	put_cpu();
361 }
362 
363 /**
364  * blk_mq_complete_request - end I/O on a request
365  * @rq:		the request being processed
366  *
367  * Description:
368  *	Ends all I/O on a request. It does not handle partial completions.
369  *	The actual completion happens out-of-order, through a IPI handler.
370  **/
371 void blk_mq_complete_request(struct request *rq)
372 {
373 	if (unlikely(blk_should_fake_timeout(rq->q)))
374 		return;
375 	if (!blk_mark_rq_complete(rq))
376 		__blk_mq_complete_request(rq);
377 }
378 EXPORT_SYMBOL(blk_mq_complete_request);
379 
380 static void blk_mq_start_request(struct request *rq, bool last)
381 {
382 	struct request_queue *q = rq->q;
383 
384 	trace_block_rq_issue(q, rq);
385 
386 	/*
387 	 * Just mark start time and set the started bit. Due to memory
388 	 * ordering, we know we'll see the correct deadline as long as
389 	 * REQ_ATOMIC_STARTED is seen.
390 	 */
391 	rq->deadline = jiffies + q->rq_timeout;
392 	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
393 
394 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
395 		/*
396 		 * Make sure space for the drain appears.  We know we can do
397 		 * this because max_hw_segments has been adjusted to be one
398 		 * fewer than the device can handle.
399 		 */
400 		rq->nr_phys_segments++;
401 	}
402 
403 	/*
404 	 * Flag the last request in the series so that drivers know when IO
405 	 * should be kicked off, if they don't do it on a per-request basis.
406 	 *
407 	 * Note: the flag isn't the only condition drivers should do kick off.
408 	 * If drive is busy, the last request might not have the bit set.
409 	 */
410 	if (last)
411 		rq->cmd_flags |= REQ_END;
412 }
413 
414 static void blk_mq_requeue_request(struct request *rq)
415 {
416 	struct request_queue *q = rq->q;
417 
418 	trace_block_rq_requeue(q, rq);
419 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
420 
421 	rq->cmd_flags &= ~REQ_END;
422 
423 	if (q->dma_drain_size && blk_rq_bytes(rq))
424 		rq->nr_phys_segments--;
425 }
426 
427 struct blk_mq_timeout_data {
428 	struct blk_mq_hw_ctx *hctx;
429 	unsigned long *next;
430 	unsigned int *next_set;
431 };
432 
433 static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
434 {
435 	struct blk_mq_timeout_data *data = __data;
436 	struct blk_mq_hw_ctx *hctx = data->hctx;
437 	unsigned int tag;
438 
439 	 /* It may not be in flight yet (this is where
440 	 * the REQ_ATOMIC_STARTED flag comes in). The requests are
441 	 * statically allocated, so we know it's always safe to access the
442 	 * memory associated with a bit offset into ->rqs[].
443 	 */
444 	tag = 0;
445 	do {
446 		struct request *rq;
447 
448 		tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
449 		if (tag >= hctx->queue_depth)
450 			break;
451 
452 		rq = hctx->rqs[tag++];
453 
454 		if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
455 			continue;
456 
457 		blk_rq_check_expired(rq, data->next, data->next_set);
458 	} while (1);
459 }
460 
461 static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
462 					unsigned long *next,
463 					unsigned int *next_set)
464 {
465 	struct blk_mq_timeout_data data = {
466 		.hctx		= hctx,
467 		.next		= next,
468 		.next_set	= next_set,
469 	};
470 
471 	/*
472 	 * Ask the tagging code to iterate busy requests, so we can
473 	 * check them for timeout.
474 	 */
475 	blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
476 }
477 
478 static void blk_mq_rq_timer(unsigned long data)
479 {
480 	struct request_queue *q = (struct request_queue *) data;
481 	struct blk_mq_hw_ctx *hctx;
482 	unsigned long next = 0;
483 	int i, next_set = 0;
484 
485 	queue_for_each_hw_ctx(q, hctx, i)
486 		blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
487 
488 	if (next_set)
489 		mod_timer(&q->timeout, round_jiffies_up(next));
490 }
491 
492 /*
493  * Reverse check our software queue for entries that we could potentially
494  * merge with. Currently includes a hand-wavy stop count of 8, to not spend
495  * too much time checking for merges.
496  */
497 static bool blk_mq_attempt_merge(struct request_queue *q,
498 				 struct blk_mq_ctx *ctx, struct bio *bio)
499 {
500 	struct request *rq;
501 	int checked = 8;
502 
503 	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
504 		int el_ret;
505 
506 		if (!checked--)
507 			break;
508 
509 		if (!blk_rq_merge_ok(rq, bio))
510 			continue;
511 
512 		el_ret = blk_try_merge(rq, bio);
513 		if (el_ret == ELEVATOR_BACK_MERGE) {
514 			if (bio_attempt_back_merge(q, rq, bio)) {
515 				ctx->rq_merged++;
516 				return true;
517 			}
518 			break;
519 		} else if (el_ret == ELEVATOR_FRONT_MERGE) {
520 			if (bio_attempt_front_merge(q, rq, bio)) {
521 				ctx->rq_merged++;
522 				return true;
523 			}
524 			break;
525 		}
526 	}
527 
528 	return false;
529 }
530 
531 void blk_mq_add_timer(struct request *rq)
532 {
533 	__blk_add_timer(rq, NULL);
534 }
535 
536 /*
537  * Run this hardware queue, pulling any software queues mapped to it in.
538  * Note that this function currently has various problems around ordering
539  * of IO. In particular, we'd like FIFO behaviour on handling existing
540  * items on the hctx->dispatch list. Ignore that for now.
541  */
542 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
543 {
544 	struct request_queue *q = hctx->queue;
545 	struct blk_mq_ctx *ctx;
546 	struct request *rq;
547 	LIST_HEAD(rq_list);
548 	int bit, queued;
549 
550 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
551 		return;
552 
553 	hctx->run++;
554 
555 	/*
556 	 * Touch any software queue that has pending entries.
557 	 */
558 	for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
559 		clear_bit(bit, hctx->ctx_map);
560 		ctx = hctx->ctxs[bit];
561 		BUG_ON(bit != ctx->index_hw);
562 
563 		spin_lock(&ctx->lock);
564 		list_splice_tail_init(&ctx->rq_list, &rq_list);
565 		spin_unlock(&ctx->lock);
566 	}
567 
568 	/*
569 	 * If we have previous entries on our dispatch list, grab them
570 	 * and stuff them at the front for more fair dispatch.
571 	 */
572 	if (!list_empty_careful(&hctx->dispatch)) {
573 		spin_lock(&hctx->lock);
574 		if (!list_empty(&hctx->dispatch))
575 			list_splice_init(&hctx->dispatch, &rq_list);
576 		spin_unlock(&hctx->lock);
577 	}
578 
579 	/*
580 	 * Delete and return all entries from our dispatch list
581 	 */
582 	queued = 0;
583 
584 	/*
585 	 * Now process all the entries, sending them to the driver.
586 	 */
587 	while (!list_empty(&rq_list)) {
588 		int ret;
589 
590 		rq = list_first_entry(&rq_list, struct request, queuelist);
591 		list_del_init(&rq->queuelist);
592 
593 		blk_mq_start_request(rq, list_empty(&rq_list));
594 
595 		ret = q->mq_ops->queue_rq(hctx, rq);
596 		switch (ret) {
597 		case BLK_MQ_RQ_QUEUE_OK:
598 			queued++;
599 			continue;
600 		case BLK_MQ_RQ_QUEUE_BUSY:
601 			/*
602 			 * FIXME: we should have a mechanism to stop the queue
603 			 * like blk_stop_queue, otherwise we will waste cpu
604 			 * time
605 			 */
606 			list_add(&rq->queuelist, &rq_list);
607 			blk_mq_requeue_request(rq);
608 			break;
609 		default:
610 			pr_err("blk-mq: bad return on queue: %d\n", ret);
611 		case BLK_MQ_RQ_QUEUE_ERROR:
612 			rq->errors = -EIO;
613 			blk_mq_end_io(rq, rq->errors);
614 			break;
615 		}
616 
617 		if (ret == BLK_MQ_RQ_QUEUE_BUSY)
618 			break;
619 	}
620 
621 	if (!queued)
622 		hctx->dispatched[0]++;
623 	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
624 		hctx->dispatched[ilog2(queued) + 1]++;
625 
626 	/*
627 	 * Any items that need requeuing? Stuff them into hctx->dispatch,
628 	 * that is where we will continue on next queue run.
629 	 */
630 	if (!list_empty(&rq_list)) {
631 		spin_lock(&hctx->lock);
632 		list_splice(&rq_list, &hctx->dispatch);
633 		spin_unlock(&hctx->lock);
634 	}
635 }
636 
637 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
638 {
639 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
640 		return;
641 
642 	if (!async)
643 		__blk_mq_run_hw_queue(hctx);
644 	else {
645 		struct request_queue *q = hctx->queue;
646 
647 		kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
648 	}
649 }
650 
651 void blk_mq_run_queues(struct request_queue *q, bool async)
652 {
653 	struct blk_mq_hw_ctx *hctx;
654 	int i;
655 
656 	queue_for_each_hw_ctx(q, hctx, i) {
657 		if ((!blk_mq_hctx_has_pending(hctx) &&
658 		    list_empty_careful(&hctx->dispatch)) ||
659 		    test_bit(BLK_MQ_S_STOPPED, &hctx->flags))
660 			continue;
661 
662 		blk_mq_run_hw_queue(hctx, async);
663 	}
664 }
665 EXPORT_SYMBOL(blk_mq_run_queues);
666 
667 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
668 {
669 	cancel_delayed_work(&hctx->delayed_work);
670 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
671 }
672 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
673 
674 void blk_mq_stop_hw_queues(struct request_queue *q)
675 {
676 	struct blk_mq_hw_ctx *hctx;
677 	int i;
678 
679 	queue_for_each_hw_ctx(q, hctx, i)
680 		blk_mq_stop_hw_queue(hctx);
681 }
682 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
683 
684 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
685 {
686 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
687 	__blk_mq_run_hw_queue(hctx);
688 }
689 EXPORT_SYMBOL(blk_mq_start_hw_queue);
690 
691 void blk_mq_start_stopped_hw_queues(struct request_queue *q)
692 {
693 	struct blk_mq_hw_ctx *hctx;
694 	int i;
695 
696 	queue_for_each_hw_ctx(q, hctx, i) {
697 		if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
698 			continue;
699 
700 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
701 		blk_mq_run_hw_queue(hctx, true);
702 	}
703 }
704 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
705 
706 static void blk_mq_work_fn(struct work_struct *work)
707 {
708 	struct blk_mq_hw_ctx *hctx;
709 
710 	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
711 	__blk_mq_run_hw_queue(hctx);
712 }
713 
714 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
715 				    struct request *rq, bool at_head)
716 {
717 	struct blk_mq_ctx *ctx = rq->mq_ctx;
718 
719 	trace_block_rq_insert(hctx->queue, rq);
720 
721 	if (at_head)
722 		list_add(&rq->queuelist, &ctx->rq_list);
723 	else
724 		list_add_tail(&rq->queuelist, &ctx->rq_list);
725 	blk_mq_hctx_mark_pending(hctx, ctx);
726 
727 	/*
728 	 * We do this early, to ensure we are on the right CPU.
729 	 */
730 	blk_mq_add_timer(rq);
731 }
732 
733 void blk_mq_insert_request(struct request_queue *q, struct request *rq,
734 			   bool at_head, bool run_queue)
735 {
736 	struct blk_mq_hw_ctx *hctx;
737 	struct blk_mq_ctx *ctx, *current_ctx;
738 
739 	ctx = rq->mq_ctx;
740 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
741 
742 	if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
743 		blk_insert_flush(rq);
744 	} else {
745 		current_ctx = blk_mq_get_ctx(q);
746 
747 		if (!cpu_online(ctx->cpu)) {
748 			ctx = current_ctx;
749 			hctx = q->mq_ops->map_queue(q, ctx->cpu);
750 			rq->mq_ctx = ctx;
751 		}
752 		spin_lock(&ctx->lock);
753 		__blk_mq_insert_request(hctx, rq, at_head);
754 		spin_unlock(&ctx->lock);
755 
756 		blk_mq_put_ctx(current_ctx);
757 	}
758 
759 	if (run_queue)
760 		__blk_mq_run_hw_queue(hctx);
761 }
762 EXPORT_SYMBOL(blk_mq_insert_request);
763 
764 /*
765  * This is a special version of blk_mq_insert_request to bypass FLUSH request
766  * check. Should only be used internally.
767  */
768 void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
769 {
770 	struct request_queue *q = rq->q;
771 	struct blk_mq_hw_ctx *hctx;
772 	struct blk_mq_ctx *ctx, *current_ctx;
773 
774 	current_ctx = blk_mq_get_ctx(q);
775 
776 	ctx = rq->mq_ctx;
777 	if (!cpu_online(ctx->cpu)) {
778 		ctx = current_ctx;
779 		rq->mq_ctx = ctx;
780 	}
781 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
782 
783 	/* ctx->cpu might be offline */
784 	spin_lock(&ctx->lock);
785 	__blk_mq_insert_request(hctx, rq, false);
786 	spin_unlock(&ctx->lock);
787 
788 	blk_mq_put_ctx(current_ctx);
789 
790 	if (run_queue)
791 		blk_mq_run_hw_queue(hctx, async);
792 }
793 
794 static void blk_mq_insert_requests(struct request_queue *q,
795 				     struct blk_mq_ctx *ctx,
796 				     struct list_head *list,
797 				     int depth,
798 				     bool from_schedule)
799 
800 {
801 	struct blk_mq_hw_ctx *hctx;
802 	struct blk_mq_ctx *current_ctx;
803 
804 	trace_block_unplug(q, depth, !from_schedule);
805 
806 	current_ctx = blk_mq_get_ctx(q);
807 
808 	if (!cpu_online(ctx->cpu))
809 		ctx = current_ctx;
810 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
811 
812 	/*
813 	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
814 	 * offline now
815 	 */
816 	spin_lock(&ctx->lock);
817 	while (!list_empty(list)) {
818 		struct request *rq;
819 
820 		rq = list_first_entry(list, struct request, queuelist);
821 		list_del_init(&rq->queuelist);
822 		rq->mq_ctx = ctx;
823 		__blk_mq_insert_request(hctx, rq, false);
824 	}
825 	spin_unlock(&ctx->lock);
826 
827 	blk_mq_put_ctx(current_ctx);
828 
829 	blk_mq_run_hw_queue(hctx, from_schedule);
830 }
831 
832 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
833 {
834 	struct request *rqa = container_of(a, struct request, queuelist);
835 	struct request *rqb = container_of(b, struct request, queuelist);
836 
837 	return !(rqa->mq_ctx < rqb->mq_ctx ||
838 		 (rqa->mq_ctx == rqb->mq_ctx &&
839 		  blk_rq_pos(rqa) < blk_rq_pos(rqb)));
840 }
841 
842 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
843 {
844 	struct blk_mq_ctx *this_ctx;
845 	struct request_queue *this_q;
846 	struct request *rq;
847 	LIST_HEAD(list);
848 	LIST_HEAD(ctx_list);
849 	unsigned int depth;
850 
851 	list_splice_init(&plug->mq_list, &list);
852 
853 	list_sort(NULL, &list, plug_ctx_cmp);
854 
855 	this_q = NULL;
856 	this_ctx = NULL;
857 	depth = 0;
858 
859 	while (!list_empty(&list)) {
860 		rq = list_entry_rq(list.next);
861 		list_del_init(&rq->queuelist);
862 		BUG_ON(!rq->q);
863 		if (rq->mq_ctx != this_ctx) {
864 			if (this_ctx) {
865 				blk_mq_insert_requests(this_q, this_ctx,
866 							&ctx_list, depth,
867 							from_schedule);
868 			}
869 
870 			this_ctx = rq->mq_ctx;
871 			this_q = rq->q;
872 			depth = 0;
873 		}
874 
875 		depth++;
876 		list_add_tail(&rq->queuelist, &ctx_list);
877 	}
878 
879 	/*
880 	 * If 'this_ctx' is set, we know we have entries to complete
881 	 * on 'ctx_list'. Do those.
882 	 */
883 	if (this_ctx) {
884 		blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
885 				       from_schedule);
886 	}
887 }
888 
889 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
890 {
891 	init_request_from_bio(rq, bio);
892 	blk_account_io_start(rq, 1);
893 }
894 
895 static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
896 {
897 	struct blk_mq_hw_ctx *hctx;
898 	struct blk_mq_ctx *ctx;
899 	const int is_sync = rw_is_sync(bio->bi_rw);
900 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
901 	int rw = bio_data_dir(bio);
902 	struct request *rq;
903 	unsigned int use_plug, request_count = 0;
904 
905 	/*
906 	 * If we have multiple hardware queues, just go directly to
907 	 * one of those for sync IO.
908 	 */
909 	use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);
910 
911 	blk_queue_bounce(q, &bio);
912 
913 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
914 		bio_endio(bio, -EIO);
915 		return;
916 	}
917 
918 	if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
919 		return;
920 
921 	if (blk_mq_queue_enter(q)) {
922 		bio_endio(bio, -EIO);
923 		return;
924 	}
925 
926 	ctx = blk_mq_get_ctx(q);
927 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
928 
929 	trace_block_getrq(q, bio, rw);
930 	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
931 	if (likely(rq))
932 		blk_mq_rq_ctx_init(q, ctx, rq, rw);
933 	else {
934 		blk_mq_put_ctx(ctx);
935 		trace_block_sleeprq(q, bio, rw);
936 		rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
937 							false);
938 		ctx = rq->mq_ctx;
939 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
940 	}
941 
942 	hctx->queued++;
943 
944 	if (unlikely(is_flush_fua)) {
945 		blk_mq_bio_to_request(rq, bio);
946 		blk_mq_put_ctx(ctx);
947 		blk_insert_flush(rq);
948 		goto run_queue;
949 	}
950 
951 	/*
952 	 * A task plug currently exists. Since this is completely lockless,
953 	 * utilize that to temporarily store requests until the task is
954 	 * either done or scheduled away.
955 	 */
956 	if (use_plug) {
957 		struct blk_plug *plug = current->plug;
958 
959 		if (plug) {
960 			blk_mq_bio_to_request(rq, bio);
961 			if (list_empty(&plug->mq_list))
962 				trace_block_plug(q);
963 			else if (request_count >= BLK_MAX_REQUEST_COUNT) {
964 				blk_flush_plug_list(plug, false);
965 				trace_block_plug(q);
966 			}
967 			list_add_tail(&rq->queuelist, &plug->mq_list);
968 			blk_mq_put_ctx(ctx);
969 			return;
970 		}
971 	}
972 
973 	spin_lock(&ctx->lock);
974 
975 	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
976 	    blk_mq_attempt_merge(q, ctx, bio))
977 		__blk_mq_free_request(hctx, ctx, rq);
978 	else {
979 		blk_mq_bio_to_request(rq, bio);
980 		__blk_mq_insert_request(hctx, rq, false);
981 	}
982 
983 	spin_unlock(&ctx->lock);
984 	blk_mq_put_ctx(ctx);
985 
986 	/*
987 	 * For a SYNC request, send it to the hardware immediately. For an
988 	 * ASYNC request, just ensure that we run it later on. The latter
989 	 * allows for merging opportunities and more efficient dispatching.
990 	 */
991 run_queue:
992 	blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
993 }
994 
995 /*
996  * Default mapping to a software queue, since we use one per CPU.
997  */
998 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
999 {
1000 	return q->queue_hw_ctx[q->mq_map[cpu]];
1001 }
1002 EXPORT_SYMBOL(blk_mq_map_queue);
1003 
1004 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
1005 						   unsigned int hctx_index)
1006 {
1007 	return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
1008 				GFP_KERNEL | __GFP_ZERO, reg->numa_node);
1009 }
1010 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
1011 
1012 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
1013 				 unsigned int hctx_index)
1014 {
1015 	kfree(hctx);
1016 }
1017 EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
1018 
1019 static void blk_mq_hctx_notify(void *data, unsigned long action,
1020 			       unsigned int cpu)
1021 {
1022 	struct blk_mq_hw_ctx *hctx = data;
1023 	struct blk_mq_ctx *ctx;
1024 	LIST_HEAD(tmp);
1025 
1026 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1027 		return;
1028 
1029 	/*
1030 	 * Move ctx entries to new CPU, if this one is going away.
1031 	 */
1032 	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
1033 
1034 	spin_lock(&ctx->lock);
1035 	if (!list_empty(&ctx->rq_list)) {
1036 		list_splice_init(&ctx->rq_list, &tmp);
1037 		clear_bit(ctx->index_hw, hctx->ctx_map);
1038 	}
1039 	spin_unlock(&ctx->lock);
1040 
1041 	if (list_empty(&tmp))
1042 		return;
1043 
1044 	ctx = blk_mq_get_ctx(hctx->queue);
1045 	spin_lock(&ctx->lock);
1046 
1047 	while (!list_empty(&tmp)) {
1048 		struct request *rq;
1049 
1050 		rq = list_first_entry(&tmp, struct request, queuelist);
1051 		rq->mq_ctx = ctx;
1052 		list_move_tail(&rq->queuelist, &ctx->rq_list);
1053 	}
1054 
1055 	blk_mq_hctx_mark_pending(hctx, ctx);
1056 
1057 	spin_unlock(&ctx->lock);
1058 	blk_mq_put_ctx(ctx);
1059 }
1060 
1061 static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
1062 				    void (*init)(void *, struct blk_mq_hw_ctx *,
1063 					struct request *, unsigned int),
1064 				    void *data)
1065 {
1066 	unsigned int i;
1067 
1068 	for (i = 0; i < hctx->queue_depth; i++) {
1069 		struct request *rq = hctx->rqs[i];
1070 
1071 		init(data, hctx, rq, i);
1072 	}
1073 }
1074 
1075 void blk_mq_init_commands(struct request_queue *q,
1076 			  void (*init)(void *, struct blk_mq_hw_ctx *,
1077 					struct request *, unsigned int),
1078 			  void *data)
1079 {
1080 	struct blk_mq_hw_ctx *hctx;
1081 	unsigned int i;
1082 
1083 	queue_for_each_hw_ctx(q, hctx, i)
1084 		blk_mq_init_hw_commands(hctx, init, data);
1085 }
1086 EXPORT_SYMBOL(blk_mq_init_commands);
1087 
1088 static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
1089 {
1090 	struct page *page;
1091 
1092 	while (!list_empty(&hctx->page_list)) {
1093 		page = list_first_entry(&hctx->page_list, struct page, lru);
1094 		list_del_init(&page->lru);
1095 		__free_pages(page, page->private);
1096 	}
1097 
1098 	kfree(hctx->rqs);
1099 
1100 	if (hctx->tags)
1101 		blk_mq_free_tags(hctx->tags);
1102 }
1103 
1104 static size_t order_to_size(unsigned int order)
1105 {
1106 	size_t ret = PAGE_SIZE;
1107 
1108 	while (order--)
1109 		ret *= 2;
1110 
1111 	return ret;
1112 }
1113 
1114 static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
1115 			      unsigned int reserved_tags, int node)
1116 {
1117 	unsigned int i, j, entries_per_page, max_order = 4;
1118 	size_t rq_size, left;
1119 
1120 	INIT_LIST_HEAD(&hctx->page_list);
1121 
1122 	hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
1123 					GFP_KERNEL, node);
1124 	if (!hctx->rqs)
1125 		return -ENOMEM;
1126 
1127 	/*
1128 	 * rq_size is the size of the request plus driver payload, rounded
1129 	 * to the cacheline size
1130 	 */
1131 	rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
1132 				cache_line_size());
1133 	left = rq_size * hctx->queue_depth;
1134 
1135 	for (i = 0; i < hctx->queue_depth;) {
1136 		int this_order = max_order;
1137 		struct page *page;
1138 		int to_do;
1139 		void *p;
1140 
1141 		while (left < order_to_size(this_order - 1) && this_order)
1142 			this_order--;
1143 
1144 		do {
1145 			page = alloc_pages_node(node, GFP_KERNEL, this_order);
1146 			if (page)
1147 				break;
1148 			if (!this_order--)
1149 				break;
1150 			if (order_to_size(this_order) < rq_size)
1151 				break;
1152 		} while (1);
1153 
1154 		if (!page)
1155 			break;
1156 
1157 		page->private = this_order;
1158 		list_add_tail(&page->lru, &hctx->page_list);
1159 
1160 		p = page_address(page);
1161 		entries_per_page = order_to_size(this_order) / rq_size;
1162 		to_do = min(entries_per_page, hctx->queue_depth - i);
1163 		left -= to_do * rq_size;
1164 		for (j = 0; j < to_do; j++) {
1165 			hctx->rqs[i] = p;
1166 			blk_mq_rq_init(hctx, hctx->rqs[i]);
1167 			p += rq_size;
1168 			i++;
1169 		}
1170 	}
1171 
1172 	if (i < (reserved_tags + BLK_MQ_TAG_MIN))
1173 		goto err_rq_map;
1174 	else if (i != hctx->queue_depth) {
1175 		hctx->queue_depth = i;
1176 		pr_warn("%s: queue depth set to %u because of low memory\n",
1177 					__func__, i);
1178 	}
1179 
1180 	hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
1181 	if (!hctx->tags) {
1182 err_rq_map:
1183 		blk_mq_free_rq_map(hctx);
1184 		return -ENOMEM;
1185 	}
1186 
1187 	return 0;
1188 }
1189 
1190 static int blk_mq_init_hw_queues(struct request_queue *q,
1191 				 struct blk_mq_reg *reg, void *driver_data)
1192 {
1193 	struct blk_mq_hw_ctx *hctx;
1194 	unsigned int i, j;
1195 
1196 	/*
1197 	 * Initialize hardware queues
1198 	 */
1199 	queue_for_each_hw_ctx(q, hctx, i) {
1200 		unsigned int num_maps;
1201 		int node;
1202 
1203 		node = hctx->numa_node;
1204 		if (node == NUMA_NO_NODE)
1205 			node = hctx->numa_node = reg->numa_node;
1206 
1207 		INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
1208 		spin_lock_init(&hctx->lock);
1209 		INIT_LIST_HEAD(&hctx->dispatch);
1210 		hctx->queue = q;
1211 		hctx->queue_num = i;
1212 		hctx->flags = reg->flags;
1213 		hctx->queue_depth = reg->queue_depth;
1214 		hctx->cmd_size = reg->cmd_size;
1215 
1216 		blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1217 						blk_mq_hctx_notify, hctx);
1218 		blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1219 
1220 		if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
1221 			break;
1222 
1223 		/*
1224 		 * Allocate space for all possible cpus to avoid allocation in
1225 		 * runtime
1226 		 */
1227 		hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1228 						GFP_KERNEL, node);
1229 		if (!hctx->ctxs)
1230 			break;
1231 
1232 		num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
1233 		hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
1234 						GFP_KERNEL, node);
1235 		if (!hctx->ctx_map)
1236 			break;
1237 
1238 		hctx->nr_ctx_map = num_maps;
1239 		hctx->nr_ctx = 0;
1240 
1241 		if (reg->ops->init_hctx &&
1242 		    reg->ops->init_hctx(hctx, driver_data, i))
1243 			break;
1244 	}
1245 
1246 	if (i == q->nr_hw_queues)
1247 		return 0;
1248 
1249 	/*
1250 	 * Init failed
1251 	 */
1252 	queue_for_each_hw_ctx(q, hctx, j) {
1253 		if (i == j)
1254 			break;
1255 
1256 		if (reg->ops->exit_hctx)
1257 			reg->ops->exit_hctx(hctx, j);
1258 
1259 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1260 		blk_mq_free_rq_map(hctx);
1261 		kfree(hctx->ctxs);
1262 	}
1263 
1264 	return 1;
1265 }
1266 
1267 static void blk_mq_init_cpu_queues(struct request_queue *q,
1268 				   unsigned int nr_hw_queues)
1269 {
1270 	unsigned int i;
1271 
1272 	for_each_possible_cpu(i) {
1273 		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1274 		struct blk_mq_hw_ctx *hctx;
1275 
1276 		memset(__ctx, 0, sizeof(*__ctx));
1277 		__ctx->cpu = i;
1278 		spin_lock_init(&__ctx->lock);
1279 		INIT_LIST_HEAD(&__ctx->rq_list);
1280 		__ctx->queue = q;
1281 
1282 		/* If the cpu isn't online, the cpu is mapped to first hctx */
1283 		hctx = q->mq_ops->map_queue(q, i);
1284 		hctx->nr_ctx++;
1285 
1286 		if (!cpu_online(i))
1287 			continue;
1288 
1289 		/*
1290 		 * Set local node, IFF we have more than one hw queue. If
1291 		 * not, we remain on the home node of the device
1292 		 */
1293 		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1294 			hctx->numa_node = cpu_to_node(i);
1295 	}
1296 }
1297 
1298 static void blk_mq_map_swqueue(struct request_queue *q)
1299 {
1300 	unsigned int i;
1301 	struct blk_mq_hw_ctx *hctx;
1302 	struct blk_mq_ctx *ctx;
1303 
1304 	queue_for_each_hw_ctx(q, hctx, i) {
1305 		hctx->nr_ctx = 0;
1306 	}
1307 
1308 	/*
1309 	 * Map software to hardware queues
1310 	 */
1311 	queue_for_each_ctx(q, ctx, i) {
1312 		/* If the cpu isn't online, the cpu is mapped to first hctx */
1313 		hctx = q->mq_ops->map_queue(q, i);
1314 		ctx->index_hw = hctx->nr_ctx;
1315 		hctx->ctxs[hctx->nr_ctx++] = ctx;
1316 	}
1317 }
1318 
1319 struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
1320 					void *driver_data)
1321 {
1322 	struct blk_mq_hw_ctx **hctxs;
1323 	struct blk_mq_ctx *ctx;
1324 	struct request_queue *q;
1325 	int i;
1326 
1327 	if (!reg->nr_hw_queues ||
1328 	    !reg->ops->queue_rq || !reg->ops->map_queue ||
1329 	    !reg->ops->alloc_hctx || !reg->ops->free_hctx)
1330 		return ERR_PTR(-EINVAL);
1331 
1332 	if (!reg->queue_depth)
1333 		reg->queue_depth = BLK_MQ_MAX_DEPTH;
1334 	else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
1335 		pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
1336 		reg->queue_depth = BLK_MQ_MAX_DEPTH;
1337 	}
1338 
1339 	if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
1340 		return ERR_PTR(-EINVAL);
1341 
1342 	ctx = alloc_percpu(struct blk_mq_ctx);
1343 	if (!ctx)
1344 		return ERR_PTR(-ENOMEM);
1345 
1346 	hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1347 			reg->numa_node);
1348 
1349 	if (!hctxs)
1350 		goto err_percpu;
1351 
1352 	for (i = 0; i < reg->nr_hw_queues; i++) {
1353 		hctxs[i] = reg->ops->alloc_hctx(reg, i);
1354 		if (!hctxs[i])
1355 			goto err_hctxs;
1356 
1357 		hctxs[i]->numa_node = NUMA_NO_NODE;
1358 		hctxs[i]->queue_num = i;
1359 	}
1360 
1361 	q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
1362 	if (!q)
1363 		goto err_hctxs;
1364 
1365 	q->mq_map = blk_mq_make_queue_map(reg);
1366 	if (!q->mq_map)
1367 		goto err_map;
1368 
1369 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1370 	blk_queue_rq_timeout(q, 30000);
1371 
1372 	q->nr_queues = nr_cpu_ids;
1373 	q->nr_hw_queues = reg->nr_hw_queues;
1374 
1375 	q->queue_ctx = ctx;
1376 	q->queue_hw_ctx = hctxs;
1377 
1378 	q->mq_ops = reg->ops;
1379 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
1380 
1381 	q->sg_reserved_size = INT_MAX;
1382 
1383 	blk_queue_make_request(q, blk_mq_make_request);
1384 	blk_queue_rq_timed_out(q, reg->ops->timeout);
1385 	if (reg->timeout)
1386 		blk_queue_rq_timeout(q, reg->timeout);
1387 
1388 	if (reg->ops->complete)
1389 		blk_queue_softirq_done(q, reg->ops->complete);
1390 
1391 	blk_mq_init_flush(q);
1392 	blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
1393 
1394 	q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size,
1395 				cache_line_size()), GFP_KERNEL);
1396 	if (!q->flush_rq)
1397 		goto err_hw;
1398 
1399 	if (blk_mq_init_hw_queues(q, reg, driver_data))
1400 		goto err_flush_rq;
1401 
1402 	blk_mq_map_swqueue(q);
1403 
1404 	mutex_lock(&all_q_mutex);
1405 	list_add_tail(&q->all_q_node, &all_q_list);
1406 	mutex_unlock(&all_q_mutex);
1407 
1408 	return q;
1409 
1410 err_flush_rq:
1411 	kfree(q->flush_rq);
1412 err_hw:
1413 	kfree(q->mq_map);
1414 err_map:
1415 	blk_cleanup_queue(q);
1416 err_hctxs:
1417 	for (i = 0; i < reg->nr_hw_queues; i++) {
1418 		if (!hctxs[i])
1419 			break;
1420 		reg->ops->free_hctx(hctxs[i], i);
1421 	}
1422 	kfree(hctxs);
1423 err_percpu:
1424 	free_percpu(ctx);
1425 	return ERR_PTR(-ENOMEM);
1426 }
1427 EXPORT_SYMBOL(blk_mq_init_queue);
1428 
1429 void blk_mq_free_queue(struct request_queue *q)
1430 {
1431 	struct blk_mq_hw_ctx *hctx;
1432 	int i;
1433 
1434 	queue_for_each_hw_ctx(q, hctx, i) {
1435 		kfree(hctx->ctx_map);
1436 		kfree(hctx->ctxs);
1437 		blk_mq_free_rq_map(hctx);
1438 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1439 		if (q->mq_ops->exit_hctx)
1440 			q->mq_ops->exit_hctx(hctx, i);
1441 		q->mq_ops->free_hctx(hctx, i);
1442 	}
1443 
1444 	free_percpu(q->queue_ctx);
1445 	kfree(q->queue_hw_ctx);
1446 	kfree(q->mq_map);
1447 
1448 	q->queue_ctx = NULL;
1449 	q->queue_hw_ctx = NULL;
1450 	q->mq_map = NULL;
1451 
1452 	mutex_lock(&all_q_mutex);
1453 	list_del_init(&q->all_q_node);
1454 	mutex_unlock(&all_q_mutex);
1455 }
1456 
1457 /* Basically redo blk_mq_init_queue with queue frozen */
1458 static void blk_mq_queue_reinit(struct request_queue *q)
1459 {
1460 	blk_mq_freeze_queue(q);
1461 
1462 	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
1463 
1464 	/*
1465 	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
1466 	 * we should change hctx numa_node according to new topology (this
1467 	 * involves free and re-allocate memory, worthy doing?)
1468 	 */
1469 
1470 	blk_mq_map_swqueue(q);
1471 
1472 	blk_mq_unfreeze_queue(q);
1473 }
1474 
1475 static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1476 				      unsigned long action, void *hcpu)
1477 {
1478 	struct request_queue *q;
1479 
1480 	/*
1481 	 * Before new mapping is established, hotadded cpu might already start
1482 	 * handling requests. This doesn't break anything as we map offline
1483 	 * CPUs to first hardware queue. We will re-init queue below to get
1484 	 * optimal settings.
1485 	 */
1486 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1487 	    action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
1488 		return NOTIFY_OK;
1489 
1490 	mutex_lock(&all_q_mutex);
1491 	list_for_each_entry(q, &all_q_list, all_q_node)
1492 		blk_mq_queue_reinit(q);
1493 	mutex_unlock(&all_q_mutex);
1494 	return NOTIFY_OK;
1495 }
1496 
1497 static int __init blk_mq_init(void)
1498 {
1499 	blk_mq_cpu_init();
1500 
1501 	/* Must be called after percpu_counter_hotcpu_callback() */
1502 	hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
1503 
1504 	return 0;
1505 }
1506 subsys_initcall(blk_mq_init);
1507