xref: /openbmc/linux/block/blk-mq.c (revision e6dec923)
1 /*
2  * Block multiqueue core code
3  *
4  * Copyright (C) 2013-2014 Jens Axboe
5  * Copyright (C) 2013-2014 Christoph Hellwig
6  */
7 #include <linux/kernel.h>
8 #include <linux/module.h>
9 #include <linux/backing-dev.h>
10 #include <linux/bio.h>
11 #include <linux/blkdev.h>
12 #include <linux/kmemleak.h>
13 #include <linux/mm.h>
14 #include <linux/init.h>
15 #include <linux/slab.h>
16 #include <linux/workqueue.h>
17 #include <linux/smp.h>
18 #include <linux/llist.h>
19 #include <linux/list_sort.h>
20 #include <linux/cpu.h>
21 #include <linux/cache.h>
22 #include <linux/sched/sysctl.h>
23 #include <linux/sched/topology.h>
24 #include <linux/sched/signal.h>
25 #include <linux/delay.h>
26 #include <linux/crash_dump.h>
27 #include <linux/prefetch.h>
28 
29 #include <trace/events/block.h>
30 
31 #include <linux/blk-mq.h>
32 #include "blk.h"
33 #include "blk-mq.h"
34 #include "blk-mq-debugfs.h"
35 #include "blk-mq-tag.h"
36 #include "blk-stat.h"
37 #include "blk-wbt.h"
38 #include "blk-mq-sched.h"
39 
40 static void blk_mq_poll_stats_start(struct request_queue *q);
41 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
42 
43 static int blk_mq_poll_stats_bkt(const struct request *rq)
44 {
45 	int ddir, bytes, bucket;
46 
47 	ddir = rq_data_dir(rq);
48 	bytes = blk_rq_bytes(rq);
49 
50 	bucket = ddir + 2*(ilog2(bytes) - 9);
51 
52 	if (bucket < 0)
53 		return -1;
54 	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
55 		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
56 
57 	return bucket;
58 }
59 
60 /*
61  * Check if any of the ctx's have pending work in this hardware queue
62  */
63 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
64 {
65 	return sbitmap_any_bit_set(&hctx->ctx_map) ||
66 			!list_empty_careful(&hctx->dispatch) ||
67 			blk_mq_sched_has_work(hctx);
68 }
69 
70 /*
71  * Mark this ctx as having pending work in this hardware queue
72  */
73 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
74 				     struct blk_mq_ctx *ctx)
75 {
76 	if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
77 		sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
78 }
79 
80 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
81 				      struct blk_mq_ctx *ctx)
82 {
83 	sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
84 }
85 
86 void blk_freeze_queue_start(struct request_queue *q)
87 {
88 	int freeze_depth;
89 
90 	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
91 	if (freeze_depth == 1) {
92 		percpu_ref_kill(&q->q_usage_counter);
93 		blk_mq_run_hw_queues(q, false);
94 	}
95 }
96 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
97 
98 void blk_mq_freeze_queue_wait(struct request_queue *q)
99 {
100 	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
101 }
102 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
103 
104 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
105 				     unsigned long timeout)
106 {
107 	return wait_event_timeout(q->mq_freeze_wq,
108 					percpu_ref_is_zero(&q->q_usage_counter),
109 					timeout);
110 }
111 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
112 
113 /*
114  * Guarantee no request is in use, so we can change any data structure of
115  * the queue afterward.
116  */
117 void blk_freeze_queue(struct request_queue *q)
118 {
119 	/*
120 	 * In the !blk_mq case we are only calling this to kill the
121 	 * q_usage_counter, otherwise this increases the freeze depth
122 	 * and waits for it to return to zero.  For this reason there is
123 	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
124 	 * exported to drivers as the only user for unfreeze is blk_mq.
125 	 */
126 	blk_freeze_queue_start(q);
127 	blk_mq_freeze_queue_wait(q);
128 }
129 
130 void blk_mq_freeze_queue(struct request_queue *q)
131 {
132 	/*
133 	 * ...just an alias to keep freeze and unfreeze actions balanced
134 	 * in the blk_mq_* namespace
135 	 */
136 	blk_freeze_queue(q);
137 }
138 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
139 
140 void blk_mq_unfreeze_queue(struct request_queue *q)
141 {
142 	int freeze_depth;
143 
144 	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
145 	WARN_ON_ONCE(freeze_depth < 0);
146 	if (!freeze_depth) {
147 		percpu_ref_reinit(&q->q_usage_counter);
148 		wake_up_all(&q->mq_freeze_wq);
149 	}
150 }
151 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
152 
153 /*
154  * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
155  * mpt3sas driver such that this function can be removed.
156  */
157 void blk_mq_quiesce_queue_nowait(struct request_queue *q)
158 {
159 	unsigned long flags;
160 
161 	spin_lock_irqsave(q->queue_lock, flags);
162 	queue_flag_set(QUEUE_FLAG_QUIESCED, q);
163 	spin_unlock_irqrestore(q->queue_lock, flags);
164 }
165 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
166 
167 /**
168  * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
169  * @q: request queue.
170  *
171  * Note: this function does not prevent that the struct request end_io()
172  * callback function is invoked. Once this function is returned, we make
173  * sure no dispatch can happen until the queue is unquiesced via
174  * blk_mq_unquiesce_queue().
175  */
176 void blk_mq_quiesce_queue(struct request_queue *q)
177 {
178 	struct blk_mq_hw_ctx *hctx;
179 	unsigned int i;
180 	bool rcu = false;
181 
182 	blk_mq_quiesce_queue_nowait(q);
183 
184 	queue_for_each_hw_ctx(q, hctx, i) {
185 		if (hctx->flags & BLK_MQ_F_BLOCKING)
186 			synchronize_srcu(hctx->queue_rq_srcu);
187 		else
188 			rcu = true;
189 	}
190 	if (rcu)
191 		synchronize_rcu();
192 }
193 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
194 
195 /*
196  * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
197  * @q: request queue.
198  *
199  * This function recovers queue into the state before quiescing
200  * which is done by blk_mq_quiesce_queue.
201  */
202 void blk_mq_unquiesce_queue(struct request_queue *q)
203 {
204 	unsigned long flags;
205 
206 	spin_lock_irqsave(q->queue_lock, flags);
207 	queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
208 	spin_unlock_irqrestore(q->queue_lock, flags);
209 
210 	/* dispatch requests which are inserted during quiescing */
211 	blk_mq_run_hw_queues(q, true);
212 }
213 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
214 
215 void blk_mq_wake_waiters(struct request_queue *q)
216 {
217 	struct blk_mq_hw_ctx *hctx;
218 	unsigned int i;
219 
220 	queue_for_each_hw_ctx(q, hctx, i)
221 		if (blk_mq_hw_queue_mapped(hctx))
222 			blk_mq_tag_wakeup_all(hctx->tags, true);
223 
224 	/*
225 	 * If we are called because the queue has now been marked as
226 	 * dying, we need to ensure that processes currently waiting on
227 	 * the queue are notified as well.
228 	 */
229 	wake_up_all(&q->mq_freeze_wq);
230 }
231 
232 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
233 {
234 	return blk_mq_has_free_tags(hctx->tags);
235 }
236 EXPORT_SYMBOL(blk_mq_can_queue);
237 
238 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
239 		unsigned int tag, unsigned int op)
240 {
241 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
242 	struct request *rq = tags->static_rqs[tag];
243 
244 	rq->rq_flags = 0;
245 
246 	if (data->flags & BLK_MQ_REQ_INTERNAL) {
247 		rq->tag = -1;
248 		rq->internal_tag = tag;
249 	} else {
250 		if (blk_mq_tag_busy(data->hctx)) {
251 			rq->rq_flags = RQF_MQ_INFLIGHT;
252 			atomic_inc(&data->hctx->nr_active);
253 		}
254 		rq->tag = tag;
255 		rq->internal_tag = -1;
256 		data->hctx->tags->rqs[rq->tag] = rq;
257 	}
258 
259 	INIT_LIST_HEAD(&rq->queuelist);
260 	/* csd/requeue_work/fifo_time is initialized before use */
261 	rq->q = data->q;
262 	rq->mq_ctx = data->ctx;
263 	rq->cmd_flags = op;
264 	if (blk_queue_io_stat(data->q))
265 		rq->rq_flags |= RQF_IO_STAT;
266 	/* do not touch atomic flags, it needs atomic ops against the timer */
267 	rq->cpu = -1;
268 	INIT_HLIST_NODE(&rq->hash);
269 	RB_CLEAR_NODE(&rq->rb_node);
270 	rq->rq_disk = NULL;
271 	rq->part = NULL;
272 	rq->start_time = jiffies;
273 #ifdef CONFIG_BLK_CGROUP
274 	rq->rl = NULL;
275 	set_start_time_ns(rq);
276 	rq->io_start_time_ns = 0;
277 #endif
278 	rq->nr_phys_segments = 0;
279 #if defined(CONFIG_BLK_DEV_INTEGRITY)
280 	rq->nr_integrity_segments = 0;
281 #endif
282 	rq->special = NULL;
283 	/* tag was already set */
284 	rq->extra_len = 0;
285 
286 	INIT_LIST_HEAD(&rq->timeout_list);
287 	rq->timeout = 0;
288 
289 	rq->end_io = NULL;
290 	rq->end_io_data = NULL;
291 	rq->next_rq = NULL;
292 
293 	data->ctx->rq_dispatched[op_is_sync(op)]++;
294 	return rq;
295 }
296 
297 static struct request *blk_mq_get_request(struct request_queue *q,
298 		struct bio *bio, unsigned int op,
299 		struct blk_mq_alloc_data *data)
300 {
301 	struct elevator_queue *e = q->elevator;
302 	struct request *rq;
303 	unsigned int tag;
304 
305 	blk_queue_enter_live(q);
306 	data->q = q;
307 	if (likely(!data->ctx))
308 		data->ctx = blk_mq_get_ctx(q);
309 	if (likely(!data->hctx))
310 		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
311 	if (op & REQ_NOWAIT)
312 		data->flags |= BLK_MQ_REQ_NOWAIT;
313 
314 	if (e) {
315 		data->flags |= BLK_MQ_REQ_INTERNAL;
316 
317 		/*
318 		 * Flush requests are special and go directly to the
319 		 * dispatch list.
320 		 */
321 		if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
322 			e->type->ops.mq.limit_depth(op, data);
323 	}
324 
325 	tag = blk_mq_get_tag(data);
326 	if (tag == BLK_MQ_TAG_FAIL) {
327 		blk_queue_exit(q);
328 		return NULL;
329 	}
330 
331 	rq = blk_mq_rq_ctx_init(data, tag, op);
332 	if (!op_is_flush(op)) {
333 		rq->elv.icq = NULL;
334 		if (e && e->type->ops.mq.prepare_request) {
335 			if (e->type->icq_cache && rq_ioc(bio))
336 				blk_mq_sched_assign_ioc(rq, bio);
337 
338 			e->type->ops.mq.prepare_request(rq, bio);
339 			rq->rq_flags |= RQF_ELVPRIV;
340 		}
341 	}
342 	data->hctx->queued++;
343 	return rq;
344 }
345 
346 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
347 		unsigned int flags)
348 {
349 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
350 	struct request *rq;
351 	int ret;
352 
353 	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
354 	if (ret)
355 		return ERR_PTR(ret);
356 
357 	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
358 
359 	blk_mq_put_ctx(alloc_data.ctx);
360 	blk_queue_exit(q);
361 
362 	if (!rq)
363 		return ERR_PTR(-EWOULDBLOCK);
364 
365 	rq->__data_len = 0;
366 	rq->__sector = (sector_t) -1;
367 	rq->bio = rq->biotail = NULL;
368 	return rq;
369 }
370 EXPORT_SYMBOL(blk_mq_alloc_request);
371 
372 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
373 		unsigned int op, unsigned int flags, unsigned int hctx_idx)
374 {
375 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
376 	struct request *rq;
377 	unsigned int cpu;
378 	int ret;
379 
380 	/*
381 	 * If the tag allocator sleeps we could get an allocation for a
382 	 * different hardware context.  No need to complicate the low level
383 	 * allocator for this for the rare use case of a command tied to
384 	 * a specific queue.
385 	 */
386 	if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
387 		return ERR_PTR(-EINVAL);
388 
389 	if (hctx_idx >= q->nr_hw_queues)
390 		return ERR_PTR(-EIO);
391 
392 	ret = blk_queue_enter(q, true);
393 	if (ret)
394 		return ERR_PTR(ret);
395 
396 	/*
397 	 * Check if the hardware context is actually mapped to anything.
398 	 * If not tell the caller that it should skip this queue.
399 	 */
400 	alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
401 	if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
402 		blk_queue_exit(q);
403 		return ERR_PTR(-EXDEV);
404 	}
405 	cpu = cpumask_first(alloc_data.hctx->cpumask);
406 	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
407 
408 	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
409 
410 	blk_queue_exit(q);
411 
412 	if (!rq)
413 		return ERR_PTR(-EWOULDBLOCK);
414 
415 	return rq;
416 }
417 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
418 
419 void blk_mq_free_request(struct request *rq)
420 {
421 	struct request_queue *q = rq->q;
422 	struct elevator_queue *e = q->elevator;
423 	struct blk_mq_ctx *ctx = rq->mq_ctx;
424 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
425 	const int sched_tag = rq->internal_tag;
426 
427 	if (rq->rq_flags & RQF_ELVPRIV) {
428 		if (e && e->type->ops.mq.finish_request)
429 			e->type->ops.mq.finish_request(rq);
430 		if (rq->elv.icq) {
431 			put_io_context(rq->elv.icq->ioc);
432 			rq->elv.icq = NULL;
433 		}
434 	}
435 
436 	ctx->rq_completed[rq_is_sync(rq)]++;
437 	if (rq->rq_flags & RQF_MQ_INFLIGHT)
438 		atomic_dec(&hctx->nr_active);
439 
440 	wbt_done(q->rq_wb, &rq->issue_stat);
441 
442 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
443 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
444 	if (rq->tag != -1)
445 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
446 	if (sched_tag != -1)
447 		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
448 	blk_mq_sched_restart(hctx);
449 	blk_queue_exit(q);
450 }
451 EXPORT_SYMBOL_GPL(blk_mq_free_request);
452 
453 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
454 {
455 	blk_account_io_done(rq);
456 
457 	if (rq->end_io) {
458 		wbt_done(rq->q->rq_wb, &rq->issue_stat);
459 		rq->end_io(rq, error);
460 	} else {
461 		if (unlikely(blk_bidi_rq(rq)))
462 			blk_mq_free_request(rq->next_rq);
463 		blk_mq_free_request(rq);
464 	}
465 }
466 EXPORT_SYMBOL(__blk_mq_end_request);
467 
468 void blk_mq_end_request(struct request *rq, blk_status_t error)
469 {
470 	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
471 		BUG();
472 	__blk_mq_end_request(rq, error);
473 }
474 EXPORT_SYMBOL(blk_mq_end_request);
475 
476 static void __blk_mq_complete_request_remote(void *data)
477 {
478 	struct request *rq = data;
479 
480 	rq->q->softirq_done_fn(rq);
481 }
482 
483 static void __blk_mq_complete_request(struct request *rq)
484 {
485 	struct blk_mq_ctx *ctx = rq->mq_ctx;
486 	bool shared = false;
487 	int cpu;
488 
489 	if (rq->internal_tag != -1)
490 		blk_mq_sched_completed_request(rq);
491 	if (rq->rq_flags & RQF_STATS) {
492 		blk_mq_poll_stats_start(rq->q);
493 		blk_stat_add(rq);
494 	}
495 
496 	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
497 		rq->q->softirq_done_fn(rq);
498 		return;
499 	}
500 
501 	cpu = get_cpu();
502 	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
503 		shared = cpus_share_cache(cpu, ctx->cpu);
504 
505 	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
506 		rq->csd.func = __blk_mq_complete_request_remote;
507 		rq->csd.info = rq;
508 		rq->csd.flags = 0;
509 		smp_call_function_single_async(ctx->cpu, &rq->csd);
510 	} else {
511 		rq->q->softirq_done_fn(rq);
512 	}
513 	put_cpu();
514 }
515 
516 /**
517  * blk_mq_complete_request - end I/O on a request
518  * @rq:		the request being processed
519  *
520  * Description:
521  *	Ends all I/O on a request. It does not handle partial completions.
522  *	The actual completion happens out-of-order, through a IPI handler.
523  **/
524 void blk_mq_complete_request(struct request *rq)
525 {
526 	struct request_queue *q = rq->q;
527 
528 	if (unlikely(blk_should_fake_timeout(q)))
529 		return;
530 	if (!blk_mark_rq_complete(rq))
531 		__blk_mq_complete_request(rq);
532 }
533 EXPORT_SYMBOL(blk_mq_complete_request);
534 
535 int blk_mq_request_started(struct request *rq)
536 {
537 	return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
538 }
539 EXPORT_SYMBOL_GPL(blk_mq_request_started);
540 
541 void blk_mq_start_request(struct request *rq)
542 {
543 	struct request_queue *q = rq->q;
544 
545 	blk_mq_sched_started_request(rq);
546 
547 	trace_block_rq_issue(q, rq);
548 
549 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
550 		blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
551 		rq->rq_flags |= RQF_STATS;
552 		wbt_issue(q->rq_wb, &rq->issue_stat);
553 	}
554 
555 	blk_add_timer(rq);
556 
557 	/*
558 	 * Ensure that ->deadline is visible before set the started
559 	 * flag and clear the completed flag.
560 	 */
561 	smp_mb__before_atomic();
562 
563 	/*
564 	 * Mark us as started and clear complete. Complete might have been
565 	 * set if requeue raced with timeout, which then marked it as
566 	 * complete. So be sure to clear complete again when we start
567 	 * the request, otherwise we'll ignore the completion event.
568 	 */
569 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
570 		set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
571 	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
572 		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
573 
574 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
575 		/*
576 		 * Make sure space for the drain appears.  We know we can do
577 		 * this because max_hw_segments has been adjusted to be one
578 		 * fewer than the device can handle.
579 		 */
580 		rq->nr_phys_segments++;
581 	}
582 }
583 EXPORT_SYMBOL(blk_mq_start_request);
584 
585 /*
586  * When we reach here because queue is busy, REQ_ATOM_COMPLETE
587  * flag isn't set yet, so there may be race with timeout handler,
588  * but given rq->deadline is just set in .queue_rq() under
589  * this situation, the race won't be possible in reality because
590  * rq->timeout should be set as big enough to cover the window
591  * between blk_mq_start_request() called from .queue_rq() and
592  * clearing REQ_ATOM_STARTED here.
593  */
594 static void __blk_mq_requeue_request(struct request *rq)
595 {
596 	struct request_queue *q = rq->q;
597 
598 	trace_block_rq_requeue(q, rq);
599 	wbt_requeue(q->rq_wb, &rq->issue_stat);
600 	blk_mq_sched_requeue_request(rq);
601 
602 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
603 		if (q->dma_drain_size && blk_rq_bytes(rq))
604 			rq->nr_phys_segments--;
605 	}
606 }
607 
608 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
609 {
610 	__blk_mq_requeue_request(rq);
611 
612 	BUG_ON(blk_queued_rq(rq));
613 	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
614 }
615 EXPORT_SYMBOL(blk_mq_requeue_request);
616 
617 static void blk_mq_requeue_work(struct work_struct *work)
618 {
619 	struct request_queue *q =
620 		container_of(work, struct request_queue, requeue_work.work);
621 	LIST_HEAD(rq_list);
622 	struct request *rq, *next;
623 	unsigned long flags;
624 
625 	spin_lock_irqsave(&q->requeue_lock, flags);
626 	list_splice_init(&q->requeue_list, &rq_list);
627 	spin_unlock_irqrestore(&q->requeue_lock, flags);
628 
629 	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
630 		if (!(rq->rq_flags & RQF_SOFTBARRIER))
631 			continue;
632 
633 		rq->rq_flags &= ~RQF_SOFTBARRIER;
634 		list_del_init(&rq->queuelist);
635 		blk_mq_sched_insert_request(rq, true, false, false, true);
636 	}
637 
638 	while (!list_empty(&rq_list)) {
639 		rq = list_entry(rq_list.next, struct request, queuelist);
640 		list_del_init(&rq->queuelist);
641 		blk_mq_sched_insert_request(rq, false, false, false, true);
642 	}
643 
644 	blk_mq_run_hw_queues(q, false);
645 }
646 
647 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
648 				bool kick_requeue_list)
649 {
650 	struct request_queue *q = rq->q;
651 	unsigned long flags;
652 
653 	/*
654 	 * We abuse this flag that is otherwise used by the I/O scheduler to
655 	 * request head insertation from the workqueue.
656 	 */
657 	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
658 
659 	spin_lock_irqsave(&q->requeue_lock, flags);
660 	if (at_head) {
661 		rq->rq_flags |= RQF_SOFTBARRIER;
662 		list_add(&rq->queuelist, &q->requeue_list);
663 	} else {
664 		list_add_tail(&rq->queuelist, &q->requeue_list);
665 	}
666 	spin_unlock_irqrestore(&q->requeue_lock, flags);
667 
668 	if (kick_requeue_list)
669 		blk_mq_kick_requeue_list(q);
670 }
671 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
672 
673 void blk_mq_kick_requeue_list(struct request_queue *q)
674 {
675 	kblockd_schedule_delayed_work(&q->requeue_work, 0);
676 }
677 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
678 
679 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
680 				    unsigned long msecs)
681 {
682 	kblockd_schedule_delayed_work(&q->requeue_work,
683 				      msecs_to_jiffies(msecs));
684 }
685 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
686 
687 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
688 {
689 	if (tag < tags->nr_tags) {
690 		prefetch(tags->rqs[tag]);
691 		return tags->rqs[tag];
692 	}
693 
694 	return NULL;
695 }
696 EXPORT_SYMBOL(blk_mq_tag_to_rq);
697 
698 struct blk_mq_timeout_data {
699 	unsigned long next;
700 	unsigned int next_set;
701 };
702 
703 void blk_mq_rq_timed_out(struct request *req, bool reserved)
704 {
705 	const struct blk_mq_ops *ops = req->q->mq_ops;
706 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
707 
708 	/*
709 	 * We know that complete is set at this point. If STARTED isn't set
710 	 * anymore, then the request isn't active and the "timeout" should
711 	 * just be ignored. This can happen due to the bitflag ordering.
712 	 * Timeout first checks if STARTED is set, and if it is, assumes
713 	 * the request is active. But if we race with completion, then
714 	 * both flags will get cleared. So check here again, and ignore
715 	 * a timeout event with a request that isn't active.
716 	 */
717 	if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
718 		return;
719 
720 	if (ops->timeout)
721 		ret = ops->timeout(req, reserved);
722 
723 	switch (ret) {
724 	case BLK_EH_HANDLED:
725 		__blk_mq_complete_request(req);
726 		break;
727 	case BLK_EH_RESET_TIMER:
728 		blk_add_timer(req);
729 		blk_clear_rq_complete(req);
730 		break;
731 	case BLK_EH_NOT_HANDLED:
732 		break;
733 	default:
734 		printk(KERN_ERR "block: bad eh return: %d\n", ret);
735 		break;
736 	}
737 }
738 
739 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
740 		struct request *rq, void *priv, bool reserved)
741 {
742 	struct blk_mq_timeout_data *data = priv;
743 
744 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
745 		return;
746 
747 	/*
748 	 * The rq being checked may have been freed and reallocated
749 	 * out already here, we avoid this race by checking rq->deadline
750 	 * and REQ_ATOM_COMPLETE flag together:
751 	 *
752 	 * - if rq->deadline is observed as new value because of
753 	 *   reusing, the rq won't be timed out because of timing.
754 	 * - if rq->deadline is observed as previous value,
755 	 *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
756 	 *   because we put a barrier between setting rq->deadline
757 	 *   and clearing the flag in blk_mq_start_request(), so
758 	 *   this rq won't be timed out too.
759 	 */
760 	if (time_after_eq(jiffies, rq->deadline)) {
761 		if (!blk_mark_rq_complete(rq))
762 			blk_mq_rq_timed_out(rq, reserved);
763 	} else if (!data->next_set || time_after(data->next, rq->deadline)) {
764 		data->next = rq->deadline;
765 		data->next_set = 1;
766 	}
767 }
768 
769 static void blk_mq_timeout_work(struct work_struct *work)
770 {
771 	struct request_queue *q =
772 		container_of(work, struct request_queue, timeout_work);
773 	struct blk_mq_timeout_data data = {
774 		.next		= 0,
775 		.next_set	= 0,
776 	};
777 	int i;
778 
779 	/* A deadlock might occur if a request is stuck requiring a
780 	 * timeout at the same time a queue freeze is waiting
781 	 * completion, since the timeout code would not be able to
782 	 * acquire the queue reference here.
783 	 *
784 	 * That's why we don't use blk_queue_enter here; instead, we use
785 	 * percpu_ref_tryget directly, because we need to be able to
786 	 * obtain a reference even in the short window between the queue
787 	 * starting to freeze, by dropping the first reference in
788 	 * blk_freeze_queue_start, and the moment the last request is
789 	 * consumed, marked by the instant q_usage_counter reaches
790 	 * zero.
791 	 */
792 	if (!percpu_ref_tryget(&q->q_usage_counter))
793 		return;
794 
795 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
796 
797 	if (data.next_set) {
798 		data.next = blk_rq_timeout(round_jiffies_up(data.next));
799 		mod_timer(&q->timeout, data.next);
800 	} else {
801 		struct blk_mq_hw_ctx *hctx;
802 
803 		queue_for_each_hw_ctx(q, hctx, i) {
804 			/* the hctx may be unmapped, so check it here */
805 			if (blk_mq_hw_queue_mapped(hctx))
806 				blk_mq_tag_idle(hctx);
807 		}
808 	}
809 	blk_queue_exit(q);
810 }
811 
812 struct flush_busy_ctx_data {
813 	struct blk_mq_hw_ctx *hctx;
814 	struct list_head *list;
815 };
816 
817 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
818 {
819 	struct flush_busy_ctx_data *flush_data = data;
820 	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
821 	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
822 
823 	sbitmap_clear_bit(sb, bitnr);
824 	spin_lock(&ctx->lock);
825 	list_splice_tail_init(&ctx->rq_list, flush_data->list);
826 	spin_unlock(&ctx->lock);
827 	return true;
828 }
829 
830 /*
831  * Process software queues that have been marked busy, splicing them
832  * to the for-dispatch
833  */
834 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
835 {
836 	struct flush_busy_ctx_data data = {
837 		.hctx = hctx,
838 		.list = list,
839 	};
840 
841 	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
842 }
843 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
844 
845 static inline unsigned int queued_to_index(unsigned int queued)
846 {
847 	if (!queued)
848 		return 0;
849 
850 	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
851 }
852 
853 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
854 			   bool wait)
855 {
856 	struct blk_mq_alloc_data data = {
857 		.q = rq->q,
858 		.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
859 		.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
860 	};
861 
862 	might_sleep_if(wait);
863 
864 	if (rq->tag != -1)
865 		goto done;
866 
867 	if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
868 		data.flags |= BLK_MQ_REQ_RESERVED;
869 
870 	rq->tag = blk_mq_get_tag(&data);
871 	if (rq->tag >= 0) {
872 		if (blk_mq_tag_busy(data.hctx)) {
873 			rq->rq_flags |= RQF_MQ_INFLIGHT;
874 			atomic_inc(&data.hctx->nr_active);
875 		}
876 		data.hctx->tags->rqs[rq->tag] = rq;
877 	}
878 
879 done:
880 	if (hctx)
881 		*hctx = data.hctx;
882 	return rq->tag != -1;
883 }
884 
885 static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
886 				    struct request *rq)
887 {
888 	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
889 	rq->tag = -1;
890 
891 	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
892 		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
893 		atomic_dec(&hctx->nr_active);
894 	}
895 }
896 
897 static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
898 				       struct request *rq)
899 {
900 	if (rq->tag == -1 || rq->internal_tag == -1)
901 		return;
902 
903 	__blk_mq_put_driver_tag(hctx, rq);
904 }
905 
906 static void blk_mq_put_driver_tag(struct request *rq)
907 {
908 	struct blk_mq_hw_ctx *hctx;
909 
910 	if (rq->tag == -1 || rq->internal_tag == -1)
911 		return;
912 
913 	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
914 	__blk_mq_put_driver_tag(hctx, rq);
915 }
916 
917 /*
918  * If we fail getting a driver tag because all the driver tags are already
919  * assigned and on the dispatch list, BUT the first entry does not have a
920  * tag, then we could deadlock. For that case, move entries with assigned
921  * driver tags to the front, leaving the set of tagged requests in the
922  * same order, and the untagged set in the same order.
923  */
924 static bool reorder_tags_to_front(struct list_head *list)
925 {
926 	struct request *rq, *tmp, *first = NULL;
927 
928 	list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
929 		if (rq == first)
930 			break;
931 		if (rq->tag != -1) {
932 			list_move(&rq->queuelist, list);
933 			if (!first)
934 				first = rq;
935 		}
936 	}
937 
938 	return first != NULL;
939 }
940 
941 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
942 				void *key)
943 {
944 	struct blk_mq_hw_ctx *hctx;
945 
946 	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
947 
948 	list_del(&wait->entry);
949 	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
950 	blk_mq_run_hw_queue(hctx, true);
951 	return 1;
952 }
953 
954 static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
955 {
956 	struct sbq_wait_state *ws;
957 
958 	/*
959 	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
960 	 * The thread which wins the race to grab this bit adds the hardware
961 	 * queue to the wait queue.
962 	 */
963 	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
964 	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
965 		return false;
966 
967 	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
968 	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
969 
970 	/*
971 	 * As soon as this returns, it's no longer safe to fiddle with
972 	 * hctx->dispatch_wait, since a completion can wake up the wait queue
973 	 * and unlock the bit.
974 	 */
975 	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
976 	return true;
977 }
978 
979 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
980 {
981 	struct blk_mq_hw_ctx *hctx;
982 	struct request *rq;
983 	int errors, queued;
984 
985 	if (list_empty(list))
986 		return false;
987 
988 	/*
989 	 * Now process all the entries, sending them to the driver.
990 	 */
991 	errors = queued = 0;
992 	do {
993 		struct blk_mq_queue_data bd;
994 		blk_status_t ret;
995 
996 		rq = list_first_entry(list, struct request, queuelist);
997 		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
998 			if (!queued && reorder_tags_to_front(list))
999 				continue;
1000 
1001 			/*
1002 			 * The initial allocation attempt failed, so we need to
1003 			 * rerun the hardware queue when a tag is freed.
1004 			 */
1005 			if (!blk_mq_dispatch_wait_add(hctx))
1006 				break;
1007 
1008 			/*
1009 			 * It's possible that a tag was freed in the window
1010 			 * between the allocation failure and adding the
1011 			 * hardware queue to the wait queue.
1012 			 */
1013 			if (!blk_mq_get_driver_tag(rq, &hctx, false))
1014 				break;
1015 		}
1016 
1017 		list_del_init(&rq->queuelist);
1018 
1019 		bd.rq = rq;
1020 
1021 		/*
1022 		 * Flag last if we have no more requests, or if we have more
1023 		 * but can't assign a driver tag to it.
1024 		 */
1025 		if (list_empty(list))
1026 			bd.last = true;
1027 		else {
1028 			struct request *nxt;
1029 
1030 			nxt = list_first_entry(list, struct request, queuelist);
1031 			bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
1032 		}
1033 
1034 		ret = q->mq_ops->queue_rq(hctx, &bd);
1035 		if (ret == BLK_STS_RESOURCE) {
1036 			blk_mq_put_driver_tag_hctx(hctx, rq);
1037 			list_add(&rq->queuelist, list);
1038 			__blk_mq_requeue_request(rq);
1039 			break;
1040 		}
1041 
1042 		if (unlikely(ret != BLK_STS_OK)) {
1043 			errors++;
1044 			blk_mq_end_request(rq, BLK_STS_IOERR);
1045 			continue;
1046 		}
1047 
1048 		queued++;
1049 	} while (!list_empty(list));
1050 
1051 	hctx->dispatched[queued_to_index(queued)]++;
1052 
1053 	/*
1054 	 * Any items that need requeuing? Stuff them into hctx->dispatch,
1055 	 * that is where we will continue on next queue run.
1056 	 */
1057 	if (!list_empty(list)) {
1058 		/*
1059 		 * If an I/O scheduler has been configured and we got a driver
1060 		 * tag for the next request already, free it again.
1061 		 */
1062 		rq = list_first_entry(list, struct request, queuelist);
1063 		blk_mq_put_driver_tag(rq);
1064 
1065 		spin_lock(&hctx->lock);
1066 		list_splice_init(list, &hctx->dispatch);
1067 		spin_unlock(&hctx->lock);
1068 
1069 		/*
1070 		 * If SCHED_RESTART was set by the caller of this function and
1071 		 * it is no longer set that means that it was cleared by another
1072 		 * thread and hence that a queue rerun is needed.
1073 		 *
1074 		 * If TAG_WAITING is set that means that an I/O scheduler has
1075 		 * been configured and another thread is waiting for a driver
1076 		 * tag. To guarantee fairness, do not rerun this hardware queue
1077 		 * but let the other thread grab the driver tag.
1078 		 *
1079 		 * If no I/O scheduler has been configured it is possible that
1080 		 * the hardware queue got stopped and restarted before requests
1081 		 * were pushed back onto the dispatch list. Rerun the queue to
1082 		 * avoid starvation. Notes:
1083 		 * - blk_mq_run_hw_queue() checks whether or not a queue has
1084 		 *   been stopped before rerunning a queue.
1085 		 * - Some but not all block drivers stop a queue before
1086 		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
1087 		 *   and dm-rq.
1088 		 */
1089 		if (!blk_mq_sched_needs_restart(hctx) &&
1090 		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
1091 			blk_mq_run_hw_queue(hctx, true);
1092 	}
1093 
1094 	return (queued + errors) != 0;
1095 }
1096 
1097 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1098 {
1099 	int srcu_idx;
1100 
1101 	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
1102 		cpu_online(hctx->next_cpu));
1103 
1104 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
1105 		rcu_read_lock();
1106 		blk_mq_sched_dispatch_requests(hctx);
1107 		rcu_read_unlock();
1108 	} else {
1109 		might_sleep();
1110 
1111 		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
1112 		blk_mq_sched_dispatch_requests(hctx);
1113 		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
1114 	}
1115 }
1116 
1117 /*
1118  * It'd be great if the workqueue API had a way to pass
1119  * in a mask and had some smarts for more clever placement.
1120  * For now we just round-robin here, switching for every
1121  * BLK_MQ_CPU_WORK_BATCH queued items.
1122  */
1123 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1124 {
1125 	if (hctx->queue->nr_hw_queues == 1)
1126 		return WORK_CPU_UNBOUND;
1127 
1128 	if (--hctx->next_cpu_batch <= 0) {
1129 		int next_cpu;
1130 
1131 		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
1132 		if (next_cpu >= nr_cpu_ids)
1133 			next_cpu = cpumask_first(hctx->cpumask);
1134 
1135 		hctx->next_cpu = next_cpu;
1136 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1137 	}
1138 
1139 	return hctx->next_cpu;
1140 }
1141 
1142 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1143 					unsigned long msecs)
1144 {
1145 	if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
1146 		return;
1147 
1148 	if (unlikely(blk_mq_hctx_stopped(hctx)))
1149 		return;
1150 
1151 	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1152 		int cpu = get_cpu();
1153 		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
1154 			__blk_mq_run_hw_queue(hctx);
1155 			put_cpu();
1156 			return;
1157 		}
1158 
1159 		put_cpu();
1160 	}
1161 
1162 	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1163 					 &hctx->run_work,
1164 					 msecs_to_jiffies(msecs));
1165 }
1166 
1167 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1168 {
1169 	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
1170 }
1171 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1172 
1173 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1174 {
1175 	__blk_mq_delay_run_hw_queue(hctx, async, 0);
1176 }
1177 EXPORT_SYMBOL(blk_mq_run_hw_queue);
1178 
1179 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1180 {
1181 	struct blk_mq_hw_ctx *hctx;
1182 	int i;
1183 
1184 	queue_for_each_hw_ctx(q, hctx, i) {
1185 		if (!blk_mq_hctx_has_pending(hctx) ||
1186 		    blk_mq_hctx_stopped(hctx))
1187 			continue;
1188 
1189 		blk_mq_run_hw_queue(hctx, async);
1190 	}
1191 }
1192 EXPORT_SYMBOL(blk_mq_run_hw_queues);
1193 
1194 /**
1195  * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
1196  * @q: request queue.
1197  *
1198  * The caller is responsible for serializing this function against
1199  * blk_mq_{start,stop}_hw_queue().
1200  */
1201 bool blk_mq_queue_stopped(struct request_queue *q)
1202 {
1203 	struct blk_mq_hw_ctx *hctx;
1204 	int i;
1205 
1206 	queue_for_each_hw_ctx(q, hctx, i)
1207 		if (blk_mq_hctx_stopped(hctx))
1208 			return true;
1209 
1210 	return false;
1211 }
1212 EXPORT_SYMBOL(blk_mq_queue_stopped);
1213 
1214 /*
1215  * This function is often used for pausing .queue_rq() by driver when
1216  * there isn't enough resource or some conditions aren't satisfied, and
1217  * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
1218  *
1219  * We do not guarantee that dispatch can be drained or blocked
1220  * after blk_mq_stop_hw_queue() returns. Please use
1221  * blk_mq_quiesce_queue() for that requirement.
1222  */
1223 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1224 {
1225 	cancel_delayed_work(&hctx->run_work);
1226 
1227 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1228 }
1229 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1230 
1231 /*
1232  * This function is often used for pausing .queue_rq() by driver when
1233  * there isn't enough resource or some conditions aren't satisfied, and
1234  * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
1235  *
1236  * We do not guarantee that dispatch can be drained or blocked
1237  * after blk_mq_stop_hw_queues() returns. Please use
1238  * blk_mq_quiesce_queue() for that requirement.
1239  */
1240 void blk_mq_stop_hw_queues(struct request_queue *q)
1241 {
1242 	struct blk_mq_hw_ctx *hctx;
1243 	int i;
1244 
1245 	queue_for_each_hw_ctx(q, hctx, i)
1246 		blk_mq_stop_hw_queue(hctx);
1247 }
1248 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1249 
1250 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1251 {
1252 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1253 
1254 	blk_mq_run_hw_queue(hctx, false);
1255 }
1256 EXPORT_SYMBOL(blk_mq_start_hw_queue);
1257 
1258 void blk_mq_start_hw_queues(struct request_queue *q)
1259 {
1260 	struct blk_mq_hw_ctx *hctx;
1261 	int i;
1262 
1263 	queue_for_each_hw_ctx(q, hctx, i)
1264 		blk_mq_start_hw_queue(hctx);
1265 }
1266 EXPORT_SYMBOL(blk_mq_start_hw_queues);
1267 
1268 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1269 {
1270 	if (!blk_mq_hctx_stopped(hctx))
1271 		return;
1272 
1273 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1274 	blk_mq_run_hw_queue(hctx, async);
1275 }
1276 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
1277 
1278 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
1279 {
1280 	struct blk_mq_hw_ctx *hctx;
1281 	int i;
1282 
1283 	queue_for_each_hw_ctx(q, hctx, i)
1284 		blk_mq_start_stopped_hw_queue(hctx, async);
1285 }
1286 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1287 
1288 static void blk_mq_run_work_fn(struct work_struct *work)
1289 {
1290 	struct blk_mq_hw_ctx *hctx;
1291 
1292 	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
1293 
1294 	/*
1295 	 * If we are stopped, don't run the queue. The exception is if
1296 	 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
1297 	 * the STOPPED bit and run it.
1298 	 */
1299 	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
1300 		if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
1301 			return;
1302 
1303 		clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
1304 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1305 	}
1306 
1307 	__blk_mq_run_hw_queue(hctx);
1308 }
1309 
1310 
1311 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1312 {
1313 	if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
1314 		return;
1315 
1316 	/*
1317 	 * Stop the hw queue, then modify currently delayed work.
1318 	 * This should prevent us from running the queue prematurely.
1319 	 * Mark the queue as auto-clearing STOPPED when it runs.
1320 	 */
1321 	blk_mq_stop_hw_queue(hctx);
1322 	set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
1323 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1324 					&hctx->run_work,
1325 					msecs_to_jiffies(msecs));
1326 }
1327 EXPORT_SYMBOL(blk_mq_delay_queue);
1328 
1329 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1330 					    struct request *rq,
1331 					    bool at_head)
1332 {
1333 	struct blk_mq_ctx *ctx = rq->mq_ctx;
1334 
1335 	lockdep_assert_held(&ctx->lock);
1336 
1337 	trace_block_rq_insert(hctx->queue, rq);
1338 
1339 	if (at_head)
1340 		list_add(&rq->queuelist, &ctx->rq_list);
1341 	else
1342 		list_add_tail(&rq->queuelist, &ctx->rq_list);
1343 }
1344 
1345 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1346 			     bool at_head)
1347 {
1348 	struct blk_mq_ctx *ctx = rq->mq_ctx;
1349 
1350 	lockdep_assert_held(&ctx->lock);
1351 
1352 	__blk_mq_insert_req_list(hctx, rq, at_head);
1353 	blk_mq_hctx_mark_pending(hctx, ctx);
1354 }
1355 
1356 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1357 			    struct list_head *list)
1358 
1359 {
1360 	/*
1361 	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
1362 	 * offline now
1363 	 */
1364 	spin_lock(&ctx->lock);
1365 	while (!list_empty(list)) {
1366 		struct request *rq;
1367 
1368 		rq = list_first_entry(list, struct request, queuelist);
1369 		BUG_ON(rq->mq_ctx != ctx);
1370 		list_del_init(&rq->queuelist);
1371 		__blk_mq_insert_req_list(hctx, rq, false);
1372 	}
1373 	blk_mq_hctx_mark_pending(hctx, ctx);
1374 	spin_unlock(&ctx->lock);
1375 }
1376 
1377 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1378 {
1379 	struct request *rqa = container_of(a, struct request, queuelist);
1380 	struct request *rqb = container_of(b, struct request, queuelist);
1381 
1382 	return !(rqa->mq_ctx < rqb->mq_ctx ||
1383 		 (rqa->mq_ctx == rqb->mq_ctx &&
1384 		  blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1385 }
1386 
1387 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1388 {
1389 	struct blk_mq_ctx *this_ctx;
1390 	struct request_queue *this_q;
1391 	struct request *rq;
1392 	LIST_HEAD(list);
1393 	LIST_HEAD(ctx_list);
1394 	unsigned int depth;
1395 
1396 	list_splice_init(&plug->mq_list, &list);
1397 
1398 	list_sort(NULL, &list, plug_ctx_cmp);
1399 
1400 	this_q = NULL;
1401 	this_ctx = NULL;
1402 	depth = 0;
1403 
1404 	while (!list_empty(&list)) {
1405 		rq = list_entry_rq(list.next);
1406 		list_del_init(&rq->queuelist);
1407 		BUG_ON(!rq->q);
1408 		if (rq->mq_ctx != this_ctx) {
1409 			if (this_ctx) {
1410 				trace_block_unplug(this_q, depth, from_schedule);
1411 				blk_mq_sched_insert_requests(this_q, this_ctx,
1412 								&ctx_list,
1413 								from_schedule);
1414 			}
1415 
1416 			this_ctx = rq->mq_ctx;
1417 			this_q = rq->q;
1418 			depth = 0;
1419 		}
1420 
1421 		depth++;
1422 		list_add_tail(&rq->queuelist, &ctx_list);
1423 	}
1424 
1425 	/*
1426 	 * If 'this_ctx' is set, we know we have entries to complete
1427 	 * on 'ctx_list'. Do those.
1428 	 */
1429 	if (this_ctx) {
1430 		trace_block_unplug(this_q, depth, from_schedule);
1431 		blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
1432 						from_schedule);
1433 	}
1434 }
1435 
1436 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1437 {
1438 	blk_init_request_from_bio(rq, bio);
1439 
1440 	blk_account_io_start(rq, true);
1441 }
1442 
1443 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1444 {
1445 	return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1446 		!blk_queue_nomerges(hctx->queue);
1447 }
1448 
1449 static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
1450 				   struct blk_mq_ctx *ctx,
1451 				   struct request *rq)
1452 {
1453 	spin_lock(&ctx->lock);
1454 	__blk_mq_insert_request(hctx, rq, false);
1455 	spin_unlock(&ctx->lock);
1456 }
1457 
1458 static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1459 {
1460 	if (rq->tag != -1)
1461 		return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
1462 
1463 	return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1464 }
1465 
1466 static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1467 					struct request *rq,
1468 					blk_qc_t *cookie, bool may_sleep)
1469 {
1470 	struct request_queue *q = rq->q;
1471 	struct blk_mq_queue_data bd = {
1472 		.rq = rq,
1473 		.last = true,
1474 	};
1475 	blk_qc_t new_cookie;
1476 	blk_status_t ret;
1477 	bool run_queue = true;
1478 
1479 	/* RCU or SRCU read lock is needed before checking quiesced flag */
1480 	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
1481 		run_queue = false;
1482 		goto insert;
1483 	}
1484 
1485 	if (q->elevator)
1486 		goto insert;
1487 
1488 	if (!blk_mq_get_driver_tag(rq, NULL, false))
1489 		goto insert;
1490 
1491 	new_cookie = request_to_qc_t(hctx, rq);
1492 
1493 	/*
1494 	 * For OK queue, we are done. For error, kill it. Any other
1495 	 * error (busy), just add it to our list as we previously
1496 	 * would have done
1497 	 */
1498 	ret = q->mq_ops->queue_rq(hctx, &bd);
1499 	switch (ret) {
1500 	case BLK_STS_OK:
1501 		*cookie = new_cookie;
1502 		return;
1503 	case BLK_STS_RESOURCE:
1504 		__blk_mq_requeue_request(rq);
1505 		goto insert;
1506 	default:
1507 		*cookie = BLK_QC_T_NONE;
1508 		blk_mq_end_request(rq, ret);
1509 		return;
1510 	}
1511 
1512 insert:
1513 	blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
1514 }
1515 
1516 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1517 		struct request *rq, blk_qc_t *cookie)
1518 {
1519 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
1520 		rcu_read_lock();
1521 		__blk_mq_try_issue_directly(hctx, rq, cookie, false);
1522 		rcu_read_unlock();
1523 	} else {
1524 		unsigned int srcu_idx;
1525 
1526 		might_sleep();
1527 
1528 		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
1529 		__blk_mq_try_issue_directly(hctx, rq, cookie, true);
1530 		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
1531 	}
1532 }
1533 
1534 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1535 {
1536 	const int is_sync = op_is_sync(bio->bi_opf);
1537 	const int is_flush_fua = op_is_flush(bio->bi_opf);
1538 	struct blk_mq_alloc_data data = { .flags = 0 };
1539 	struct request *rq;
1540 	unsigned int request_count = 0;
1541 	struct blk_plug *plug;
1542 	struct request *same_queue_rq = NULL;
1543 	blk_qc_t cookie;
1544 	unsigned int wb_acct;
1545 
1546 	blk_queue_bounce(q, &bio);
1547 
1548 	blk_queue_split(q, &bio);
1549 
1550 	if (!bio_integrity_prep(bio))
1551 		return BLK_QC_T_NONE;
1552 
1553 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
1554 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1555 		return BLK_QC_T_NONE;
1556 
1557 	if (blk_mq_sched_bio_merge(q, bio))
1558 		return BLK_QC_T_NONE;
1559 
1560 	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
1561 
1562 	trace_block_getrq(q, bio, bio->bi_opf);
1563 
1564 	rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
1565 	if (unlikely(!rq)) {
1566 		__wbt_done(q->rq_wb, wb_acct);
1567 		if (bio->bi_opf & REQ_NOWAIT)
1568 			bio_wouldblock_error(bio);
1569 		return BLK_QC_T_NONE;
1570 	}
1571 
1572 	wbt_track(&rq->issue_stat, wb_acct);
1573 
1574 	cookie = request_to_qc_t(data.hctx, rq);
1575 
1576 	plug = current->plug;
1577 	if (unlikely(is_flush_fua)) {
1578 		blk_mq_put_ctx(data.ctx);
1579 		blk_mq_bio_to_request(rq, bio);
1580 		if (q->elevator) {
1581 			blk_mq_sched_insert_request(rq, false, true, true,
1582 					true);
1583 		} else {
1584 			blk_insert_flush(rq);
1585 			blk_mq_run_hw_queue(data.hctx, true);
1586 		}
1587 	} else if (plug && q->nr_hw_queues == 1) {
1588 		struct request *last = NULL;
1589 
1590 		blk_mq_put_ctx(data.ctx);
1591 		blk_mq_bio_to_request(rq, bio);
1592 
1593 		/*
1594 		 * @request_count may become stale because of schedule
1595 		 * out, so check the list again.
1596 		 */
1597 		if (list_empty(&plug->mq_list))
1598 			request_count = 0;
1599 		else if (blk_queue_nomerges(q))
1600 			request_count = blk_plug_queued_count(q);
1601 
1602 		if (!request_count)
1603 			trace_block_plug(q);
1604 		else
1605 			last = list_entry_rq(plug->mq_list.prev);
1606 
1607 		if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
1608 		    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1609 			blk_flush_plug_list(plug, false);
1610 			trace_block_plug(q);
1611 		}
1612 
1613 		list_add_tail(&rq->queuelist, &plug->mq_list);
1614 	} else if (plug && !blk_queue_nomerges(q)) {
1615 		blk_mq_bio_to_request(rq, bio);
1616 
1617 		/*
1618 		 * We do limited plugging. If the bio can be merged, do that.
1619 		 * Otherwise the existing request in the plug list will be
1620 		 * issued. So the plug list will have one request at most
1621 		 * The plug list might get flushed before this. If that happens,
1622 		 * the plug list is empty, and same_queue_rq is invalid.
1623 		 */
1624 		if (list_empty(&plug->mq_list))
1625 			same_queue_rq = NULL;
1626 		if (same_queue_rq)
1627 			list_del_init(&same_queue_rq->queuelist);
1628 		list_add_tail(&rq->queuelist, &plug->mq_list);
1629 
1630 		blk_mq_put_ctx(data.ctx);
1631 
1632 		if (same_queue_rq) {
1633 			data.hctx = blk_mq_map_queue(q,
1634 					same_queue_rq->mq_ctx->cpu);
1635 			blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1636 					&cookie);
1637 		}
1638 	} else if (q->nr_hw_queues > 1 && is_sync) {
1639 		blk_mq_put_ctx(data.ctx);
1640 		blk_mq_bio_to_request(rq, bio);
1641 		blk_mq_try_issue_directly(data.hctx, rq, &cookie);
1642 	} else if (q->elevator) {
1643 		blk_mq_put_ctx(data.ctx);
1644 		blk_mq_bio_to_request(rq, bio);
1645 		blk_mq_sched_insert_request(rq, false, true, true, true);
1646 	} else {
1647 		blk_mq_put_ctx(data.ctx);
1648 		blk_mq_bio_to_request(rq, bio);
1649 		blk_mq_queue_io(data.hctx, data.ctx, rq);
1650 		blk_mq_run_hw_queue(data.hctx, true);
1651 	}
1652 
1653 	return cookie;
1654 }
1655 
1656 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1657 		     unsigned int hctx_idx)
1658 {
1659 	struct page *page;
1660 
1661 	if (tags->rqs && set->ops->exit_request) {
1662 		int i;
1663 
1664 		for (i = 0; i < tags->nr_tags; i++) {
1665 			struct request *rq = tags->static_rqs[i];
1666 
1667 			if (!rq)
1668 				continue;
1669 			set->ops->exit_request(set, rq, hctx_idx);
1670 			tags->static_rqs[i] = NULL;
1671 		}
1672 	}
1673 
1674 	while (!list_empty(&tags->page_list)) {
1675 		page = list_first_entry(&tags->page_list, struct page, lru);
1676 		list_del_init(&page->lru);
1677 		/*
1678 		 * Remove kmemleak object previously allocated in
1679 		 * blk_mq_init_rq_map().
1680 		 */
1681 		kmemleak_free(page_address(page));
1682 		__free_pages(page, page->private);
1683 	}
1684 }
1685 
1686 void blk_mq_free_rq_map(struct blk_mq_tags *tags)
1687 {
1688 	kfree(tags->rqs);
1689 	tags->rqs = NULL;
1690 	kfree(tags->static_rqs);
1691 	tags->static_rqs = NULL;
1692 
1693 	blk_mq_free_tags(tags);
1694 }
1695 
1696 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
1697 					unsigned int hctx_idx,
1698 					unsigned int nr_tags,
1699 					unsigned int reserved_tags)
1700 {
1701 	struct blk_mq_tags *tags;
1702 	int node;
1703 
1704 	node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
1705 	if (node == NUMA_NO_NODE)
1706 		node = set->numa_node;
1707 
1708 	tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
1709 				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
1710 	if (!tags)
1711 		return NULL;
1712 
1713 	tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
1714 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1715 				 node);
1716 	if (!tags->rqs) {
1717 		blk_mq_free_tags(tags);
1718 		return NULL;
1719 	}
1720 
1721 	tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
1722 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1723 				 node);
1724 	if (!tags->static_rqs) {
1725 		kfree(tags->rqs);
1726 		blk_mq_free_tags(tags);
1727 		return NULL;
1728 	}
1729 
1730 	return tags;
1731 }
1732 
1733 static size_t order_to_size(unsigned int order)
1734 {
1735 	return (size_t)PAGE_SIZE << order;
1736 }
1737 
1738 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1739 		     unsigned int hctx_idx, unsigned int depth)
1740 {
1741 	unsigned int i, j, entries_per_page, max_order = 4;
1742 	size_t rq_size, left;
1743 	int node;
1744 
1745 	node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
1746 	if (node == NUMA_NO_NODE)
1747 		node = set->numa_node;
1748 
1749 	INIT_LIST_HEAD(&tags->page_list);
1750 
1751 	/*
1752 	 * rq_size is the size of the request plus driver payload, rounded
1753 	 * to the cacheline size
1754 	 */
1755 	rq_size = round_up(sizeof(struct request) + set->cmd_size,
1756 				cache_line_size());
1757 	left = rq_size * depth;
1758 
1759 	for (i = 0; i < depth; ) {
1760 		int this_order = max_order;
1761 		struct page *page;
1762 		int to_do;
1763 		void *p;
1764 
1765 		while (this_order && left < order_to_size(this_order - 1))
1766 			this_order--;
1767 
1768 		do {
1769 			page = alloc_pages_node(node,
1770 				GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
1771 				this_order);
1772 			if (page)
1773 				break;
1774 			if (!this_order--)
1775 				break;
1776 			if (order_to_size(this_order) < rq_size)
1777 				break;
1778 		} while (1);
1779 
1780 		if (!page)
1781 			goto fail;
1782 
1783 		page->private = this_order;
1784 		list_add_tail(&page->lru, &tags->page_list);
1785 
1786 		p = page_address(page);
1787 		/*
1788 		 * Allow kmemleak to scan these pages as they contain pointers
1789 		 * to additional allocations like via ops->init_request().
1790 		 */
1791 		kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
1792 		entries_per_page = order_to_size(this_order) / rq_size;
1793 		to_do = min(entries_per_page, depth - i);
1794 		left -= to_do * rq_size;
1795 		for (j = 0; j < to_do; j++) {
1796 			struct request *rq = p;
1797 
1798 			tags->static_rqs[i] = rq;
1799 			if (set->ops->init_request) {
1800 				if (set->ops->init_request(set, rq, hctx_idx,
1801 						node)) {
1802 					tags->static_rqs[i] = NULL;
1803 					goto fail;
1804 				}
1805 			}
1806 
1807 			p += rq_size;
1808 			i++;
1809 		}
1810 	}
1811 	return 0;
1812 
1813 fail:
1814 	blk_mq_free_rqs(set, tags, hctx_idx);
1815 	return -ENOMEM;
1816 }
1817 
1818 /*
1819  * 'cpu' is going away. splice any existing rq_list entries from this
1820  * software queue to the hw queue dispatch list, and ensure that it
1821  * gets run.
1822  */
1823 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
1824 {
1825 	struct blk_mq_hw_ctx *hctx;
1826 	struct blk_mq_ctx *ctx;
1827 	LIST_HEAD(tmp);
1828 
1829 	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
1830 	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
1831 
1832 	spin_lock(&ctx->lock);
1833 	if (!list_empty(&ctx->rq_list)) {
1834 		list_splice_init(&ctx->rq_list, &tmp);
1835 		blk_mq_hctx_clear_pending(hctx, ctx);
1836 	}
1837 	spin_unlock(&ctx->lock);
1838 
1839 	if (list_empty(&tmp))
1840 		return 0;
1841 
1842 	spin_lock(&hctx->lock);
1843 	list_splice_tail_init(&tmp, &hctx->dispatch);
1844 	spin_unlock(&hctx->lock);
1845 
1846 	blk_mq_run_hw_queue(hctx, true);
1847 	return 0;
1848 }
1849 
1850 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
1851 {
1852 	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
1853 					    &hctx->cpuhp_dead);
1854 }
1855 
1856 /* hctx->ctxs will be freed in queue's release handler */
1857 static void blk_mq_exit_hctx(struct request_queue *q,
1858 		struct blk_mq_tag_set *set,
1859 		struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1860 {
1861 	blk_mq_debugfs_unregister_hctx(hctx);
1862 
1863 	blk_mq_tag_idle(hctx);
1864 
1865 	if (set->ops->exit_request)
1866 		set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
1867 
1868 	blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
1869 
1870 	if (set->ops->exit_hctx)
1871 		set->ops->exit_hctx(hctx, hctx_idx);
1872 
1873 	if (hctx->flags & BLK_MQ_F_BLOCKING)
1874 		cleanup_srcu_struct(hctx->queue_rq_srcu);
1875 
1876 	blk_mq_remove_cpuhp(hctx);
1877 	blk_free_flush_queue(hctx->fq);
1878 	sbitmap_free(&hctx->ctx_map);
1879 }
1880 
1881 static void blk_mq_exit_hw_queues(struct request_queue *q,
1882 		struct blk_mq_tag_set *set, int nr_queue)
1883 {
1884 	struct blk_mq_hw_ctx *hctx;
1885 	unsigned int i;
1886 
1887 	queue_for_each_hw_ctx(q, hctx, i) {
1888 		if (i == nr_queue)
1889 			break;
1890 		blk_mq_exit_hctx(q, set, hctx, i);
1891 	}
1892 }
1893 
1894 static int blk_mq_init_hctx(struct request_queue *q,
1895 		struct blk_mq_tag_set *set,
1896 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
1897 {
1898 	int node;
1899 
1900 	node = hctx->numa_node;
1901 	if (node == NUMA_NO_NODE)
1902 		node = hctx->numa_node = set->numa_node;
1903 
1904 	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1905 	spin_lock_init(&hctx->lock);
1906 	INIT_LIST_HEAD(&hctx->dispatch);
1907 	hctx->queue = q;
1908 	hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
1909 
1910 	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
1911 
1912 	hctx->tags = set->tags[hctx_idx];
1913 
1914 	/*
1915 	 * Allocate space for all possible cpus to avoid allocation at
1916 	 * runtime
1917 	 */
1918 	hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1919 					GFP_KERNEL, node);
1920 	if (!hctx->ctxs)
1921 		goto unregister_cpu_notifier;
1922 
1923 	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
1924 			      node))
1925 		goto free_ctxs;
1926 
1927 	hctx->nr_ctx = 0;
1928 
1929 	if (set->ops->init_hctx &&
1930 	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1931 		goto free_bitmap;
1932 
1933 	if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
1934 		goto exit_hctx;
1935 
1936 	hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
1937 	if (!hctx->fq)
1938 		goto sched_exit_hctx;
1939 
1940 	if (set->ops->init_request &&
1941 	    set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
1942 				   node))
1943 		goto free_fq;
1944 
1945 	if (hctx->flags & BLK_MQ_F_BLOCKING)
1946 		init_srcu_struct(hctx->queue_rq_srcu);
1947 
1948 	blk_mq_debugfs_register_hctx(q, hctx);
1949 
1950 	return 0;
1951 
1952  free_fq:
1953 	kfree(hctx->fq);
1954  sched_exit_hctx:
1955 	blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
1956  exit_hctx:
1957 	if (set->ops->exit_hctx)
1958 		set->ops->exit_hctx(hctx, hctx_idx);
1959  free_bitmap:
1960 	sbitmap_free(&hctx->ctx_map);
1961  free_ctxs:
1962 	kfree(hctx->ctxs);
1963  unregister_cpu_notifier:
1964 	blk_mq_remove_cpuhp(hctx);
1965 	return -1;
1966 }
1967 
1968 static void blk_mq_init_cpu_queues(struct request_queue *q,
1969 				   unsigned int nr_hw_queues)
1970 {
1971 	unsigned int i;
1972 
1973 	for_each_possible_cpu(i) {
1974 		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1975 		struct blk_mq_hw_ctx *hctx;
1976 
1977 		__ctx->cpu = i;
1978 		spin_lock_init(&__ctx->lock);
1979 		INIT_LIST_HEAD(&__ctx->rq_list);
1980 		__ctx->queue = q;
1981 
1982 		/* If the cpu isn't present, the cpu is mapped to first hctx */
1983 		if (!cpu_present(i))
1984 			continue;
1985 
1986 		hctx = blk_mq_map_queue(q, i);
1987 
1988 		/*
1989 		 * Set local node, IFF we have more than one hw queue. If
1990 		 * not, we remain on the home node of the device
1991 		 */
1992 		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1993 			hctx->numa_node = local_memory_node(cpu_to_node(i));
1994 	}
1995 }
1996 
1997 static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
1998 {
1999 	int ret = 0;
2000 
2001 	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2002 					set->queue_depth, set->reserved_tags);
2003 	if (!set->tags[hctx_idx])
2004 		return false;
2005 
2006 	ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
2007 				set->queue_depth);
2008 	if (!ret)
2009 		return true;
2010 
2011 	blk_mq_free_rq_map(set->tags[hctx_idx]);
2012 	set->tags[hctx_idx] = NULL;
2013 	return false;
2014 }
2015 
2016 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2017 					 unsigned int hctx_idx)
2018 {
2019 	if (set->tags[hctx_idx]) {
2020 		blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2021 		blk_mq_free_rq_map(set->tags[hctx_idx]);
2022 		set->tags[hctx_idx] = NULL;
2023 	}
2024 }
2025 
2026 static void blk_mq_map_swqueue(struct request_queue *q)
2027 {
2028 	unsigned int i, hctx_idx;
2029 	struct blk_mq_hw_ctx *hctx;
2030 	struct blk_mq_ctx *ctx;
2031 	struct blk_mq_tag_set *set = q->tag_set;
2032 
2033 	/*
2034 	 * Avoid others reading imcomplete hctx->cpumask through sysfs
2035 	 */
2036 	mutex_lock(&q->sysfs_lock);
2037 
2038 	queue_for_each_hw_ctx(q, hctx, i) {
2039 		cpumask_clear(hctx->cpumask);
2040 		hctx->nr_ctx = 0;
2041 	}
2042 
2043 	/*
2044 	 * Map software to hardware queues.
2045 	 *
2046 	 * If the cpu isn't present, the cpu is mapped to first hctx.
2047 	 */
2048 	for_each_present_cpu(i) {
2049 		hctx_idx = q->mq_map[i];
2050 		/* unmapped hw queue can be remapped after CPU topo changed */
2051 		if (!set->tags[hctx_idx] &&
2052 		    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2053 			/*
2054 			 * If tags initialization fail for some hctx,
2055 			 * that hctx won't be brought online.  In this
2056 			 * case, remap the current ctx to hctx[0] which
2057 			 * is guaranteed to always have tags allocated
2058 			 */
2059 			q->mq_map[i] = 0;
2060 		}
2061 
2062 		ctx = per_cpu_ptr(q->queue_ctx, i);
2063 		hctx = blk_mq_map_queue(q, i);
2064 
2065 		cpumask_set_cpu(i, hctx->cpumask);
2066 		ctx->index_hw = hctx->nr_ctx;
2067 		hctx->ctxs[hctx->nr_ctx++] = ctx;
2068 	}
2069 
2070 	mutex_unlock(&q->sysfs_lock);
2071 
2072 	queue_for_each_hw_ctx(q, hctx, i) {
2073 		/*
2074 		 * If no software queues are mapped to this hardware queue,
2075 		 * disable it and free the request entries.
2076 		 */
2077 		if (!hctx->nr_ctx) {
2078 			/* Never unmap queue 0.  We need it as a
2079 			 * fallback in case of a new remap fails
2080 			 * allocation
2081 			 */
2082 			if (i && set->tags[i])
2083 				blk_mq_free_map_and_requests(set, i);
2084 
2085 			hctx->tags = NULL;
2086 			continue;
2087 		}
2088 
2089 		hctx->tags = set->tags[i];
2090 		WARN_ON(!hctx->tags);
2091 
2092 		/*
2093 		 * Set the map size to the number of mapped software queues.
2094 		 * This is more accurate and more efficient than looping
2095 		 * over all possibly mapped software queues.
2096 		 */
2097 		sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
2098 
2099 		/*
2100 		 * Initialize batch roundrobin counts
2101 		 */
2102 		hctx->next_cpu = cpumask_first(hctx->cpumask);
2103 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2104 	}
2105 }
2106 
2107 /*
2108  * Caller needs to ensure that we're either frozen/quiesced, or that
2109  * the queue isn't live yet.
2110  */
2111 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
2112 {
2113 	struct blk_mq_hw_ctx *hctx;
2114 	int i;
2115 
2116 	queue_for_each_hw_ctx(q, hctx, i) {
2117 		if (shared) {
2118 			if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2119 				atomic_inc(&q->shared_hctx_restart);
2120 			hctx->flags |= BLK_MQ_F_TAG_SHARED;
2121 		} else {
2122 			if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2123 				atomic_dec(&q->shared_hctx_restart);
2124 			hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
2125 		}
2126 	}
2127 }
2128 
2129 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2130 					bool shared)
2131 {
2132 	struct request_queue *q;
2133 
2134 	lockdep_assert_held(&set->tag_list_lock);
2135 
2136 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
2137 		blk_mq_freeze_queue(q);
2138 		queue_set_hctx_shared(q, shared);
2139 		blk_mq_unfreeze_queue(q);
2140 	}
2141 }
2142 
2143 static void blk_mq_del_queue_tag_set(struct request_queue *q)
2144 {
2145 	struct blk_mq_tag_set *set = q->tag_set;
2146 
2147 	mutex_lock(&set->tag_list_lock);
2148 	list_del_rcu(&q->tag_set_list);
2149 	INIT_LIST_HEAD(&q->tag_set_list);
2150 	if (list_is_singular(&set->tag_list)) {
2151 		/* just transitioned to unshared */
2152 		set->flags &= ~BLK_MQ_F_TAG_SHARED;
2153 		/* update existing queue */
2154 		blk_mq_update_tag_set_depth(set, false);
2155 	}
2156 	mutex_unlock(&set->tag_list_lock);
2157 
2158 	synchronize_rcu();
2159 }
2160 
2161 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2162 				     struct request_queue *q)
2163 {
2164 	q->tag_set = set;
2165 
2166 	mutex_lock(&set->tag_list_lock);
2167 
2168 	/* Check to see if we're transitioning to shared (from 1 to 2 queues). */
2169 	if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2170 		set->flags |= BLK_MQ_F_TAG_SHARED;
2171 		/* update existing queue */
2172 		blk_mq_update_tag_set_depth(set, true);
2173 	}
2174 	if (set->flags & BLK_MQ_F_TAG_SHARED)
2175 		queue_set_hctx_shared(q, true);
2176 	list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
2177 
2178 	mutex_unlock(&set->tag_list_lock);
2179 }
2180 
2181 /*
2182  * It is the actual release handler for mq, but we do it from
2183  * request queue's release handler for avoiding use-after-free
2184  * and headache because q->mq_kobj shouldn't have been introduced,
2185  * but we can't group ctx/kctx kobj without it.
2186  */
2187 void blk_mq_release(struct request_queue *q)
2188 {
2189 	struct blk_mq_hw_ctx *hctx;
2190 	unsigned int i;
2191 
2192 	/* hctx kobj stays in hctx */
2193 	queue_for_each_hw_ctx(q, hctx, i) {
2194 		if (!hctx)
2195 			continue;
2196 		kobject_put(&hctx->kobj);
2197 	}
2198 
2199 	q->mq_map = NULL;
2200 
2201 	kfree(q->queue_hw_ctx);
2202 
2203 	/*
2204 	 * release .mq_kobj and sw queue's kobject now because
2205 	 * both share lifetime with request queue.
2206 	 */
2207 	blk_mq_sysfs_deinit(q);
2208 
2209 	free_percpu(q->queue_ctx);
2210 }
2211 
2212 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
2213 {
2214 	struct request_queue *uninit_q, *q;
2215 
2216 	uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
2217 	if (!uninit_q)
2218 		return ERR_PTR(-ENOMEM);
2219 
2220 	q = blk_mq_init_allocated_queue(set, uninit_q);
2221 	if (IS_ERR(q))
2222 		blk_cleanup_queue(uninit_q);
2223 
2224 	return q;
2225 }
2226 EXPORT_SYMBOL(blk_mq_init_queue);
2227 
2228 static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2229 {
2230 	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2231 
2232 	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
2233 			   __alignof__(struct blk_mq_hw_ctx)) !=
2234 		     sizeof(struct blk_mq_hw_ctx));
2235 
2236 	if (tag_set->flags & BLK_MQ_F_BLOCKING)
2237 		hw_ctx_size += sizeof(struct srcu_struct);
2238 
2239 	return hw_ctx_size;
2240 }
2241 
2242 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2243 						struct request_queue *q)
2244 {
2245 	int i, j;
2246 	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2247 
2248 	blk_mq_sysfs_unregister(q);
2249 	for (i = 0; i < set->nr_hw_queues; i++) {
2250 		int node;
2251 
2252 		if (hctxs[i])
2253 			continue;
2254 
2255 		node = blk_mq_hw_queue_to_node(q->mq_map, i);
2256 		hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
2257 					GFP_KERNEL, node);
2258 		if (!hctxs[i])
2259 			break;
2260 
2261 		if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
2262 						node)) {
2263 			kfree(hctxs[i]);
2264 			hctxs[i] = NULL;
2265 			break;
2266 		}
2267 
2268 		atomic_set(&hctxs[i]->nr_active, 0);
2269 		hctxs[i]->numa_node = node;
2270 		hctxs[i]->queue_num = i;
2271 
2272 		if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2273 			free_cpumask_var(hctxs[i]->cpumask);
2274 			kfree(hctxs[i]);
2275 			hctxs[i] = NULL;
2276 			break;
2277 		}
2278 		blk_mq_hctx_kobj_init(hctxs[i]);
2279 	}
2280 	for (j = i; j < q->nr_hw_queues; j++) {
2281 		struct blk_mq_hw_ctx *hctx = hctxs[j];
2282 
2283 		if (hctx) {
2284 			if (hctx->tags)
2285 				blk_mq_free_map_and_requests(set, j);
2286 			blk_mq_exit_hctx(q, set, hctx, j);
2287 			kobject_put(&hctx->kobj);
2288 			hctxs[j] = NULL;
2289 
2290 		}
2291 	}
2292 	q->nr_hw_queues = i;
2293 	blk_mq_sysfs_register(q);
2294 }
2295 
2296 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2297 						  struct request_queue *q)
2298 {
2299 	/* mark the queue as mq asap */
2300 	q->mq_ops = set->ops;
2301 
2302 	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
2303 					     blk_mq_poll_stats_bkt,
2304 					     BLK_MQ_POLL_STATS_BKTS, q);
2305 	if (!q->poll_cb)
2306 		goto err_exit;
2307 
2308 	q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2309 	if (!q->queue_ctx)
2310 		goto err_exit;
2311 
2312 	/* init q->mq_kobj and sw queues' kobjects */
2313 	blk_mq_sysfs_init(q);
2314 
2315 	q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
2316 						GFP_KERNEL, set->numa_node);
2317 	if (!q->queue_hw_ctx)
2318 		goto err_percpu;
2319 
2320 	q->mq_map = set->mq_map;
2321 
2322 	blk_mq_realloc_hw_ctxs(set, q);
2323 	if (!q->nr_hw_queues)
2324 		goto err_hctxs;
2325 
2326 	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2327 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2328 
2329 	q->nr_queues = nr_cpu_ids;
2330 
2331 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2332 
2333 	if (!(set->flags & BLK_MQ_F_SG_MERGE))
2334 		q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
2335 
2336 	q->sg_reserved_size = INT_MAX;
2337 
2338 	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
2339 	INIT_LIST_HEAD(&q->requeue_list);
2340 	spin_lock_init(&q->requeue_lock);
2341 
2342 	blk_queue_make_request(q, blk_mq_make_request);
2343 
2344 	/*
2345 	 * Do this after blk_queue_make_request() overrides it...
2346 	 */
2347 	q->nr_requests = set->queue_depth;
2348 
2349 	/*
2350 	 * Default to classic polling
2351 	 */
2352 	q->poll_nsec = -1;
2353 
2354 	if (set->ops->complete)
2355 		blk_queue_softirq_done(q, set->ops->complete);
2356 
2357 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2358 	blk_mq_add_queue_tag_set(set, q);
2359 	blk_mq_map_swqueue(q);
2360 
2361 	if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2362 		int ret;
2363 
2364 		ret = blk_mq_sched_init(q);
2365 		if (ret)
2366 			return ERR_PTR(ret);
2367 	}
2368 
2369 	return q;
2370 
2371 err_hctxs:
2372 	kfree(q->queue_hw_ctx);
2373 err_percpu:
2374 	free_percpu(q->queue_ctx);
2375 err_exit:
2376 	q->mq_ops = NULL;
2377 	return ERR_PTR(-ENOMEM);
2378 }
2379 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2380 
2381 void blk_mq_free_queue(struct request_queue *q)
2382 {
2383 	struct blk_mq_tag_set	*set = q->tag_set;
2384 
2385 	blk_mq_del_queue_tag_set(q);
2386 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2387 }
2388 
2389 /* Basically redo blk_mq_init_queue with queue frozen */
2390 static void blk_mq_queue_reinit(struct request_queue *q)
2391 {
2392 	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2393 
2394 	blk_mq_debugfs_unregister_hctxs(q);
2395 	blk_mq_sysfs_unregister(q);
2396 
2397 	/*
2398 	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2399 	 * we should change hctx numa_node according to new topology (this
2400 	 * involves free and re-allocate memory, worthy doing?)
2401 	 */
2402 
2403 	blk_mq_map_swqueue(q);
2404 
2405 	blk_mq_sysfs_register(q);
2406 	blk_mq_debugfs_register_hctxs(q);
2407 }
2408 
2409 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2410 {
2411 	int i;
2412 
2413 	for (i = 0; i < set->nr_hw_queues; i++)
2414 		if (!__blk_mq_alloc_rq_map(set, i))
2415 			goto out_unwind;
2416 
2417 	return 0;
2418 
2419 out_unwind:
2420 	while (--i >= 0)
2421 		blk_mq_free_rq_map(set->tags[i]);
2422 
2423 	return -ENOMEM;
2424 }
2425 
2426 /*
2427  * Allocate the request maps associated with this tag_set. Note that this
2428  * may reduce the depth asked for, if memory is tight. set->queue_depth
2429  * will be updated to reflect the allocated depth.
2430  */
2431 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2432 {
2433 	unsigned int depth;
2434 	int err;
2435 
2436 	depth = set->queue_depth;
2437 	do {
2438 		err = __blk_mq_alloc_rq_maps(set);
2439 		if (!err)
2440 			break;
2441 
2442 		set->queue_depth >>= 1;
2443 		if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2444 			err = -ENOMEM;
2445 			break;
2446 		}
2447 	} while (set->queue_depth);
2448 
2449 	if (!set->queue_depth || err) {
2450 		pr_err("blk-mq: failed to allocate request map\n");
2451 		return -ENOMEM;
2452 	}
2453 
2454 	if (depth != set->queue_depth)
2455 		pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2456 						depth, set->queue_depth);
2457 
2458 	return 0;
2459 }
2460 
2461 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2462 {
2463 	if (set->ops->map_queues)
2464 		return set->ops->map_queues(set);
2465 	else
2466 		return blk_mq_map_queues(set);
2467 }
2468 
2469 /*
2470  * Alloc a tag set to be associated with one or more request queues.
2471  * May fail with EINVAL for various error conditions. May adjust the
2472  * requested depth down, if if it too large. In that case, the set
2473  * value will be stored in set->queue_depth.
2474  */
2475 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2476 {
2477 	int ret;
2478 
2479 	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2480 
2481 	if (!set->nr_hw_queues)
2482 		return -EINVAL;
2483 	if (!set->queue_depth)
2484 		return -EINVAL;
2485 	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2486 		return -EINVAL;
2487 
2488 	if (!set->ops->queue_rq)
2489 		return -EINVAL;
2490 
2491 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2492 		pr_info("blk-mq: reduced tag depth to %u\n",
2493 			BLK_MQ_MAX_DEPTH);
2494 		set->queue_depth = BLK_MQ_MAX_DEPTH;
2495 	}
2496 
2497 	/*
2498 	 * If a crashdump is active, then we are potentially in a very
2499 	 * memory constrained environment. Limit us to 1 queue and
2500 	 * 64 tags to prevent using too much memory.
2501 	 */
2502 	if (is_kdump_kernel()) {
2503 		set->nr_hw_queues = 1;
2504 		set->queue_depth = min(64U, set->queue_depth);
2505 	}
2506 	/*
2507 	 * There is no use for more h/w queues than cpus.
2508 	 */
2509 	if (set->nr_hw_queues > nr_cpu_ids)
2510 		set->nr_hw_queues = nr_cpu_ids;
2511 
2512 	set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
2513 				 GFP_KERNEL, set->numa_node);
2514 	if (!set->tags)
2515 		return -ENOMEM;
2516 
2517 	ret = -ENOMEM;
2518 	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
2519 			GFP_KERNEL, set->numa_node);
2520 	if (!set->mq_map)
2521 		goto out_free_tags;
2522 
2523 	ret = blk_mq_update_queue_map(set);
2524 	if (ret)
2525 		goto out_free_mq_map;
2526 
2527 	ret = blk_mq_alloc_rq_maps(set);
2528 	if (ret)
2529 		goto out_free_mq_map;
2530 
2531 	mutex_init(&set->tag_list_lock);
2532 	INIT_LIST_HEAD(&set->tag_list);
2533 
2534 	return 0;
2535 
2536 out_free_mq_map:
2537 	kfree(set->mq_map);
2538 	set->mq_map = NULL;
2539 out_free_tags:
2540 	kfree(set->tags);
2541 	set->tags = NULL;
2542 	return ret;
2543 }
2544 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2545 
2546 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2547 {
2548 	int i;
2549 
2550 	for (i = 0; i < nr_cpu_ids; i++)
2551 		blk_mq_free_map_and_requests(set, i);
2552 
2553 	kfree(set->mq_map);
2554 	set->mq_map = NULL;
2555 
2556 	kfree(set->tags);
2557 	set->tags = NULL;
2558 }
2559 EXPORT_SYMBOL(blk_mq_free_tag_set);
2560 
2561 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2562 {
2563 	struct blk_mq_tag_set *set = q->tag_set;
2564 	struct blk_mq_hw_ctx *hctx;
2565 	int i, ret;
2566 
2567 	if (!set)
2568 		return -EINVAL;
2569 
2570 	blk_mq_freeze_queue(q);
2571 
2572 	ret = 0;
2573 	queue_for_each_hw_ctx(q, hctx, i) {
2574 		if (!hctx->tags)
2575 			continue;
2576 		/*
2577 		 * If we're using an MQ scheduler, just update the scheduler
2578 		 * queue depth. This is similar to what the old code would do.
2579 		 */
2580 		if (!hctx->sched_tags) {
2581 			ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
2582 							min(nr, set->queue_depth),
2583 							false);
2584 		} else {
2585 			ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
2586 							nr, true);
2587 		}
2588 		if (ret)
2589 			break;
2590 	}
2591 
2592 	if (!ret)
2593 		q->nr_requests = nr;
2594 
2595 	blk_mq_unfreeze_queue(q);
2596 
2597 	return ret;
2598 }
2599 
2600 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
2601 							int nr_hw_queues)
2602 {
2603 	struct request_queue *q;
2604 
2605 	lockdep_assert_held(&set->tag_list_lock);
2606 
2607 	if (nr_hw_queues > nr_cpu_ids)
2608 		nr_hw_queues = nr_cpu_ids;
2609 	if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
2610 		return;
2611 
2612 	list_for_each_entry(q, &set->tag_list, tag_set_list)
2613 		blk_mq_freeze_queue(q);
2614 
2615 	set->nr_hw_queues = nr_hw_queues;
2616 	blk_mq_update_queue_map(set);
2617 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
2618 		blk_mq_realloc_hw_ctxs(set, q);
2619 		blk_mq_queue_reinit(q);
2620 	}
2621 
2622 	list_for_each_entry(q, &set->tag_list, tag_set_list)
2623 		blk_mq_unfreeze_queue(q);
2624 }
2625 
2626 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2627 {
2628 	mutex_lock(&set->tag_list_lock);
2629 	__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
2630 	mutex_unlock(&set->tag_list_lock);
2631 }
2632 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
2633 
2634 /* Enable polling stats and return whether they were already enabled. */
2635 static bool blk_poll_stats_enable(struct request_queue *q)
2636 {
2637 	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
2638 	    test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
2639 		return true;
2640 	blk_stat_add_callback(q, q->poll_cb);
2641 	return false;
2642 }
2643 
2644 static void blk_mq_poll_stats_start(struct request_queue *q)
2645 {
2646 	/*
2647 	 * We don't arm the callback if polling stats are not enabled or the
2648 	 * callback is already active.
2649 	 */
2650 	if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
2651 	    blk_stat_is_active(q->poll_cb))
2652 		return;
2653 
2654 	blk_stat_activate_msecs(q->poll_cb, 100);
2655 }
2656 
2657 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
2658 {
2659 	struct request_queue *q = cb->data;
2660 	int bucket;
2661 
2662 	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
2663 		if (cb->stat[bucket].nr_samples)
2664 			q->poll_stat[bucket] = cb->stat[bucket];
2665 	}
2666 }
2667 
2668 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
2669 				       struct blk_mq_hw_ctx *hctx,
2670 				       struct request *rq)
2671 {
2672 	unsigned long ret = 0;
2673 	int bucket;
2674 
2675 	/*
2676 	 * If stats collection isn't on, don't sleep but turn it on for
2677 	 * future users
2678 	 */
2679 	if (!blk_poll_stats_enable(q))
2680 		return 0;
2681 
2682 	/*
2683 	 * As an optimistic guess, use half of the mean service time
2684 	 * for this type of request. We can (and should) make this smarter.
2685 	 * For instance, if the completion latencies are tight, we can
2686 	 * get closer than just half the mean. This is especially
2687 	 * important on devices where the completion latencies are longer
2688 	 * than ~10 usec. We do use the stats for the relevant IO size
2689 	 * if available which does lead to better estimates.
2690 	 */
2691 	bucket = blk_mq_poll_stats_bkt(rq);
2692 	if (bucket < 0)
2693 		return ret;
2694 
2695 	if (q->poll_stat[bucket].nr_samples)
2696 		ret = (q->poll_stat[bucket].mean + 1) / 2;
2697 
2698 	return ret;
2699 }
2700 
2701 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
2702 				     struct blk_mq_hw_ctx *hctx,
2703 				     struct request *rq)
2704 {
2705 	struct hrtimer_sleeper hs;
2706 	enum hrtimer_mode mode;
2707 	unsigned int nsecs;
2708 	ktime_t kt;
2709 
2710 	if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
2711 		return false;
2712 
2713 	/*
2714 	 * poll_nsec can be:
2715 	 *
2716 	 * -1:	don't ever hybrid sleep
2717 	 *  0:	use half of prev avg
2718 	 * >0:	use this specific value
2719 	 */
2720 	if (q->poll_nsec == -1)
2721 		return false;
2722 	else if (q->poll_nsec > 0)
2723 		nsecs = q->poll_nsec;
2724 	else
2725 		nsecs = blk_mq_poll_nsecs(q, hctx, rq);
2726 
2727 	if (!nsecs)
2728 		return false;
2729 
2730 	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
2731 
2732 	/*
2733 	 * This will be replaced with the stats tracking code, using
2734 	 * 'avg_completion_time / 2' as the pre-sleep target.
2735 	 */
2736 	kt = nsecs;
2737 
2738 	mode = HRTIMER_MODE_REL;
2739 	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
2740 	hrtimer_set_expires(&hs.timer, kt);
2741 
2742 	hrtimer_init_sleeper(&hs, current);
2743 	do {
2744 		if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
2745 			break;
2746 		set_current_state(TASK_UNINTERRUPTIBLE);
2747 		hrtimer_start_expires(&hs.timer, mode);
2748 		if (hs.task)
2749 			io_schedule();
2750 		hrtimer_cancel(&hs.timer);
2751 		mode = HRTIMER_MODE_ABS;
2752 	} while (hs.task && !signal_pending(current));
2753 
2754 	__set_current_state(TASK_RUNNING);
2755 	destroy_hrtimer_on_stack(&hs.timer);
2756 	return true;
2757 }
2758 
2759 static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
2760 {
2761 	struct request_queue *q = hctx->queue;
2762 	long state;
2763 
2764 	/*
2765 	 * If we sleep, have the caller restart the poll loop to reset
2766 	 * the state. Like for the other success return cases, the
2767 	 * caller is responsible for checking if the IO completed. If
2768 	 * the IO isn't complete, we'll get called again and will go
2769 	 * straight to the busy poll loop.
2770 	 */
2771 	if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
2772 		return true;
2773 
2774 	hctx->poll_considered++;
2775 
2776 	state = current->state;
2777 	while (!need_resched()) {
2778 		int ret;
2779 
2780 		hctx->poll_invoked++;
2781 
2782 		ret = q->mq_ops->poll(hctx, rq->tag);
2783 		if (ret > 0) {
2784 			hctx->poll_success++;
2785 			set_current_state(TASK_RUNNING);
2786 			return true;
2787 		}
2788 
2789 		if (signal_pending_state(state, current))
2790 			set_current_state(TASK_RUNNING);
2791 
2792 		if (current->state == TASK_RUNNING)
2793 			return true;
2794 		if (ret < 0)
2795 			break;
2796 		cpu_relax();
2797 	}
2798 
2799 	return false;
2800 }
2801 
2802 bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
2803 {
2804 	struct blk_mq_hw_ctx *hctx;
2805 	struct blk_plug *plug;
2806 	struct request *rq;
2807 
2808 	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
2809 	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
2810 		return false;
2811 
2812 	plug = current->plug;
2813 	if (plug)
2814 		blk_flush_plug_list(plug, false);
2815 
2816 	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
2817 	if (!blk_qc_t_is_internal(cookie))
2818 		rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
2819 	else {
2820 		rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
2821 		/*
2822 		 * With scheduling, if the request has completed, we'll
2823 		 * get a NULL return here, as we clear the sched tag when
2824 		 * that happens. The request still remains valid, like always,
2825 		 * so we should be safe with just the NULL check.
2826 		 */
2827 		if (!rq)
2828 			return false;
2829 	}
2830 
2831 	return __blk_mq_poll(hctx, rq);
2832 }
2833 EXPORT_SYMBOL_GPL(blk_mq_poll);
2834 
2835 static int __init blk_mq_init(void)
2836 {
2837 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
2838 				blk_mq_hctx_notify_dead);
2839 	return 0;
2840 }
2841 subsys_initcall(blk_mq_init);
2842