1 /* 2 * Block multiqueue core code 3 * 4 * Copyright (C) 2013-2014 Jens Axboe 5 * Copyright (C) 2013-2014 Christoph Hellwig 6 */ 7 #include <linux/kernel.h> 8 #include <linux/module.h> 9 #include <linux/backing-dev.h> 10 #include <linux/bio.h> 11 #include <linux/blkdev.h> 12 #include <linux/kmemleak.h> 13 #include <linux/mm.h> 14 #include <linux/init.h> 15 #include <linux/slab.h> 16 #include <linux/workqueue.h> 17 #include <linux/smp.h> 18 #include <linux/llist.h> 19 #include <linux/list_sort.h> 20 #include <linux/cpu.h> 21 #include <linux/cache.h> 22 #include <linux/sched/sysctl.h> 23 #include <linux/sched/topology.h> 24 #include <linux/sched/signal.h> 25 #include <linux/delay.h> 26 #include <linux/crash_dump.h> 27 #include <linux/prefetch.h> 28 29 #include <trace/events/block.h> 30 31 #include <linux/blk-mq.h> 32 #include "blk.h" 33 #include "blk-mq.h" 34 #include "blk-mq-debugfs.h" 35 #include "blk-mq-tag.h" 36 #include "blk-pm.h" 37 #include "blk-stat.h" 38 #include "blk-mq-sched.h" 39 #include "blk-rq-qos.h" 40 41 static void blk_mq_poll_stats_start(struct request_queue *q); 42 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); 43 44 static int blk_mq_poll_stats_bkt(const struct request *rq) 45 { 46 int ddir, bytes, bucket; 47 48 ddir = rq_data_dir(rq); 49 bytes = blk_rq_bytes(rq); 50 51 bucket = ddir + 2*(ilog2(bytes) - 9); 52 53 if (bucket < 0) 54 return -1; 55 else if (bucket >= BLK_MQ_POLL_STATS_BKTS) 56 return ddir + BLK_MQ_POLL_STATS_BKTS - 2; 57 58 return bucket; 59 } 60 61 /* 62 * Check if any of the ctx's have pending work in this hardware queue 63 */ 64 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 65 { 66 return !list_empty_careful(&hctx->dispatch) || 67 sbitmap_any_bit_set(&hctx->ctx_map) || 68 blk_mq_sched_has_work(hctx); 69 } 70 71 /* 72 * Mark this ctx as having pending work in this hardware queue 73 */ 74 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 75 struct blk_mq_ctx *ctx) 76 { 77 const int bit = ctx->index_hw[hctx->type]; 78 79 if (!sbitmap_test_bit(&hctx->ctx_map, bit)) 80 sbitmap_set_bit(&hctx->ctx_map, bit); 81 } 82 83 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 84 struct blk_mq_ctx *ctx) 85 { 86 const int bit = ctx->index_hw[hctx->type]; 87 88 sbitmap_clear_bit(&hctx->ctx_map, bit); 89 } 90 91 struct mq_inflight { 92 struct hd_struct *part; 93 unsigned int *inflight; 94 }; 95 96 static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, 97 struct request *rq, void *priv, 98 bool reserved) 99 { 100 struct mq_inflight *mi = priv; 101 102 /* 103 * index[0] counts the specific partition that was asked for. 104 */ 105 if (rq->part == mi->part) 106 mi->inflight[0]++; 107 108 return true; 109 } 110 111 unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) 112 { 113 unsigned inflight[2]; 114 struct mq_inflight mi = { .part = part, .inflight = inflight, }; 115 116 inflight[0] = inflight[1] = 0; 117 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); 118 119 return inflight[0]; 120 } 121 122 static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, 123 struct request *rq, void *priv, 124 bool reserved) 125 { 126 struct mq_inflight *mi = priv; 127 128 if (rq->part == mi->part) 129 mi->inflight[rq_data_dir(rq)]++; 130 131 return true; 132 } 133 134 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, 135 unsigned int inflight[2]) 136 { 137 struct mq_inflight mi = { .part = part, .inflight = inflight, }; 138 139 inflight[0] = inflight[1] = 0; 140 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); 141 } 142 143 void blk_freeze_queue_start(struct request_queue *q) 144 { 145 int freeze_depth; 146 147 freeze_depth = atomic_inc_return(&q->mq_freeze_depth); 148 if (freeze_depth == 1) { 149 percpu_ref_kill(&q->q_usage_counter); 150 if (queue_is_mq(q)) 151 blk_mq_run_hw_queues(q, false); 152 } 153 } 154 EXPORT_SYMBOL_GPL(blk_freeze_queue_start); 155 156 void blk_mq_freeze_queue_wait(struct request_queue *q) 157 { 158 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); 159 } 160 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); 161 162 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 163 unsigned long timeout) 164 { 165 return wait_event_timeout(q->mq_freeze_wq, 166 percpu_ref_is_zero(&q->q_usage_counter), 167 timeout); 168 } 169 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); 170 171 /* 172 * Guarantee no request is in use, so we can change any data structure of 173 * the queue afterward. 174 */ 175 void blk_freeze_queue(struct request_queue *q) 176 { 177 /* 178 * In the !blk_mq case we are only calling this to kill the 179 * q_usage_counter, otherwise this increases the freeze depth 180 * and waits for it to return to zero. For this reason there is 181 * no blk_unfreeze_queue(), and blk_freeze_queue() is not 182 * exported to drivers as the only user for unfreeze is blk_mq. 183 */ 184 blk_freeze_queue_start(q); 185 blk_mq_freeze_queue_wait(q); 186 } 187 188 void blk_mq_freeze_queue(struct request_queue *q) 189 { 190 /* 191 * ...just an alias to keep freeze and unfreeze actions balanced 192 * in the blk_mq_* namespace 193 */ 194 blk_freeze_queue(q); 195 } 196 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 197 198 void blk_mq_unfreeze_queue(struct request_queue *q) 199 { 200 int freeze_depth; 201 202 freeze_depth = atomic_dec_return(&q->mq_freeze_depth); 203 WARN_ON_ONCE(freeze_depth < 0); 204 if (!freeze_depth) { 205 percpu_ref_resurrect(&q->q_usage_counter); 206 wake_up_all(&q->mq_freeze_wq); 207 } 208 } 209 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 210 211 /* 212 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the 213 * mpt3sas driver such that this function can be removed. 214 */ 215 void blk_mq_quiesce_queue_nowait(struct request_queue *q) 216 { 217 blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); 218 } 219 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); 220 221 /** 222 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished 223 * @q: request queue. 224 * 225 * Note: this function does not prevent that the struct request end_io() 226 * callback function is invoked. Once this function is returned, we make 227 * sure no dispatch can happen until the queue is unquiesced via 228 * blk_mq_unquiesce_queue(). 229 */ 230 void blk_mq_quiesce_queue(struct request_queue *q) 231 { 232 struct blk_mq_hw_ctx *hctx; 233 unsigned int i; 234 bool rcu = false; 235 236 blk_mq_quiesce_queue_nowait(q); 237 238 queue_for_each_hw_ctx(q, hctx, i) { 239 if (hctx->flags & BLK_MQ_F_BLOCKING) 240 synchronize_srcu(hctx->srcu); 241 else 242 rcu = true; 243 } 244 if (rcu) 245 synchronize_rcu(); 246 } 247 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); 248 249 /* 250 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() 251 * @q: request queue. 252 * 253 * This function recovers queue into the state before quiescing 254 * which is done by blk_mq_quiesce_queue. 255 */ 256 void blk_mq_unquiesce_queue(struct request_queue *q) 257 { 258 blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); 259 260 /* dispatch requests which are inserted during quiescing */ 261 blk_mq_run_hw_queues(q, true); 262 } 263 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); 264 265 void blk_mq_wake_waiters(struct request_queue *q) 266 { 267 struct blk_mq_hw_ctx *hctx; 268 unsigned int i; 269 270 queue_for_each_hw_ctx(q, hctx, i) 271 if (blk_mq_hw_queue_mapped(hctx)) 272 blk_mq_tag_wakeup_all(hctx->tags, true); 273 } 274 275 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 276 { 277 return blk_mq_has_free_tags(hctx->tags); 278 } 279 EXPORT_SYMBOL(blk_mq_can_queue); 280 281 /* 282 * Only need start/end time stamping if we have stats enabled, or using 283 * an IO scheduler. 284 */ 285 static inline bool blk_mq_need_time_stamp(struct request *rq) 286 { 287 return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; 288 } 289 290 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, 291 unsigned int tag, unsigned int op) 292 { 293 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 294 struct request *rq = tags->static_rqs[tag]; 295 req_flags_t rq_flags = 0; 296 297 if (data->flags & BLK_MQ_REQ_INTERNAL) { 298 rq->tag = -1; 299 rq->internal_tag = tag; 300 } else { 301 if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { 302 rq_flags = RQF_MQ_INFLIGHT; 303 atomic_inc(&data->hctx->nr_active); 304 } 305 rq->tag = tag; 306 rq->internal_tag = -1; 307 data->hctx->tags->rqs[rq->tag] = rq; 308 } 309 310 /* csd/requeue_work/fifo_time is initialized before use */ 311 rq->q = data->q; 312 rq->mq_ctx = data->ctx; 313 rq->mq_hctx = data->hctx; 314 rq->rq_flags = rq_flags; 315 rq->cmd_flags = op; 316 if (data->flags & BLK_MQ_REQ_PREEMPT) 317 rq->rq_flags |= RQF_PREEMPT; 318 if (blk_queue_io_stat(data->q)) 319 rq->rq_flags |= RQF_IO_STAT; 320 INIT_LIST_HEAD(&rq->queuelist); 321 INIT_HLIST_NODE(&rq->hash); 322 RB_CLEAR_NODE(&rq->rb_node); 323 rq->rq_disk = NULL; 324 rq->part = NULL; 325 if (blk_mq_need_time_stamp(rq)) 326 rq->start_time_ns = ktime_get_ns(); 327 else 328 rq->start_time_ns = 0; 329 rq->io_start_time_ns = 0; 330 rq->nr_phys_segments = 0; 331 #if defined(CONFIG_BLK_DEV_INTEGRITY) 332 rq->nr_integrity_segments = 0; 333 #endif 334 /* tag was already set */ 335 rq->extra_len = 0; 336 WRITE_ONCE(rq->deadline, 0); 337 338 rq->timeout = 0; 339 340 rq->end_io = NULL; 341 rq->end_io_data = NULL; 342 343 data->ctx->rq_dispatched[op_is_sync(op)]++; 344 refcount_set(&rq->ref, 1); 345 return rq; 346 } 347 348 static struct request *blk_mq_get_request(struct request_queue *q, 349 struct bio *bio, 350 struct blk_mq_alloc_data *data) 351 { 352 struct elevator_queue *e = q->elevator; 353 struct request *rq; 354 unsigned int tag; 355 bool put_ctx_on_error = false; 356 357 blk_queue_enter_live(q); 358 data->q = q; 359 if (likely(!data->ctx)) { 360 data->ctx = blk_mq_get_ctx(q); 361 put_ctx_on_error = true; 362 } 363 if (likely(!data->hctx)) 364 data->hctx = blk_mq_map_queue(q, data->cmd_flags, 365 data->ctx); 366 if (data->cmd_flags & REQ_NOWAIT) 367 data->flags |= BLK_MQ_REQ_NOWAIT; 368 369 if (e) { 370 data->flags |= BLK_MQ_REQ_INTERNAL; 371 372 /* 373 * Flush requests are special and go directly to the 374 * dispatch list. Don't include reserved tags in the 375 * limiting, as it isn't useful. 376 */ 377 if (!op_is_flush(data->cmd_flags) && 378 e->type->ops.limit_depth && 379 !(data->flags & BLK_MQ_REQ_RESERVED)) 380 e->type->ops.limit_depth(data->cmd_flags, data); 381 } else { 382 blk_mq_tag_busy(data->hctx); 383 } 384 385 tag = blk_mq_get_tag(data); 386 if (tag == BLK_MQ_TAG_FAIL) { 387 if (put_ctx_on_error) { 388 blk_mq_put_ctx(data->ctx); 389 data->ctx = NULL; 390 } 391 blk_queue_exit(q); 392 return NULL; 393 } 394 395 rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags); 396 if (!op_is_flush(data->cmd_flags)) { 397 rq->elv.icq = NULL; 398 if (e && e->type->ops.prepare_request) { 399 if (e->type->icq_cache) 400 blk_mq_sched_assign_ioc(rq); 401 402 e->type->ops.prepare_request(rq, bio); 403 rq->rq_flags |= RQF_ELVPRIV; 404 } 405 } 406 data->hctx->queued++; 407 return rq; 408 } 409 410 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 411 blk_mq_req_flags_t flags) 412 { 413 struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; 414 struct request *rq; 415 int ret; 416 417 ret = blk_queue_enter(q, flags); 418 if (ret) 419 return ERR_PTR(ret); 420 421 rq = blk_mq_get_request(q, NULL, &alloc_data); 422 blk_queue_exit(q); 423 424 if (!rq) 425 return ERR_PTR(-EWOULDBLOCK); 426 427 blk_mq_put_ctx(alloc_data.ctx); 428 429 rq->__data_len = 0; 430 rq->__sector = (sector_t) -1; 431 rq->bio = rq->biotail = NULL; 432 return rq; 433 } 434 EXPORT_SYMBOL(blk_mq_alloc_request); 435 436 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 437 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) 438 { 439 struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; 440 struct request *rq; 441 unsigned int cpu; 442 int ret; 443 444 /* 445 * If the tag allocator sleeps we could get an allocation for a 446 * different hardware context. No need to complicate the low level 447 * allocator for this for the rare use case of a command tied to 448 * a specific queue. 449 */ 450 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT))) 451 return ERR_PTR(-EINVAL); 452 453 if (hctx_idx >= q->nr_hw_queues) 454 return ERR_PTR(-EIO); 455 456 ret = blk_queue_enter(q, flags); 457 if (ret) 458 return ERR_PTR(ret); 459 460 /* 461 * Check if the hardware context is actually mapped to anything. 462 * If not tell the caller that it should skip this queue. 463 */ 464 alloc_data.hctx = q->queue_hw_ctx[hctx_idx]; 465 if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) { 466 blk_queue_exit(q); 467 return ERR_PTR(-EXDEV); 468 } 469 cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); 470 alloc_data.ctx = __blk_mq_get_ctx(q, cpu); 471 472 rq = blk_mq_get_request(q, NULL, &alloc_data); 473 blk_queue_exit(q); 474 475 if (!rq) 476 return ERR_PTR(-EWOULDBLOCK); 477 478 return rq; 479 } 480 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 481 482 static void __blk_mq_free_request(struct request *rq) 483 { 484 struct request_queue *q = rq->q; 485 struct blk_mq_ctx *ctx = rq->mq_ctx; 486 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 487 const int sched_tag = rq->internal_tag; 488 489 blk_pm_mark_last_busy(rq); 490 rq->mq_hctx = NULL; 491 if (rq->tag != -1) 492 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); 493 if (sched_tag != -1) 494 blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag); 495 blk_mq_sched_restart(hctx); 496 blk_queue_exit(q); 497 } 498 499 void blk_mq_free_request(struct request *rq) 500 { 501 struct request_queue *q = rq->q; 502 struct elevator_queue *e = q->elevator; 503 struct blk_mq_ctx *ctx = rq->mq_ctx; 504 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 505 506 if (rq->rq_flags & RQF_ELVPRIV) { 507 if (e && e->type->ops.finish_request) 508 e->type->ops.finish_request(rq); 509 if (rq->elv.icq) { 510 put_io_context(rq->elv.icq->ioc); 511 rq->elv.icq = NULL; 512 } 513 } 514 515 ctx->rq_completed[rq_is_sync(rq)]++; 516 if (rq->rq_flags & RQF_MQ_INFLIGHT) 517 atomic_dec(&hctx->nr_active); 518 519 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) 520 laptop_io_completion(q->backing_dev_info); 521 522 rq_qos_done(q, rq); 523 524 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 525 if (refcount_dec_and_test(&rq->ref)) 526 __blk_mq_free_request(rq); 527 } 528 EXPORT_SYMBOL_GPL(blk_mq_free_request); 529 530 inline void __blk_mq_end_request(struct request *rq, blk_status_t error) 531 { 532 u64 now = 0; 533 534 if (blk_mq_need_time_stamp(rq)) 535 now = ktime_get_ns(); 536 537 if (rq->rq_flags & RQF_STATS) { 538 blk_mq_poll_stats_start(rq->q); 539 blk_stat_add(rq, now); 540 } 541 542 if (rq->internal_tag != -1) 543 blk_mq_sched_completed_request(rq, now); 544 545 blk_account_io_done(rq, now); 546 547 if (rq->end_io) { 548 rq_qos_done(rq->q, rq); 549 rq->end_io(rq, error); 550 } else { 551 blk_mq_free_request(rq); 552 } 553 } 554 EXPORT_SYMBOL(__blk_mq_end_request); 555 556 void blk_mq_end_request(struct request *rq, blk_status_t error) 557 { 558 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 559 BUG(); 560 __blk_mq_end_request(rq, error); 561 } 562 EXPORT_SYMBOL(blk_mq_end_request); 563 564 static void __blk_mq_complete_request_remote(void *data) 565 { 566 struct request *rq = data; 567 struct request_queue *q = rq->q; 568 569 q->mq_ops->complete(rq); 570 } 571 572 static void __blk_mq_complete_request(struct request *rq) 573 { 574 struct blk_mq_ctx *ctx = rq->mq_ctx; 575 struct request_queue *q = rq->q; 576 bool shared = false; 577 int cpu; 578 579 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 580 /* 581 * Most of single queue controllers, there is only one irq vector 582 * for handling IO completion, and the only irq's affinity is set 583 * as all possible CPUs. On most of ARCHs, this affinity means the 584 * irq is handled on one specific CPU. 585 * 586 * So complete IO reqeust in softirq context in case of single queue 587 * for not degrading IO performance by irqsoff latency. 588 */ 589 if (q->nr_hw_queues == 1) { 590 __blk_complete_request(rq); 591 return; 592 } 593 594 /* 595 * For a polled request, always complete locallly, it's pointless 596 * to redirect the completion. 597 */ 598 if ((rq->cmd_flags & REQ_HIPRI) || 599 !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { 600 q->mq_ops->complete(rq); 601 return; 602 } 603 604 cpu = get_cpu(); 605 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) 606 shared = cpus_share_cache(cpu, ctx->cpu); 607 608 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 609 rq->csd.func = __blk_mq_complete_request_remote; 610 rq->csd.info = rq; 611 rq->csd.flags = 0; 612 smp_call_function_single_async(ctx->cpu, &rq->csd); 613 } else { 614 q->mq_ops->complete(rq); 615 } 616 put_cpu(); 617 } 618 619 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) 620 __releases(hctx->srcu) 621 { 622 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) 623 rcu_read_unlock(); 624 else 625 srcu_read_unlock(hctx->srcu, srcu_idx); 626 } 627 628 static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) 629 __acquires(hctx->srcu) 630 { 631 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { 632 /* shut up gcc false positive */ 633 *srcu_idx = 0; 634 rcu_read_lock(); 635 } else 636 *srcu_idx = srcu_read_lock(hctx->srcu); 637 } 638 639 /** 640 * blk_mq_complete_request - end I/O on a request 641 * @rq: the request being processed 642 * 643 * Description: 644 * Ends all I/O on a request. It does not handle partial completions. 645 * The actual completion happens out-of-order, through a IPI handler. 646 **/ 647 bool blk_mq_complete_request(struct request *rq) 648 { 649 if (unlikely(blk_should_fake_timeout(rq->q))) 650 return false; 651 __blk_mq_complete_request(rq); 652 return true; 653 } 654 EXPORT_SYMBOL(blk_mq_complete_request); 655 656 int blk_mq_request_started(struct request *rq) 657 { 658 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 659 } 660 EXPORT_SYMBOL_GPL(blk_mq_request_started); 661 662 void blk_mq_start_request(struct request *rq) 663 { 664 struct request_queue *q = rq->q; 665 666 blk_mq_sched_started_request(rq); 667 668 trace_block_rq_issue(q, rq); 669 670 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { 671 rq->io_start_time_ns = ktime_get_ns(); 672 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 673 rq->throtl_size = blk_rq_sectors(rq); 674 #endif 675 rq->rq_flags |= RQF_STATS; 676 rq_qos_issue(q, rq); 677 } 678 679 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); 680 681 blk_add_timer(rq); 682 WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); 683 684 if (q->dma_drain_size && blk_rq_bytes(rq)) { 685 /* 686 * Make sure space for the drain appears. We know we can do 687 * this because max_hw_segments has been adjusted to be one 688 * fewer than the device can handle. 689 */ 690 rq->nr_phys_segments++; 691 } 692 } 693 EXPORT_SYMBOL(blk_mq_start_request); 694 695 static void __blk_mq_requeue_request(struct request *rq) 696 { 697 struct request_queue *q = rq->q; 698 699 blk_mq_put_driver_tag(rq); 700 701 trace_block_rq_requeue(q, rq); 702 rq_qos_requeue(q, rq); 703 704 if (blk_mq_request_started(rq)) { 705 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 706 rq->rq_flags &= ~RQF_TIMED_OUT; 707 if (q->dma_drain_size && blk_rq_bytes(rq)) 708 rq->nr_phys_segments--; 709 } 710 } 711 712 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) 713 { 714 __blk_mq_requeue_request(rq); 715 716 /* this request will be re-inserted to io scheduler queue */ 717 blk_mq_sched_requeue_request(rq); 718 719 BUG_ON(!list_empty(&rq->queuelist)); 720 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); 721 } 722 EXPORT_SYMBOL(blk_mq_requeue_request); 723 724 static void blk_mq_requeue_work(struct work_struct *work) 725 { 726 struct request_queue *q = 727 container_of(work, struct request_queue, requeue_work.work); 728 LIST_HEAD(rq_list); 729 struct request *rq, *next; 730 731 spin_lock_irq(&q->requeue_lock); 732 list_splice_init(&q->requeue_list, &rq_list); 733 spin_unlock_irq(&q->requeue_lock); 734 735 list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 736 if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) 737 continue; 738 739 rq->rq_flags &= ~RQF_SOFTBARRIER; 740 list_del_init(&rq->queuelist); 741 /* 742 * If RQF_DONTPREP, rq has contained some driver specific 743 * data, so insert it to hctx dispatch list to avoid any 744 * merge. 745 */ 746 if (rq->rq_flags & RQF_DONTPREP) 747 blk_mq_request_bypass_insert(rq, false); 748 else 749 blk_mq_sched_insert_request(rq, true, false, false); 750 } 751 752 while (!list_empty(&rq_list)) { 753 rq = list_entry(rq_list.next, struct request, queuelist); 754 list_del_init(&rq->queuelist); 755 blk_mq_sched_insert_request(rq, false, false, false); 756 } 757 758 blk_mq_run_hw_queues(q, false); 759 } 760 761 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, 762 bool kick_requeue_list) 763 { 764 struct request_queue *q = rq->q; 765 unsigned long flags; 766 767 /* 768 * We abuse this flag that is otherwise used by the I/O scheduler to 769 * request head insertion from the workqueue. 770 */ 771 BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); 772 773 spin_lock_irqsave(&q->requeue_lock, flags); 774 if (at_head) { 775 rq->rq_flags |= RQF_SOFTBARRIER; 776 list_add(&rq->queuelist, &q->requeue_list); 777 } else { 778 list_add_tail(&rq->queuelist, &q->requeue_list); 779 } 780 spin_unlock_irqrestore(&q->requeue_lock, flags); 781 782 if (kick_requeue_list) 783 blk_mq_kick_requeue_list(q); 784 } 785 EXPORT_SYMBOL(blk_mq_add_to_requeue_list); 786 787 void blk_mq_kick_requeue_list(struct request_queue *q) 788 { 789 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); 790 } 791 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 792 793 void blk_mq_delay_kick_requeue_list(struct request_queue *q, 794 unsigned long msecs) 795 { 796 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 797 msecs_to_jiffies(msecs)); 798 } 799 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); 800 801 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 802 { 803 if (tag < tags->nr_tags) { 804 prefetch(tags->rqs[tag]); 805 return tags->rqs[tag]; 806 } 807 808 return NULL; 809 } 810 EXPORT_SYMBOL(blk_mq_tag_to_rq); 811 812 static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, 813 void *priv, bool reserved) 814 { 815 /* 816 * If we find a request that is inflight and the queue matches, 817 * we know the queue is busy. Return false to stop the iteration. 818 */ 819 if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) { 820 bool *busy = priv; 821 822 *busy = true; 823 return false; 824 } 825 826 return true; 827 } 828 829 bool blk_mq_queue_inflight(struct request_queue *q) 830 { 831 bool busy = false; 832 833 blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); 834 return busy; 835 } 836 EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); 837 838 static void blk_mq_rq_timed_out(struct request *req, bool reserved) 839 { 840 req->rq_flags |= RQF_TIMED_OUT; 841 if (req->q->mq_ops->timeout) { 842 enum blk_eh_timer_return ret; 843 844 ret = req->q->mq_ops->timeout(req, reserved); 845 if (ret == BLK_EH_DONE) 846 return; 847 WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); 848 } 849 850 blk_add_timer(req); 851 } 852 853 static bool blk_mq_req_expired(struct request *rq, unsigned long *next) 854 { 855 unsigned long deadline; 856 857 if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) 858 return false; 859 if (rq->rq_flags & RQF_TIMED_OUT) 860 return false; 861 862 deadline = READ_ONCE(rq->deadline); 863 if (time_after_eq(jiffies, deadline)) 864 return true; 865 866 if (*next == 0) 867 *next = deadline; 868 else if (time_after(*next, deadline)) 869 *next = deadline; 870 return false; 871 } 872 873 static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 874 struct request *rq, void *priv, bool reserved) 875 { 876 unsigned long *next = priv; 877 878 /* 879 * Just do a quick check if it is expired before locking the request in 880 * so we're not unnecessarilly synchronizing across CPUs. 881 */ 882 if (!blk_mq_req_expired(rq, next)) 883 return true; 884 885 /* 886 * We have reason to believe the request may be expired. Take a 887 * reference on the request to lock this request lifetime into its 888 * currently allocated context to prevent it from being reallocated in 889 * the event the completion by-passes this timeout handler. 890 * 891 * If the reference was already released, then the driver beat the 892 * timeout handler to posting a natural completion. 893 */ 894 if (!refcount_inc_not_zero(&rq->ref)) 895 return true; 896 897 /* 898 * The request is now locked and cannot be reallocated underneath the 899 * timeout handler's processing. Re-verify this exact request is truly 900 * expired; if it is not expired, then the request was completed and 901 * reallocated as a new request. 902 */ 903 if (blk_mq_req_expired(rq, next)) 904 blk_mq_rq_timed_out(rq, reserved); 905 if (refcount_dec_and_test(&rq->ref)) 906 __blk_mq_free_request(rq); 907 908 return true; 909 } 910 911 static void blk_mq_timeout_work(struct work_struct *work) 912 { 913 struct request_queue *q = 914 container_of(work, struct request_queue, timeout_work); 915 unsigned long next = 0; 916 struct blk_mq_hw_ctx *hctx; 917 int i; 918 919 /* A deadlock might occur if a request is stuck requiring a 920 * timeout at the same time a queue freeze is waiting 921 * completion, since the timeout code would not be able to 922 * acquire the queue reference here. 923 * 924 * That's why we don't use blk_queue_enter here; instead, we use 925 * percpu_ref_tryget directly, because we need to be able to 926 * obtain a reference even in the short window between the queue 927 * starting to freeze, by dropping the first reference in 928 * blk_freeze_queue_start, and the moment the last request is 929 * consumed, marked by the instant q_usage_counter reaches 930 * zero. 931 */ 932 if (!percpu_ref_tryget(&q->q_usage_counter)) 933 return; 934 935 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); 936 937 if (next != 0) { 938 mod_timer(&q->timeout, next); 939 } else { 940 /* 941 * Request timeouts are handled as a forward rolling timer. If 942 * we end up here it means that no requests are pending and 943 * also that no request has been pending for a while. Mark 944 * each hctx as idle. 945 */ 946 queue_for_each_hw_ctx(q, hctx, i) { 947 /* the hctx may be unmapped, so check it here */ 948 if (blk_mq_hw_queue_mapped(hctx)) 949 blk_mq_tag_idle(hctx); 950 } 951 } 952 blk_queue_exit(q); 953 } 954 955 struct flush_busy_ctx_data { 956 struct blk_mq_hw_ctx *hctx; 957 struct list_head *list; 958 }; 959 960 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) 961 { 962 struct flush_busy_ctx_data *flush_data = data; 963 struct blk_mq_hw_ctx *hctx = flush_data->hctx; 964 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 965 enum hctx_type type = hctx->type; 966 967 spin_lock(&ctx->lock); 968 list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); 969 sbitmap_clear_bit(sb, bitnr); 970 spin_unlock(&ctx->lock); 971 return true; 972 } 973 974 /* 975 * Process software queues that have been marked busy, splicing them 976 * to the for-dispatch 977 */ 978 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 979 { 980 struct flush_busy_ctx_data data = { 981 .hctx = hctx, 982 .list = list, 983 }; 984 985 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); 986 } 987 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); 988 989 struct dispatch_rq_data { 990 struct blk_mq_hw_ctx *hctx; 991 struct request *rq; 992 }; 993 994 static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, 995 void *data) 996 { 997 struct dispatch_rq_data *dispatch_data = data; 998 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; 999 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 1000 enum hctx_type type = hctx->type; 1001 1002 spin_lock(&ctx->lock); 1003 if (!list_empty(&ctx->rq_lists[type])) { 1004 dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); 1005 list_del_init(&dispatch_data->rq->queuelist); 1006 if (list_empty(&ctx->rq_lists[type])) 1007 sbitmap_clear_bit(sb, bitnr); 1008 } 1009 spin_unlock(&ctx->lock); 1010 1011 return !dispatch_data->rq; 1012 } 1013 1014 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, 1015 struct blk_mq_ctx *start) 1016 { 1017 unsigned off = start ? start->index_hw[hctx->type] : 0; 1018 struct dispatch_rq_data data = { 1019 .hctx = hctx, 1020 .rq = NULL, 1021 }; 1022 1023 __sbitmap_for_each_set(&hctx->ctx_map, off, 1024 dispatch_rq_from_ctx, &data); 1025 1026 return data.rq; 1027 } 1028 1029 static inline unsigned int queued_to_index(unsigned int queued) 1030 { 1031 if (!queued) 1032 return 0; 1033 1034 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); 1035 } 1036 1037 bool blk_mq_get_driver_tag(struct request *rq) 1038 { 1039 struct blk_mq_alloc_data data = { 1040 .q = rq->q, 1041 .hctx = rq->mq_hctx, 1042 .flags = BLK_MQ_REQ_NOWAIT, 1043 .cmd_flags = rq->cmd_flags, 1044 }; 1045 bool shared; 1046 1047 if (rq->tag != -1) 1048 goto done; 1049 1050 if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) 1051 data.flags |= BLK_MQ_REQ_RESERVED; 1052 1053 shared = blk_mq_tag_busy(data.hctx); 1054 rq->tag = blk_mq_get_tag(&data); 1055 if (rq->tag >= 0) { 1056 if (shared) { 1057 rq->rq_flags |= RQF_MQ_INFLIGHT; 1058 atomic_inc(&data.hctx->nr_active); 1059 } 1060 data.hctx->tags->rqs[rq->tag] = rq; 1061 } 1062 1063 done: 1064 return rq->tag != -1; 1065 } 1066 1067 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, 1068 int flags, void *key) 1069 { 1070 struct blk_mq_hw_ctx *hctx; 1071 1072 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); 1073 1074 spin_lock(&hctx->dispatch_wait_lock); 1075 list_del_init(&wait->entry); 1076 spin_unlock(&hctx->dispatch_wait_lock); 1077 1078 blk_mq_run_hw_queue(hctx, true); 1079 return 1; 1080 } 1081 1082 /* 1083 * Mark us waiting for a tag. For shared tags, this involves hooking us into 1084 * the tag wakeups. For non-shared tags, we can simply mark us needing a 1085 * restart. For both cases, take care to check the condition again after 1086 * marking us as waiting. 1087 */ 1088 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, 1089 struct request *rq) 1090 { 1091 struct wait_queue_head *wq; 1092 wait_queue_entry_t *wait; 1093 bool ret; 1094 1095 if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { 1096 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 1097 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 1098 1099 /* 1100 * It's possible that a tag was freed in the window between the 1101 * allocation failure and adding the hardware queue to the wait 1102 * queue. 1103 * 1104 * Don't clear RESTART here, someone else could have set it. 1105 * At most this will cost an extra queue run. 1106 */ 1107 return blk_mq_get_driver_tag(rq); 1108 } 1109 1110 wait = &hctx->dispatch_wait; 1111 if (!list_empty_careful(&wait->entry)) 1112 return false; 1113 1114 wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait; 1115 1116 spin_lock_irq(&wq->lock); 1117 spin_lock(&hctx->dispatch_wait_lock); 1118 if (!list_empty(&wait->entry)) { 1119 spin_unlock(&hctx->dispatch_wait_lock); 1120 spin_unlock_irq(&wq->lock); 1121 return false; 1122 } 1123 1124 wait->flags &= ~WQ_FLAG_EXCLUSIVE; 1125 __add_wait_queue(wq, wait); 1126 1127 /* 1128 * It's possible that a tag was freed in the window between the 1129 * allocation failure and adding the hardware queue to the wait 1130 * queue. 1131 */ 1132 ret = blk_mq_get_driver_tag(rq); 1133 if (!ret) { 1134 spin_unlock(&hctx->dispatch_wait_lock); 1135 spin_unlock_irq(&wq->lock); 1136 return false; 1137 } 1138 1139 /* 1140 * We got a tag, remove ourselves from the wait queue to ensure 1141 * someone else gets the wakeup. 1142 */ 1143 list_del_init(&wait->entry); 1144 spin_unlock(&hctx->dispatch_wait_lock); 1145 spin_unlock_irq(&wq->lock); 1146 1147 return true; 1148 } 1149 1150 #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 1151 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 1152 /* 1153 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): 1154 * - EWMA is one simple way to compute running average value 1155 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially 1156 * - take 4 as factor for avoiding to get too small(0) result, and this 1157 * factor doesn't matter because EWMA decreases exponentially 1158 */ 1159 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) 1160 { 1161 unsigned int ewma; 1162 1163 if (hctx->queue->elevator) 1164 return; 1165 1166 ewma = hctx->dispatch_busy; 1167 1168 if (!ewma && !busy) 1169 return; 1170 1171 ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; 1172 if (busy) 1173 ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; 1174 ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; 1175 1176 hctx->dispatch_busy = ewma; 1177 } 1178 1179 #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ 1180 1181 /* 1182 * Returns true if we did some work AND can potentially do more. 1183 */ 1184 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, 1185 bool got_budget) 1186 { 1187 struct blk_mq_hw_ctx *hctx; 1188 struct request *rq, *nxt; 1189 bool no_tag = false; 1190 int errors, queued; 1191 blk_status_t ret = BLK_STS_OK; 1192 1193 if (list_empty(list)) 1194 return false; 1195 1196 WARN_ON(!list_is_singular(list) && got_budget); 1197 1198 /* 1199 * Now process all the entries, sending them to the driver. 1200 */ 1201 errors = queued = 0; 1202 do { 1203 struct blk_mq_queue_data bd; 1204 1205 rq = list_first_entry(list, struct request, queuelist); 1206 1207 hctx = rq->mq_hctx; 1208 if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) 1209 break; 1210 1211 if (!blk_mq_get_driver_tag(rq)) { 1212 /* 1213 * The initial allocation attempt failed, so we need to 1214 * rerun the hardware queue when a tag is freed. The 1215 * waitqueue takes care of that. If the queue is run 1216 * before we add this entry back on the dispatch list, 1217 * we'll re-run it below. 1218 */ 1219 if (!blk_mq_mark_tag_wait(hctx, rq)) { 1220 blk_mq_put_dispatch_budget(hctx); 1221 /* 1222 * For non-shared tags, the RESTART check 1223 * will suffice. 1224 */ 1225 if (hctx->flags & BLK_MQ_F_TAG_SHARED) 1226 no_tag = true; 1227 break; 1228 } 1229 } 1230 1231 list_del_init(&rq->queuelist); 1232 1233 bd.rq = rq; 1234 1235 /* 1236 * Flag last if we have no more requests, or if we have more 1237 * but can't assign a driver tag to it. 1238 */ 1239 if (list_empty(list)) 1240 bd.last = true; 1241 else { 1242 nxt = list_first_entry(list, struct request, queuelist); 1243 bd.last = !blk_mq_get_driver_tag(nxt); 1244 } 1245 1246 ret = q->mq_ops->queue_rq(hctx, &bd); 1247 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { 1248 /* 1249 * If an I/O scheduler has been configured and we got a 1250 * driver tag for the next request already, free it 1251 * again. 1252 */ 1253 if (!list_empty(list)) { 1254 nxt = list_first_entry(list, struct request, queuelist); 1255 blk_mq_put_driver_tag(nxt); 1256 } 1257 list_add(&rq->queuelist, list); 1258 __blk_mq_requeue_request(rq); 1259 break; 1260 } 1261 1262 if (unlikely(ret != BLK_STS_OK)) { 1263 errors++; 1264 blk_mq_end_request(rq, BLK_STS_IOERR); 1265 continue; 1266 } 1267 1268 queued++; 1269 } while (!list_empty(list)); 1270 1271 hctx->dispatched[queued_to_index(queued)]++; 1272 1273 /* 1274 * Any items that need requeuing? Stuff them into hctx->dispatch, 1275 * that is where we will continue on next queue run. 1276 */ 1277 if (!list_empty(list)) { 1278 bool needs_restart; 1279 1280 /* 1281 * If we didn't flush the entire list, we could have told 1282 * the driver there was more coming, but that turned out to 1283 * be a lie. 1284 */ 1285 if (q->mq_ops->commit_rqs) 1286 q->mq_ops->commit_rqs(hctx); 1287 1288 spin_lock(&hctx->lock); 1289 list_splice_init(list, &hctx->dispatch); 1290 spin_unlock(&hctx->lock); 1291 1292 /* 1293 * If SCHED_RESTART was set by the caller of this function and 1294 * it is no longer set that means that it was cleared by another 1295 * thread and hence that a queue rerun is needed. 1296 * 1297 * If 'no_tag' is set, that means that we failed getting 1298 * a driver tag with an I/O scheduler attached. If our dispatch 1299 * waitqueue is no longer active, ensure that we run the queue 1300 * AFTER adding our entries back to the list. 1301 * 1302 * If no I/O scheduler has been configured it is possible that 1303 * the hardware queue got stopped and restarted before requests 1304 * were pushed back onto the dispatch list. Rerun the queue to 1305 * avoid starvation. Notes: 1306 * - blk_mq_run_hw_queue() checks whether or not a queue has 1307 * been stopped before rerunning a queue. 1308 * - Some but not all block drivers stop a queue before 1309 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq 1310 * and dm-rq. 1311 * 1312 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART 1313 * bit is set, run queue after a delay to avoid IO stalls 1314 * that could otherwise occur if the queue is idle. 1315 */ 1316 needs_restart = blk_mq_sched_needs_restart(hctx); 1317 if (!needs_restart || 1318 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) 1319 blk_mq_run_hw_queue(hctx, true); 1320 else if (needs_restart && (ret == BLK_STS_RESOURCE)) 1321 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); 1322 1323 blk_mq_update_dispatch_busy(hctx, true); 1324 return false; 1325 } else 1326 blk_mq_update_dispatch_busy(hctx, false); 1327 1328 /* 1329 * If the host/device is unable to accept more work, inform the 1330 * caller of that. 1331 */ 1332 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) 1333 return false; 1334 1335 return (queued + errors) != 0; 1336 } 1337 1338 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 1339 { 1340 int srcu_idx; 1341 1342 /* 1343 * We should be running this queue from one of the CPUs that 1344 * are mapped to it. 1345 * 1346 * There are at least two related races now between setting 1347 * hctx->next_cpu from blk_mq_hctx_next_cpu() and running 1348 * __blk_mq_run_hw_queue(): 1349 * 1350 * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), 1351 * but later it becomes online, then this warning is harmless 1352 * at all 1353 * 1354 * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), 1355 * but later it becomes offline, then the warning can't be 1356 * triggered, and we depend on blk-mq timeout handler to 1357 * handle dispatched requests to this hctx 1358 */ 1359 if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && 1360 cpu_online(hctx->next_cpu)) { 1361 printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", 1362 raw_smp_processor_id(), 1363 cpumask_empty(hctx->cpumask) ? "inactive": "active"); 1364 dump_stack(); 1365 } 1366 1367 /* 1368 * We can't run the queue inline with ints disabled. Ensure that 1369 * we catch bad users of this early. 1370 */ 1371 WARN_ON_ONCE(in_interrupt()); 1372 1373 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); 1374 1375 hctx_lock(hctx, &srcu_idx); 1376 blk_mq_sched_dispatch_requests(hctx); 1377 hctx_unlock(hctx, srcu_idx); 1378 } 1379 1380 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) 1381 { 1382 int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); 1383 1384 if (cpu >= nr_cpu_ids) 1385 cpu = cpumask_first(hctx->cpumask); 1386 return cpu; 1387 } 1388 1389 /* 1390 * It'd be great if the workqueue API had a way to pass 1391 * in a mask and had some smarts for more clever placement. 1392 * For now we just round-robin here, switching for every 1393 * BLK_MQ_CPU_WORK_BATCH queued items. 1394 */ 1395 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 1396 { 1397 bool tried = false; 1398 int next_cpu = hctx->next_cpu; 1399 1400 if (hctx->queue->nr_hw_queues == 1) 1401 return WORK_CPU_UNBOUND; 1402 1403 if (--hctx->next_cpu_batch <= 0) { 1404 select_cpu: 1405 next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, 1406 cpu_online_mask); 1407 if (next_cpu >= nr_cpu_ids) 1408 next_cpu = blk_mq_first_mapped_cpu(hctx); 1409 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1410 } 1411 1412 /* 1413 * Do unbound schedule if we can't find a online CPU for this hctx, 1414 * and it should only happen in the path of handling CPU DEAD. 1415 */ 1416 if (!cpu_online(next_cpu)) { 1417 if (!tried) { 1418 tried = true; 1419 goto select_cpu; 1420 } 1421 1422 /* 1423 * Make sure to re-select CPU next time once after CPUs 1424 * in hctx->cpumask become online again. 1425 */ 1426 hctx->next_cpu = next_cpu; 1427 hctx->next_cpu_batch = 1; 1428 return WORK_CPU_UNBOUND; 1429 } 1430 1431 hctx->next_cpu = next_cpu; 1432 return next_cpu; 1433 } 1434 1435 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, 1436 unsigned long msecs) 1437 { 1438 if (unlikely(blk_mq_hctx_stopped(hctx))) 1439 return; 1440 1441 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { 1442 int cpu = get_cpu(); 1443 if (cpumask_test_cpu(cpu, hctx->cpumask)) { 1444 __blk_mq_run_hw_queue(hctx); 1445 put_cpu(); 1446 return; 1447 } 1448 1449 put_cpu(); 1450 } 1451 1452 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, 1453 msecs_to_jiffies(msecs)); 1454 } 1455 1456 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1457 { 1458 __blk_mq_delay_run_hw_queue(hctx, true, msecs); 1459 } 1460 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); 1461 1462 bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1463 { 1464 int srcu_idx; 1465 bool need_run; 1466 1467 /* 1468 * When queue is quiesced, we may be switching io scheduler, or 1469 * updating nr_hw_queues, or other things, and we can't run queue 1470 * any more, even __blk_mq_hctx_has_pending() can't be called safely. 1471 * 1472 * And queue will be rerun in blk_mq_unquiesce_queue() if it is 1473 * quiesced. 1474 */ 1475 hctx_lock(hctx, &srcu_idx); 1476 need_run = !blk_queue_quiesced(hctx->queue) && 1477 blk_mq_hctx_has_pending(hctx); 1478 hctx_unlock(hctx, srcu_idx); 1479 1480 if (need_run) { 1481 __blk_mq_delay_run_hw_queue(hctx, async, 0); 1482 return true; 1483 } 1484 1485 return false; 1486 } 1487 EXPORT_SYMBOL(blk_mq_run_hw_queue); 1488 1489 void blk_mq_run_hw_queues(struct request_queue *q, bool async) 1490 { 1491 struct blk_mq_hw_ctx *hctx; 1492 int i; 1493 1494 queue_for_each_hw_ctx(q, hctx, i) { 1495 if (blk_mq_hctx_stopped(hctx)) 1496 continue; 1497 1498 blk_mq_run_hw_queue(hctx, async); 1499 } 1500 } 1501 EXPORT_SYMBOL(blk_mq_run_hw_queues); 1502 1503 /** 1504 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped 1505 * @q: request queue. 1506 * 1507 * The caller is responsible for serializing this function against 1508 * blk_mq_{start,stop}_hw_queue(). 1509 */ 1510 bool blk_mq_queue_stopped(struct request_queue *q) 1511 { 1512 struct blk_mq_hw_ctx *hctx; 1513 int i; 1514 1515 queue_for_each_hw_ctx(q, hctx, i) 1516 if (blk_mq_hctx_stopped(hctx)) 1517 return true; 1518 1519 return false; 1520 } 1521 EXPORT_SYMBOL(blk_mq_queue_stopped); 1522 1523 /* 1524 * This function is often used for pausing .queue_rq() by driver when 1525 * there isn't enough resource or some conditions aren't satisfied, and 1526 * BLK_STS_RESOURCE is usually returned. 1527 * 1528 * We do not guarantee that dispatch can be drained or blocked 1529 * after blk_mq_stop_hw_queue() returns. Please use 1530 * blk_mq_quiesce_queue() for that requirement. 1531 */ 1532 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 1533 { 1534 cancel_delayed_work(&hctx->run_work); 1535 1536 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 1537 } 1538 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 1539 1540 /* 1541 * This function is often used for pausing .queue_rq() by driver when 1542 * there isn't enough resource or some conditions aren't satisfied, and 1543 * BLK_STS_RESOURCE is usually returned. 1544 * 1545 * We do not guarantee that dispatch can be drained or blocked 1546 * after blk_mq_stop_hw_queues() returns. Please use 1547 * blk_mq_quiesce_queue() for that requirement. 1548 */ 1549 void blk_mq_stop_hw_queues(struct request_queue *q) 1550 { 1551 struct blk_mq_hw_ctx *hctx; 1552 int i; 1553 1554 queue_for_each_hw_ctx(q, hctx, i) 1555 blk_mq_stop_hw_queue(hctx); 1556 } 1557 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 1558 1559 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 1560 { 1561 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1562 1563 blk_mq_run_hw_queue(hctx, false); 1564 } 1565 EXPORT_SYMBOL(blk_mq_start_hw_queue); 1566 1567 void blk_mq_start_hw_queues(struct request_queue *q) 1568 { 1569 struct blk_mq_hw_ctx *hctx; 1570 int i; 1571 1572 queue_for_each_hw_ctx(q, hctx, i) 1573 blk_mq_start_hw_queue(hctx); 1574 } 1575 EXPORT_SYMBOL(blk_mq_start_hw_queues); 1576 1577 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1578 { 1579 if (!blk_mq_hctx_stopped(hctx)) 1580 return; 1581 1582 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1583 blk_mq_run_hw_queue(hctx, async); 1584 } 1585 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); 1586 1587 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 1588 { 1589 struct blk_mq_hw_ctx *hctx; 1590 int i; 1591 1592 queue_for_each_hw_ctx(q, hctx, i) 1593 blk_mq_start_stopped_hw_queue(hctx, async); 1594 } 1595 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 1596 1597 static void blk_mq_run_work_fn(struct work_struct *work) 1598 { 1599 struct blk_mq_hw_ctx *hctx; 1600 1601 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 1602 1603 /* 1604 * If we are stopped, don't run the queue. 1605 */ 1606 if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 1607 return; 1608 1609 __blk_mq_run_hw_queue(hctx); 1610 } 1611 1612 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, 1613 struct request *rq, 1614 bool at_head) 1615 { 1616 struct blk_mq_ctx *ctx = rq->mq_ctx; 1617 enum hctx_type type = hctx->type; 1618 1619 lockdep_assert_held(&ctx->lock); 1620 1621 trace_block_rq_insert(hctx->queue, rq); 1622 1623 if (at_head) 1624 list_add(&rq->queuelist, &ctx->rq_lists[type]); 1625 else 1626 list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); 1627 } 1628 1629 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 1630 bool at_head) 1631 { 1632 struct blk_mq_ctx *ctx = rq->mq_ctx; 1633 1634 lockdep_assert_held(&ctx->lock); 1635 1636 __blk_mq_insert_req_list(hctx, rq, at_head); 1637 blk_mq_hctx_mark_pending(hctx, ctx); 1638 } 1639 1640 /* 1641 * Should only be used carefully, when the caller knows we want to 1642 * bypass a potential IO scheduler on the target device. 1643 */ 1644 void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) 1645 { 1646 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 1647 1648 spin_lock(&hctx->lock); 1649 list_add_tail(&rq->queuelist, &hctx->dispatch); 1650 spin_unlock(&hctx->lock); 1651 1652 if (run_queue) 1653 blk_mq_run_hw_queue(hctx, false); 1654 } 1655 1656 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 1657 struct list_head *list) 1658 1659 { 1660 struct request *rq; 1661 enum hctx_type type = hctx->type; 1662 1663 /* 1664 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1665 * offline now 1666 */ 1667 list_for_each_entry(rq, list, queuelist) { 1668 BUG_ON(rq->mq_ctx != ctx); 1669 trace_block_rq_insert(hctx->queue, rq); 1670 } 1671 1672 spin_lock(&ctx->lock); 1673 list_splice_tail_init(list, &ctx->rq_lists[type]); 1674 blk_mq_hctx_mark_pending(hctx, ctx); 1675 spin_unlock(&ctx->lock); 1676 } 1677 1678 static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) 1679 { 1680 struct request *rqa = container_of(a, struct request, queuelist); 1681 struct request *rqb = container_of(b, struct request, queuelist); 1682 1683 if (rqa->mq_ctx < rqb->mq_ctx) 1684 return -1; 1685 else if (rqa->mq_ctx > rqb->mq_ctx) 1686 return 1; 1687 else if (rqa->mq_hctx < rqb->mq_hctx) 1688 return -1; 1689 else if (rqa->mq_hctx > rqb->mq_hctx) 1690 return 1; 1691 1692 return blk_rq_pos(rqa) > blk_rq_pos(rqb); 1693 } 1694 1695 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1696 { 1697 struct blk_mq_hw_ctx *this_hctx; 1698 struct blk_mq_ctx *this_ctx; 1699 struct request_queue *this_q; 1700 struct request *rq; 1701 LIST_HEAD(list); 1702 LIST_HEAD(rq_list); 1703 unsigned int depth; 1704 1705 list_splice_init(&plug->mq_list, &list); 1706 plug->rq_count = 0; 1707 1708 if (plug->rq_count > 2 && plug->multiple_queues) 1709 list_sort(NULL, &list, plug_rq_cmp); 1710 1711 this_q = NULL; 1712 this_hctx = NULL; 1713 this_ctx = NULL; 1714 depth = 0; 1715 1716 while (!list_empty(&list)) { 1717 rq = list_entry_rq(list.next); 1718 list_del_init(&rq->queuelist); 1719 BUG_ON(!rq->q); 1720 if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) { 1721 if (this_hctx) { 1722 trace_block_unplug(this_q, depth, !from_schedule); 1723 blk_mq_sched_insert_requests(this_hctx, this_ctx, 1724 &rq_list, 1725 from_schedule); 1726 } 1727 1728 this_q = rq->q; 1729 this_ctx = rq->mq_ctx; 1730 this_hctx = rq->mq_hctx; 1731 depth = 0; 1732 } 1733 1734 depth++; 1735 list_add_tail(&rq->queuelist, &rq_list); 1736 } 1737 1738 /* 1739 * If 'this_hctx' is set, we know we have entries to complete 1740 * on 'rq_list'. Do those. 1741 */ 1742 if (this_hctx) { 1743 trace_block_unplug(this_q, depth, !from_schedule); 1744 blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, 1745 from_schedule); 1746 } 1747 } 1748 1749 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1750 { 1751 blk_init_request_from_bio(rq, bio); 1752 1753 blk_account_io_start(rq, true); 1754 } 1755 1756 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, 1757 struct request *rq, 1758 blk_qc_t *cookie, bool last) 1759 { 1760 struct request_queue *q = rq->q; 1761 struct blk_mq_queue_data bd = { 1762 .rq = rq, 1763 .last = last, 1764 }; 1765 blk_qc_t new_cookie; 1766 blk_status_t ret; 1767 1768 new_cookie = request_to_qc_t(hctx, rq); 1769 1770 /* 1771 * For OK queue, we are done. For error, caller may kill it. 1772 * Any other error (busy), just add it to our list as we 1773 * previously would have done. 1774 */ 1775 ret = q->mq_ops->queue_rq(hctx, &bd); 1776 switch (ret) { 1777 case BLK_STS_OK: 1778 blk_mq_update_dispatch_busy(hctx, false); 1779 *cookie = new_cookie; 1780 break; 1781 case BLK_STS_RESOURCE: 1782 case BLK_STS_DEV_RESOURCE: 1783 blk_mq_update_dispatch_busy(hctx, true); 1784 __blk_mq_requeue_request(rq); 1785 break; 1786 default: 1787 blk_mq_update_dispatch_busy(hctx, false); 1788 *cookie = BLK_QC_T_NONE; 1789 break; 1790 } 1791 1792 return ret; 1793 } 1794 1795 blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 1796 struct request *rq, 1797 blk_qc_t *cookie, 1798 bool bypass, bool last) 1799 { 1800 struct request_queue *q = rq->q; 1801 bool run_queue = true; 1802 blk_status_t ret = BLK_STS_RESOURCE; 1803 int srcu_idx; 1804 bool force = false; 1805 1806 hctx_lock(hctx, &srcu_idx); 1807 /* 1808 * hctx_lock is needed before checking quiesced flag. 1809 * 1810 * When queue is stopped or quiesced, ignore 'bypass', insert 1811 * and return BLK_STS_OK to caller, and avoid driver to try to 1812 * dispatch again. 1813 */ 1814 if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) { 1815 run_queue = false; 1816 bypass = false; 1817 goto out_unlock; 1818 } 1819 1820 if (unlikely(q->elevator && !bypass)) 1821 goto out_unlock; 1822 1823 if (!blk_mq_get_dispatch_budget(hctx)) 1824 goto out_unlock; 1825 1826 if (!blk_mq_get_driver_tag(rq)) { 1827 blk_mq_put_dispatch_budget(hctx); 1828 goto out_unlock; 1829 } 1830 1831 /* 1832 * Always add a request that has been through 1833 *.queue_rq() to the hardware dispatch list. 1834 */ 1835 force = true; 1836 ret = __blk_mq_issue_directly(hctx, rq, cookie, last); 1837 out_unlock: 1838 hctx_unlock(hctx, srcu_idx); 1839 switch (ret) { 1840 case BLK_STS_OK: 1841 break; 1842 case BLK_STS_DEV_RESOURCE: 1843 case BLK_STS_RESOURCE: 1844 if (force) { 1845 blk_mq_request_bypass_insert(rq, run_queue); 1846 /* 1847 * We have to return BLK_STS_OK for the DM 1848 * to avoid livelock. Otherwise, we return 1849 * the real result to indicate whether the 1850 * request is direct-issued successfully. 1851 */ 1852 ret = bypass ? BLK_STS_OK : ret; 1853 } else if (!bypass) { 1854 blk_mq_sched_insert_request(rq, false, 1855 run_queue, false); 1856 } 1857 break; 1858 default: 1859 if (!bypass) 1860 blk_mq_end_request(rq, ret); 1861 break; 1862 } 1863 1864 return ret; 1865 } 1866 1867 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 1868 struct list_head *list) 1869 { 1870 blk_qc_t unused; 1871 blk_status_t ret = BLK_STS_OK; 1872 1873 while (!list_empty(list)) { 1874 struct request *rq = list_first_entry(list, struct request, 1875 queuelist); 1876 1877 list_del_init(&rq->queuelist); 1878 if (ret == BLK_STS_OK) 1879 ret = blk_mq_try_issue_directly(hctx, rq, &unused, 1880 false, 1881 list_empty(list)); 1882 else 1883 blk_mq_sched_insert_request(rq, false, true, false); 1884 } 1885 1886 /* 1887 * If we didn't flush the entire list, we could have told 1888 * the driver there was more coming, but that turned out to 1889 * be a lie. 1890 */ 1891 if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs) 1892 hctx->queue->mq_ops->commit_rqs(hctx); 1893 } 1894 1895 static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) 1896 { 1897 list_add_tail(&rq->queuelist, &plug->mq_list); 1898 plug->rq_count++; 1899 if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { 1900 struct request *tmp; 1901 1902 tmp = list_first_entry(&plug->mq_list, struct request, 1903 queuelist); 1904 if (tmp->q != rq->q) 1905 plug->multiple_queues = true; 1906 } 1907 } 1908 1909 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) 1910 { 1911 const int is_sync = op_is_sync(bio->bi_opf); 1912 const int is_flush_fua = op_is_flush(bio->bi_opf); 1913 struct blk_mq_alloc_data data = { .flags = 0}; 1914 struct request *rq; 1915 struct blk_plug *plug; 1916 struct request *same_queue_rq = NULL; 1917 blk_qc_t cookie; 1918 1919 blk_queue_bounce(q, &bio); 1920 1921 blk_queue_split(q, &bio); 1922 1923 if (!bio_integrity_prep(bio)) 1924 return BLK_QC_T_NONE; 1925 1926 if (!is_flush_fua && !blk_queue_nomerges(q) && 1927 blk_attempt_plug_merge(q, bio, &same_queue_rq)) 1928 return BLK_QC_T_NONE; 1929 1930 if (blk_mq_sched_bio_merge(q, bio)) 1931 return BLK_QC_T_NONE; 1932 1933 rq_qos_throttle(q, bio); 1934 1935 data.cmd_flags = bio->bi_opf; 1936 rq = blk_mq_get_request(q, bio, &data); 1937 if (unlikely(!rq)) { 1938 rq_qos_cleanup(q, bio); 1939 if (bio->bi_opf & REQ_NOWAIT) 1940 bio_wouldblock_error(bio); 1941 return BLK_QC_T_NONE; 1942 } 1943 1944 trace_block_getrq(q, bio, bio->bi_opf); 1945 1946 rq_qos_track(q, rq, bio); 1947 1948 cookie = request_to_qc_t(data.hctx, rq); 1949 1950 plug = current->plug; 1951 if (unlikely(is_flush_fua)) { 1952 blk_mq_put_ctx(data.ctx); 1953 blk_mq_bio_to_request(rq, bio); 1954 1955 /* bypass scheduler for flush rq */ 1956 blk_insert_flush(rq); 1957 blk_mq_run_hw_queue(data.hctx, true); 1958 } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) { 1959 /* 1960 * Use plugging if we have a ->commit_rqs() hook as well, as 1961 * we know the driver uses bd->last in a smart fashion. 1962 */ 1963 unsigned int request_count = plug->rq_count; 1964 struct request *last = NULL; 1965 1966 blk_mq_put_ctx(data.ctx); 1967 blk_mq_bio_to_request(rq, bio); 1968 1969 if (!request_count) 1970 trace_block_plug(q); 1971 else 1972 last = list_entry_rq(plug->mq_list.prev); 1973 1974 if (request_count >= BLK_MAX_REQUEST_COUNT || (last && 1975 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { 1976 blk_flush_plug_list(plug, false); 1977 trace_block_plug(q); 1978 } 1979 1980 blk_add_rq_to_plug(plug, rq); 1981 } else if (plug && !blk_queue_nomerges(q)) { 1982 blk_mq_bio_to_request(rq, bio); 1983 1984 /* 1985 * We do limited plugging. If the bio can be merged, do that. 1986 * Otherwise the existing request in the plug list will be 1987 * issued. So the plug list will have one request at most 1988 * The plug list might get flushed before this. If that happens, 1989 * the plug list is empty, and same_queue_rq is invalid. 1990 */ 1991 if (list_empty(&plug->mq_list)) 1992 same_queue_rq = NULL; 1993 if (same_queue_rq) { 1994 list_del_init(&same_queue_rq->queuelist); 1995 plug->rq_count--; 1996 } 1997 blk_add_rq_to_plug(plug, rq); 1998 1999 blk_mq_put_ctx(data.ctx); 2000 2001 if (same_queue_rq) { 2002 data.hctx = same_queue_rq->mq_hctx; 2003 blk_mq_try_issue_directly(data.hctx, same_queue_rq, 2004 &cookie, false, true); 2005 } 2006 } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && 2007 !data.hctx->dispatch_busy)) { 2008 blk_mq_put_ctx(data.ctx); 2009 blk_mq_bio_to_request(rq, bio); 2010 blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true); 2011 } else { 2012 blk_mq_put_ctx(data.ctx); 2013 blk_mq_bio_to_request(rq, bio); 2014 blk_mq_sched_insert_request(rq, false, true, true); 2015 } 2016 2017 return cookie; 2018 } 2019 2020 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 2021 unsigned int hctx_idx) 2022 { 2023 struct page *page; 2024 2025 if (tags->rqs && set->ops->exit_request) { 2026 int i; 2027 2028 for (i = 0; i < tags->nr_tags; i++) { 2029 struct request *rq = tags->static_rqs[i]; 2030 2031 if (!rq) 2032 continue; 2033 set->ops->exit_request(set, rq, hctx_idx); 2034 tags->static_rqs[i] = NULL; 2035 } 2036 } 2037 2038 while (!list_empty(&tags->page_list)) { 2039 page = list_first_entry(&tags->page_list, struct page, lru); 2040 list_del_init(&page->lru); 2041 /* 2042 * Remove kmemleak object previously allocated in 2043 * blk_mq_init_rq_map(). 2044 */ 2045 kmemleak_free(page_address(page)); 2046 __free_pages(page, page->private); 2047 } 2048 } 2049 2050 void blk_mq_free_rq_map(struct blk_mq_tags *tags) 2051 { 2052 kfree(tags->rqs); 2053 tags->rqs = NULL; 2054 kfree(tags->static_rqs); 2055 tags->static_rqs = NULL; 2056 2057 blk_mq_free_tags(tags); 2058 } 2059 2060 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, 2061 unsigned int hctx_idx, 2062 unsigned int nr_tags, 2063 unsigned int reserved_tags) 2064 { 2065 struct blk_mq_tags *tags; 2066 int node; 2067 2068 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); 2069 if (node == NUMA_NO_NODE) 2070 node = set->numa_node; 2071 2072 tags = blk_mq_init_tags(nr_tags, reserved_tags, node, 2073 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 2074 if (!tags) 2075 return NULL; 2076 2077 tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), 2078 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2079 node); 2080 if (!tags->rqs) { 2081 blk_mq_free_tags(tags); 2082 return NULL; 2083 } 2084 2085 tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), 2086 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2087 node); 2088 if (!tags->static_rqs) { 2089 kfree(tags->rqs); 2090 blk_mq_free_tags(tags); 2091 return NULL; 2092 } 2093 2094 return tags; 2095 } 2096 2097 static size_t order_to_size(unsigned int order) 2098 { 2099 return (size_t)PAGE_SIZE << order; 2100 } 2101 2102 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, 2103 unsigned int hctx_idx, int node) 2104 { 2105 int ret; 2106 2107 if (set->ops->init_request) { 2108 ret = set->ops->init_request(set, rq, hctx_idx, node); 2109 if (ret) 2110 return ret; 2111 } 2112 2113 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 2114 return 0; 2115 } 2116 2117 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 2118 unsigned int hctx_idx, unsigned int depth) 2119 { 2120 unsigned int i, j, entries_per_page, max_order = 4; 2121 size_t rq_size, left; 2122 int node; 2123 2124 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); 2125 if (node == NUMA_NO_NODE) 2126 node = set->numa_node; 2127 2128 INIT_LIST_HEAD(&tags->page_list); 2129 2130 /* 2131 * rq_size is the size of the request plus driver payload, rounded 2132 * to the cacheline size 2133 */ 2134 rq_size = round_up(sizeof(struct request) + set->cmd_size, 2135 cache_line_size()); 2136 left = rq_size * depth; 2137 2138 for (i = 0; i < depth; ) { 2139 int this_order = max_order; 2140 struct page *page; 2141 int to_do; 2142 void *p; 2143 2144 while (this_order && left < order_to_size(this_order - 1)) 2145 this_order--; 2146 2147 do { 2148 page = alloc_pages_node(node, 2149 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 2150 this_order); 2151 if (page) 2152 break; 2153 if (!this_order--) 2154 break; 2155 if (order_to_size(this_order) < rq_size) 2156 break; 2157 } while (1); 2158 2159 if (!page) 2160 goto fail; 2161 2162 page->private = this_order; 2163 list_add_tail(&page->lru, &tags->page_list); 2164 2165 p = page_address(page); 2166 /* 2167 * Allow kmemleak to scan these pages as they contain pointers 2168 * to additional allocations like via ops->init_request(). 2169 */ 2170 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); 2171 entries_per_page = order_to_size(this_order) / rq_size; 2172 to_do = min(entries_per_page, depth - i); 2173 left -= to_do * rq_size; 2174 for (j = 0; j < to_do; j++) { 2175 struct request *rq = p; 2176 2177 tags->static_rqs[i] = rq; 2178 if (blk_mq_init_request(set, rq, hctx_idx, node)) { 2179 tags->static_rqs[i] = NULL; 2180 goto fail; 2181 } 2182 2183 p += rq_size; 2184 i++; 2185 } 2186 } 2187 return 0; 2188 2189 fail: 2190 blk_mq_free_rqs(set, tags, hctx_idx); 2191 return -ENOMEM; 2192 } 2193 2194 /* 2195 * 'cpu' is going away. splice any existing rq_list entries from this 2196 * software queue to the hw queue dispatch list, and ensure that it 2197 * gets run. 2198 */ 2199 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) 2200 { 2201 struct blk_mq_hw_ctx *hctx; 2202 struct blk_mq_ctx *ctx; 2203 LIST_HEAD(tmp); 2204 enum hctx_type type; 2205 2206 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 2207 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 2208 type = hctx->type; 2209 2210 spin_lock(&ctx->lock); 2211 if (!list_empty(&ctx->rq_lists[type])) { 2212 list_splice_init(&ctx->rq_lists[type], &tmp); 2213 blk_mq_hctx_clear_pending(hctx, ctx); 2214 } 2215 spin_unlock(&ctx->lock); 2216 2217 if (list_empty(&tmp)) 2218 return 0; 2219 2220 spin_lock(&hctx->lock); 2221 list_splice_tail_init(&tmp, &hctx->dispatch); 2222 spin_unlock(&hctx->lock); 2223 2224 blk_mq_run_hw_queue(hctx, true); 2225 return 0; 2226 } 2227 2228 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 2229 { 2230 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 2231 &hctx->cpuhp_dead); 2232 } 2233 2234 /* hctx->ctxs will be freed in queue's release handler */ 2235 static void blk_mq_exit_hctx(struct request_queue *q, 2236 struct blk_mq_tag_set *set, 2237 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 2238 { 2239 if (blk_mq_hw_queue_mapped(hctx)) 2240 blk_mq_tag_idle(hctx); 2241 2242 if (set->ops->exit_request) 2243 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); 2244 2245 if (set->ops->exit_hctx) 2246 set->ops->exit_hctx(hctx, hctx_idx); 2247 2248 if (hctx->flags & BLK_MQ_F_BLOCKING) 2249 cleanup_srcu_struct(hctx->srcu); 2250 2251 blk_mq_remove_cpuhp(hctx); 2252 blk_free_flush_queue(hctx->fq); 2253 sbitmap_free(&hctx->ctx_map); 2254 } 2255 2256 static void blk_mq_exit_hw_queues(struct request_queue *q, 2257 struct blk_mq_tag_set *set, int nr_queue) 2258 { 2259 struct blk_mq_hw_ctx *hctx; 2260 unsigned int i; 2261 2262 queue_for_each_hw_ctx(q, hctx, i) { 2263 if (i == nr_queue) 2264 break; 2265 blk_mq_debugfs_unregister_hctx(hctx); 2266 blk_mq_exit_hctx(q, set, hctx, i); 2267 } 2268 } 2269 2270 static int blk_mq_init_hctx(struct request_queue *q, 2271 struct blk_mq_tag_set *set, 2272 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 2273 { 2274 int node; 2275 2276 node = hctx->numa_node; 2277 if (node == NUMA_NO_NODE) 2278 node = hctx->numa_node = set->numa_node; 2279 2280 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 2281 spin_lock_init(&hctx->lock); 2282 INIT_LIST_HEAD(&hctx->dispatch); 2283 hctx->queue = q; 2284 hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; 2285 2286 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); 2287 2288 hctx->tags = set->tags[hctx_idx]; 2289 2290 /* 2291 * Allocate space for all possible cpus to avoid allocation at 2292 * runtime 2293 */ 2294 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), 2295 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); 2296 if (!hctx->ctxs) 2297 goto unregister_cpu_notifier; 2298 2299 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), 2300 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node)) 2301 goto free_ctxs; 2302 2303 hctx->nr_ctx = 0; 2304 2305 spin_lock_init(&hctx->dispatch_wait_lock); 2306 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); 2307 INIT_LIST_HEAD(&hctx->dispatch_wait.entry); 2308 2309 if (set->ops->init_hctx && 2310 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 2311 goto free_bitmap; 2312 2313 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size, 2314 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); 2315 if (!hctx->fq) 2316 goto exit_hctx; 2317 2318 if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) 2319 goto free_fq; 2320 2321 if (hctx->flags & BLK_MQ_F_BLOCKING) 2322 init_srcu_struct(hctx->srcu); 2323 2324 return 0; 2325 2326 free_fq: 2327 kfree(hctx->fq); 2328 exit_hctx: 2329 if (set->ops->exit_hctx) 2330 set->ops->exit_hctx(hctx, hctx_idx); 2331 free_bitmap: 2332 sbitmap_free(&hctx->ctx_map); 2333 free_ctxs: 2334 kfree(hctx->ctxs); 2335 unregister_cpu_notifier: 2336 blk_mq_remove_cpuhp(hctx); 2337 return -1; 2338 } 2339 2340 static void blk_mq_init_cpu_queues(struct request_queue *q, 2341 unsigned int nr_hw_queues) 2342 { 2343 struct blk_mq_tag_set *set = q->tag_set; 2344 unsigned int i, j; 2345 2346 for_each_possible_cpu(i) { 2347 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 2348 struct blk_mq_hw_ctx *hctx; 2349 int k; 2350 2351 __ctx->cpu = i; 2352 spin_lock_init(&__ctx->lock); 2353 for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) 2354 INIT_LIST_HEAD(&__ctx->rq_lists[k]); 2355 2356 __ctx->queue = q; 2357 2358 /* 2359 * Set local node, IFF we have more than one hw queue. If 2360 * not, we remain on the home node of the device 2361 */ 2362 for (j = 0; j < set->nr_maps; j++) { 2363 hctx = blk_mq_map_queue_type(q, j, i); 2364 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 2365 hctx->numa_node = local_memory_node(cpu_to_node(i)); 2366 } 2367 } 2368 } 2369 2370 static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) 2371 { 2372 int ret = 0; 2373 2374 set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, 2375 set->queue_depth, set->reserved_tags); 2376 if (!set->tags[hctx_idx]) 2377 return false; 2378 2379 ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, 2380 set->queue_depth); 2381 if (!ret) 2382 return true; 2383 2384 blk_mq_free_rq_map(set->tags[hctx_idx]); 2385 set->tags[hctx_idx] = NULL; 2386 return false; 2387 } 2388 2389 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, 2390 unsigned int hctx_idx) 2391 { 2392 if (set->tags && set->tags[hctx_idx]) { 2393 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); 2394 blk_mq_free_rq_map(set->tags[hctx_idx]); 2395 set->tags[hctx_idx] = NULL; 2396 } 2397 } 2398 2399 static void blk_mq_map_swqueue(struct request_queue *q) 2400 { 2401 unsigned int i, j, hctx_idx; 2402 struct blk_mq_hw_ctx *hctx; 2403 struct blk_mq_ctx *ctx; 2404 struct blk_mq_tag_set *set = q->tag_set; 2405 2406 /* 2407 * Avoid others reading imcomplete hctx->cpumask through sysfs 2408 */ 2409 mutex_lock(&q->sysfs_lock); 2410 2411 queue_for_each_hw_ctx(q, hctx, i) { 2412 cpumask_clear(hctx->cpumask); 2413 hctx->nr_ctx = 0; 2414 hctx->dispatch_from = NULL; 2415 } 2416 2417 /* 2418 * Map software to hardware queues. 2419 * 2420 * If the cpu isn't present, the cpu is mapped to first hctx. 2421 */ 2422 for_each_possible_cpu(i) { 2423 hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i]; 2424 /* unmapped hw queue can be remapped after CPU topo changed */ 2425 if (!set->tags[hctx_idx] && 2426 !__blk_mq_alloc_rq_map(set, hctx_idx)) { 2427 /* 2428 * If tags initialization fail for some hctx, 2429 * that hctx won't be brought online. In this 2430 * case, remap the current ctx to hctx[0] which 2431 * is guaranteed to always have tags allocated 2432 */ 2433 set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0; 2434 } 2435 2436 ctx = per_cpu_ptr(q->queue_ctx, i); 2437 for (j = 0; j < set->nr_maps; j++) { 2438 if (!set->map[j].nr_queues) { 2439 ctx->hctxs[j] = blk_mq_map_queue_type(q, 2440 HCTX_TYPE_DEFAULT, i); 2441 continue; 2442 } 2443 2444 hctx = blk_mq_map_queue_type(q, j, i); 2445 ctx->hctxs[j] = hctx; 2446 /* 2447 * If the CPU is already set in the mask, then we've 2448 * mapped this one already. This can happen if 2449 * devices share queues across queue maps. 2450 */ 2451 if (cpumask_test_cpu(i, hctx->cpumask)) 2452 continue; 2453 2454 cpumask_set_cpu(i, hctx->cpumask); 2455 hctx->type = j; 2456 ctx->index_hw[hctx->type] = hctx->nr_ctx; 2457 hctx->ctxs[hctx->nr_ctx++] = ctx; 2458 2459 /* 2460 * If the nr_ctx type overflows, we have exceeded the 2461 * amount of sw queues we can support. 2462 */ 2463 BUG_ON(!hctx->nr_ctx); 2464 } 2465 2466 for (; j < HCTX_MAX_TYPES; j++) 2467 ctx->hctxs[j] = blk_mq_map_queue_type(q, 2468 HCTX_TYPE_DEFAULT, i); 2469 } 2470 2471 mutex_unlock(&q->sysfs_lock); 2472 2473 queue_for_each_hw_ctx(q, hctx, i) { 2474 /* 2475 * If no software queues are mapped to this hardware queue, 2476 * disable it and free the request entries. 2477 */ 2478 if (!hctx->nr_ctx) { 2479 /* Never unmap queue 0. We need it as a 2480 * fallback in case of a new remap fails 2481 * allocation 2482 */ 2483 if (i && set->tags[i]) 2484 blk_mq_free_map_and_requests(set, i); 2485 2486 hctx->tags = NULL; 2487 continue; 2488 } 2489 2490 hctx->tags = set->tags[i]; 2491 WARN_ON(!hctx->tags); 2492 2493 /* 2494 * Set the map size to the number of mapped software queues. 2495 * This is more accurate and more efficient than looping 2496 * over all possibly mapped software queues. 2497 */ 2498 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); 2499 2500 /* 2501 * Initialize batch roundrobin counts 2502 */ 2503 hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); 2504 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 2505 } 2506 } 2507 2508 /* 2509 * Caller needs to ensure that we're either frozen/quiesced, or that 2510 * the queue isn't live yet. 2511 */ 2512 static void queue_set_hctx_shared(struct request_queue *q, bool shared) 2513 { 2514 struct blk_mq_hw_ctx *hctx; 2515 int i; 2516 2517 queue_for_each_hw_ctx(q, hctx, i) { 2518 if (shared) 2519 hctx->flags |= BLK_MQ_F_TAG_SHARED; 2520 else 2521 hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 2522 } 2523 } 2524 2525 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, 2526 bool shared) 2527 { 2528 struct request_queue *q; 2529 2530 lockdep_assert_held(&set->tag_list_lock); 2531 2532 list_for_each_entry(q, &set->tag_list, tag_set_list) { 2533 blk_mq_freeze_queue(q); 2534 queue_set_hctx_shared(q, shared); 2535 blk_mq_unfreeze_queue(q); 2536 } 2537 } 2538 2539 static void blk_mq_del_queue_tag_set(struct request_queue *q) 2540 { 2541 struct blk_mq_tag_set *set = q->tag_set; 2542 2543 mutex_lock(&set->tag_list_lock); 2544 list_del_rcu(&q->tag_set_list); 2545 if (list_is_singular(&set->tag_list)) { 2546 /* just transitioned to unshared */ 2547 set->flags &= ~BLK_MQ_F_TAG_SHARED; 2548 /* update existing queue */ 2549 blk_mq_update_tag_set_depth(set, false); 2550 } 2551 mutex_unlock(&set->tag_list_lock); 2552 INIT_LIST_HEAD(&q->tag_set_list); 2553 } 2554 2555 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 2556 struct request_queue *q) 2557 { 2558 mutex_lock(&set->tag_list_lock); 2559 2560 /* 2561 * Check to see if we're transitioning to shared (from 1 to 2 queues). 2562 */ 2563 if (!list_empty(&set->tag_list) && 2564 !(set->flags & BLK_MQ_F_TAG_SHARED)) { 2565 set->flags |= BLK_MQ_F_TAG_SHARED; 2566 /* update existing queue */ 2567 blk_mq_update_tag_set_depth(set, true); 2568 } 2569 if (set->flags & BLK_MQ_F_TAG_SHARED) 2570 queue_set_hctx_shared(q, true); 2571 list_add_tail_rcu(&q->tag_set_list, &set->tag_list); 2572 2573 mutex_unlock(&set->tag_list_lock); 2574 } 2575 2576 /* All allocations will be freed in release handler of q->mq_kobj */ 2577 static int blk_mq_alloc_ctxs(struct request_queue *q) 2578 { 2579 struct blk_mq_ctxs *ctxs; 2580 int cpu; 2581 2582 ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); 2583 if (!ctxs) 2584 return -ENOMEM; 2585 2586 ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); 2587 if (!ctxs->queue_ctx) 2588 goto fail; 2589 2590 for_each_possible_cpu(cpu) { 2591 struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); 2592 ctx->ctxs = ctxs; 2593 } 2594 2595 q->mq_kobj = &ctxs->kobj; 2596 q->queue_ctx = ctxs->queue_ctx; 2597 2598 return 0; 2599 fail: 2600 kfree(ctxs); 2601 return -ENOMEM; 2602 } 2603 2604 /* 2605 * It is the actual release handler for mq, but we do it from 2606 * request queue's release handler for avoiding use-after-free 2607 * and headache because q->mq_kobj shouldn't have been introduced, 2608 * but we can't group ctx/kctx kobj without it. 2609 */ 2610 void blk_mq_release(struct request_queue *q) 2611 { 2612 struct blk_mq_hw_ctx *hctx; 2613 unsigned int i; 2614 2615 /* hctx kobj stays in hctx */ 2616 queue_for_each_hw_ctx(q, hctx, i) { 2617 if (!hctx) 2618 continue; 2619 kobject_put(&hctx->kobj); 2620 } 2621 2622 kfree(q->queue_hw_ctx); 2623 2624 /* 2625 * release .mq_kobj and sw queue's kobject now because 2626 * both share lifetime with request queue. 2627 */ 2628 blk_mq_sysfs_deinit(q); 2629 } 2630 2631 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 2632 { 2633 struct request_queue *uninit_q, *q; 2634 2635 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); 2636 if (!uninit_q) 2637 return ERR_PTR(-ENOMEM); 2638 2639 q = blk_mq_init_allocated_queue(set, uninit_q); 2640 if (IS_ERR(q)) 2641 blk_cleanup_queue(uninit_q); 2642 2643 return q; 2644 } 2645 EXPORT_SYMBOL(blk_mq_init_queue); 2646 2647 /* 2648 * Helper for setting up a queue with mq ops, given queue depth, and 2649 * the passed in mq ops flags. 2650 */ 2651 struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, 2652 const struct blk_mq_ops *ops, 2653 unsigned int queue_depth, 2654 unsigned int set_flags) 2655 { 2656 struct request_queue *q; 2657 int ret; 2658 2659 memset(set, 0, sizeof(*set)); 2660 set->ops = ops; 2661 set->nr_hw_queues = 1; 2662 set->nr_maps = 1; 2663 set->queue_depth = queue_depth; 2664 set->numa_node = NUMA_NO_NODE; 2665 set->flags = set_flags; 2666 2667 ret = blk_mq_alloc_tag_set(set); 2668 if (ret) 2669 return ERR_PTR(ret); 2670 2671 q = blk_mq_init_queue(set); 2672 if (IS_ERR(q)) { 2673 blk_mq_free_tag_set(set); 2674 return q; 2675 } 2676 2677 return q; 2678 } 2679 EXPORT_SYMBOL(blk_mq_init_sq_queue); 2680 2681 static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) 2682 { 2683 int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); 2684 2685 BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), 2686 __alignof__(struct blk_mq_hw_ctx)) != 2687 sizeof(struct blk_mq_hw_ctx)); 2688 2689 if (tag_set->flags & BLK_MQ_F_BLOCKING) 2690 hw_ctx_size += sizeof(struct srcu_struct); 2691 2692 return hw_ctx_size; 2693 } 2694 2695 static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( 2696 struct blk_mq_tag_set *set, struct request_queue *q, 2697 int hctx_idx, int node) 2698 { 2699 struct blk_mq_hw_ctx *hctx; 2700 2701 hctx = kzalloc_node(blk_mq_hw_ctx_size(set), 2702 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2703 node); 2704 if (!hctx) 2705 return NULL; 2706 2707 if (!zalloc_cpumask_var_node(&hctx->cpumask, 2708 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2709 node)) { 2710 kfree(hctx); 2711 return NULL; 2712 } 2713 2714 atomic_set(&hctx->nr_active, 0); 2715 hctx->numa_node = node; 2716 hctx->queue_num = hctx_idx; 2717 2718 if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) { 2719 free_cpumask_var(hctx->cpumask); 2720 kfree(hctx); 2721 return NULL; 2722 } 2723 blk_mq_hctx_kobj_init(hctx); 2724 2725 return hctx; 2726 } 2727 2728 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 2729 struct request_queue *q) 2730 { 2731 int i, j, end; 2732 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; 2733 2734 /* protect against switching io scheduler */ 2735 mutex_lock(&q->sysfs_lock); 2736 for (i = 0; i < set->nr_hw_queues; i++) { 2737 int node; 2738 struct blk_mq_hw_ctx *hctx; 2739 2740 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); 2741 /* 2742 * If the hw queue has been mapped to another numa node, 2743 * we need to realloc the hctx. If allocation fails, fallback 2744 * to use the previous one. 2745 */ 2746 if (hctxs[i] && (hctxs[i]->numa_node == node)) 2747 continue; 2748 2749 hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); 2750 if (hctx) { 2751 if (hctxs[i]) { 2752 blk_mq_exit_hctx(q, set, hctxs[i], i); 2753 kobject_put(&hctxs[i]->kobj); 2754 } 2755 hctxs[i] = hctx; 2756 } else { 2757 if (hctxs[i]) 2758 pr_warn("Allocate new hctx on node %d fails,\ 2759 fallback to previous one on node %d\n", 2760 node, hctxs[i]->numa_node); 2761 else 2762 break; 2763 } 2764 } 2765 /* 2766 * Increasing nr_hw_queues fails. Free the newly allocated 2767 * hctxs and keep the previous q->nr_hw_queues. 2768 */ 2769 if (i != set->nr_hw_queues) { 2770 j = q->nr_hw_queues; 2771 end = i; 2772 } else { 2773 j = i; 2774 end = q->nr_hw_queues; 2775 q->nr_hw_queues = set->nr_hw_queues; 2776 } 2777 2778 for (; j < end; j++) { 2779 struct blk_mq_hw_ctx *hctx = hctxs[j]; 2780 2781 if (hctx) { 2782 if (hctx->tags) 2783 blk_mq_free_map_and_requests(set, j); 2784 blk_mq_exit_hctx(q, set, hctx, j); 2785 kobject_put(&hctx->kobj); 2786 hctxs[j] = NULL; 2787 2788 } 2789 } 2790 mutex_unlock(&q->sysfs_lock); 2791 } 2792 2793 /* 2794 * Maximum number of hardware queues we support. For single sets, we'll never 2795 * have more than the CPUs (software queues). For multiple sets, the tag_set 2796 * user may have set ->nr_hw_queues larger. 2797 */ 2798 static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) 2799 { 2800 if (set->nr_maps == 1) 2801 return nr_cpu_ids; 2802 2803 return max(set->nr_hw_queues, nr_cpu_ids); 2804 } 2805 2806 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 2807 struct request_queue *q) 2808 { 2809 /* mark the queue as mq asap */ 2810 q->mq_ops = set->ops; 2811 2812 q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, 2813 blk_mq_poll_stats_bkt, 2814 BLK_MQ_POLL_STATS_BKTS, q); 2815 if (!q->poll_cb) 2816 goto err_exit; 2817 2818 if (blk_mq_alloc_ctxs(q)) 2819 goto err_exit; 2820 2821 /* init q->mq_kobj and sw queues' kobjects */ 2822 blk_mq_sysfs_init(q); 2823 2824 q->nr_queues = nr_hw_queues(set); 2825 q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)), 2826 GFP_KERNEL, set->numa_node); 2827 if (!q->queue_hw_ctx) 2828 goto err_sys_init; 2829 2830 blk_mq_realloc_hw_ctxs(set, q); 2831 if (!q->nr_hw_queues) 2832 goto err_hctxs; 2833 2834 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 2835 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 2836 2837 q->tag_set = set; 2838 2839 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 2840 if (set->nr_maps > HCTX_TYPE_POLL && 2841 set->map[HCTX_TYPE_POLL].nr_queues) 2842 blk_queue_flag_set(QUEUE_FLAG_POLL, q); 2843 2844 q->sg_reserved_size = INT_MAX; 2845 2846 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); 2847 INIT_LIST_HEAD(&q->requeue_list); 2848 spin_lock_init(&q->requeue_lock); 2849 2850 blk_queue_make_request(q, blk_mq_make_request); 2851 2852 /* 2853 * Do this after blk_queue_make_request() overrides it... 2854 */ 2855 q->nr_requests = set->queue_depth; 2856 2857 /* 2858 * Default to classic polling 2859 */ 2860 q->poll_nsec = -1; 2861 2862 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 2863 blk_mq_add_queue_tag_set(set, q); 2864 blk_mq_map_swqueue(q); 2865 2866 if (!(set->flags & BLK_MQ_F_NO_SCHED)) { 2867 int ret; 2868 2869 ret = elevator_init_mq(q); 2870 if (ret) 2871 return ERR_PTR(ret); 2872 } 2873 2874 return q; 2875 2876 err_hctxs: 2877 kfree(q->queue_hw_ctx); 2878 err_sys_init: 2879 blk_mq_sysfs_deinit(q); 2880 err_exit: 2881 q->mq_ops = NULL; 2882 return ERR_PTR(-ENOMEM); 2883 } 2884 EXPORT_SYMBOL(blk_mq_init_allocated_queue); 2885 2886 void blk_mq_free_queue(struct request_queue *q) 2887 { 2888 struct blk_mq_tag_set *set = q->tag_set; 2889 2890 blk_mq_del_queue_tag_set(q); 2891 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 2892 } 2893 2894 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2895 { 2896 int i; 2897 2898 for (i = 0; i < set->nr_hw_queues; i++) 2899 if (!__blk_mq_alloc_rq_map(set, i)) 2900 goto out_unwind; 2901 2902 return 0; 2903 2904 out_unwind: 2905 while (--i >= 0) 2906 blk_mq_free_rq_map(set->tags[i]); 2907 2908 return -ENOMEM; 2909 } 2910 2911 /* 2912 * Allocate the request maps associated with this tag_set. Note that this 2913 * may reduce the depth asked for, if memory is tight. set->queue_depth 2914 * will be updated to reflect the allocated depth. 2915 */ 2916 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2917 { 2918 unsigned int depth; 2919 int err; 2920 2921 depth = set->queue_depth; 2922 do { 2923 err = __blk_mq_alloc_rq_maps(set); 2924 if (!err) 2925 break; 2926 2927 set->queue_depth >>= 1; 2928 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 2929 err = -ENOMEM; 2930 break; 2931 } 2932 } while (set->queue_depth); 2933 2934 if (!set->queue_depth || err) { 2935 pr_err("blk-mq: failed to allocate request map\n"); 2936 return -ENOMEM; 2937 } 2938 2939 if (depth != set->queue_depth) 2940 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 2941 depth, set->queue_depth); 2942 2943 return 0; 2944 } 2945 2946 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 2947 { 2948 if (set->ops->map_queues && !is_kdump_kernel()) { 2949 int i; 2950 2951 /* 2952 * transport .map_queues is usually done in the following 2953 * way: 2954 * 2955 * for (queue = 0; queue < set->nr_hw_queues; queue++) { 2956 * mask = get_cpu_mask(queue) 2957 * for_each_cpu(cpu, mask) 2958 * set->map[x].mq_map[cpu] = queue; 2959 * } 2960 * 2961 * When we need to remap, the table has to be cleared for 2962 * killing stale mapping since one CPU may not be mapped 2963 * to any hw queue. 2964 */ 2965 for (i = 0; i < set->nr_maps; i++) 2966 blk_mq_clear_mq_map(&set->map[i]); 2967 2968 return set->ops->map_queues(set); 2969 } else { 2970 BUG_ON(set->nr_maps > 1); 2971 return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 2972 } 2973 } 2974 2975 /* 2976 * Alloc a tag set to be associated with one or more request queues. 2977 * May fail with EINVAL for various error conditions. May adjust the 2978 * requested depth down, if it's too large. In that case, the set 2979 * value will be stored in set->queue_depth. 2980 */ 2981 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2982 { 2983 int i, ret; 2984 2985 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 2986 2987 if (!set->nr_hw_queues) 2988 return -EINVAL; 2989 if (!set->queue_depth) 2990 return -EINVAL; 2991 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 2992 return -EINVAL; 2993 2994 if (!set->ops->queue_rq) 2995 return -EINVAL; 2996 2997 if (!set->ops->get_budget ^ !set->ops->put_budget) 2998 return -EINVAL; 2999 3000 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 3001 pr_info("blk-mq: reduced tag depth to %u\n", 3002 BLK_MQ_MAX_DEPTH); 3003 set->queue_depth = BLK_MQ_MAX_DEPTH; 3004 } 3005 3006 if (!set->nr_maps) 3007 set->nr_maps = 1; 3008 else if (set->nr_maps > HCTX_MAX_TYPES) 3009 return -EINVAL; 3010 3011 /* 3012 * If a crashdump is active, then we are potentially in a very 3013 * memory constrained environment. Limit us to 1 queue and 3014 * 64 tags to prevent using too much memory. 3015 */ 3016 if (is_kdump_kernel()) { 3017 set->nr_hw_queues = 1; 3018 set->nr_maps = 1; 3019 set->queue_depth = min(64U, set->queue_depth); 3020 } 3021 /* 3022 * There is no use for more h/w queues than cpus if we just have 3023 * a single map 3024 */ 3025 if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) 3026 set->nr_hw_queues = nr_cpu_ids; 3027 3028 set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *), 3029 GFP_KERNEL, set->numa_node); 3030 if (!set->tags) 3031 return -ENOMEM; 3032 3033 ret = -ENOMEM; 3034 for (i = 0; i < set->nr_maps; i++) { 3035 set->map[i].mq_map = kcalloc_node(nr_cpu_ids, 3036 sizeof(set->map[i].mq_map[0]), 3037 GFP_KERNEL, set->numa_node); 3038 if (!set->map[i].mq_map) 3039 goto out_free_mq_map; 3040 set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; 3041 } 3042 3043 ret = blk_mq_update_queue_map(set); 3044 if (ret) 3045 goto out_free_mq_map; 3046 3047 ret = blk_mq_alloc_rq_maps(set); 3048 if (ret) 3049 goto out_free_mq_map; 3050 3051 mutex_init(&set->tag_list_lock); 3052 INIT_LIST_HEAD(&set->tag_list); 3053 3054 return 0; 3055 3056 out_free_mq_map: 3057 for (i = 0; i < set->nr_maps; i++) { 3058 kfree(set->map[i].mq_map); 3059 set->map[i].mq_map = NULL; 3060 } 3061 kfree(set->tags); 3062 set->tags = NULL; 3063 return ret; 3064 } 3065 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 3066 3067 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 3068 { 3069 int i, j; 3070 3071 for (i = 0; i < nr_hw_queues(set); i++) 3072 blk_mq_free_map_and_requests(set, i); 3073 3074 for (j = 0; j < set->nr_maps; j++) { 3075 kfree(set->map[j].mq_map); 3076 set->map[j].mq_map = NULL; 3077 } 3078 3079 kfree(set->tags); 3080 set->tags = NULL; 3081 } 3082 EXPORT_SYMBOL(blk_mq_free_tag_set); 3083 3084 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 3085 { 3086 struct blk_mq_tag_set *set = q->tag_set; 3087 struct blk_mq_hw_ctx *hctx; 3088 int i, ret; 3089 3090 if (!set) 3091 return -EINVAL; 3092 3093 if (q->nr_requests == nr) 3094 return 0; 3095 3096 blk_mq_freeze_queue(q); 3097 blk_mq_quiesce_queue(q); 3098 3099 ret = 0; 3100 queue_for_each_hw_ctx(q, hctx, i) { 3101 if (!hctx->tags) 3102 continue; 3103 /* 3104 * If we're using an MQ scheduler, just update the scheduler 3105 * queue depth. This is similar to what the old code would do. 3106 */ 3107 if (!hctx->sched_tags) { 3108 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, 3109 false); 3110 } else { 3111 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, 3112 nr, true); 3113 } 3114 if (ret) 3115 break; 3116 } 3117 3118 if (!ret) 3119 q->nr_requests = nr; 3120 3121 blk_mq_unquiesce_queue(q); 3122 blk_mq_unfreeze_queue(q); 3123 3124 return ret; 3125 } 3126 3127 /* 3128 * request_queue and elevator_type pair. 3129 * It is just used by __blk_mq_update_nr_hw_queues to cache 3130 * the elevator_type associated with a request_queue. 3131 */ 3132 struct blk_mq_qe_pair { 3133 struct list_head node; 3134 struct request_queue *q; 3135 struct elevator_type *type; 3136 }; 3137 3138 /* 3139 * Cache the elevator_type in qe pair list and switch the 3140 * io scheduler to 'none' 3141 */ 3142 static bool blk_mq_elv_switch_none(struct list_head *head, 3143 struct request_queue *q) 3144 { 3145 struct blk_mq_qe_pair *qe; 3146 3147 if (!q->elevator) 3148 return true; 3149 3150 qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); 3151 if (!qe) 3152 return false; 3153 3154 INIT_LIST_HEAD(&qe->node); 3155 qe->q = q; 3156 qe->type = q->elevator->type; 3157 list_add(&qe->node, head); 3158 3159 mutex_lock(&q->sysfs_lock); 3160 /* 3161 * After elevator_switch_mq, the previous elevator_queue will be 3162 * released by elevator_release. The reference of the io scheduler 3163 * module get by elevator_get will also be put. So we need to get 3164 * a reference of the io scheduler module here to prevent it to be 3165 * removed. 3166 */ 3167 __module_get(qe->type->elevator_owner); 3168 elevator_switch_mq(q, NULL); 3169 mutex_unlock(&q->sysfs_lock); 3170 3171 return true; 3172 } 3173 3174 static void blk_mq_elv_switch_back(struct list_head *head, 3175 struct request_queue *q) 3176 { 3177 struct blk_mq_qe_pair *qe; 3178 struct elevator_type *t = NULL; 3179 3180 list_for_each_entry(qe, head, node) 3181 if (qe->q == q) { 3182 t = qe->type; 3183 break; 3184 } 3185 3186 if (!t) 3187 return; 3188 3189 list_del(&qe->node); 3190 kfree(qe); 3191 3192 mutex_lock(&q->sysfs_lock); 3193 elevator_switch_mq(q, t); 3194 mutex_unlock(&q->sysfs_lock); 3195 } 3196 3197 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, 3198 int nr_hw_queues) 3199 { 3200 struct request_queue *q; 3201 LIST_HEAD(head); 3202 int prev_nr_hw_queues; 3203 3204 lockdep_assert_held(&set->tag_list_lock); 3205 3206 if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) 3207 nr_hw_queues = nr_cpu_ids; 3208 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) 3209 return; 3210 3211 list_for_each_entry(q, &set->tag_list, tag_set_list) 3212 blk_mq_freeze_queue(q); 3213 /* 3214 * Sync with blk_mq_queue_tag_busy_iter. 3215 */ 3216 synchronize_rcu(); 3217 /* 3218 * Switch IO scheduler to 'none', cleaning up the data associated 3219 * with the previous scheduler. We will switch back once we are done 3220 * updating the new sw to hw queue mappings. 3221 */ 3222 list_for_each_entry(q, &set->tag_list, tag_set_list) 3223 if (!blk_mq_elv_switch_none(&head, q)) 3224 goto switch_back; 3225 3226 list_for_each_entry(q, &set->tag_list, tag_set_list) { 3227 blk_mq_debugfs_unregister_hctxs(q); 3228 blk_mq_sysfs_unregister(q); 3229 } 3230 3231 prev_nr_hw_queues = set->nr_hw_queues; 3232 set->nr_hw_queues = nr_hw_queues; 3233 blk_mq_update_queue_map(set); 3234 fallback: 3235 list_for_each_entry(q, &set->tag_list, tag_set_list) { 3236 blk_mq_realloc_hw_ctxs(set, q); 3237 if (q->nr_hw_queues != set->nr_hw_queues) { 3238 pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", 3239 nr_hw_queues, prev_nr_hw_queues); 3240 set->nr_hw_queues = prev_nr_hw_queues; 3241 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 3242 goto fallback; 3243 } 3244 blk_mq_map_swqueue(q); 3245 } 3246 3247 list_for_each_entry(q, &set->tag_list, tag_set_list) { 3248 blk_mq_sysfs_register(q); 3249 blk_mq_debugfs_register_hctxs(q); 3250 } 3251 3252 switch_back: 3253 list_for_each_entry(q, &set->tag_list, tag_set_list) 3254 blk_mq_elv_switch_back(&head, q); 3255 3256 list_for_each_entry(q, &set->tag_list, tag_set_list) 3257 blk_mq_unfreeze_queue(q); 3258 } 3259 3260 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) 3261 { 3262 mutex_lock(&set->tag_list_lock); 3263 __blk_mq_update_nr_hw_queues(set, nr_hw_queues); 3264 mutex_unlock(&set->tag_list_lock); 3265 } 3266 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 3267 3268 /* Enable polling stats and return whether they were already enabled. */ 3269 static bool blk_poll_stats_enable(struct request_queue *q) 3270 { 3271 if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 3272 blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q)) 3273 return true; 3274 blk_stat_add_callback(q, q->poll_cb); 3275 return false; 3276 } 3277 3278 static void blk_mq_poll_stats_start(struct request_queue *q) 3279 { 3280 /* 3281 * We don't arm the callback if polling stats are not enabled or the 3282 * callback is already active. 3283 */ 3284 if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 3285 blk_stat_is_active(q->poll_cb)) 3286 return; 3287 3288 blk_stat_activate_msecs(q->poll_cb, 100); 3289 } 3290 3291 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) 3292 { 3293 struct request_queue *q = cb->data; 3294 int bucket; 3295 3296 for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { 3297 if (cb->stat[bucket].nr_samples) 3298 q->poll_stat[bucket] = cb->stat[bucket]; 3299 } 3300 } 3301 3302 static unsigned long blk_mq_poll_nsecs(struct request_queue *q, 3303 struct blk_mq_hw_ctx *hctx, 3304 struct request *rq) 3305 { 3306 unsigned long ret = 0; 3307 int bucket; 3308 3309 /* 3310 * If stats collection isn't on, don't sleep but turn it on for 3311 * future users 3312 */ 3313 if (!blk_poll_stats_enable(q)) 3314 return 0; 3315 3316 /* 3317 * As an optimistic guess, use half of the mean service time 3318 * for this type of request. We can (and should) make this smarter. 3319 * For instance, if the completion latencies are tight, we can 3320 * get closer than just half the mean. This is especially 3321 * important on devices where the completion latencies are longer 3322 * than ~10 usec. We do use the stats for the relevant IO size 3323 * if available which does lead to better estimates. 3324 */ 3325 bucket = blk_mq_poll_stats_bkt(rq); 3326 if (bucket < 0) 3327 return ret; 3328 3329 if (q->poll_stat[bucket].nr_samples) 3330 ret = (q->poll_stat[bucket].mean + 1) / 2; 3331 3332 return ret; 3333 } 3334 3335 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, 3336 struct blk_mq_hw_ctx *hctx, 3337 struct request *rq) 3338 { 3339 struct hrtimer_sleeper hs; 3340 enum hrtimer_mode mode; 3341 unsigned int nsecs; 3342 ktime_t kt; 3343 3344 if (rq->rq_flags & RQF_MQ_POLL_SLEPT) 3345 return false; 3346 3347 /* 3348 * If we get here, hybrid polling is enabled. Hence poll_nsec can be: 3349 * 3350 * 0: use half of prev avg 3351 * >0: use this specific value 3352 */ 3353 if (q->poll_nsec > 0) 3354 nsecs = q->poll_nsec; 3355 else 3356 nsecs = blk_mq_poll_nsecs(q, hctx, rq); 3357 3358 if (!nsecs) 3359 return false; 3360 3361 rq->rq_flags |= RQF_MQ_POLL_SLEPT; 3362 3363 /* 3364 * This will be replaced with the stats tracking code, using 3365 * 'avg_completion_time / 2' as the pre-sleep target. 3366 */ 3367 kt = nsecs; 3368 3369 mode = HRTIMER_MODE_REL; 3370 hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); 3371 hrtimer_set_expires(&hs.timer, kt); 3372 3373 hrtimer_init_sleeper(&hs, current); 3374 do { 3375 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) 3376 break; 3377 set_current_state(TASK_UNINTERRUPTIBLE); 3378 hrtimer_start_expires(&hs.timer, mode); 3379 if (hs.task) 3380 io_schedule(); 3381 hrtimer_cancel(&hs.timer); 3382 mode = HRTIMER_MODE_ABS; 3383 } while (hs.task && !signal_pending(current)); 3384 3385 __set_current_state(TASK_RUNNING); 3386 destroy_hrtimer_on_stack(&hs.timer); 3387 return true; 3388 } 3389 3390 static bool blk_mq_poll_hybrid(struct request_queue *q, 3391 struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) 3392 { 3393 struct request *rq; 3394 3395 if (q->poll_nsec == -1) 3396 return false; 3397 3398 if (!blk_qc_t_is_internal(cookie)) 3399 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); 3400 else { 3401 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); 3402 /* 3403 * With scheduling, if the request has completed, we'll 3404 * get a NULL return here, as we clear the sched tag when 3405 * that happens. The request still remains valid, like always, 3406 * so we should be safe with just the NULL check. 3407 */ 3408 if (!rq) 3409 return false; 3410 } 3411 3412 return blk_mq_poll_hybrid_sleep(q, hctx, rq); 3413 } 3414 3415 /** 3416 * blk_poll - poll for IO completions 3417 * @q: the queue 3418 * @cookie: cookie passed back at IO submission time 3419 * @spin: whether to spin for completions 3420 * 3421 * Description: 3422 * Poll for completions on the passed in queue. Returns number of 3423 * completed entries found. If @spin is true, then blk_poll will continue 3424 * looping until at least one completion is found, unless the task is 3425 * otherwise marked running (or we need to reschedule). 3426 */ 3427 int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) 3428 { 3429 struct blk_mq_hw_ctx *hctx; 3430 long state; 3431 3432 if (!blk_qc_t_valid(cookie) || 3433 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) 3434 return 0; 3435 3436 if (current->plug) 3437 blk_flush_plug_list(current->plug, false); 3438 3439 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; 3440 3441 /* 3442 * If we sleep, have the caller restart the poll loop to reset 3443 * the state. Like for the other success return cases, the 3444 * caller is responsible for checking if the IO completed. If 3445 * the IO isn't complete, we'll get called again and will go 3446 * straight to the busy poll loop. 3447 */ 3448 if (blk_mq_poll_hybrid(q, hctx, cookie)) 3449 return 1; 3450 3451 hctx->poll_considered++; 3452 3453 state = current->state; 3454 do { 3455 int ret; 3456 3457 hctx->poll_invoked++; 3458 3459 ret = q->mq_ops->poll(hctx); 3460 if (ret > 0) { 3461 hctx->poll_success++; 3462 __set_current_state(TASK_RUNNING); 3463 return ret; 3464 } 3465 3466 if (signal_pending_state(state, current)) 3467 __set_current_state(TASK_RUNNING); 3468 3469 if (current->state == TASK_RUNNING) 3470 return 1; 3471 if (ret < 0 || !spin) 3472 break; 3473 cpu_relax(); 3474 } while (!need_resched()); 3475 3476 __set_current_state(TASK_RUNNING); 3477 return 0; 3478 } 3479 EXPORT_SYMBOL_GPL(blk_poll); 3480 3481 unsigned int blk_mq_rq_cpu(struct request *rq) 3482 { 3483 return rq->mq_ctx->cpu; 3484 } 3485 EXPORT_SYMBOL(blk_mq_rq_cpu); 3486 3487 static int __init blk_mq_init(void) 3488 { 3489 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 3490 blk_mq_hctx_notify_dead); 3491 return 0; 3492 } 3493 subsys_initcall(blk_mq_init); 3494