1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Tag allocation using scalable bitmaps. Uses active queue tracking to support 4 * fairer distribution of tags between multiple submitters when a shared tag map 5 * is used. 6 * 7 * Copyright (C) 2013-2014 Jens Axboe 8 */ 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 12 #include <linux/blk-mq.h> 13 #include <linux/delay.h> 14 #include "blk.h" 15 #include "blk-mq.h" 16 #include "blk-mq-sched.h" 17 #include "blk-mq-tag.h" 18 19 /* 20 * If a previously inactive queue goes active, bump the active user count. 21 * We need to do this before try to allocate driver tag, then even if fail 22 * to get tag when first time, the other shared-tag users could reserve 23 * budget for it. 24 */ 25 bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 26 { 27 if (blk_mq_is_sbitmap_shared(hctx->flags)) { 28 struct request_queue *q = hctx->queue; 29 struct blk_mq_tag_set *set = q->tag_set; 30 31 if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && 32 !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) 33 atomic_inc(&set->active_queues_shared_sbitmap); 34 } else { 35 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && 36 !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 37 atomic_inc(&hctx->tags->active_queues); 38 } 39 40 return true; 41 } 42 43 /* 44 * Wakeup all potentially sleeping on tags 45 */ 46 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) 47 { 48 sbitmap_queue_wake_all(tags->bitmap_tags); 49 if (include_reserve) 50 sbitmap_queue_wake_all(tags->breserved_tags); 51 } 52 53 /* 54 * If a previously busy queue goes inactive, potential waiters could now 55 * be allowed to queue. Wake them up and check. 56 */ 57 void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) 58 { 59 struct blk_mq_tags *tags = hctx->tags; 60 struct request_queue *q = hctx->queue; 61 struct blk_mq_tag_set *set = q->tag_set; 62 63 if (blk_mq_is_sbitmap_shared(hctx->flags)) { 64 if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, 65 &q->queue_flags)) 66 return; 67 atomic_dec(&set->active_queues_shared_sbitmap); 68 } else { 69 if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 70 return; 71 atomic_dec(&tags->active_queues); 72 } 73 74 blk_mq_tag_wakeup_all(tags, false); 75 } 76 77 static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, 78 struct sbitmap_queue *bt) 79 { 80 if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && 81 !hctx_may_queue(data->hctx, bt)) 82 return BLK_MQ_NO_TAG; 83 84 if (data->shallow_depth) 85 return __sbitmap_queue_get_shallow(bt, data->shallow_depth); 86 else 87 return __sbitmap_queue_get(bt); 88 } 89 90 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) 91 { 92 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 93 struct sbitmap_queue *bt; 94 struct sbq_wait_state *ws; 95 DEFINE_SBQ_WAIT(wait); 96 unsigned int tag_offset; 97 int tag; 98 99 if (data->flags & BLK_MQ_REQ_RESERVED) { 100 if (unlikely(!tags->nr_reserved_tags)) { 101 WARN_ON_ONCE(1); 102 return BLK_MQ_NO_TAG; 103 } 104 bt = tags->breserved_tags; 105 tag_offset = 0; 106 } else { 107 bt = tags->bitmap_tags; 108 tag_offset = tags->nr_reserved_tags; 109 } 110 111 tag = __blk_mq_get_tag(data, bt); 112 if (tag != BLK_MQ_NO_TAG) 113 goto found_tag; 114 115 if (data->flags & BLK_MQ_REQ_NOWAIT) 116 return BLK_MQ_NO_TAG; 117 118 ws = bt_wait_ptr(bt, data->hctx); 119 do { 120 struct sbitmap_queue *bt_prev; 121 122 /* 123 * We're out of tags on this hardware queue, kick any 124 * pending IO submits before going to sleep waiting for 125 * some to complete. 126 */ 127 blk_mq_run_hw_queue(data->hctx, false); 128 129 /* 130 * Retry tag allocation after running the hardware queue, 131 * as running the queue may also have found completions. 132 */ 133 tag = __blk_mq_get_tag(data, bt); 134 if (tag != BLK_MQ_NO_TAG) 135 break; 136 137 sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); 138 139 tag = __blk_mq_get_tag(data, bt); 140 if (tag != BLK_MQ_NO_TAG) 141 break; 142 143 bt_prev = bt; 144 io_schedule(); 145 146 sbitmap_finish_wait(bt, ws, &wait); 147 148 data->ctx = blk_mq_get_ctx(data->q); 149 data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, 150 data->ctx); 151 tags = blk_mq_tags_from_data(data); 152 if (data->flags & BLK_MQ_REQ_RESERVED) 153 bt = tags->breserved_tags; 154 else 155 bt = tags->bitmap_tags; 156 157 /* 158 * If destination hw queue is changed, fake wake up on 159 * previous queue for compensating the wake up miss, so 160 * other allocations on previous queue won't be starved. 161 */ 162 if (bt != bt_prev) 163 sbitmap_queue_wake_up(bt_prev); 164 165 ws = bt_wait_ptr(bt, data->hctx); 166 } while (1); 167 168 sbitmap_finish_wait(bt, ws, &wait); 169 170 found_tag: 171 /* 172 * Give up this allocation if the hctx is inactive. The caller will 173 * retry on an active hctx. 174 */ 175 if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { 176 blk_mq_put_tag(tags, data->ctx, tag + tag_offset); 177 return BLK_MQ_NO_TAG; 178 } 179 return tag + tag_offset; 180 } 181 182 void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, 183 unsigned int tag) 184 { 185 if (!blk_mq_tag_is_reserved(tags, tag)) { 186 const int real_tag = tag - tags->nr_reserved_tags; 187 188 BUG_ON(real_tag >= tags->nr_tags); 189 sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu); 190 } else { 191 BUG_ON(tag >= tags->nr_reserved_tags); 192 sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu); 193 } 194 } 195 196 struct bt_iter_data { 197 struct blk_mq_hw_ctx *hctx; 198 busy_iter_fn *fn; 199 void *data; 200 bool reserved; 201 }; 202 203 static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, 204 unsigned int bitnr) 205 { 206 struct request *rq; 207 unsigned long flags; 208 209 spin_lock_irqsave(&tags->lock, flags); 210 rq = tags->rqs[bitnr]; 211 if (!rq || !refcount_inc_not_zero(&rq->ref)) 212 rq = NULL; 213 spin_unlock_irqrestore(&tags->lock, flags); 214 return rq; 215 } 216 217 static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) 218 { 219 struct bt_iter_data *iter_data = data; 220 struct blk_mq_hw_ctx *hctx = iter_data->hctx; 221 struct blk_mq_tags *tags = hctx->tags; 222 bool reserved = iter_data->reserved; 223 struct request *rq; 224 bool ret = true; 225 226 if (!reserved) 227 bitnr += tags->nr_reserved_tags; 228 /* 229 * We can hit rq == NULL here, because the tagging functions 230 * test and set the bit before assigning ->rqs[]. 231 */ 232 rq = blk_mq_find_and_get_req(tags, bitnr); 233 if (!rq) 234 return true; 235 236 if (rq->q == hctx->queue && rq->mq_hctx == hctx) 237 ret = iter_data->fn(hctx, rq, iter_data->data, reserved); 238 blk_mq_put_rq_ref(rq); 239 return ret; 240 } 241 242 /** 243 * bt_for_each - iterate over the requests associated with a hardware queue 244 * @hctx: Hardware queue to examine. 245 * @bt: sbitmap to examine. This is either the breserved_tags member 246 * or the bitmap_tags member of struct blk_mq_tags. 247 * @fn: Pointer to the function that will be called for each request 248 * associated with @hctx that has been assigned a driver tag. 249 * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) 250 * where rq is a pointer to a request. Return true to continue 251 * iterating tags, false to stop. 252 * @data: Will be passed as third argument to @fn. 253 * @reserved: Indicates whether @bt is the breserved_tags member or the 254 * bitmap_tags member of struct blk_mq_tags. 255 */ 256 static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, 257 busy_iter_fn *fn, void *data, bool reserved) 258 { 259 struct bt_iter_data iter_data = { 260 .hctx = hctx, 261 .fn = fn, 262 .data = data, 263 .reserved = reserved, 264 }; 265 266 sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); 267 } 268 269 struct bt_tags_iter_data { 270 struct blk_mq_tags *tags; 271 busy_tag_iter_fn *fn; 272 void *data; 273 unsigned int flags; 274 }; 275 276 #define BT_TAG_ITER_RESERVED (1 << 0) 277 #define BT_TAG_ITER_STARTED (1 << 1) 278 #define BT_TAG_ITER_STATIC_RQS (1 << 2) 279 280 static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) 281 { 282 struct bt_tags_iter_data *iter_data = data; 283 struct blk_mq_tags *tags = iter_data->tags; 284 bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED; 285 struct request *rq; 286 bool ret = true; 287 bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); 288 289 if (!reserved) 290 bitnr += tags->nr_reserved_tags; 291 292 /* 293 * We can hit rq == NULL here, because the tagging functions 294 * test and set the bit before assigning ->rqs[]. 295 */ 296 if (iter_static_rqs) 297 rq = tags->static_rqs[bitnr]; 298 else 299 rq = blk_mq_find_and_get_req(tags, bitnr); 300 if (!rq) 301 return true; 302 303 if (!(iter_data->flags & BT_TAG_ITER_STARTED) || 304 blk_mq_request_started(rq)) 305 ret = iter_data->fn(rq, iter_data->data, reserved); 306 if (!iter_static_rqs) 307 blk_mq_put_rq_ref(rq); 308 return ret; 309 } 310 311 /** 312 * bt_tags_for_each - iterate over the requests in a tag map 313 * @tags: Tag map to iterate over. 314 * @bt: sbitmap to examine. This is either the breserved_tags member 315 * or the bitmap_tags member of struct blk_mq_tags. 316 * @fn: Pointer to the function that will be called for each started 317 * request. @fn will be called as follows: @fn(rq, @data, 318 * @reserved) where rq is a pointer to a request. Return true 319 * to continue iterating tags, false to stop. 320 * @data: Will be passed as second argument to @fn. 321 * @flags: BT_TAG_ITER_* 322 */ 323 static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, 324 busy_tag_iter_fn *fn, void *data, unsigned int flags) 325 { 326 struct bt_tags_iter_data iter_data = { 327 .tags = tags, 328 .fn = fn, 329 .data = data, 330 .flags = flags, 331 }; 332 333 if (tags->rqs) 334 sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); 335 } 336 337 static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, 338 busy_tag_iter_fn *fn, void *priv, unsigned int flags) 339 { 340 WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); 341 342 if (tags->nr_reserved_tags) 343 bt_tags_for_each(tags, tags->breserved_tags, fn, priv, 344 flags | BT_TAG_ITER_RESERVED); 345 bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags); 346 } 347 348 /** 349 * blk_mq_all_tag_iter - iterate over all requests in a tag map 350 * @tags: Tag map to iterate over. 351 * @fn: Pointer to the function that will be called for each 352 * request. @fn will be called as follows: @fn(rq, @priv, 353 * reserved) where rq is a pointer to a request. 'reserved' 354 * indicates whether or not @rq is a reserved request. Return 355 * true to continue iterating tags, false to stop. 356 * @priv: Will be passed as second argument to @fn. 357 * 358 * Caller has to pass the tag map from which requests are allocated. 359 */ 360 void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, 361 void *priv) 362 { 363 __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS); 364 } 365 366 /** 367 * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set 368 * @tagset: Tag set to iterate over. 369 * @fn: Pointer to the function that will be called for each started 370 * request. @fn will be called as follows: @fn(rq, @priv, 371 * reserved) where rq is a pointer to a request. 'reserved' 372 * indicates whether or not @rq is a reserved request. Return 373 * true to continue iterating tags, false to stop. 374 * @priv: Will be passed as second argument to @fn. 375 * 376 * We grab one request reference before calling @fn and release it after 377 * @fn returns. 378 */ 379 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 380 busy_tag_iter_fn *fn, void *priv) 381 { 382 int i; 383 384 for (i = 0; i < tagset->nr_hw_queues; i++) { 385 if (tagset->tags && tagset->tags[i]) 386 __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, 387 BT_TAG_ITER_STARTED); 388 } 389 } 390 EXPORT_SYMBOL(blk_mq_tagset_busy_iter); 391 392 static bool blk_mq_tagset_count_completed_rqs(struct request *rq, 393 void *data, bool reserved) 394 { 395 unsigned *count = data; 396 397 if (blk_mq_request_completed(rq)) 398 (*count)++; 399 return true; 400 } 401 402 /** 403 * blk_mq_tagset_wait_completed_request - Wait until all scheduled request 404 * completions have finished. 405 * @tagset: Tag set to drain completed request 406 * 407 * Note: This function has to be run after all IO queues are shutdown 408 */ 409 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) 410 { 411 while (true) { 412 unsigned count = 0; 413 414 blk_mq_tagset_busy_iter(tagset, 415 blk_mq_tagset_count_completed_rqs, &count); 416 if (!count) 417 break; 418 msleep(5); 419 } 420 } 421 EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); 422 423 /** 424 * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag 425 * @q: Request queue to examine. 426 * @fn: Pointer to the function that will be called for each request 427 * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, 428 * reserved) where rq is a pointer to a request and hctx points 429 * to the hardware queue associated with the request. 'reserved' 430 * indicates whether or not @rq is a reserved request. 431 * @priv: Will be passed as third argument to @fn. 432 * 433 * Note: if @q->tag_set is shared with other request queues then @fn will be 434 * called for all requests on all queues that share that tag set and not only 435 * for requests associated with @q. 436 */ 437 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, 438 void *priv) 439 { 440 struct blk_mq_hw_ctx *hctx; 441 int i; 442 443 /* 444 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx 445 * while the queue is frozen. So we can use q_usage_counter to avoid 446 * racing with it. 447 */ 448 if (!percpu_ref_tryget(&q->q_usage_counter)) 449 return; 450 451 queue_for_each_hw_ctx(q, hctx, i) { 452 struct blk_mq_tags *tags = hctx->tags; 453 454 /* 455 * If no software queues are currently mapped to this 456 * hardware queue, there's nothing to check 457 */ 458 if (!blk_mq_hw_queue_mapped(hctx)) 459 continue; 460 461 if (tags->nr_reserved_tags) 462 bt_for_each(hctx, tags->breserved_tags, fn, priv, true); 463 bt_for_each(hctx, tags->bitmap_tags, fn, priv, false); 464 } 465 blk_queue_exit(q); 466 } 467 468 static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, 469 bool round_robin, int node) 470 { 471 return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, 472 node); 473 } 474 475 int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, 476 struct sbitmap_queue *breserved_tags, 477 unsigned int queue_depth, unsigned int reserved, 478 int node, int alloc_policy) 479 { 480 unsigned int depth = queue_depth - reserved; 481 bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; 482 483 if (bt_alloc(bitmap_tags, depth, round_robin, node)) 484 return -ENOMEM; 485 if (bt_alloc(breserved_tags, reserved, round_robin, node)) 486 goto free_bitmap_tags; 487 488 return 0; 489 490 free_bitmap_tags: 491 sbitmap_queue_free(bitmap_tags); 492 return -ENOMEM; 493 } 494 495 static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, 496 int node, int alloc_policy) 497 { 498 int ret; 499 500 ret = blk_mq_init_bitmaps(&tags->__bitmap_tags, 501 &tags->__breserved_tags, 502 tags->nr_tags, tags->nr_reserved_tags, 503 node, alloc_policy); 504 if (ret) 505 return ret; 506 507 tags->bitmap_tags = &tags->__bitmap_tags; 508 tags->breserved_tags = &tags->__breserved_tags; 509 510 return 0; 511 } 512 513 int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set) 514 { 515 int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); 516 int i, ret; 517 518 ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags, 519 set->queue_depth, set->reserved_tags, 520 set->numa_node, alloc_policy); 521 if (ret) 522 return ret; 523 524 for (i = 0; i < set->nr_hw_queues; i++) { 525 struct blk_mq_tags *tags = set->tags[i]; 526 527 tags->bitmap_tags = &set->__bitmap_tags; 528 tags->breserved_tags = &set->__breserved_tags; 529 } 530 531 return 0; 532 } 533 534 void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) 535 { 536 sbitmap_queue_free(&set->__bitmap_tags); 537 sbitmap_queue_free(&set->__breserved_tags); 538 } 539 540 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 541 unsigned int reserved_tags, 542 int node, unsigned int flags) 543 { 544 int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags); 545 struct blk_mq_tags *tags; 546 547 if (total_tags > BLK_MQ_TAG_MAX) { 548 pr_err("blk-mq: tag depth too large\n"); 549 return NULL; 550 } 551 552 tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); 553 if (!tags) 554 return NULL; 555 556 tags->nr_tags = total_tags; 557 tags->nr_reserved_tags = reserved_tags; 558 spin_lock_init(&tags->lock); 559 560 if (blk_mq_is_sbitmap_shared(flags)) 561 return tags; 562 563 if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { 564 kfree(tags); 565 return NULL; 566 } 567 return tags; 568 } 569 570 void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) 571 { 572 if (!blk_mq_is_sbitmap_shared(flags)) { 573 sbitmap_queue_free(tags->bitmap_tags); 574 sbitmap_queue_free(tags->breserved_tags); 575 } 576 kfree(tags); 577 } 578 579 int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, 580 struct blk_mq_tags **tagsptr, unsigned int tdepth, 581 bool can_grow) 582 { 583 struct blk_mq_tags *tags = *tagsptr; 584 585 if (tdepth <= tags->nr_reserved_tags) 586 return -EINVAL; 587 588 /* 589 * If we are allowed to grow beyond the original size, allocate 590 * a new set of tags before freeing the old one. 591 */ 592 if (tdepth > tags->nr_tags) { 593 struct blk_mq_tag_set *set = hctx->queue->tag_set; 594 struct blk_mq_tags *new; 595 bool ret; 596 597 if (!can_grow) 598 return -EINVAL; 599 600 /* 601 * We need some sort of upper limit, set it high enough that 602 * no valid use cases should require more. 603 */ 604 if (tdepth > MAX_SCHED_RQ) 605 return -EINVAL; 606 607 new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 608 tags->nr_reserved_tags, set->flags); 609 if (!new) 610 return -ENOMEM; 611 ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); 612 if (ret) { 613 blk_mq_free_rq_map(new, set->flags); 614 return -ENOMEM; 615 } 616 617 blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); 618 blk_mq_free_rq_map(*tagsptr, set->flags); 619 *tagsptr = new; 620 } else { 621 /* 622 * Don't need (or can't) update reserved tags here, they 623 * remain static and should never need resizing. 624 */ 625 sbitmap_queue_resize(tags->bitmap_tags, 626 tdepth - tags->nr_reserved_tags); 627 } 628 629 return 0; 630 } 631 632 void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size) 633 { 634 sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags); 635 } 636 637 /** 638 * blk_mq_unique_tag() - return a tag that is unique queue-wide 639 * @rq: request for which to compute a unique tag 640 * 641 * The tag field in struct request is unique per hardware queue but not over 642 * all hardware queues. Hence this function that returns a tag with the 643 * hardware context index in the upper bits and the per hardware queue tag in 644 * the lower bits. 645 * 646 * Note: When called for a request that is queued on a non-multiqueue request 647 * queue, the hardware context index is set to zero. 648 */ 649 u32 blk_mq_unique_tag(struct request *rq) 650 { 651 return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | 652 (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); 653 } 654 EXPORT_SYMBOL(blk_mq_unique_tag); 655