1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Tag allocation using scalable bitmaps. Uses active queue tracking to support 4 * fairer distribution of tags between multiple submitters when a shared tag map 5 * is used. 6 * 7 * Copyright (C) 2013-2014 Jens Axboe 8 */ 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 12 #include <linux/blk-mq.h> 13 #include <linux/delay.h> 14 #include "blk.h" 15 #include "blk-mq.h" 16 #include "blk-mq-sched.h" 17 #include "blk-mq-tag.h" 18 19 /* 20 * If a previously inactive queue goes active, bump the active user count. 21 * We need to do this before try to allocate driver tag, then even if fail 22 * to get tag when first time, the other shared-tag users could reserve 23 * budget for it. 24 */ 25 bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 26 { 27 if (blk_mq_is_shared_tags(hctx->flags)) { 28 struct request_queue *q = hctx->queue; 29 30 if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && 31 !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) 32 atomic_inc(&hctx->tags->active_queues); 33 } else { 34 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && 35 !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 36 atomic_inc(&hctx->tags->active_queues); 37 } 38 39 return true; 40 } 41 42 /* 43 * Wakeup all potentially sleeping on tags 44 */ 45 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) 46 { 47 sbitmap_queue_wake_all(&tags->bitmap_tags); 48 if (include_reserve) 49 sbitmap_queue_wake_all(&tags->breserved_tags); 50 } 51 52 /* 53 * If a previously busy queue goes inactive, potential waiters could now 54 * be allowed to queue. Wake them up and check. 55 */ 56 void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) 57 { 58 struct blk_mq_tags *tags = hctx->tags; 59 60 if (blk_mq_is_shared_tags(hctx->flags)) { 61 struct request_queue *q = hctx->queue; 62 63 if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, 64 &q->queue_flags)) 65 return; 66 } else { 67 if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 68 return; 69 } 70 71 atomic_dec(&tags->active_queues); 72 73 blk_mq_tag_wakeup_all(tags, false); 74 } 75 76 static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, 77 struct sbitmap_queue *bt) 78 { 79 if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && 80 !hctx_may_queue(data->hctx, bt)) 81 return BLK_MQ_NO_TAG; 82 83 if (data->shallow_depth) 84 return __sbitmap_queue_get_shallow(bt, data->shallow_depth); 85 else 86 return __sbitmap_queue_get(bt); 87 } 88 89 unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, 90 unsigned int *offset) 91 { 92 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 93 struct sbitmap_queue *bt = &tags->bitmap_tags; 94 unsigned long ret; 95 96 if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED || 97 data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) 98 return 0; 99 ret = __sbitmap_queue_get_batch(bt, nr_tags, offset); 100 *offset += tags->nr_reserved_tags; 101 return ret; 102 } 103 104 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) 105 { 106 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 107 struct sbitmap_queue *bt; 108 struct sbq_wait_state *ws; 109 DEFINE_SBQ_WAIT(wait); 110 unsigned int tag_offset; 111 int tag; 112 113 if (data->flags & BLK_MQ_REQ_RESERVED) { 114 if (unlikely(!tags->nr_reserved_tags)) { 115 WARN_ON_ONCE(1); 116 return BLK_MQ_NO_TAG; 117 } 118 bt = &tags->breserved_tags; 119 tag_offset = 0; 120 } else { 121 bt = &tags->bitmap_tags; 122 tag_offset = tags->nr_reserved_tags; 123 } 124 125 tag = __blk_mq_get_tag(data, bt); 126 if (tag != BLK_MQ_NO_TAG) 127 goto found_tag; 128 129 if (data->flags & BLK_MQ_REQ_NOWAIT) 130 return BLK_MQ_NO_TAG; 131 132 ws = bt_wait_ptr(bt, data->hctx); 133 do { 134 struct sbitmap_queue *bt_prev; 135 136 /* 137 * We're out of tags on this hardware queue, kick any 138 * pending IO submits before going to sleep waiting for 139 * some to complete. 140 */ 141 blk_mq_run_hw_queue(data->hctx, false); 142 143 /* 144 * Retry tag allocation after running the hardware queue, 145 * as running the queue may also have found completions. 146 */ 147 tag = __blk_mq_get_tag(data, bt); 148 if (tag != BLK_MQ_NO_TAG) 149 break; 150 151 sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); 152 153 tag = __blk_mq_get_tag(data, bt); 154 if (tag != BLK_MQ_NO_TAG) 155 break; 156 157 bt_prev = bt; 158 io_schedule(); 159 160 sbitmap_finish_wait(bt, ws, &wait); 161 162 data->ctx = blk_mq_get_ctx(data->q); 163 data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, 164 data->ctx); 165 tags = blk_mq_tags_from_data(data); 166 if (data->flags & BLK_MQ_REQ_RESERVED) 167 bt = &tags->breserved_tags; 168 else 169 bt = &tags->bitmap_tags; 170 171 /* 172 * If destination hw queue is changed, fake wake up on 173 * previous queue for compensating the wake up miss, so 174 * other allocations on previous queue won't be starved. 175 */ 176 if (bt != bt_prev) 177 sbitmap_queue_wake_up(bt_prev); 178 179 ws = bt_wait_ptr(bt, data->hctx); 180 } while (1); 181 182 sbitmap_finish_wait(bt, ws, &wait); 183 184 found_tag: 185 /* 186 * Give up this allocation if the hctx is inactive. The caller will 187 * retry on an active hctx. 188 */ 189 if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { 190 blk_mq_put_tag(tags, data->ctx, tag + tag_offset); 191 return BLK_MQ_NO_TAG; 192 } 193 return tag + tag_offset; 194 } 195 196 void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, 197 unsigned int tag) 198 { 199 if (!blk_mq_tag_is_reserved(tags, tag)) { 200 const int real_tag = tag - tags->nr_reserved_tags; 201 202 BUG_ON(real_tag >= tags->nr_tags); 203 sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); 204 } else { 205 BUG_ON(tag >= tags->nr_reserved_tags); 206 sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); 207 } 208 } 209 210 void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags) 211 { 212 sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags, 213 tag_array, nr_tags); 214 } 215 216 struct bt_iter_data { 217 struct blk_mq_hw_ctx *hctx; 218 busy_iter_fn *fn; 219 void *data; 220 bool reserved; 221 }; 222 223 static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, 224 unsigned int bitnr) 225 { 226 struct request *rq; 227 unsigned long flags; 228 229 spin_lock_irqsave(&tags->lock, flags); 230 rq = tags->rqs[bitnr]; 231 if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref)) 232 rq = NULL; 233 spin_unlock_irqrestore(&tags->lock, flags); 234 return rq; 235 } 236 237 static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) 238 { 239 struct bt_iter_data *iter_data = data; 240 struct blk_mq_hw_ctx *hctx = iter_data->hctx; 241 struct blk_mq_tags *tags = hctx->tags; 242 bool reserved = iter_data->reserved; 243 struct request *rq; 244 bool ret = true; 245 246 if (!reserved) 247 bitnr += tags->nr_reserved_tags; 248 /* 249 * We can hit rq == NULL here, because the tagging functions 250 * test and set the bit before assigning ->rqs[]. 251 */ 252 rq = blk_mq_find_and_get_req(tags, bitnr); 253 if (!rq) 254 return true; 255 256 if (rq->q == hctx->queue && rq->mq_hctx == hctx) 257 ret = iter_data->fn(hctx, rq, iter_data->data, reserved); 258 blk_mq_put_rq_ref(rq); 259 return ret; 260 } 261 262 /** 263 * bt_for_each - iterate over the requests associated with a hardware queue 264 * @hctx: Hardware queue to examine. 265 * @bt: sbitmap to examine. This is either the breserved_tags member 266 * or the bitmap_tags member of struct blk_mq_tags. 267 * @fn: Pointer to the function that will be called for each request 268 * associated with @hctx that has been assigned a driver tag. 269 * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) 270 * where rq is a pointer to a request. Return true to continue 271 * iterating tags, false to stop. 272 * @data: Will be passed as third argument to @fn. 273 * @reserved: Indicates whether @bt is the breserved_tags member or the 274 * bitmap_tags member of struct blk_mq_tags. 275 */ 276 static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, 277 busy_iter_fn *fn, void *data, bool reserved) 278 { 279 struct bt_iter_data iter_data = { 280 .hctx = hctx, 281 .fn = fn, 282 .data = data, 283 .reserved = reserved, 284 }; 285 286 sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); 287 } 288 289 struct bt_tags_iter_data { 290 struct blk_mq_tags *tags; 291 busy_tag_iter_fn *fn; 292 void *data; 293 unsigned int flags; 294 }; 295 296 #define BT_TAG_ITER_RESERVED (1 << 0) 297 #define BT_TAG_ITER_STARTED (1 << 1) 298 #define BT_TAG_ITER_STATIC_RQS (1 << 2) 299 300 static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) 301 { 302 struct bt_tags_iter_data *iter_data = data; 303 struct blk_mq_tags *tags = iter_data->tags; 304 bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED; 305 struct request *rq; 306 bool ret = true; 307 bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); 308 309 if (!reserved) 310 bitnr += tags->nr_reserved_tags; 311 312 /* 313 * We can hit rq == NULL here, because the tagging functions 314 * test and set the bit before assigning ->rqs[]. 315 */ 316 if (iter_static_rqs) 317 rq = tags->static_rqs[bitnr]; 318 else 319 rq = blk_mq_find_and_get_req(tags, bitnr); 320 if (!rq) 321 return true; 322 323 if (!(iter_data->flags & BT_TAG_ITER_STARTED) || 324 blk_mq_request_started(rq)) 325 ret = iter_data->fn(rq, iter_data->data, reserved); 326 if (!iter_static_rqs) 327 blk_mq_put_rq_ref(rq); 328 return ret; 329 } 330 331 /** 332 * bt_tags_for_each - iterate over the requests in a tag map 333 * @tags: Tag map to iterate over. 334 * @bt: sbitmap to examine. This is either the breserved_tags member 335 * or the bitmap_tags member of struct blk_mq_tags. 336 * @fn: Pointer to the function that will be called for each started 337 * request. @fn will be called as follows: @fn(rq, @data, 338 * @reserved) where rq is a pointer to a request. Return true 339 * to continue iterating tags, false to stop. 340 * @data: Will be passed as second argument to @fn. 341 * @flags: BT_TAG_ITER_* 342 */ 343 static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, 344 busy_tag_iter_fn *fn, void *data, unsigned int flags) 345 { 346 struct bt_tags_iter_data iter_data = { 347 .tags = tags, 348 .fn = fn, 349 .data = data, 350 .flags = flags, 351 }; 352 353 if (tags->rqs) 354 sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); 355 } 356 357 static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, 358 busy_tag_iter_fn *fn, void *priv, unsigned int flags) 359 { 360 WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); 361 362 if (tags->nr_reserved_tags) 363 bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, 364 flags | BT_TAG_ITER_RESERVED); 365 bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); 366 } 367 368 /** 369 * blk_mq_all_tag_iter - iterate over all requests in a tag map 370 * @tags: Tag map to iterate over. 371 * @fn: Pointer to the function that will be called for each 372 * request. @fn will be called as follows: @fn(rq, @priv, 373 * reserved) where rq is a pointer to a request. 'reserved' 374 * indicates whether or not @rq is a reserved request. Return 375 * true to continue iterating tags, false to stop. 376 * @priv: Will be passed as second argument to @fn. 377 * 378 * Caller has to pass the tag map from which requests are allocated. 379 */ 380 void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, 381 void *priv) 382 { 383 __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS); 384 } 385 386 /** 387 * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set 388 * @tagset: Tag set to iterate over. 389 * @fn: Pointer to the function that will be called for each started 390 * request. @fn will be called as follows: @fn(rq, @priv, 391 * reserved) where rq is a pointer to a request. 'reserved' 392 * indicates whether or not @rq is a reserved request. Return 393 * true to continue iterating tags, false to stop. 394 * @priv: Will be passed as second argument to @fn. 395 * 396 * We grab one request reference before calling @fn and release it after 397 * @fn returns. 398 */ 399 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 400 busy_tag_iter_fn *fn, void *priv) 401 { 402 unsigned int flags = tagset->flags; 403 int i, nr_tags; 404 405 nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; 406 407 for (i = 0; i < nr_tags; i++) { 408 if (tagset->tags && tagset->tags[i]) 409 __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, 410 BT_TAG_ITER_STARTED); 411 } 412 } 413 EXPORT_SYMBOL(blk_mq_tagset_busy_iter); 414 415 static bool blk_mq_tagset_count_completed_rqs(struct request *rq, 416 void *data, bool reserved) 417 { 418 unsigned *count = data; 419 420 if (blk_mq_request_completed(rq)) 421 (*count)++; 422 return true; 423 } 424 425 /** 426 * blk_mq_tagset_wait_completed_request - Wait until all scheduled request 427 * completions have finished. 428 * @tagset: Tag set to drain completed request 429 * 430 * Note: This function has to be run after all IO queues are shutdown 431 */ 432 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) 433 { 434 while (true) { 435 unsigned count = 0; 436 437 blk_mq_tagset_busy_iter(tagset, 438 blk_mq_tagset_count_completed_rqs, &count); 439 if (!count) 440 break; 441 msleep(5); 442 } 443 } 444 EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); 445 446 /** 447 * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag 448 * @q: Request queue to examine. 449 * @fn: Pointer to the function that will be called for each request 450 * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, 451 * reserved) where rq is a pointer to a request and hctx points 452 * to the hardware queue associated with the request. 'reserved' 453 * indicates whether or not @rq is a reserved request. 454 * @priv: Will be passed as third argument to @fn. 455 * 456 * Note: if @q->tag_set is shared with other request queues then @fn will be 457 * called for all requests on all queues that share that tag set and not only 458 * for requests associated with @q. 459 */ 460 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, 461 void *priv) 462 { 463 struct blk_mq_hw_ctx *hctx; 464 int i; 465 466 /* 467 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx 468 * while the queue is frozen. So we can use q_usage_counter to avoid 469 * racing with it. 470 */ 471 if (!percpu_ref_tryget(&q->q_usage_counter)) 472 return; 473 474 queue_for_each_hw_ctx(q, hctx, i) { 475 struct blk_mq_tags *tags = hctx->tags; 476 477 /* 478 * If no software queues are currently mapped to this 479 * hardware queue, there's nothing to check 480 */ 481 if (!blk_mq_hw_queue_mapped(hctx)) 482 continue; 483 484 if (tags->nr_reserved_tags) 485 bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); 486 bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); 487 } 488 blk_queue_exit(q); 489 } 490 491 static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, 492 bool round_robin, int node) 493 { 494 return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, 495 node); 496 } 497 498 int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, 499 struct sbitmap_queue *breserved_tags, 500 unsigned int queue_depth, unsigned int reserved, 501 int node, int alloc_policy) 502 { 503 unsigned int depth = queue_depth - reserved; 504 bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; 505 506 if (bt_alloc(bitmap_tags, depth, round_robin, node)) 507 return -ENOMEM; 508 if (bt_alloc(breserved_tags, reserved, round_robin, node)) 509 goto free_bitmap_tags; 510 511 return 0; 512 513 free_bitmap_tags: 514 sbitmap_queue_free(bitmap_tags); 515 return -ENOMEM; 516 } 517 518 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 519 unsigned int reserved_tags, 520 int node, int alloc_policy) 521 { 522 struct blk_mq_tags *tags; 523 524 if (total_tags > BLK_MQ_TAG_MAX) { 525 pr_err("blk-mq: tag depth too large\n"); 526 return NULL; 527 } 528 529 tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); 530 if (!tags) 531 return NULL; 532 533 tags->nr_tags = total_tags; 534 tags->nr_reserved_tags = reserved_tags; 535 spin_lock_init(&tags->lock); 536 537 if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, 538 total_tags, reserved_tags, node, 539 alloc_policy) < 0) { 540 kfree(tags); 541 return NULL; 542 } 543 return tags; 544 } 545 546 void blk_mq_free_tags(struct blk_mq_tags *tags) 547 { 548 sbitmap_queue_free(&tags->bitmap_tags); 549 sbitmap_queue_free(&tags->breserved_tags); 550 kfree(tags); 551 } 552 553 int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, 554 struct blk_mq_tags **tagsptr, unsigned int tdepth, 555 bool can_grow) 556 { 557 struct blk_mq_tags *tags = *tagsptr; 558 559 if (tdepth <= tags->nr_reserved_tags) 560 return -EINVAL; 561 562 /* 563 * If we are allowed to grow beyond the original size, allocate 564 * a new set of tags before freeing the old one. 565 */ 566 if (tdepth > tags->nr_tags) { 567 struct blk_mq_tag_set *set = hctx->queue->tag_set; 568 struct blk_mq_tags *new; 569 570 if (!can_grow) 571 return -EINVAL; 572 573 /* 574 * We need some sort of upper limit, set it high enough that 575 * no valid use cases should require more. 576 */ 577 if (tdepth > MAX_SCHED_RQ) 578 return -EINVAL; 579 580 /* 581 * Only the sbitmap needs resizing since we allocated the max 582 * initially. 583 */ 584 if (blk_mq_is_shared_tags(set->flags)) 585 return 0; 586 587 new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); 588 if (!new) 589 return -ENOMEM; 590 591 blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); 592 *tagsptr = new; 593 } else { 594 /* 595 * Don't need (or can't) update reserved tags here, they 596 * remain static and should never need resizing. 597 */ 598 sbitmap_queue_resize(&tags->bitmap_tags, 599 tdepth - tags->nr_reserved_tags); 600 } 601 602 return 0; 603 } 604 605 void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size) 606 { 607 struct blk_mq_tags *tags = set->shared_tags; 608 609 sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags); 610 } 611 612 void blk_mq_tag_update_sched_shared_tags(struct request_queue *q) 613 { 614 sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags, 615 q->nr_requests - q->tag_set->reserved_tags); 616 } 617 618 /** 619 * blk_mq_unique_tag() - return a tag that is unique queue-wide 620 * @rq: request for which to compute a unique tag 621 * 622 * The tag field in struct request is unique per hardware queue but not over 623 * all hardware queues. Hence this function that returns a tag with the 624 * hardware context index in the upper bits and the per hardware queue tag in 625 * the lower bits. 626 * 627 * Note: When called for a request that is queued on a non-multiqueue request 628 * queue, the hardware context index is set to zero. 629 */ 630 u32 blk_mq_unique_tag(struct request *rq) 631 { 632 return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | 633 (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); 634 } 635 EXPORT_SYMBOL(blk_mq_unique_tag); 636