1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Tag allocation using scalable bitmaps. Uses active queue tracking to support 4 * fairer distribution of tags between multiple submitters when a shared tag map 5 * is used. 6 * 7 * Copyright (C) 2013-2014 Jens Axboe 8 */ 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 12 #include <linux/blk-mq.h> 13 #include <linux/delay.h> 14 #include "blk.h" 15 #include "blk-mq.h" 16 #include "blk-mq-tag.h" 17 18 /* 19 * If a previously inactive queue goes active, bump the active user count. 20 * We need to do this before try to allocate driver tag, then even if fail 21 * to get tag when first time, the other shared-tag users could reserve 22 * budget for it. 23 */ 24 bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 25 { 26 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && 27 !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 28 atomic_inc(&hctx->tags->active_queues); 29 30 return true; 31 } 32 33 /* 34 * Wakeup all potentially sleeping on tags 35 */ 36 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) 37 { 38 sbitmap_queue_wake_all(&tags->bitmap_tags); 39 if (include_reserve) 40 sbitmap_queue_wake_all(&tags->breserved_tags); 41 } 42 43 /* 44 * If a previously busy queue goes inactive, potential waiters could now 45 * be allowed to queue. Wake them up and check. 46 */ 47 void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) 48 { 49 struct blk_mq_tags *tags = hctx->tags; 50 51 if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 52 return; 53 54 atomic_dec(&tags->active_queues); 55 56 blk_mq_tag_wakeup_all(tags, false); 57 } 58 59 static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, 60 struct sbitmap_queue *bt) 61 { 62 if (!data->q->elevator && !hctx_may_queue(data->hctx, bt)) 63 return BLK_MQ_NO_TAG; 64 65 if (data->shallow_depth) 66 return __sbitmap_queue_get_shallow(bt, data->shallow_depth); 67 else 68 return __sbitmap_queue_get(bt); 69 } 70 71 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) 72 { 73 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 74 struct sbitmap_queue *bt; 75 struct sbq_wait_state *ws; 76 DEFINE_SBQ_WAIT(wait); 77 unsigned int tag_offset; 78 int tag; 79 80 if (data->flags & BLK_MQ_REQ_RESERVED) { 81 if (unlikely(!tags->nr_reserved_tags)) { 82 WARN_ON_ONCE(1); 83 return BLK_MQ_NO_TAG; 84 } 85 bt = &tags->breserved_tags; 86 tag_offset = 0; 87 } else { 88 bt = &tags->bitmap_tags; 89 tag_offset = tags->nr_reserved_tags; 90 } 91 92 tag = __blk_mq_get_tag(data, bt); 93 if (tag != BLK_MQ_NO_TAG) 94 goto found_tag; 95 96 if (data->flags & BLK_MQ_REQ_NOWAIT) 97 return BLK_MQ_NO_TAG; 98 99 ws = bt_wait_ptr(bt, data->hctx); 100 do { 101 struct sbitmap_queue *bt_prev; 102 103 /* 104 * We're out of tags on this hardware queue, kick any 105 * pending IO submits before going to sleep waiting for 106 * some to complete. 107 */ 108 blk_mq_run_hw_queue(data->hctx, false); 109 110 /* 111 * Retry tag allocation after running the hardware queue, 112 * as running the queue may also have found completions. 113 */ 114 tag = __blk_mq_get_tag(data, bt); 115 if (tag != BLK_MQ_NO_TAG) 116 break; 117 118 sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); 119 120 tag = __blk_mq_get_tag(data, bt); 121 if (tag != BLK_MQ_NO_TAG) 122 break; 123 124 bt_prev = bt; 125 io_schedule(); 126 127 sbitmap_finish_wait(bt, ws, &wait); 128 129 data->ctx = blk_mq_get_ctx(data->q); 130 data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, 131 data->ctx); 132 tags = blk_mq_tags_from_data(data); 133 if (data->flags & BLK_MQ_REQ_RESERVED) 134 bt = &tags->breserved_tags; 135 else 136 bt = &tags->bitmap_tags; 137 138 /* 139 * If destination hw queue is changed, fake wake up on 140 * previous queue for compensating the wake up miss, so 141 * other allocations on previous queue won't be starved. 142 */ 143 if (bt != bt_prev) 144 sbitmap_queue_wake_up(bt_prev); 145 146 ws = bt_wait_ptr(bt, data->hctx); 147 } while (1); 148 149 sbitmap_finish_wait(bt, ws, &wait); 150 151 found_tag: 152 /* 153 * Give up this allocation if the hctx is inactive. The caller will 154 * retry on an active hctx. 155 */ 156 if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { 157 blk_mq_put_tag(tags, data->ctx, tag + tag_offset); 158 return BLK_MQ_NO_TAG; 159 } 160 return tag + tag_offset; 161 } 162 163 void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, 164 unsigned int tag) 165 { 166 if (!blk_mq_tag_is_reserved(tags, tag)) { 167 const int real_tag = tag - tags->nr_reserved_tags; 168 169 BUG_ON(real_tag >= tags->nr_tags); 170 sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); 171 } else { 172 BUG_ON(tag >= tags->nr_reserved_tags); 173 sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); 174 } 175 } 176 177 struct bt_iter_data { 178 struct blk_mq_hw_ctx *hctx; 179 busy_iter_fn *fn; 180 void *data; 181 bool reserved; 182 }; 183 184 static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) 185 { 186 struct bt_iter_data *iter_data = data; 187 struct blk_mq_hw_ctx *hctx = iter_data->hctx; 188 struct blk_mq_tags *tags = hctx->tags; 189 bool reserved = iter_data->reserved; 190 struct request *rq; 191 192 if (!reserved) 193 bitnr += tags->nr_reserved_tags; 194 rq = tags->rqs[bitnr]; 195 196 /* 197 * We can hit rq == NULL here, because the tagging functions 198 * test and set the bit before assigning ->rqs[]. 199 */ 200 if (rq && rq->q == hctx->queue) 201 return iter_data->fn(hctx, rq, iter_data->data, reserved); 202 return true; 203 } 204 205 /** 206 * bt_for_each - iterate over the requests associated with a hardware queue 207 * @hctx: Hardware queue to examine. 208 * @bt: sbitmap to examine. This is either the breserved_tags member 209 * or the bitmap_tags member of struct blk_mq_tags. 210 * @fn: Pointer to the function that will be called for each request 211 * associated with @hctx that has been assigned a driver tag. 212 * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) 213 * where rq is a pointer to a request. Return true to continue 214 * iterating tags, false to stop. 215 * @data: Will be passed as third argument to @fn. 216 * @reserved: Indicates whether @bt is the breserved_tags member or the 217 * bitmap_tags member of struct blk_mq_tags. 218 */ 219 static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, 220 busy_iter_fn *fn, void *data, bool reserved) 221 { 222 struct bt_iter_data iter_data = { 223 .hctx = hctx, 224 .fn = fn, 225 .data = data, 226 .reserved = reserved, 227 }; 228 229 sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); 230 } 231 232 struct bt_tags_iter_data { 233 struct blk_mq_tags *tags; 234 busy_tag_iter_fn *fn; 235 void *data; 236 unsigned int flags; 237 }; 238 239 #define BT_TAG_ITER_RESERVED (1 << 0) 240 #define BT_TAG_ITER_STARTED (1 << 1) 241 #define BT_TAG_ITER_STATIC_RQS (1 << 2) 242 243 static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) 244 { 245 struct bt_tags_iter_data *iter_data = data; 246 struct blk_mq_tags *tags = iter_data->tags; 247 bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED; 248 struct request *rq; 249 250 if (!reserved) 251 bitnr += tags->nr_reserved_tags; 252 253 /* 254 * We can hit rq == NULL here, because the tagging functions 255 * test and set the bit before assigning ->rqs[]. 256 */ 257 if (iter_data->flags & BT_TAG_ITER_STATIC_RQS) 258 rq = tags->static_rqs[bitnr]; 259 else 260 rq = tags->rqs[bitnr]; 261 if (!rq) 262 return true; 263 if ((iter_data->flags & BT_TAG_ITER_STARTED) && 264 !blk_mq_request_started(rq)) 265 return true; 266 return iter_data->fn(rq, iter_data->data, reserved); 267 } 268 269 /** 270 * bt_tags_for_each - iterate over the requests in a tag map 271 * @tags: Tag map to iterate over. 272 * @bt: sbitmap to examine. This is either the breserved_tags member 273 * or the bitmap_tags member of struct blk_mq_tags. 274 * @fn: Pointer to the function that will be called for each started 275 * request. @fn will be called as follows: @fn(rq, @data, 276 * @reserved) where rq is a pointer to a request. Return true 277 * to continue iterating tags, false to stop. 278 * @data: Will be passed as second argument to @fn. 279 * @flags: BT_TAG_ITER_* 280 */ 281 static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, 282 busy_tag_iter_fn *fn, void *data, unsigned int flags) 283 { 284 struct bt_tags_iter_data iter_data = { 285 .tags = tags, 286 .fn = fn, 287 .data = data, 288 .flags = flags, 289 }; 290 291 if (tags->rqs) 292 sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); 293 } 294 295 static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, 296 busy_tag_iter_fn *fn, void *priv, unsigned int flags) 297 { 298 WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); 299 300 if (tags->nr_reserved_tags) 301 bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, 302 flags | BT_TAG_ITER_RESERVED); 303 bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); 304 } 305 306 /** 307 * blk_mq_all_tag_iter - iterate over all requests in a tag map 308 * @tags: Tag map to iterate over. 309 * @fn: Pointer to the function that will be called for each 310 * request. @fn will be called as follows: @fn(rq, @priv, 311 * reserved) where rq is a pointer to a request. 'reserved' 312 * indicates whether or not @rq is a reserved request. Return 313 * true to continue iterating tags, false to stop. 314 * @priv: Will be passed as second argument to @fn. 315 * 316 * Caller has to pass the tag map from which requests are allocated. 317 */ 318 void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, 319 void *priv) 320 { 321 __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS); 322 } 323 324 /** 325 * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set 326 * @tagset: Tag set to iterate over. 327 * @fn: Pointer to the function that will be called for each started 328 * request. @fn will be called as follows: @fn(rq, @priv, 329 * reserved) where rq is a pointer to a request. 'reserved' 330 * indicates whether or not @rq is a reserved request. Return 331 * true to continue iterating tags, false to stop. 332 * @priv: Will be passed as second argument to @fn. 333 */ 334 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 335 busy_tag_iter_fn *fn, void *priv) 336 { 337 int i; 338 339 for (i = 0; i < tagset->nr_hw_queues; i++) { 340 if (tagset->tags && tagset->tags[i]) 341 __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, 342 BT_TAG_ITER_STARTED); 343 } 344 } 345 EXPORT_SYMBOL(blk_mq_tagset_busy_iter); 346 347 static bool blk_mq_tagset_count_completed_rqs(struct request *rq, 348 void *data, bool reserved) 349 { 350 unsigned *count = data; 351 352 if (blk_mq_request_completed(rq)) 353 (*count)++; 354 return true; 355 } 356 357 /** 358 * blk_mq_tagset_wait_completed_request - wait until all completed req's 359 * complete funtion is run 360 * @tagset: Tag set to drain completed request 361 * 362 * Note: This function has to be run after all IO queues are shutdown 363 */ 364 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) 365 { 366 while (true) { 367 unsigned count = 0; 368 369 blk_mq_tagset_busy_iter(tagset, 370 blk_mq_tagset_count_completed_rqs, &count); 371 if (!count) 372 break; 373 msleep(5); 374 } 375 } 376 EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); 377 378 /** 379 * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag 380 * @q: Request queue to examine. 381 * @fn: Pointer to the function that will be called for each request 382 * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, 383 * reserved) where rq is a pointer to a request and hctx points 384 * to the hardware queue associated with the request. 'reserved' 385 * indicates whether or not @rq is a reserved request. 386 * @priv: Will be passed as third argument to @fn. 387 * 388 * Note: if @q->tag_set is shared with other request queues then @fn will be 389 * called for all requests on all queues that share that tag set and not only 390 * for requests associated with @q. 391 */ 392 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, 393 void *priv) 394 { 395 struct blk_mq_hw_ctx *hctx; 396 int i; 397 398 /* 399 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx 400 * while the queue is frozen. So we can use q_usage_counter to avoid 401 * racing with it. __blk_mq_update_nr_hw_queues() uses 402 * synchronize_rcu() to ensure this function left the critical section 403 * below. 404 */ 405 if (!percpu_ref_tryget(&q->q_usage_counter)) 406 return; 407 408 queue_for_each_hw_ctx(q, hctx, i) { 409 struct blk_mq_tags *tags = hctx->tags; 410 411 /* 412 * If no software queues are currently mapped to this 413 * hardware queue, there's nothing to check 414 */ 415 if (!blk_mq_hw_queue_mapped(hctx)) 416 continue; 417 418 if (tags->nr_reserved_tags) 419 bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); 420 bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); 421 } 422 blk_queue_exit(q); 423 } 424 425 static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, 426 bool round_robin, int node) 427 { 428 return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, 429 node); 430 } 431 432 static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, 433 int node, int alloc_policy) 434 { 435 unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; 436 bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; 437 438 if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) 439 goto free_tags; 440 if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin, 441 node)) 442 goto free_bitmap_tags; 443 444 return tags; 445 free_bitmap_tags: 446 sbitmap_queue_free(&tags->bitmap_tags); 447 free_tags: 448 kfree(tags); 449 return NULL; 450 } 451 452 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 453 unsigned int reserved_tags, 454 int node, int alloc_policy) 455 { 456 struct blk_mq_tags *tags; 457 458 if (total_tags > BLK_MQ_TAG_MAX) { 459 pr_err("blk-mq: tag depth too large\n"); 460 return NULL; 461 } 462 463 tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); 464 if (!tags) 465 return NULL; 466 467 tags->nr_tags = total_tags; 468 tags->nr_reserved_tags = reserved_tags; 469 470 return blk_mq_init_bitmap_tags(tags, node, alloc_policy); 471 } 472 473 void blk_mq_free_tags(struct blk_mq_tags *tags) 474 { 475 sbitmap_queue_free(&tags->bitmap_tags); 476 sbitmap_queue_free(&tags->breserved_tags); 477 kfree(tags); 478 } 479 480 int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, 481 struct blk_mq_tags **tagsptr, unsigned int tdepth, 482 bool can_grow) 483 { 484 struct blk_mq_tags *tags = *tagsptr; 485 486 if (tdepth <= tags->nr_reserved_tags) 487 return -EINVAL; 488 489 /* 490 * If we are allowed to grow beyond the original size, allocate 491 * a new set of tags before freeing the old one. 492 */ 493 if (tdepth > tags->nr_tags) { 494 struct blk_mq_tag_set *set = hctx->queue->tag_set; 495 struct blk_mq_tags *new; 496 bool ret; 497 498 if (!can_grow) 499 return -EINVAL; 500 501 /* 502 * We need some sort of upper limit, set it high enough that 503 * no valid use cases should require more. 504 */ 505 if (tdepth > 16 * BLKDEV_MAX_RQ) 506 return -EINVAL; 507 508 new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 509 tags->nr_reserved_tags); 510 if (!new) 511 return -ENOMEM; 512 ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); 513 if (ret) { 514 blk_mq_free_rq_map(new); 515 return -ENOMEM; 516 } 517 518 blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); 519 blk_mq_free_rq_map(*tagsptr); 520 *tagsptr = new; 521 } else { 522 /* 523 * Don't need (or can't) update reserved tags here, they 524 * remain static and should never need resizing. 525 */ 526 sbitmap_queue_resize(&tags->bitmap_tags, 527 tdepth - tags->nr_reserved_tags); 528 } 529 530 return 0; 531 } 532 533 /** 534 * blk_mq_unique_tag() - return a tag that is unique queue-wide 535 * @rq: request for which to compute a unique tag 536 * 537 * The tag field in struct request is unique per hardware queue but not over 538 * all hardware queues. Hence this function that returns a tag with the 539 * hardware context index in the upper bits and the per hardware queue tag in 540 * the lower bits. 541 * 542 * Note: When called for a request that is queued on a non-multiqueue request 543 * queue, the hardware context index is set to zero. 544 */ 545 u32 blk_mq_unique_tag(struct request *rq) 546 { 547 return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | 548 (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); 549 } 550 EXPORT_SYMBOL(blk_mq_unique_tag); 551