1 /* 2 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 2 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public Licens 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * 17 */ 18 #include <linux/mm.h> 19 #include <linux/swap.h> 20 #include <linux/bio.h> 21 #include <linux/blkdev.h> 22 #include <linux/uio.h> 23 #include <linux/iocontext.h> 24 #include <linux/slab.h> 25 #include <linux/init.h> 26 #include <linux/kernel.h> 27 #include <linux/export.h> 28 #include <linux/mempool.h> 29 #include <linux/workqueue.h> 30 #include <linux/cgroup.h> 31 #include <scsi/sg.h> /* for struct sg_iovec */ 32 33 #include <trace/events/block.h> 34 35 /* 36 * Test patch to inline a certain number of bi_io_vec's inside the bio 37 * itself, to shrink a bio data allocation from two mempool calls to one 38 */ 39 #define BIO_INLINE_VECS 4 40 41 /* 42 * if you change this list, also change bvec_alloc or things will 43 * break badly! cannot be bigger than what you can fit into an 44 * unsigned short 45 */ 46 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 47 static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 48 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 49 }; 50 #undef BV 51 52 /* 53 * fs_bio_set is the bio_set containing bio and iovec memory pools used by 54 * IO code that does not need private memory pools. 55 */ 56 struct bio_set *fs_bio_set; 57 EXPORT_SYMBOL(fs_bio_set); 58 59 /* 60 * Our slab pool management 61 */ 62 struct bio_slab { 63 struct kmem_cache *slab; 64 unsigned int slab_ref; 65 unsigned int slab_size; 66 char name[8]; 67 }; 68 static DEFINE_MUTEX(bio_slab_lock); 69 static struct bio_slab *bio_slabs; 70 static unsigned int bio_slab_nr, bio_slab_max; 71 72 static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) 73 { 74 unsigned int sz = sizeof(struct bio) + extra_size; 75 struct kmem_cache *slab = NULL; 76 struct bio_slab *bslab, *new_bio_slabs; 77 unsigned int new_bio_slab_max; 78 unsigned int i, entry = -1; 79 80 mutex_lock(&bio_slab_lock); 81 82 i = 0; 83 while (i < bio_slab_nr) { 84 bslab = &bio_slabs[i]; 85 86 if (!bslab->slab && entry == -1) 87 entry = i; 88 else if (bslab->slab_size == sz) { 89 slab = bslab->slab; 90 bslab->slab_ref++; 91 break; 92 } 93 i++; 94 } 95 96 if (slab) 97 goto out_unlock; 98 99 if (bio_slab_nr == bio_slab_max && entry == -1) { 100 new_bio_slab_max = bio_slab_max << 1; 101 new_bio_slabs = krealloc(bio_slabs, 102 new_bio_slab_max * sizeof(struct bio_slab), 103 GFP_KERNEL); 104 if (!new_bio_slabs) 105 goto out_unlock; 106 bio_slab_max = new_bio_slab_max; 107 bio_slabs = new_bio_slabs; 108 } 109 if (entry == -1) 110 entry = bio_slab_nr++; 111 112 bslab = &bio_slabs[entry]; 113 114 snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); 115 slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN, 116 SLAB_HWCACHE_ALIGN, NULL); 117 if (!slab) 118 goto out_unlock; 119 120 bslab->slab = slab; 121 bslab->slab_ref = 1; 122 bslab->slab_size = sz; 123 out_unlock: 124 mutex_unlock(&bio_slab_lock); 125 return slab; 126 } 127 128 static void bio_put_slab(struct bio_set *bs) 129 { 130 struct bio_slab *bslab = NULL; 131 unsigned int i; 132 133 mutex_lock(&bio_slab_lock); 134 135 for (i = 0; i < bio_slab_nr; i++) { 136 if (bs->bio_slab == bio_slabs[i].slab) { 137 bslab = &bio_slabs[i]; 138 break; 139 } 140 } 141 142 if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) 143 goto out; 144 145 WARN_ON(!bslab->slab_ref); 146 147 if (--bslab->slab_ref) 148 goto out; 149 150 kmem_cache_destroy(bslab->slab); 151 bslab->slab = NULL; 152 153 out: 154 mutex_unlock(&bio_slab_lock); 155 } 156 157 unsigned int bvec_nr_vecs(unsigned short idx) 158 { 159 return bvec_slabs[idx].nr_vecs; 160 } 161 162 void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) 163 { 164 BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); 165 166 if (idx == BIOVEC_MAX_IDX) 167 mempool_free(bv, pool); 168 else { 169 struct biovec_slab *bvs = bvec_slabs + idx; 170 171 kmem_cache_free(bvs->slab, bv); 172 } 173 } 174 175 struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, 176 mempool_t *pool) 177 { 178 struct bio_vec *bvl; 179 180 /* 181 * see comment near bvec_array define! 182 */ 183 switch (nr) { 184 case 1: 185 *idx = 0; 186 break; 187 case 2 ... 4: 188 *idx = 1; 189 break; 190 case 5 ... 16: 191 *idx = 2; 192 break; 193 case 17 ... 64: 194 *idx = 3; 195 break; 196 case 65 ... 128: 197 *idx = 4; 198 break; 199 case 129 ... BIO_MAX_PAGES: 200 *idx = 5; 201 break; 202 default: 203 return NULL; 204 } 205 206 /* 207 * idx now points to the pool we want to allocate from. only the 208 * 1-vec entry pool is mempool backed. 209 */ 210 if (*idx == BIOVEC_MAX_IDX) { 211 fallback: 212 bvl = mempool_alloc(pool, gfp_mask); 213 } else { 214 struct biovec_slab *bvs = bvec_slabs + *idx; 215 gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); 216 217 /* 218 * Make this allocation restricted and don't dump info on 219 * allocation failures, since we'll fallback to the mempool 220 * in case of failure. 221 */ 222 __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; 223 224 /* 225 * Try a slab allocation. If this fails and __GFP_WAIT 226 * is set, retry with the 1-entry mempool 227 */ 228 bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); 229 if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { 230 *idx = BIOVEC_MAX_IDX; 231 goto fallback; 232 } 233 } 234 235 return bvl; 236 } 237 238 static void __bio_free(struct bio *bio) 239 { 240 bio_disassociate_task(bio); 241 242 if (bio_integrity(bio)) 243 bio_integrity_free(bio); 244 } 245 246 static void bio_free(struct bio *bio) 247 { 248 struct bio_set *bs = bio->bi_pool; 249 void *p; 250 251 __bio_free(bio); 252 253 if (bs) { 254 if (bio_flagged(bio, BIO_OWNS_VEC)) 255 bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); 256 257 /* 258 * If we have front padding, adjust the bio pointer before freeing 259 */ 260 p = bio; 261 p -= bs->front_pad; 262 263 mempool_free(p, bs->bio_pool); 264 } else { 265 /* Bio was allocated by bio_kmalloc() */ 266 kfree(bio); 267 } 268 } 269 270 void bio_init(struct bio *bio) 271 { 272 memset(bio, 0, sizeof(*bio)); 273 bio->bi_flags = 1 << BIO_UPTODATE; 274 atomic_set(&bio->bi_remaining, 1); 275 atomic_set(&bio->bi_cnt, 1); 276 } 277 EXPORT_SYMBOL(bio_init); 278 279 /** 280 * bio_reset - reinitialize a bio 281 * @bio: bio to reset 282 * 283 * Description: 284 * After calling bio_reset(), @bio will be in the same state as a freshly 285 * allocated bio returned bio bio_alloc_bioset() - the only fields that are 286 * preserved are the ones that are initialized by bio_alloc_bioset(). See 287 * comment in struct bio. 288 */ 289 void bio_reset(struct bio *bio) 290 { 291 unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); 292 293 __bio_free(bio); 294 295 memset(bio, 0, BIO_RESET_BYTES); 296 bio->bi_flags = flags|(1 << BIO_UPTODATE); 297 atomic_set(&bio->bi_remaining, 1); 298 } 299 EXPORT_SYMBOL(bio_reset); 300 301 static void bio_chain_endio(struct bio *bio, int error) 302 { 303 bio_endio(bio->bi_private, error); 304 bio_put(bio); 305 } 306 307 /** 308 * bio_chain - chain bio completions 309 * @bio: the target bio 310 * @parent: the @bio's parent bio 311 * 312 * The caller won't have a bi_end_io called when @bio completes - instead, 313 * @parent's bi_end_io won't be called until both @parent and @bio have 314 * completed; the chained bio will also be freed when it completes. 315 * 316 * The caller must not set bi_private or bi_end_io in @bio. 317 */ 318 void bio_chain(struct bio *bio, struct bio *parent) 319 { 320 BUG_ON(bio->bi_private || bio->bi_end_io); 321 322 bio->bi_private = parent; 323 bio->bi_end_io = bio_chain_endio; 324 atomic_inc(&parent->bi_remaining); 325 } 326 EXPORT_SYMBOL(bio_chain); 327 328 static void bio_alloc_rescue(struct work_struct *work) 329 { 330 struct bio_set *bs = container_of(work, struct bio_set, rescue_work); 331 struct bio *bio; 332 333 while (1) { 334 spin_lock(&bs->rescue_lock); 335 bio = bio_list_pop(&bs->rescue_list); 336 spin_unlock(&bs->rescue_lock); 337 338 if (!bio) 339 break; 340 341 generic_make_request(bio); 342 } 343 } 344 345 static void punt_bios_to_rescuer(struct bio_set *bs) 346 { 347 struct bio_list punt, nopunt; 348 struct bio *bio; 349 350 /* 351 * In order to guarantee forward progress we must punt only bios that 352 * were allocated from this bio_set; otherwise, if there was a bio on 353 * there for a stacking driver higher up in the stack, processing it 354 * could require allocating bios from this bio_set, and doing that from 355 * our own rescuer would be bad. 356 * 357 * Since bio lists are singly linked, pop them all instead of trying to 358 * remove from the middle of the list: 359 */ 360 361 bio_list_init(&punt); 362 bio_list_init(&nopunt); 363 364 while ((bio = bio_list_pop(current->bio_list))) 365 bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); 366 367 *current->bio_list = nopunt; 368 369 spin_lock(&bs->rescue_lock); 370 bio_list_merge(&bs->rescue_list, &punt); 371 spin_unlock(&bs->rescue_lock); 372 373 queue_work(bs->rescue_workqueue, &bs->rescue_work); 374 } 375 376 /** 377 * bio_alloc_bioset - allocate a bio for I/O 378 * @gfp_mask: the GFP_ mask given to the slab allocator 379 * @nr_iovecs: number of iovecs to pre-allocate 380 * @bs: the bio_set to allocate from. 381 * 382 * Description: 383 * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is 384 * backed by the @bs's mempool. 385 * 386 * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be 387 * able to allocate a bio. This is due to the mempool guarantees. To make this 388 * work, callers must never allocate more than 1 bio at a time from this pool. 389 * Callers that need to allocate more than 1 bio must always submit the 390 * previously allocated bio for IO before attempting to allocate a new one. 391 * Failure to do so can cause deadlocks under memory pressure. 392 * 393 * Note that when running under generic_make_request() (i.e. any block 394 * driver), bios are not submitted until after you return - see the code in 395 * generic_make_request() that converts recursion into iteration, to prevent 396 * stack overflows. 397 * 398 * This would normally mean allocating multiple bios under 399 * generic_make_request() would be susceptible to deadlocks, but we have 400 * deadlock avoidance code that resubmits any blocked bios from a rescuer 401 * thread. 402 * 403 * However, we do not guarantee forward progress for allocations from other 404 * mempools. Doing multiple allocations from the same mempool under 405 * generic_make_request() should be avoided - instead, use bio_set's front_pad 406 * for per bio allocations. 407 * 408 * RETURNS: 409 * Pointer to new bio on success, NULL on failure. 410 */ 411 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 412 { 413 gfp_t saved_gfp = gfp_mask; 414 unsigned front_pad; 415 unsigned inline_vecs; 416 unsigned long idx = BIO_POOL_NONE; 417 struct bio_vec *bvl = NULL; 418 struct bio *bio; 419 void *p; 420 421 if (!bs) { 422 if (nr_iovecs > UIO_MAXIOV) 423 return NULL; 424 425 p = kmalloc(sizeof(struct bio) + 426 nr_iovecs * sizeof(struct bio_vec), 427 gfp_mask); 428 front_pad = 0; 429 inline_vecs = nr_iovecs; 430 } else { 431 /* should not use nobvec bioset for nr_iovecs > 0 */ 432 if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0)) 433 return NULL; 434 /* 435 * generic_make_request() converts recursion to iteration; this 436 * means if we're running beneath it, any bios we allocate and 437 * submit will not be submitted (and thus freed) until after we 438 * return. 439 * 440 * This exposes us to a potential deadlock if we allocate 441 * multiple bios from the same bio_set() while running 442 * underneath generic_make_request(). If we were to allocate 443 * multiple bios (say a stacking block driver that was splitting 444 * bios), we would deadlock if we exhausted the mempool's 445 * reserve. 446 * 447 * We solve this, and guarantee forward progress, with a rescuer 448 * workqueue per bio_set. If we go to allocate and there are 449 * bios on current->bio_list, we first try the allocation 450 * without __GFP_WAIT; if that fails, we punt those bios we 451 * would be blocking to the rescuer workqueue before we retry 452 * with the original gfp_flags. 453 */ 454 455 if (current->bio_list && !bio_list_empty(current->bio_list)) 456 gfp_mask &= ~__GFP_WAIT; 457 458 p = mempool_alloc(bs->bio_pool, gfp_mask); 459 if (!p && gfp_mask != saved_gfp) { 460 punt_bios_to_rescuer(bs); 461 gfp_mask = saved_gfp; 462 p = mempool_alloc(bs->bio_pool, gfp_mask); 463 } 464 465 front_pad = bs->front_pad; 466 inline_vecs = BIO_INLINE_VECS; 467 } 468 469 if (unlikely(!p)) 470 return NULL; 471 472 bio = p + front_pad; 473 bio_init(bio); 474 475 if (nr_iovecs > inline_vecs) { 476 bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); 477 if (!bvl && gfp_mask != saved_gfp) { 478 punt_bios_to_rescuer(bs); 479 gfp_mask = saved_gfp; 480 bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); 481 } 482 483 if (unlikely(!bvl)) 484 goto err_free; 485 486 bio->bi_flags |= 1 << BIO_OWNS_VEC; 487 } else if (nr_iovecs) { 488 bvl = bio->bi_inline_vecs; 489 } 490 491 bio->bi_pool = bs; 492 bio->bi_flags |= idx << BIO_POOL_OFFSET; 493 bio->bi_max_vecs = nr_iovecs; 494 bio->bi_io_vec = bvl; 495 return bio; 496 497 err_free: 498 mempool_free(p, bs->bio_pool); 499 return NULL; 500 } 501 EXPORT_SYMBOL(bio_alloc_bioset); 502 503 void zero_fill_bio(struct bio *bio) 504 { 505 unsigned long flags; 506 struct bio_vec bv; 507 struct bvec_iter iter; 508 509 bio_for_each_segment(bv, bio, iter) { 510 char *data = bvec_kmap_irq(&bv, &flags); 511 memset(data, 0, bv.bv_len); 512 flush_dcache_page(bv.bv_page); 513 bvec_kunmap_irq(data, &flags); 514 } 515 } 516 EXPORT_SYMBOL(zero_fill_bio); 517 518 /** 519 * bio_put - release a reference to a bio 520 * @bio: bio to release reference to 521 * 522 * Description: 523 * Put a reference to a &struct bio, either one you have gotten with 524 * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. 525 **/ 526 void bio_put(struct bio *bio) 527 { 528 BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); 529 530 /* 531 * last put frees it 532 */ 533 if (atomic_dec_and_test(&bio->bi_cnt)) 534 bio_free(bio); 535 } 536 EXPORT_SYMBOL(bio_put); 537 538 inline int bio_phys_segments(struct request_queue *q, struct bio *bio) 539 { 540 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 541 blk_recount_segments(q, bio); 542 543 return bio->bi_phys_segments; 544 } 545 EXPORT_SYMBOL(bio_phys_segments); 546 547 /** 548 * __bio_clone_fast - clone a bio that shares the original bio's biovec 549 * @bio: destination bio 550 * @bio_src: bio to clone 551 * 552 * Clone a &bio. Caller will own the returned bio, but not 553 * the actual data it points to. Reference count of returned 554 * bio will be one. 555 * 556 * Caller must ensure that @bio_src is not freed before @bio. 557 */ 558 void __bio_clone_fast(struct bio *bio, struct bio *bio_src) 559 { 560 BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE); 561 562 /* 563 * most users will be overriding ->bi_bdev with a new target, 564 * so we don't set nor calculate new physical/hw segment counts here 565 */ 566 bio->bi_bdev = bio_src->bi_bdev; 567 bio->bi_flags |= 1 << BIO_CLONED; 568 bio->bi_rw = bio_src->bi_rw; 569 bio->bi_iter = bio_src->bi_iter; 570 bio->bi_io_vec = bio_src->bi_io_vec; 571 } 572 EXPORT_SYMBOL(__bio_clone_fast); 573 574 /** 575 * bio_clone_fast - clone a bio that shares the original bio's biovec 576 * @bio: bio to clone 577 * @gfp_mask: allocation priority 578 * @bs: bio_set to allocate from 579 * 580 * Like __bio_clone_fast, only also allocates the returned bio 581 */ 582 struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) 583 { 584 struct bio *b; 585 586 b = bio_alloc_bioset(gfp_mask, 0, bs); 587 if (!b) 588 return NULL; 589 590 __bio_clone_fast(b, bio); 591 592 if (bio_integrity(bio)) { 593 int ret; 594 595 ret = bio_integrity_clone(b, bio, gfp_mask); 596 597 if (ret < 0) { 598 bio_put(b); 599 return NULL; 600 } 601 } 602 603 return b; 604 } 605 EXPORT_SYMBOL(bio_clone_fast); 606 607 /** 608 * bio_clone_bioset - clone a bio 609 * @bio_src: bio to clone 610 * @gfp_mask: allocation priority 611 * @bs: bio_set to allocate from 612 * 613 * Clone bio. Caller will own the returned bio, but not the actual data it 614 * points to. Reference count of returned bio will be one. 615 */ 616 struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, 617 struct bio_set *bs) 618 { 619 struct bvec_iter iter; 620 struct bio_vec bv; 621 struct bio *bio; 622 623 /* 624 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from 625 * bio_src->bi_io_vec to bio->bi_io_vec. 626 * 627 * We can't do that anymore, because: 628 * 629 * - The point of cloning the biovec is to produce a bio with a biovec 630 * the caller can modify: bi_idx and bi_bvec_done should be 0. 631 * 632 * - The original bio could've had more than BIO_MAX_PAGES biovecs; if 633 * we tried to clone the whole thing bio_alloc_bioset() would fail. 634 * But the clone should succeed as long as the number of biovecs we 635 * actually need to allocate is fewer than BIO_MAX_PAGES. 636 * 637 * - Lastly, bi_vcnt should not be looked at or relied upon by code 638 * that does not own the bio - reason being drivers don't use it for 639 * iterating over the biovec anymore, so expecting it to be kept up 640 * to date (i.e. for clones that share the parent biovec) is just 641 * asking for trouble and would force extra work on 642 * __bio_clone_fast() anyways. 643 */ 644 645 bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); 646 if (!bio) 647 return NULL; 648 649 bio->bi_bdev = bio_src->bi_bdev; 650 bio->bi_rw = bio_src->bi_rw; 651 bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; 652 bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; 653 654 if (bio->bi_rw & REQ_DISCARD) 655 goto integrity_clone; 656 657 if (bio->bi_rw & REQ_WRITE_SAME) { 658 bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; 659 goto integrity_clone; 660 } 661 662 bio_for_each_segment(bv, bio_src, iter) 663 bio->bi_io_vec[bio->bi_vcnt++] = bv; 664 665 integrity_clone: 666 if (bio_integrity(bio_src)) { 667 int ret; 668 669 ret = bio_integrity_clone(bio, bio_src, gfp_mask); 670 if (ret < 0) { 671 bio_put(bio); 672 return NULL; 673 } 674 } 675 676 return bio; 677 } 678 EXPORT_SYMBOL(bio_clone_bioset); 679 680 /** 681 * bio_get_nr_vecs - return approx number of vecs 682 * @bdev: I/O target 683 * 684 * Return the approximate number of pages we can send to this target. 685 * There's no guarantee that you will be able to fit this number of pages 686 * into a bio, it does not account for dynamic restrictions that vary 687 * on offset. 688 */ 689 int bio_get_nr_vecs(struct block_device *bdev) 690 { 691 struct request_queue *q = bdev_get_queue(bdev); 692 int nr_pages; 693 694 nr_pages = min_t(unsigned, 695 queue_max_segments(q), 696 queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); 697 698 return min_t(unsigned, nr_pages, BIO_MAX_PAGES); 699 700 } 701 EXPORT_SYMBOL(bio_get_nr_vecs); 702 703 static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page 704 *page, unsigned int len, unsigned int offset, 705 unsigned int max_sectors) 706 { 707 int retried_segments = 0; 708 struct bio_vec *bvec; 709 710 /* 711 * cloned bio must not modify vec list 712 */ 713 if (unlikely(bio_flagged(bio, BIO_CLONED))) 714 return 0; 715 716 if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) 717 return 0; 718 719 /* 720 * For filesystems with a blocksize smaller than the pagesize 721 * we will often be called with the same page as last time and 722 * a consecutive offset. Optimize this special case. 723 */ 724 if (bio->bi_vcnt > 0) { 725 struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 726 727 if (page == prev->bv_page && 728 offset == prev->bv_offset + prev->bv_len) { 729 unsigned int prev_bv_len = prev->bv_len; 730 prev->bv_len += len; 731 732 if (q->merge_bvec_fn) { 733 struct bvec_merge_data bvm = { 734 /* prev_bvec is already charged in 735 bi_size, discharge it in order to 736 simulate merging updated prev_bvec 737 as new bvec. */ 738 .bi_bdev = bio->bi_bdev, 739 .bi_sector = bio->bi_iter.bi_sector, 740 .bi_size = bio->bi_iter.bi_size - 741 prev_bv_len, 742 .bi_rw = bio->bi_rw, 743 }; 744 745 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { 746 prev->bv_len -= len; 747 return 0; 748 } 749 } 750 751 goto done; 752 } 753 754 /* 755 * If the queue doesn't support SG gaps and adding this 756 * offset would create a gap, disallow it. 757 */ 758 if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && 759 bvec_gap_to_prev(prev, offset)) 760 return 0; 761 } 762 763 if (bio->bi_vcnt >= bio->bi_max_vecs) 764 return 0; 765 766 /* 767 * we might lose a segment or two here, but rather that than 768 * make this too complex. 769 */ 770 771 while (bio->bi_phys_segments >= queue_max_segments(q)) { 772 773 if (retried_segments) 774 return 0; 775 776 retried_segments = 1; 777 blk_recount_segments(q, bio); 778 } 779 780 /* 781 * setup the new entry, we might clear it again later if we 782 * cannot add the page 783 */ 784 bvec = &bio->bi_io_vec[bio->bi_vcnt]; 785 bvec->bv_page = page; 786 bvec->bv_len = len; 787 bvec->bv_offset = offset; 788 789 /* 790 * if queue has other restrictions (eg varying max sector size 791 * depending on offset), it can specify a merge_bvec_fn in the 792 * queue to get further control 793 */ 794 if (q->merge_bvec_fn) { 795 struct bvec_merge_data bvm = { 796 .bi_bdev = bio->bi_bdev, 797 .bi_sector = bio->bi_iter.bi_sector, 798 .bi_size = bio->bi_iter.bi_size, 799 .bi_rw = bio->bi_rw, 800 }; 801 802 /* 803 * merge_bvec_fn() returns number of bytes it can accept 804 * at this offset 805 */ 806 if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { 807 bvec->bv_page = NULL; 808 bvec->bv_len = 0; 809 bvec->bv_offset = 0; 810 return 0; 811 } 812 } 813 814 /* If we may be able to merge these biovecs, force a recount */ 815 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) 816 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 817 818 bio->bi_vcnt++; 819 bio->bi_phys_segments++; 820 done: 821 bio->bi_iter.bi_size += len; 822 return len; 823 } 824 825 /** 826 * bio_add_pc_page - attempt to add page to bio 827 * @q: the target queue 828 * @bio: destination bio 829 * @page: page to add 830 * @len: vec entry length 831 * @offset: vec entry offset 832 * 833 * Attempt to add a page to the bio_vec maplist. This can fail for a 834 * number of reasons, such as the bio being full or target block device 835 * limitations. The target block device must allow bio's up to PAGE_SIZE, 836 * so it is always possible to add a single page to an empty bio. 837 * 838 * This should only be used by REQ_PC bios. 839 */ 840 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, 841 unsigned int len, unsigned int offset) 842 { 843 return __bio_add_page(q, bio, page, len, offset, 844 queue_max_hw_sectors(q)); 845 } 846 EXPORT_SYMBOL(bio_add_pc_page); 847 848 /** 849 * bio_add_page - attempt to add page to bio 850 * @bio: destination bio 851 * @page: page to add 852 * @len: vec entry length 853 * @offset: vec entry offset 854 * 855 * Attempt to add a page to the bio_vec maplist. This can fail for a 856 * number of reasons, such as the bio being full or target block device 857 * limitations. The target block device must allow bio's up to PAGE_SIZE, 858 * so it is always possible to add a single page to an empty bio. 859 */ 860 int bio_add_page(struct bio *bio, struct page *page, unsigned int len, 861 unsigned int offset) 862 { 863 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 864 unsigned int max_sectors; 865 866 max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); 867 if ((max_sectors < (len >> 9)) && !bio->bi_iter.bi_size) 868 max_sectors = len >> 9; 869 870 return __bio_add_page(q, bio, page, len, offset, max_sectors); 871 } 872 EXPORT_SYMBOL(bio_add_page); 873 874 struct submit_bio_ret { 875 struct completion event; 876 int error; 877 }; 878 879 static void submit_bio_wait_endio(struct bio *bio, int error) 880 { 881 struct submit_bio_ret *ret = bio->bi_private; 882 883 ret->error = error; 884 complete(&ret->event); 885 } 886 887 /** 888 * submit_bio_wait - submit a bio, and wait until it completes 889 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) 890 * @bio: The &struct bio which describes the I/O 891 * 892 * Simple wrapper around submit_bio(). Returns 0 on success, or the error from 893 * bio_endio() on failure. 894 */ 895 int submit_bio_wait(int rw, struct bio *bio) 896 { 897 struct submit_bio_ret ret; 898 899 rw |= REQ_SYNC; 900 init_completion(&ret.event); 901 bio->bi_private = &ret; 902 bio->bi_end_io = submit_bio_wait_endio; 903 submit_bio(rw, bio); 904 wait_for_completion(&ret.event); 905 906 return ret.error; 907 } 908 EXPORT_SYMBOL(submit_bio_wait); 909 910 /** 911 * bio_advance - increment/complete a bio by some number of bytes 912 * @bio: bio to advance 913 * @bytes: number of bytes to complete 914 * 915 * This updates bi_sector, bi_size and bi_idx; if the number of bytes to 916 * complete doesn't align with a bvec boundary, then bv_len and bv_offset will 917 * be updated on the last bvec as well. 918 * 919 * @bio will then represent the remaining, uncompleted portion of the io. 920 */ 921 void bio_advance(struct bio *bio, unsigned bytes) 922 { 923 if (bio_integrity(bio)) 924 bio_integrity_advance(bio, bytes); 925 926 bio_advance_iter(bio, &bio->bi_iter, bytes); 927 } 928 EXPORT_SYMBOL(bio_advance); 929 930 /** 931 * bio_alloc_pages - allocates a single page for each bvec in a bio 932 * @bio: bio to allocate pages for 933 * @gfp_mask: flags for allocation 934 * 935 * Allocates pages up to @bio->bi_vcnt. 936 * 937 * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are 938 * freed. 939 */ 940 int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) 941 { 942 int i; 943 struct bio_vec *bv; 944 945 bio_for_each_segment_all(bv, bio, i) { 946 bv->bv_page = alloc_page(gfp_mask); 947 if (!bv->bv_page) { 948 while (--bv >= bio->bi_io_vec) 949 __free_page(bv->bv_page); 950 return -ENOMEM; 951 } 952 } 953 954 return 0; 955 } 956 EXPORT_SYMBOL(bio_alloc_pages); 957 958 /** 959 * bio_copy_data - copy contents of data buffers from one chain of bios to 960 * another 961 * @src: source bio list 962 * @dst: destination bio list 963 * 964 * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats 965 * @src and @dst as linked lists of bios. 966 * 967 * Stops when it reaches the end of either @src or @dst - that is, copies 968 * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). 969 */ 970 void bio_copy_data(struct bio *dst, struct bio *src) 971 { 972 struct bvec_iter src_iter, dst_iter; 973 struct bio_vec src_bv, dst_bv; 974 void *src_p, *dst_p; 975 unsigned bytes; 976 977 src_iter = src->bi_iter; 978 dst_iter = dst->bi_iter; 979 980 while (1) { 981 if (!src_iter.bi_size) { 982 src = src->bi_next; 983 if (!src) 984 break; 985 986 src_iter = src->bi_iter; 987 } 988 989 if (!dst_iter.bi_size) { 990 dst = dst->bi_next; 991 if (!dst) 992 break; 993 994 dst_iter = dst->bi_iter; 995 } 996 997 src_bv = bio_iter_iovec(src, src_iter); 998 dst_bv = bio_iter_iovec(dst, dst_iter); 999 1000 bytes = min(src_bv.bv_len, dst_bv.bv_len); 1001 1002 src_p = kmap_atomic(src_bv.bv_page); 1003 dst_p = kmap_atomic(dst_bv.bv_page); 1004 1005 memcpy(dst_p + dst_bv.bv_offset, 1006 src_p + src_bv.bv_offset, 1007 bytes); 1008 1009 kunmap_atomic(dst_p); 1010 kunmap_atomic(src_p); 1011 1012 bio_advance_iter(src, &src_iter, bytes); 1013 bio_advance_iter(dst, &dst_iter, bytes); 1014 } 1015 } 1016 EXPORT_SYMBOL(bio_copy_data); 1017 1018 struct bio_map_data { 1019 int nr_sgvecs; 1020 int is_our_pages; 1021 struct sg_iovec sgvecs[]; 1022 }; 1023 1024 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, 1025 const struct sg_iovec *iov, int iov_count, 1026 int is_our_pages) 1027 { 1028 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); 1029 bmd->nr_sgvecs = iov_count; 1030 bmd->is_our_pages = is_our_pages; 1031 bio->bi_private = bmd; 1032 } 1033 1034 static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, 1035 gfp_t gfp_mask) 1036 { 1037 if (iov_count > UIO_MAXIOV) 1038 return NULL; 1039 1040 return kmalloc(sizeof(struct bio_map_data) + 1041 sizeof(struct sg_iovec) * iov_count, gfp_mask); 1042 } 1043 1044 static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, 1045 int to_user, int from_user, int do_free_page) 1046 { 1047 int ret = 0, i; 1048 struct bio_vec *bvec; 1049 int iov_idx = 0; 1050 unsigned int iov_off = 0; 1051 1052 bio_for_each_segment_all(bvec, bio, i) { 1053 char *bv_addr = page_address(bvec->bv_page); 1054 unsigned int bv_len = bvec->bv_len; 1055 1056 while (bv_len && iov_idx < iov_count) { 1057 unsigned int bytes; 1058 char __user *iov_addr; 1059 1060 bytes = min_t(unsigned int, 1061 iov[iov_idx].iov_len - iov_off, bv_len); 1062 iov_addr = iov[iov_idx].iov_base + iov_off; 1063 1064 if (!ret) { 1065 if (to_user) 1066 ret = copy_to_user(iov_addr, bv_addr, 1067 bytes); 1068 1069 if (from_user) 1070 ret = copy_from_user(bv_addr, iov_addr, 1071 bytes); 1072 1073 if (ret) 1074 ret = -EFAULT; 1075 } 1076 1077 bv_len -= bytes; 1078 bv_addr += bytes; 1079 iov_addr += bytes; 1080 iov_off += bytes; 1081 1082 if (iov[iov_idx].iov_len == iov_off) { 1083 iov_idx++; 1084 iov_off = 0; 1085 } 1086 } 1087 1088 if (do_free_page) 1089 __free_page(bvec->bv_page); 1090 } 1091 1092 return ret; 1093 } 1094 1095 /** 1096 * bio_uncopy_user - finish previously mapped bio 1097 * @bio: bio being terminated 1098 * 1099 * Free pages allocated from bio_copy_user() and write back data 1100 * to user space in case of a read. 1101 */ 1102 int bio_uncopy_user(struct bio *bio) 1103 { 1104 struct bio_map_data *bmd = bio->bi_private; 1105 struct bio_vec *bvec; 1106 int ret = 0, i; 1107 1108 if (!bio_flagged(bio, BIO_NULL_MAPPED)) { 1109 /* 1110 * if we're in a workqueue, the request is orphaned, so 1111 * don't copy into a random user address space, just free. 1112 */ 1113 if (current->mm) 1114 ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1115 bio_data_dir(bio) == READ, 1116 0, bmd->is_our_pages); 1117 else if (bmd->is_our_pages) 1118 bio_for_each_segment_all(bvec, bio, i) 1119 __free_page(bvec->bv_page); 1120 } 1121 kfree(bmd); 1122 bio_put(bio); 1123 return ret; 1124 } 1125 EXPORT_SYMBOL(bio_uncopy_user); 1126 1127 /** 1128 * bio_copy_user_iov - copy user data to bio 1129 * @q: destination block queue 1130 * @map_data: pointer to the rq_map_data holding pages (if necessary) 1131 * @iov: the iovec. 1132 * @iov_count: number of elements in the iovec 1133 * @write_to_vm: bool indicating writing to pages or not 1134 * @gfp_mask: memory allocation flags 1135 * 1136 * Prepares and returns a bio for indirect user io, bouncing data 1137 * to/from kernel pages as necessary. Must be paired with 1138 * call bio_uncopy_user() on io completion. 1139 */ 1140 struct bio *bio_copy_user_iov(struct request_queue *q, 1141 struct rq_map_data *map_data, 1142 const struct sg_iovec *iov, int iov_count, 1143 int write_to_vm, gfp_t gfp_mask) 1144 { 1145 struct bio_map_data *bmd; 1146 struct bio_vec *bvec; 1147 struct page *page; 1148 struct bio *bio; 1149 int i, ret; 1150 int nr_pages = 0; 1151 unsigned int len = 0; 1152 unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; 1153 1154 for (i = 0; i < iov_count; i++) { 1155 unsigned long uaddr; 1156 unsigned long end; 1157 unsigned long start; 1158 1159 uaddr = (unsigned long)iov[i].iov_base; 1160 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1161 start = uaddr >> PAGE_SHIFT; 1162 1163 /* 1164 * Overflow, abort 1165 */ 1166 if (end < start) 1167 return ERR_PTR(-EINVAL); 1168 1169 nr_pages += end - start; 1170 len += iov[i].iov_len; 1171 } 1172 1173 if (offset) 1174 nr_pages++; 1175 1176 bmd = bio_alloc_map_data(iov_count, gfp_mask); 1177 if (!bmd) 1178 return ERR_PTR(-ENOMEM); 1179 1180 ret = -ENOMEM; 1181 bio = bio_kmalloc(gfp_mask, nr_pages); 1182 if (!bio) 1183 goto out_bmd; 1184 1185 if (!write_to_vm) 1186 bio->bi_rw |= REQ_WRITE; 1187 1188 ret = 0; 1189 1190 if (map_data) { 1191 nr_pages = 1 << map_data->page_order; 1192 i = map_data->offset / PAGE_SIZE; 1193 } 1194 while (len) { 1195 unsigned int bytes = PAGE_SIZE; 1196 1197 bytes -= offset; 1198 1199 if (bytes > len) 1200 bytes = len; 1201 1202 if (map_data) { 1203 if (i == map_data->nr_entries * nr_pages) { 1204 ret = -ENOMEM; 1205 break; 1206 } 1207 1208 page = map_data->pages[i / nr_pages]; 1209 page += (i % nr_pages); 1210 1211 i++; 1212 } else { 1213 page = alloc_page(q->bounce_gfp | gfp_mask); 1214 if (!page) { 1215 ret = -ENOMEM; 1216 break; 1217 } 1218 } 1219 1220 if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) 1221 break; 1222 1223 len -= bytes; 1224 offset = 0; 1225 } 1226 1227 if (ret) 1228 goto cleanup; 1229 1230 /* 1231 * success 1232 */ 1233 if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || 1234 (map_data && map_data->from_user)) { 1235 ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); 1236 if (ret) 1237 goto cleanup; 1238 } 1239 1240 bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); 1241 return bio; 1242 cleanup: 1243 if (!map_data) 1244 bio_for_each_segment_all(bvec, bio, i) 1245 __free_page(bvec->bv_page); 1246 1247 bio_put(bio); 1248 out_bmd: 1249 kfree(bmd); 1250 return ERR_PTR(ret); 1251 } 1252 1253 /** 1254 * bio_copy_user - copy user data to bio 1255 * @q: destination block queue 1256 * @map_data: pointer to the rq_map_data holding pages (if necessary) 1257 * @uaddr: start of user address 1258 * @len: length in bytes 1259 * @write_to_vm: bool indicating writing to pages or not 1260 * @gfp_mask: memory allocation flags 1261 * 1262 * Prepares and returns a bio for indirect user io, bouncing data 1263 * to/from kernel pages as necessary. Must be paired with 1264 * call bio_uncopy_user() on io completion. 1265 */ 1266 struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, 1267 unsigned long uaddr, unsigned int len, 1268 int write_to_vm, gfp_t gfp_mask) 1269 { 1270 struct sg_iovec iov; 1271 1272 iov.iov_base = (void __user *)uaddr; 1273 iov.iov_len = len; 1274 1275 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); 1276 } 1277 EXPORT_SYMBOL(bio_copy_user); 1278 1279 static struct bio *__bio_map_user_iov(struct request_queue *q, 1280 struct block_device *bdev, 1281 const struct sg_iovec *iov, int iov_count, 1282 int write_to_vm, gfp_t gfp_mask) 1283 { 1284 int i, j; 1285 int nr_pages = 0; 1286 struct page **pages; 1287 struct bio *bio; 1288 int cur_page = 0; 1289 int ret, offset; 1290 1291 for (i = 0; i < iov_count; i++) { 1292 unsigned long uaddr = (unsigned long)iov[i].iov_base; 1293 unsigned long len = iov[i].iov_len; 1294 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1295 unsigned long start = uaddr >> PAGE_SHIFT; 1296 1297 /* 1298 * Overflow, abort 1299 */ 1300 if (end < start) 1301 return ERR_PTR(-EINVAL); 1302 1303 nr_pages += end - start; 1304 /* 1305 * buffer must be aligned to at least hardsector size for now 1306 */ 1307 if (uaddr & queue_dma_alignment(q)) 1308 return ERR_PTR(-EINVAL); 1309 } 1310 1311 if (!nr_pages) 1312 return ERR_PTR(-EINVAL); 1313 1314 bio = bio_kmalloc(gfp_mask, nr_pages); 1315 if (!bio) 1316 return ERR_PTR(-ENOMEM); 1317 1318 ret = -ENOMEM; 1319 pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); 1320 if (!pages) 1321 goto out; 1322 1323 for (i = 0; i < iov_count; i++) { 1324 unsigned long uaddr = (unsigned long)iov[i].iov_base; 1325 unsigned long len = iov[i].iov_len; 1326 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1327 unsigned long start = uaddr >> PAGE_SHIFT; 1328 const int local_nr_pages = end - start; 1329 const int page_limit = cur_page + local_nr_pages; 1330 1331 ret = get_user_pages_fast(uaddr, local_nr_pages, 1332 write_to_vm, &pages[cur_page]); 1333 if (ret < local_nr_pages) { 1334 ret = -EFAULT; 1335 goto out_unmap; 1336 } 1337 1338 offset = uaddr & ~PAGE_MASK; 1339 for (j = cur_page; j < page_limit; j++) { 1340 unsigned int bytes = PAGE_SIZE - offset; 1341 1342 if (len <= 0) 1343 break; 1344 1345 if (bytes > len) 1346 bytes = len; 1347 1348 /* 1349 * sorry... 1350 */ 1351 if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < 1352 bytes) 1353 break; 1354 1355 len -= bytes; 1356 offset = 0; 1357 } 1358 1359 cur_page = j; 1360 /* 1361 * release the pages we didn't map into the bio, if any 1362 */ 1363 while (j < page_limit) 1364 page_cache_release(pages[j++]); 1365 } 1366 1367 kfree(pages); 1368 1369 /* 1370 * set data direction, and check if mapped pages need bouncing 1371 */ 1372 if (!write_to_vm) 1373 bio->bi_rw |= REQ_WRITE; 1374 1375 bio->bi_bdev = bdev; 1376 bio->bi_flags |= (1 << BIO_USER_MAPPED); 1377 return bio; 1378 1379 out_unmap: 1380 for (i = 0; i < nr_pages; i++) { 1381 if(!pages[i]) 1382 break; 1383 page_cache_release(pages[i]); 1384 } 1385 out: 1386 kfree(pages); 1387 bio_put(bio); 1388 return ERR_PTR(ret); 1389 } 1390 1391 /** 1392 * bio_map_user - map user address into bio 1393 * @q: the struct request_queue for the bio 1394 * @bdev: destination block device 1395 * @uaddr: start of user address 1396 * @len: length in bytes 1397 * @write_to_vm: bool indicating writing to pages or not 1398 * @gfp_mask: memory allocation flags 1399 * 1400 * Map the user space address into a bio suitable for io to a block 1401 * device. Returns an error pointer in case of error. 1402 */ 1403 struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, 1404 unsigned long uaddr, unsigned int len, int write_to_vm, 1405 gfp_t gfp_mask) 1406 { 1407 struct sg_iovec iov; 1408 1409 iov.iov_base = (void __user *)uaddr; 1410 iov.iov_len = len; 1411 1412 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); 1413 } 1414 EXPORT_SYMBOL(bio_map_user); 1415 1416 /** 1417 * bio_map_user_iov - map user sg_iovec table into bio 1418 * @q: the struct request_queue for the bio 1419 * @bdev: destination block device 1420 * @iov: the iovec. 1421 * @iov_count: number of elements in the iovec 1422 * @write_to_vm: bool indicating writing to pages or not 1423 * @gfp_mask: memory allocation flags 1424 * 1425 * Map the user space address into a bio suitable for io to a block 1426 * device. Returns an error pointer in case of error. 1427 */ 1428 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, 1429 const struct sg_iovec *iov, int iov_count, 1430 int write_to_vm, gfp_t gfp_mask) 1431 { 1432 struct bio *bio; 1433 1434 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, 1435 gfp_mask); 1436 if (IS_ERR(bio)) 1437 return bio; 1438 1439 /* 1440 * subtle -- if __bio_map_user() ended up bouncing a bio, 1441 * it would normally disappear when its bi_end_io is run. 1442 * however, we need it for the unmap, so grab an extra 1443 * reference to it 1444 */ 1445 bio_get(bio); 1446 1447 return bio; 1448 } 1449 1450 static void __bio_unmap_user(struct bio *bio) 1451 { 1452 struct bio_vec *bvec; 1453 int i; 1454 1455 /* 1456 * make sure we dirty pages we wrote to 1457 */ 1458 bio_for_each_segment_all(bvec, bio, i) { 1459 if (bio_data_dir(bio) == READ) 1460 set_page_dirty_lock(bvec->bv_page); 1461 1462 page_cache_release(bvec->bv_page); 1463 } 1464 1465 bio_put(bio); 1466 } 1467 1468 /** 1469 * bio_unmap_user - unmap a bio 1470 * @bio: the bio being unmapped 1471 * 1472 * Unmap a bio previously mapped by bio_map_user(). Must be called with 1473 * a process context. 1474 * 1475 * bio_unmap_user() may sleep. 1476 */ 1477 void bio_unmap_user(struct bio *bio) 1478 { 1479 __bio_unmap_user(bio); 1480 bio_put(bio); 1481 } 1482 EXPORT_SYMBOL(bio_unmap_user); 1483 1484 static void bio_map_kern_endio(struct bio *bio, int err) 1485 { 1486 bio_put(bio); 1487 } 1488 1489 static struct bio *__bio_map_kern(struct request_queue *q, void *data, 1490 unsigned int len, gfp_t gfp_mask) 1491 { 1492 unsigned long kaddr = (unsigned long)data; 1493 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1494 unsigned long start = kaddr >> PAGE_SHIFT; 1495 const int nr_pages = end - start; 1496 int offset, i; 1497 struct bio *bio; 1498 1499 bio = bio_kmalloc(gfp_mask, nr_pages); 1500 if (!bio) 1501 return ERR_PTR(-ENOMEM); 1502 1503 offset = offset_in_page(kaddr); 1504 for (i = 0; i < nr_pages; i++) { 1505 unsigned int bytes = PAGE_SIZE - offset; 1506 1507 if (len <= 0) 1508 break; 1509 1510 if (bytes > len) 1511 bytes = len; 1512 1513 if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, 1514 offset) < bytes) 1515 break; 1516 1517 data += bytes; 1518 len -= bytes; 1519 offset = 0; 1520 } 1521 1522 bio->bi_end_io = bio_map_kern_endio; 1523 return bio; 1524 } 1525 1526 /** 1527 * bio_map_kern - map kernel address into bio 1528 * @q: the struct request_queue for the bio 1529 * @data: pointer to buffer to map 1530 * @len: length in bytes 1531 * @gfp_mask: allocation flags for bio allocation 1532 * 1533 * Map the kernel address into a bio suitable for io to a block 1534 * device. Returns an error pointer in case of error. 1535 */ 1536 struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, 1537 gfp_t gfp_mask) 1538 { 1539 struct bio *bio; 1540 1541 bio = __bio_map_kern(q, data, len, gfp_mask); 1542 if (IS_ERR(bio)) 1543 return bio; 1544 1545 if (bio->bi_iter.bi_size == len) 1546 return bio; 1547 1548 /* 1549 * Don't support partial mappings. 1550 */ 1551 bio_put(bio); 1552 return ERR_PTR(-EINVAL); 1553 } 1554 EXPORT_SYMBOL(bio_map_kern); 1555 1556 static void bio_copy_kern_endio(struct bio *bio, int err) 1557 { 1558 struct bio_vec *bvec; 1559 const int read = bio_data_dir(bio) == READ; 1560 struct bio_map_data *bmd = bio->bi_private; 1561 int i; 1562 char *p = bmd->sgvecs[0].iov_base; 1563 1564 bio_for_each_segment_all(bvec, bio, i) { 1565 char *addr = page_address(bvec->bv_page); 1566 1567 if (read) 1568 memcpy(p, addr, bvec->bv_len); 1569 1570 __free_page(bvec->bv_page); 1571 p += bvec->bv_len; 1572 } 1573 1574 kfree(bmd); 1575 bio_put(bio); 1576 } 1577 1578 /** 1579 * bio_copy_kern - copy kernel address into bio 1580 * @q: the struct request_queue for the bio 1581 * @data: pointer to buffer to copy 1582 * @len: length in bytes 1583 * @gfp_mask: allocation flags for bio and page allocation 1584 * @reading: data direction is READ 1585 * 1586 * copy the kernel address into a bio suitable for io to a block 1587 * device. Returns an error pointer in case of error. 1588 */ 1589 struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, 1590 gfp_t gfp_mask, int reading) 1591 { 1592 struct bio *bio; 1593 struct bio_vec *bvec; 1594 int i; 1595 1596 bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); 1597 if (IS_ERR(bio)) 1598 return bio; 1599 1600 if (!reading) { 1601 void *p = data; 1602 1603 bio_for_each_segment_all(bvec, bio, i) { 1604 char *addr = page_address(bvec->bv_page); 1605 1606 memcpy(addr, p, bvec->bv_len); 1607 p += bvec->bv_len; 1608 } 1609 } 1610 1611 bio->bi_end_io = bio_copy_kern_endio; 1612 1613 return bio; 1614 } 1615 EXPORT_SYMBOL(bio_copy_kern); 1616 1617 /* 1618 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 1619 * for performing direct-IO in BIOs. 1620 * 1621 * The problem is that we cannot run set_page_dirty() from interrupt context 1622 * because the required locks are not interrupt-safe. So what we can do is to 1623 * mark the pages dirty _before_ performing IO. And in interrupt context, 1624 * check that the pages are still dirty. If so, fine. If not, redirty them 1625 * in process context. 1626 * 1627 * We special-case compound pages here: normally this means reads into hugetlb 1628 * pages. The logic in here doesn't really work right for compound pages 1629 * because the VM does not uniformly chase down the head page in all cases. 1630 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't 1631 * handle them at all. So we skip compound pages here at an early stage. 1632 * 1633 * Note that this code is very hard to test under normal circumstances because 1634 * direct-io pins the pages with get_user_pages(). This makes 1635 * is_page_cache_freeable return false, and the VM will not clean the pages. 1636 * But other code (eg, flusher threads) could clean the pages if they are mapped 1637 * pagecache. 1638 * 1639 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the 1640 * deferred bio dirtying paths. 1641 */ 1642 1643 /* 1644 * bio_set_pages_dirty() will mark all the bio's pages as dirty. 1645 */ 1646 void bio_set_pages_dirty(struct bio *bio) 1647 { 1648 struct bio_vec *bvec; 1649 int i; 1650 1651 bio_for_each_segment_all(bvec, bio, i) { 1652 struct page *page = bvec->bv_page; 1653 1654 if (page && !PageCompound(page)) 1655 set_page_dirty_lock(page); 1656 } 1657 } 1658 1659 static void bio_release_pages(struct bio *bio) 1660 { 1661 struct bio_vec *bvec; 1662 int i; 1663 1664 bio_for_each_segment_all(bvec, bio, i) { 1665 struct page *page = bvec->bv_page; 1666 1667 if (page) 1668 put_page(page); 1669 } 1670 } 1671 1672 /* 1673 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. 1674 * If they are, then fine. If, however, some pages are clean then they must 1675 * have been written out during the direct-IO read. So we take another ref on 1676 * the BIO and the offending pages and re-dirty the pages in process context. 1677 * 1678 * It is expected that bio_check_pages_dirty() will wholly own the BIO from 1679 * here on. It will run one page_cache_release() against each page and will 1680 * run one bio_put() against the BIO. 1681 */ 1682 1683 static void bio_dirty_fn(struct work_struct *work); 1684 1685 static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); 1686 static DEFINE_SPINLOCK(bio_dirty_lock); 1687 static struct bio *bio_dirty_list; 1688 1689 /* 1690 * This runs in process context 1691 */ 1692 static void bio_dirty_fn(struct work_struct *work) 1693 { 1694 unsigned long flags; 1695 struct bio *bio; 1696 1697 spin_lock_irqsave(&bio_dirty_lock, flags); 1698 bio = bio_dirty_list; 1699 bio_dirty_list = NULL; 1700 spin_unlock_irqrestore(&bio_dirty_lock, flags); 1701 1702 while (bio) { 1703 struct bio *next = bio->bi_private; 1704 1705 bio_set_pages_dirty(bio); 1706 bio_release_pages(bio); 1707 bio_put(bio); 1708 bio = next; 1709 } 1710 } 1711 1712 void bio_check_pages_dirty(struct bio *bio) 1713 { 1714 struct bio_vec *bvec; 1715 int nr_clean_pages = 0; 1716 int i; 1717 1718 bio_for_each_segment_all(bvec, bio, i) { 1719 struct page *page = bvec->bv_page; 1720 1721 if (PageDirty(page) || PageCompound(page)) { 1722 page_cache_release(page); 1723 bvec->bv_page = NULL; 1724 } else { 1725 nr_clean_pages++; 1726 } 1727 } 1728 1729 if (nr_clean_pages) { 1730 unsigned long flags; 1731 1732 spin_lock_irqsave(&bio_dirty_lock, flags); 1733 bio->bi_private = bio_dirty_list; 1734 bio_dirty_list = bio; 1735 spin_unlock_irqrestore(&bio_dirty_lock, flags); 1736 schedule_work(&bio_dirty_work); 1737 } else { 1738 bio_put(bio); 1739 } 1740 } 1741 1742 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1743 void bio_flush_dcache_pages(struct bio *bi) 1744 { 1745 struct bio_vec bvec; 1746 struct bvec_iter iter; 1747 1748 bio_for_each_segment(bvec, bi, iter) 1749 flush_dcache_page(bvec.bv_page); 1750 } 1751 EXPORT_SYMBOL(bio_flush_dcache_pages); 1752 #endif 1753 1754 /** 1755 * bio_endio - end I/O on a bio 1756 * @bio: bio 1757 * @error: error, if any 1758 * 1759 * Description: 1760 * bio_endio() will end I/O on the whole bio. bio_endio() is the 1761 * preferred way to end I/O on a bio, it takes care of clearing 1762 * BIO_UPTODATE on error. @error is 0 on success, and and one of the 1763 * established -Exxxx (-EIO, for instance) error values in case 1764 * something went wrong. No one should call bi_end_io() directly on a 1765 * bio unless they own it and thus know that it has an end_io 1766 * function. 1767 **/ 1768 void bio_endio(struct bio *bio, int error) 1769 { 1770 while (bio) { 1771 BUG_ON(atomic_read(&bio->bi_remaining) <= 0); 1772 1773 if (error) 1774 clear_bit(BIO_UPTODATE, &bio->bi_flags); 1775 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1776 error = -EIO; 1777 1778 if (!atomic_dec_and_test(&bio->bi_remaining)) 1779 return; 1780 1781 /* 1782 * Need to have a real endio function for chained bios, 1783 * otherwise various corner cases will break (like stacking 1784 * block devices that save/restore bi_end_io) - however, we want 1785 * to avoid unbounded recursion and blowing the stack. Tail call 1786 * optimization would handle this, but compiling with frame 1787 * pointers also disables gcc's sibling call optimization. 1788 */ 1789 if (bio->bi_end_io == bio_chain_endio) { 1790 struct bio *parent = bio->bi_private; 1791 bio_put(bio); 1792 bio = parent; 1793 } else { 1794 if (bio->bi_end_io) 1795 bio->bi_end_io(bio, error); 1796 bio = NULL; 1797 } 1798 } 1799 } 1800 EXPORT_SYMBOL(bio_endio); 1801 1802 /** 1803 * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining 1804 * @bio: bio 1805 * @error: error, if any 1806 * 1807 * For code that has saved and restored bi_end_io; thing hard before using this 1808 * function, probably you should've cloned the entire bio. 1809 **/ 1810 void bio_endio_nodec(struct bio *bio, int error) 1811 { 1812 atomic_inc(&bio->bi_remaining); 1813 bio_endio(bio, error); 1814 } 1815 EXPORT_SYMBOL(bio_endio_nodec); 1816 1817 /** 1818 * bio_split - split a bio 1819 * @bio: bio to split 1820 * @sectors: number of sectors to split from the front of @bio 1821 * @gfp: gfp mask 1822 * @bs: bio set to allocate from 1823 * 1824 * Allocates and returns a new bio which represents @sectors from the start of 1825 * @bio, and updates @bio to represent the remaining sectors. 1826 * 1827 * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's 1828 * responsibility to ensure that @bio is not freed before the split. 1829 */ 1830 struct bio *bio_split(struct bio *bio, int sectors, 1831 gfp_t gfp, struct bio_set *bs) 1832 { 1833 struct bio *split = NULL; 1834 1835 BUG_ON(sectors <= 0); 1836 BUG_ON(sectors >= bio_sectors(bio)); 1837 1838 split = bio_clone_fast(bio, gfp, bs); 1839 if (!split) 1840 return NULL; 1841 1842 split->bi_iter.bi_size = sectors << 9; 1843 1844 if (bio_integrity(split)) 1845 bio_integrity_trim(split, 0, sectors); 1846 1847 bio_advance(bio, split->bi_iter.bi_size); 1848 1849 return split; 1850 } 1851 EXPORT_SYMBOL(bio_split); 1852 1853 /** 1854 * bio_trim - trim a bio 1855 * @bio: bio to trim 1856 * @offset: number of sectors to trim from the front of @bio 1857 * @size: size we want to trim @bio to, in sectors 1858 */ 1859 void bio_trim(struct bio *bio, int offset, int size) 1860 { 1861 /* 'bio' is a cloned bio which we need to trim to match 1862 * the given offset and size. 1863 */ 1864 1865 size <<= 9; 1866 if (offset == 0 && size == bio->bi_iter.bi_size) 1867 return; 1868 1869 clear_bit(BIO_SEG_VALID, &bio->bi_flags); 1870 1871 bio_advance(bio, offset << 9); 1872 1873 bio->bi_iter.bi_size = size; 1874 } 1875 EXPORT_SYMBOL_GPL(bio_trim); 1876 1877 /* 1878 * create memory pools for biovec's in a bio_set. 1879 * use the global biovec slabs created for general use. 1880 */ 1881 mempool_t *biovec_create_pool(int pool_entries) 1882 { 1883 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; 1884 1885 return mempool_create_slab_pool(pool_entries, bp->slab); 1886 } 1887 1888 void bioset_free(struct bio_set *bs) 1889 { 1890 if (bs->rescue_workqueue) 1891 destroy_workqueue(bs->rescue_workqueue); 1892 1893 if (bs->bio_pool) 1894 mempool_destroy(bs->bio_pool); 1895 1896 if (bs->bvec_pool) 1897 mempool_destroy(bs->bvec_pool); 1898 1899 bioset_integrity_free(bs); 1900 bio_put_slab(bs); 1901 1902 kfree(bs); 1903 } 1904 EXPORT_SYMBOL(bioset_free); 1905 1906 static struct bio_set *__bioset_create(unsigned int pool_size, 1907 unsigned int front_pad, 1908 bool create_bvec_pool) 1909 { 1910 unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); 1911 struct bio_set *bs; 1912 1913 bs = kzalloc(sizeof(*bs), GFP_KERNEL); 1914 if (!bs) 1915 return NULL; 1916 1917 bs->front_pad = front_pad; 1918 1919 spin_lock_init(&bs->rescue_lock); 1920 bio_list_init(&bs->rescue_list); 1921 INIT_WORK(&bs->rescue_work, bio_alloc_rescue); 1922 1923 bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); 1924 if (!bs->bio_slab) { 1925 kfree(bs); 1926 return NULL; 1927 } 1928 1929 bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab); 1930 if (!bs->bio_pool) 1931 goto bad; 1932 1933 if (create_bvec_pool) { 1934 bs->bvec_pool = biovec_create_pool(pool_size); 1935 if (!bs->bvec_pool) 1936 goto bad; 1937 } 1938 1939 bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); 1940 if (!bs->rescue_workqueue) 1941 goto bad; 1942 1943 return bs; 1944 bad: 1945 bioset_free(bs); 1946 return NULL; 1947 } 1948 1949 /** 1950 * bioset_create - Create a bio_set 1951 * @pool_size: Number of bio and bio_vecs to cache in the mempool 1952 * @front_pad: Number of bytes to allocate in front of the returned bio 1953 * 1954 * Description: 1955 * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller 1956 * to ask for a number of bytes to be allocated in front of the bio. 1957 * Front pad allocation is useful for embedding the bio inside 1958 * another structure, to avoid allocating extra data to go with the bio. 1959 * Note that the bio must be embedded at the END of that structure always, 1960 * or things will break badly. 1961 */ 1962 struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) 1963 { 1964 return __bioset_create(pool_size, front_pad, true); 1965 } 1966 EXPORT_SYMBOL(bioset_create); 1967 1968 /** 1969 * bioset_create_nobvec - Create a bio_set without bio_vec mempool 1970 * @pool_size: Number of bio to cache in the mempool 1971 * @front_pad: Number of bytes to allocate in front of the returned bio 1972 * 1973 * Description: 1974 * Same functionality as bioset_create() except that mempool is not 1975 * created for bio_vecs. Saving some memory for bio_clone_fast() users. 1976 */ 1977 struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_pad) 1978 { 1979 return __bioset_create(pool_size, front_pad, false); 1980 } 1981 EXPORT_SYMBOL(bioset_create_nobvec); 1982 1983 #ifdef CONFIG_BLK_CGROUP 1984 /** 1985 * bio_associate_current - associate a bio with %current 1986 * @bio: target bio 1987 * 1988 * Associate @bio with %current if it hasn't been associated yet. Block 1989 * layer will treat @bio as if it were issued by %current no matter which 1990 * task actually issues it. 1991 * 1992 * This function takes an extra reference of @task's io_context and blkcg 1993 * which will be put when @bio is released. The caller must own @bio, 1994 * ensure %current->io_context exists, and is responsible for synchronizing 1995 * calls to this function. 1996 */ 1997 int bio_associate_current(struct bio *bio) 1998 { 1999 struct io_context *ioc; 2000 struct cgroup_subsys_state *css; 2001 2002 if (bio->bi_ioc) 2003 return -EBUSY; 2004 2005 ioc = current->io_context; 2006 if (!ioc) 2007 return -ENOENT; 2008 2009 /* acquire active ref on @ioc and associate */ 2010 get_io_context_active(ioc); 2011 bio->bi_ioc = ioc; 2012 2013 /* associate blkcg if exists */ 2014 rcu_read_lock(); 2015 css = task_css(current, blkio_cgrp_id); 2016 if (css && css_tryget_online(css)) 2017 bio->bi_css = css; 2018 rcu_read_unlock(); 2019 2020 return 0; 2021 } 2022 2023 /** 2024 * bio_disassociate_task - undo bio_associate_current() 2025 * @bio: target bio 2026 */ 2027 void bio_disassociate_task(struct bio *bio) 2028 { 2029 if (bio->bi_ioc) { 2030 put_io_context(bio->bi_ioc); 2031 bio->bi_ioc = NULL; 2032 } 2033 if (bio->bi_css) { 2034 css_put(bio->bi_css); 2035 bio->bi_css = NULL; 2036 } 2037 } 2038 2039 #endif /* CONFIG_BLK_CGROUP */ 2040 2041 static void __init biovec_init_slabs(void) 2042 { 2043 int i; 2044 2045 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 2046 int size; 2047 struct biovec_slab *bvs = bvec_slabs + i; 2048 2049 if (bvs->nr_vecs <= BIO_INLINE_VECS) { 2050 bvs->slab = NULL; 2051 continue; 2052 } 2053 2054 size = bvs->nr_vecs * sizeof(struct bio_vec); 2055 bvs->slab = kmem_cache_create(bvs->name, size, 0, 2056 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 2057 } 2058 } 2059 2060 static int __init init_bio(void) 2061 { 2062 bio_slab_max = 2; 2063 bio_slab_nr = 0; 2064 bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); 2065 if (!bio_slabs) 2066 panic("bio: can't allocate bios\n"); 2067 2068 bio_integrity_init(); 2069 biovec_init_slabs(); 2070 2071 fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); 2072 if (!fs_bio_set) 2073 panic("bio: can't allocate bios\n"); 2074 2075 if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) 2076 panic("bio: can't create integrity pool\n"); 2077 2078 return 0; 2079 } 2080 subsys_initcall(init_bio); 2081