1 /* 2 * fs/direct-io.c 3 * 4 * Copyright (C) 2002, Linus Torvalds. 5 * 6 * O_DIRECT 7 * 8 * 04Jul2002 Andrew Morton 9 * Initial version 10 * 11Sep2002 janetinc@us.ibm.com 11 * added readv/writev support. 12 * 29Oct2002 Andrew Morton 13 * rewrote bio_add_page() support. 14 * 30Oct2002 pbadari@us.ibm.com 15 * added support for non-aligned IO. 16 * 06Nov2002 pbadari@us.ibm.com 17 * added asynchronous IO support. 18 * 21Jul2003 nathans@sgi.com 19 * added IO completion notifier. 20 */ 21 22 #include <linux/kernel.h> 23 #include <linux/module.h> 24 #include <linux/types.h> 25 #include <linux/fs.h> 26 #include <linux/mm.h> 27 #include <linux/slab.h> 28 #include <linux/highmem.h> 29 #include <linux/pagemap.h> 30 #include <linux/task_io_accounting_ops.h> 31 #include <linux/bio.h> 32 #include <linux/wait.h> 33 #include <linux/err.h> 34 #include <linux/blkdev.h> 35 #include <linux/buffer_head.h> 36 #include <linux/rwsem.h> 37 #include <linux/uio.h> 38 #include <asm/atomic.h> 39 40 /* 41 * How many user pages to map in one call to get_user_pages(). This determines 42 * the size of a structure on the stack. 43 */ 44 #define DIO_PAGES 64 45 46 /* 47 * This code generally works in units of "dio_blocks". A dio_block is 48 * somewhere between the hard sector size and the filesystem block size. it 49 * is determined on a per-invocation basis. When talking to the filesystem 50 * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity 51 * down by dio->blkfactor. Similarly, fs-blocksize quantities are converted 52 * to bio_block quantities by shifting left by blkfactor. 53 * 54 * If blkfactor is zero then the user's request was aligned to the filesystem's 55 * blocksize. 56 */ 57 58 struct dio { 59 /* BIO submission state */ 60 struct bio *bio; /* bio under assembly */ 61 struct inode *inode; 62 int rw; 63 loff_t i_size; /* i_size when submitted */ 64 int flags; /* doesn't change */ 65 unsigned blkbits; /* doesn't change */ 66 unsigned blkfactor; /* When we're using an alignment which 67 is finer than the filesystem's soft 68 blocksize, this specifies how much 69 finer. blkfactor=2 means 1/4-block 70 alignment. Does not change */ 71 unsigned start_zero_done; /* flag: sub-blocksize zeroing has 72 been performed at the start of a 73 write */ 74 int pages_in_io; /* approximate total IO pages */ 75 size_t size; /* total request size (doesn't change)*/ 76 sector_t block_in_file; /* Current offset into the underlying 77 file in dio_block units. */ 78 unsigned blocks_available; /* At block_in_file. changes */ 79 sector_t final_block_in_request;/* doesn't change */ 80 unsigned first_block_in_page; /* doesn't change, Used only once */ 81 int boundary; /* prev block is at a boundary */ 82 int reap_counter; /* rate limit reaping */ 83 get_block_t *get_block; /* block mapping function */ 84 dio_iodone_t *end_io; /* IO completion function */ 85 dio_submit_t *submit_io; /* IO submition function */ 86 loff_t logical_offset_in_bio; /* current first logical block in bio */ 87 sector_t final_block_in_bio; /* current final block in bio + 1 */ 88 sector_t next_block_for_io; /* next block to be put under IO, 89 in dio_blocks units */ 90 struct buffer_head map_bh; /* last get_block() result */ 91 92 /* 93 * Deferred addition of a page to the dio. These variables are 94 * private to dio_send_cur_page(), submit_page_section() and 95 * dio_bio_add_page(). 96 */ 97 struct page *cur_page; /* The page */ 98 unsigned cur_page_offset; /* Offset into it, in bytes */ 99 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ 100 sector_t cur_page_block; /* Where it starts */ 101 loff_t cur_page_fs_offset; /* Offset in file */ 102 103 /* BIO completion state */ 104 spinlock_t bio_lock; /* protects BIO fields below */ 105 unsigned long refcount; /* direct_io_worker() and bios */ 106 struct bio *bio_list; /* singly linked via bi_private */ 107 struct task_struct *waiter; /* waiting task (NULL if none) */ 108 109 /* AIO related stuff */ 110 struct kiocb *iocb; /* kiocb */ 111 int is_async; /* is IO async ? */ 112 int io_error; /* IO error in completion path */ 113 ssize_t result; /* IO result */ 114 115 /* 116 * Page fetching state. These variables belong to dio_refill_pages(). 117 */ 118 int curr_page; /* changes */ 119 int total_pages; /* doesn't change */ 120 unsigned long curr_user_address;/* changes */ 121 122 /* 123 * Page queue. These variables belong to dio_refill_pages() and 124 * dio_get_page(). 125 */ 126 unsigned head; /* next page to process */ 127 unsigned tail; /* last valid page + 1 */ 128 int page_errors; /* errno from get_user_pages() */ 129 130 /* 131 * pages[] (and any fields placed after it) are not zeroed out at 132 * allocation time. Don't add new fields after pages[] unless you 133 * wish that they not be zeroed. 134 */ 135 struct page *pages[DIO_PAGES]; /* page buffer */ 136 }; 137 138 /* 139 * How many pages are in the queue? 140 */ 141 static inline unsigned dio_pages_present(struct dio *dio) 142 { 143 return dio->tail - dio->head; 144 } 145 146 /* 147 * Go grab and pin some userspace pages. Typically we'll get 64 at a time. 148 */ 149 static int dio_refill_pages(struct dio *dio) 150 { 151 int ret; 152 int nr_pages; 153 154 nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); 155 ret = get_user_pages_fast( 156 dio->curr_user_address, /* Where from? */ 157 nr_pages, /* How many pages? */ 158 dio->rw == READ, /* Write to memory? */ 159 &dio->pages[0]); /* Put results here */ 160 161 if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { 162 struct page *page = ZERO_PAGE(0); 163 /* 164 * A memory fault, but the filesystem has some outstanding 165 * mapped blocks. We need to use those blocks up to avoid 166 * leaking stale data in the file. 167 */ 168 if (dio->page_errors == 0) 169 dio->page_errors = ret; 170 page_cache_get(page); 171 dio->pages[0] = page; 172 dio->head = 0; 173 dio->tail = 1; 174 ret = 0; 175 goto out; 176 } 177 178 if (ret >= 0) { 179 dio->curr_user_address += ret * PAGE_SIZE; 180 dio->curr_page += ret; 181 dio->head = 0; 182 dio->tail = ret; 183 ret = 0; 184 } 185 out: 186 return ret; 187 } 188 189 /* 190 * Get another userspace page. Returns an ERR_PTR on error. Pages are 191 * buffered inside the dio so that we can call get_user_pages() against a 192 * decent number of pages, less frequently. To provide nicer use of the 193 * L1 cache. 194 */ 195 static struct page *dio_get_page(struct dio *dio) 196 { 197 if (dio_pages_present(dio) == 0) { 198 int ret; 199 200 ret = dio_refill_pages(dio); 201 if (ret) 202 return ERR_PTR(ret); 203 BUG_ON(dio_pages_present(dio) == 0); 204 } 205 return dio->pages[dio->head++]; 206 } 207 208 /** 209 * dio_complete() - called when all DIO BIO I/O has been completed 210 * @offset: the byte offset in the file of the completed operation 211 * 212 * This releases locks as dictated by the locking type, lets interested parties 213 * know that a DIO operation has completed, and calculates the resulting return 214 * code for the operation. 215 * 216 * It lets the filesystem know if it registered an interest earlier via 217 * get_block. Pass the private field of the map buffer_head so that 218 * filesystems can use it to hold additional state between get_block calls and 219 * dio_complete. 220 */ 221 static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) 222 { 223 ssize_t transferred = 0; 224 225 /* 226 * AIO submission can race with bio completion to get here while 227 * expecting to have the last io completed by bio completion. 228 * In that case -EIOCBQUEUED is in fact not an error we want 229 * to preserve through this call. 230 */ 231 if (ret == -EIOCBQUEUED) 232 ret = 0; 233 234 if (dio->result) { 235 transferred = dio->result; 236 237 /* Check for short read case */ 238 if ((dio->rw == READ) && ((offset + transferred) > dio->i_size)) 239 transferred = dio->i_size - offset; 240 } 241 242 if (ret == 0) 243 ret = dio->page_errors; 244 if (ret == 0) 245 ret = dio->io_error; 246 if (ret == 0) 247 ret = transferred; 248 249 if (dio->end_io && dio->result) { 250 dio->end_io(dio->iocb, offset, transferred, 251 dio->map_bh.b_private, ret, is_async); 252 } else if (is_async) { 253 aio_complete(dio->iocb, ret, 0); 254 } 255 256 if (dio->flags & DIO_LOCKING) 257 /* lockdep: non-owner release */ 258 up_read_non_owner(&dio->inode->i_alloc_sem); 259 260 return ret; 261 } 262 263 static int dio_bio_complete(struct dio *dio, struct bio *bio); 264 /* 265 * Asynchronous IO callback. 266 */ 267 static void dio_bio_end_aio(struct bio *bio, int error) 268 { 269 struct dio *dio = bio->bi_private; 270 unsigned long remaining; 271 unsigned long flags; 272 273 /* cleanup the bio */ 274 dio_bio_complete(dio, bio); 275 276 spin_lock_irqsave(&dio->bio_lock, flags); 277 remaining = --dio->refcount; 278 if (remaining == 1 && dio->waiter) 279 wake_up_process(dio->waiter); 280 spin_unlock_irqrestore(&dio->bio_lock, flags); 281 282 if (remaining == 0) { 283 dio_complete(dio, dio->iocb->ki_pos, 0, true); 284 kfree(dio); 285 } 286 } 287 288 /* 289 * The BIO completion handler simply queues the BIO up for the process-context 290 * handler. 291 * 292 * During I/O bi_private points at the dio. After I/O, bi_private is used to 293 * implement a singly-linked list of completed BIOs, at dio->bio_list. 294 */ 295 static void dio_bio_end_io(struct bio *bio, int error) 296 { 297 struct dio *dio = bio->bi_private; 298 unsigned long flags; 299 300 spin_lock_irqsave(&dio->bio_lock, flags); 301 bio->bi_private = dio->bio_list; 302 dio->bio_list = bio; 303 if (--dio->refcount == 1 && dio->waiter) 304 wake_up_process(dio->waiter); 305 spin_unlock_irqrestore(&dio->bio_lock, flags); 306 } 307 308 /** 309 * dio_end_io - handle the end io action for the given bio 310 * @bio: The direct io bio thats being completed 311 * @error: Error if there was one 312 * 313 * This is meant to be called by any filesystem that uses their own dio_submit_t 314 * so that the DIO specific endio actions are dealt with after the filesystem 315 * has done it's completion work. 316 */ 317 void dio_end_io(struct bio *bio, int error) 318 { 319 struct dio *dio = bio->bi_private; 320 321 if (dio->is_async) 322 dio_bio_end_aio(bio, error); 323 else 324 dio_bio_end_io(bio, error); 325 } 326 EXPORT_SYMBOL_GPL(dio_end_io); 327 328 static int 329 dio_bio_alloc(struct dio *dio, struct block_device *bdev, 330 sector_t first_sector, int nr_vecs) 331 { 332 struct bio *bio; 333 334 bio = bio_alloc(GFP_KERNEL, nr_vecs); 335 336 bio->bi_bdev = bdev; 337 bio->bi_sector = first_sector; 338 if (dio->is_async) 339 bio->bi_end_io = dio_bio_end_aio; 340 else 341 bio->bi_end_io = dio_bio_end_io; 342 343 dio->bio = bio; 344 dio->logical_offset_in_bio = dio->cur_page_fs_offset; 345 return 0; 346 } 347 348 /* 349 * In the AIO read case we speculatively dirty the pages before starting IO. 350 * During IO completion, any of these pages which happen to have been written 351 * back will be redirtied by bio_check_pages_dirty(). 352 * 353 * bios hold a dio reference between submit_bio and ->end_io. 354 */ 355 static void dio_bio_submit(struct dio *dio) 356 { 357 struct bio *bio = dio->bio; 358 unsigned long flags; 359 360 bio->bi_private = dio; 361 362 spin_lock_irqsave(&dio->bio_lock, flags); 363 dio->refcount++; 364 spin_unlock_irqrestore(&dio->bio_lock, flags); 365 366 if (dio->is_async && dio->rw == READ) 367 bio_set_pages_dirty(bio); 368 369 if (dio->submit_io) 370 dio->submit_io(dio->rw, bio, dio->inode, 371 dio->logical_offset_in_bio); 372 else 373 submit_bio(dio->rw, bio); 374 375 dio->bio = NULL; 376 dio->boundary = 0; 377 dio->logical_offset_in_bio = 0; 378 } 379 380 /* 381 * Release any resources in case of a failure 382 */ 383 static void dio_cleanup(struct dio *dio) 384 { 385 while (dio_pages_present(dio)) 386 page_cache_release(dio_get_page(dio)); 387 } 388 389 /* 390 * Wait for the next BIO to complete. Remove it and return it. NULL is 391 * returned once all BIOs have been completed. This must only be called once 392 * all bios have been issued so that dio->refcount can only decrease. This 393 * requires that that the caller hold a reference on the dio. 394 */ 395 static struct bio *dio_await_one(struct dio *dio) 396 { 397 unsigned long flags; 398 struct bio *bio = NULL; 399 400 spin_lock_irqsave(&dio->bio_lock, flags); 401 402 /* 403 * Wait as long as the list is empty and there are bios in flight. bio 404 * completion drops the count, maybe adds to the list, and wakes while 405 * holding the bio_lock so we don't need set_current_state()'s barrier 406 * and can call it after testing our condition. 407 */ 408 while (dio->refcount > 1 && dio->bio_list == NULL) { 409 __set_current_state(TASK_UNINTERRUPTIBLE); 410 dio->waiter = current; 411 spin_unlock_irqrestore(&dio->bio_lock, flags); 412 io_schedule(); 413 /* wake up sets us TASK_RUNNING */ 414 spin_lock_irqsave(&dio->bio_lock, flags); 415 dio->waiter = NULL; 416 } 417 if (dio->bio_list) { 418 bio = dio->bio_list; 419 dio->bio_list = bio->bi_private; 420 } 421 spin_unlock_irqrestore(&dio->bio_lock, flags); 422 return bio; 423 } 424 425 /* 426 * Process one completed BIO. No locks are held. 427 */ 428 static int dio_bio_complete(struct dio *dio, struct bio *bio) 429 { 430 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 431 struct bio_vec *bvec = bio->bi_io_vec; 432 int page_no; 433 434 if (!uptodate) 435 dio->io_error = -EIO; 436 437 if (dio->is_async && dio->rw == READ) { 438 bio_check_pages_dirty(bio); /* transfers ownership */ 439 } else { 440 for (page_no = 0; page_no < bio->bi_vcnt; page_no++) { 441 struct page *page = bvec[page_no].bv_page; 442 443 if (dio->rw == READ && !PageCompound(page)) 444 set_page_dirty_lock(page); 445 page_cache_release(page); 446 } 447 bio_put(bio); 448 } 449 return uptodate ? 0 : -EIO; 450 } 451 452 /* 453 * Wait on and process all in-flight BIOs. This must only be called once 454 * all bios have been issued so that the refcount can only decrease. 455 * This just waits for all bios to make it through dio_bio_complete. IO 456 * errors are propagated through dio->io_error and should be propagated via 457 * dio_complete(). 458 */ 459 static void dio_await_completion(struct dio *dio) 460 { 461 struct bio *bio; 462 do { 463 bio = dio_await_one(dio); 464 if (bio) 465 dio_bio_complete(dio, bio); 466 } while (bio); 467 } 468 469 /* 470 * A really large O_DIRECT read or write can generate a lot of BIOs. So 471 * to keep the memory consumption sane we periodically reap any completed BIOs 472 * during the BIO generation phase. 473 * 474 * This also helps to limit the peak amount of pinned userspace memory. 475 */ 476 static int dio_bio_reap(struct dio *dio) 477 { 478 int ret = 0; 479 480 if (dio->reap_counter++ >= 64) { 481 while (dio->bio_list) { 482 unsigned long flags; 483 struct bio *bio; 484 int ret2; 485 486 spin_lock_irqsave(&dio->bio_lock, flags); 487 bio = dio->bio_list; 488 dio->bio_list = bio->bi_private; 489 spin_unlock_irqrestore(&dio->bio_lock, flags); 490 ret2 = dio_bio_complete(dio, bio); 491 if (ret == 0) 492 ret = ret2; 493 } 494 dio->reap_counter = 0; 495 } 496 return ret; 497 } 498 499 /* 500 * Call into the fs to map some more disk blocks. We record the current number 501 * of available blocks at dio->blocks_available. These are in units of the 502 * fs blocksize, (1 << inode->i_blkbits). 503 * 504 * The fs is allowed to map lots of blocks at once. If it wants to do that, 505 * it uses the passed inode-relative block number as the file offset, as usual. 506 * 507 * get_block() is passed the number of i_blkbits-sized blocks which direct_io 508 * has remaining to do. The fs should not map more than this number of blocks. 509 * 510 * If the fs has mapped a lot of blocks, it should populate bh->b_size to 511 * indicate how much contiguous disk space has been made available at 512 * bh->b_blocknr. 513 * 514 * If *any* of the mapped blocks are new, then the fs must set buffer_new(). 515 * This isn't very efficient... 516 * 517 * In the case of filesystem holes: the fs may return an arbitrarily-large 518 * hole by returning an appropriate value in b_size and by clearing 519 * buffer_mapped(). However the direct-io code will only process holes one 520 * block at a time - it will repeatedly call get_block() as it walks the hole. 521 */ 522 static int get_more_blocks(struct dio *dio) 523 { 524 int ret; 525 struct buffer_head *map_bh = &dio->map_bh; 526 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ 527 unsigned long fs_count; /* Number of filesystem-sized blocks */ 528 unsigned long dio_count;/* Number of dio_block-sized blocks */ 529 unsigned long blkmask; 530 int create; 531 532 /* 533 * If there was a memory error and we've overwritten all the 534 * mapped blocks then we can now return that memory error 535 */ 536 ret = dio->page_errors; 537 if (ret == 0) { 538 BUG_ON(dio->block_in_file >= dio->final_block_in_request); 539 fs_startblk = dio->block_in_file >> dio->blkfactor; 540 dio_count = dio->final_block_in_request - dio->block_in_file; 541 fs_count = dio_count >> dio->blkfactor; 542 blkmask = (1 << dio->blkfactor) - 1; 543 if (dio_count & blkmask) 544 fs_count++; 545 546 map_bh->b_state = 0; 547 map_bh->b_size = fs_count << dio->inode->i_blkbits; 548 549 /* 550 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we 551 * forbid block creations: only overwrites are permitted. 552 * We will return early to the caller once we see an 553 * unmapped buffer head returned, and the caller will fall 554 * back to buffered I/O. 555 * 556 * Otherwise the decision is left to the get_blocks method, 557 * which may decide to handle it or also return an unmapped 558 * buffer head. 559 */ 560 create = dio->rw & WRITE; 561 if (dio->flags & DIO_SKIP_HOLES) { 562 if (dio->block_in_file < (i_size_read(dio->inode) >> 563 dio->blkbits)) 564 create = 0; 565 } 566 567 ret = (*dio->get_block)(dio->inode, fs_startblk, 568 map_bh, create); 569 } 570 return ret; 571 } 572 573 /* 574 * There is no bio. Make one now. 575 */ 576 static int dio_new_bio(struct dio *dio, sector_t start_sector) 577 { 578 sector_t sector; 579 int ret, nr_pages; 580 581 ret = dio_bio_reap(dio); 582 if (ret) 583 goto out; 584 sector = start_sector << (dio->blkbits - 9); 585 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); 586 BUG_ON(nr_pages <= 0); 587 ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); 588 dio->boundary = 0; 589 out: 590 return ret; 591 } 592 593 /* 594 * Attempt to put the current chunk of 'cur_page' into the current BIO. If 595 * that was successful then update final_block_in_bio and take a ref against 596 * the just-added page. 597 * 598 * Return zero on success. Non-zero means the caller needs to start a new BIO. 599 */ 600 static int dio_bio_add_page(struct dio *dio) 601 { 602 int ret; 603 604 ret = bio_add_page(dio->bio, dio->cur_page, 605 dio->cur_page_len, dio->cur_page_offset); 606 if (ret == dio->cur_page_len) { 607 /* 608 * Decrement count only, if we are done with this page 609 */ 610 if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE) 611 dio->pages_in_io--; 612 page_cache_get(dio->cur_page); 613 dio->final_block_in_bio = dio->cur_page_block + 614 (dio->cur_page_len >> dio->blkbits); 615 ret = 0; 616 } else { 617 ret = 1; 618 } 619 return ret; 620 } 621 622 /* 623 * Put cur_page under IO. The section of cur_page which is described by 624 * cur_page_offset,cur_page_len is put into a BIO. The section of cur_page 625 * starts on-disk at cur_page_block. 626 * 627 * We take a ref against the page here (on behalf of its presence in the bio). 628 * 629 * The caller of this function is responsible for removing cur_page from the 630 * dio, and for dropping the refcount which came from that presence. 631 */ 632 static int dio_send_cur_page(struct dio *dio) 633 { 634 int ret = 0; 635 636 if (dio->bio) { 637 loff_t cur_offset = dio->cur_page_fs_offset; 638 loff_t bio_next_offset = dio->logical_offset_in_bio + 639 dio->bio->bi_size; 640 641 /* 642 * See whether this new request is contiguous with the old. 643 * 644 * Btrfs cannot handl having logically non-contiguous requests 645 * submitted. For exmple if you have 646 * 647 * Logical: [0-4095][HOLE][8192-12287] 648 * Phyiscal: [0-4095] [4096-8181] 649 * 650 * We cannot submit those pages together as one BIO. So if our 651 * current logical offset in the file does not equal what would 652 * be the next logical offset in the bio, submit the bio we 653 * have. 654 */ 655 if (dio->final_block_in_bio != dio->cur_page_block || 656 cur_offset != bio_next_offset) 657 dio_bio_submit(dio); 658 /* 659 * Submit now if the underlying fs is about to perform a 660 * metadata read 661 */ 662 else if (dio->boundary) 663 dio_bio_submit(dio); 664 } 665 666 if (dio->bio == NULL) { 667 ret = dio_new_bio(dio, dio->cur_page_block); 668 if (ret) 669 goto out; 670 } 671 672 if (dio_bio_add_page(dio) != 0) { 673 dio_bio_submit(dio); 674 ret = dio_new_bio(dio, dio->cur_page_block); 675 if (ret == 0) { 676 ret = dio_bio_add_page(dio); 677 BUG_ON(ret != 0); 678 } 679 } 680 out: 681 return ret; 682 } 683 684 /* 685 * An autonomous function to put a chunk of a page under deferred IO. 686 * 687 * The caller doesn't actually know (or care) whether this piece of page is in 688 * a BIO, or is under IO or whatever. We just take care of all possible 689 * situations here. The separation between the logic of do_direct_IO() and 690 * that of submit_page_section() is important for clarity. Please don't break. 691 * 692 * The chunk of page starts on-disk at blocknr. 693 * 694 * We perform deferred IO, by recording the last-submitted page inside our 695 * private part of the dio structure. If possible, we just expand the IO 696 * across that page here. 697 * 698 * If that doesn't work out then we put the old page into the bio and add this 699 * page to the dio instead. 700 */ 701 static int 702 submit_page_section(struct dio *dio, struct page *page, 703 unsigned offset, unsigned len, sector_t blocknr) 704 { 705 int ret = 0; 706 707 if (dio->rw & WRITE) { 708 /* 709 * Read accounting is performed in submit_bio() 710 */ 711 task_io_account_write(len); 712 } 713 714 /* 715 * Can we just grow the current page's presence in the dio? 716 */ 717 if ( (dio->cur_page == page) && 718 (dio->cur_page_offset + dio->cur_page_len == offset) && 719 (dio->cur_page_block + 720 (dio->cur_page_len >> dio->blkbits) == blocknr)) { 721 dio->cur_page_len += len; 722 723 /* 724 * If dio->boundary then we want to schedule the IO now to 725 * avoid metadata seeks. 726 */ 727 if (dio->boundary) { 728 ret = dio_send_cur_page(dio); 729 page_cache_release(dio->cur_page); 730 dio->cur_page = NULL; 731 } 732 goto out; 733 } 734 735 /* 736 * If there's a deferred page already there then send it. 737 */ 738 if (dio->cur_page) { 739 ret = dio_send_cur_page(dio); 740 page_cache_release(dio->cur_page); 741 dio->cur_page = NULL; 742 if (ret) 743 goto out; 744 } 745 746 page_cache_get(page); /* It is in dio */ 747 dio->cur_page = page; 748 dio->cur_page_offset = offset; 749 dio->cur_page_len = len; 750 dio->cur_page_block = blocknr; 751 dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; 752 out: 753 return ret; 754 } 755 756 /* 757 * Clean any dirty buffers in the blockdev mapping which alias newly-created 758 * file blocks. Only called for S_ISREG files - blockdevs do not set 759 * buffer_new 760 */ 761 static void clean_blockdev_aliases(struct dio *dio) 762 { 763 unsigned i; 764 unsigned nblocks; 765 766 nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits; 767 768 for (i = 0; i < nblocks; i++) { 769 unmap_underlying_metadata(dio->map_bh.b_bdev, 770 dio->map_bh.b_blocknr + i); 771 } 772 } 773 774 /* 775 * If we are not writing the entire block and get_block() allocated 776 * the block for us, we need to fill-in the unused portion of the 777 * block with zeros. This happens only if user-buffer, fileoffset or 778 * io length is not filesystem block-size multiple. 779 * 780 * `end' is zero if we're doing the start of the IO, 1 at the end of the 781 * IO. 782 */ 783 static void dio_zero_block(struct dio *dio, int end) 784 { 785 unsigned dio_blocks_per_fs_block; 786 unsigned this_chunk_blocks; /* In dio_blocks */ 787 unsigned this_chunk_bytes; 788 struct page *page; 789 790 dio->start_zero_done = 1; 791 if (!dio->blkfactor || !buffer_new(&dio->map_bh)) 792 return; 793 794 dio_blocks_per_fs_block = 1 << dio->blkfactor; 795 this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1); 796 797 if (!this_chunk_blocks) 798 return; 799 800 /* 801 * We need to zero out part of an fs block. It is either at the 802 * beginning or the end of the fs block. 803 */ 804 if (end) 805 this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; 806 807 this_chunk_bytes = this_chunk_blocks << dio->blkbits; 808 809 page = ZERO_PAGE(0); 810 if (submit_page_section(dio, page, 0, this_chunk_bytes, 811 dio->next_block_for_io)) 812 return; 813 814 dio->next_block_for_io += this_chunk_blocks; 815 } 816 817 /* 818 * Walk the user pages, and the file, mapping blocks to disk and generating 819 * a sequence of (page,offset,len,block) mappings. These mappings are injected 820 * into submit_page_section(), which takes care of the next stage of submission 821 * 822 * Direct IO against a blockdev is different from a file. Because we can 823 * happily perform page-sized but 512-byte aligned IOs. It is important that 824 * blockdev IO be able to have fine alignment and large sizes. 825 * 826 * So what we do is to permit the ->get_block function to populate bh.b_size 827 * with the size of IO which is permitted at this offset and this i_blkbits. 828 * 829 * For best results, the blockdev should be set up with 512-byte i_blkbits and 830 * it should set b_size to PAGE_SIZE or more inside get_block(). This gives 831 * fine alignment but still allows this function to work in PAGE_SIZE units. 832 */ 833 static int do_direct_IO(struct dio *dio) 834 { 835 const unsigned blkbits = dio->blkbits; 836 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 837 struct page *page; 838 unsigned block_in_page; 839 struct buffer_head *map_bh = &dio->map_bh; 840 int ret = 0; 841 842 /* The I/O can start at any block offset within the first page */ 843 block_in_page = dio->first_block_in_page; 844 845 while (dio->block_in_file < dio->final_block_in_request) { 846 page = dio_get_page(dio); 847 if (IS_ERR(page)) { 848 ret = PTR_ERR(page); 849 goto out; 850 } 851 852 while (block_in_page < blocks_per_page) { 853 unsigned offset_in_page = block_in_page << blkbits; 854 unsigned this_chunk_bytes; /* # of bytes mapped */ 855 unsigned this_chunk_blocks; /* # of blocks */ 856 unsigned u; 857 858 if (dio->blocks_available == 0) { 859 /* 860 * Need to go and map some more disk 861 */ 862 unsigned long blkmask; 863 unsigned long dio_remainder; 864 865 ret = get_more_blocks(dio); 866 if (ret) { 867 page_cache_release(page); 868 goto out; 869 } 870 if (!buffer_mapped(map_bh)) 871 goto do_holes; 872 873 dio->blocks_available = 874 map_bh->b_size >> dio->blkbits; 875 dio->next_block_for_io = 876 map_bh->b_blocknr << dio->blkfactor; 877 if (buffer_new(map_bh)) 878 clean_blockdev_aliases(dio); 879 880 if (!dio->blkfactor) 881 goto do_holes; 882 883 blkmask = (1 << dio->blkfactor) - 1; 884 dio_remainder = (dio->block_in_file & blkmask); 885 886 /* 887 * If we are at the start of IO and that IO 888 * starts partway into a fs-block, 889 * dio_remainder will be non-zero. If the IO 890 * is a read then we can simply advance the IO 891 * cursor to the first block which is to be 892 * read. But if the IO is a write and the 893 * block was newly allocated we cannot do that; 894 * the start of the fs block must be zeroed out 895 * on-disk 896 */ 897 if (!buffer_new(map_bh)) 898 dio->next_block_for_io += dio_remainder; 899 dio->blocks_available -= dio_remainder; 900 } 901 do_holes: 902 /* Handle holes */ 903 if (!buffer_mapped(map_bh)) { 904 loff_t i_size_aligned; 905 906 /* AKPM: eargh, -ENOTBLK is a hack */ 907 if (dio->rw & WRITE) { 908 page_cache_release(page); 909 return -ENOTBLK; 910 } 911 912 /* 913 * Be sure to account for a partial block as the 914 * last block in the file 915 */ 916 i_size_aligned = ALIGN(i_size_read(dio->inode), 917 1 << blkbits); 918 if (dio->block_in_file >= 919 i_size_aligned >> blkbits) { 920 /* We hit eof */ 921 page_cache_release(page); 922 goto out; 923 } 924 zero_user(page, block_in_page << blkbits, 925 1 << blkbits); 926 dio->block_in_file++; 927 block_in_page++; 928 goto next_block; 929 } 930 931 /* 932 * If we're performing IO which has an alignment which 933 * is finer than the underlying fs, go check to see if 934 * we must zero out the start of this block. 935 */ 936 if (unlikely(dio->blkfactor && !dio->start_zero_done)) 937 dio_zero_block(dio, 0); 938 939 /* 940 * Work out, in this_chunk_blocks, how much disk we 941 * can add to this page 942 */ 943 this_chunk_blocks = dio->blocks_available; 944 u = (PAGE_SIZE - offset_in_page) >> blkbits; 945 if (this_chunk_blocks > u) 946 this_chunk_blocks = u; 947 u = dio->final_block_in_request - dio->block_in_file; 948 if (this_chunk_blocks > u) 949 this_chunk_blocks = u; 950 this_chunk_bytes = this_chunk_blocks << blkbits; 951 BUG_ON(this_chunk_bytes == 0); 952 953 dio->boundary = buffer_boundary(map_bh); 954 ret = submit_page_section(dio, page, offset_in_page, 955 this_chunk_bytes, dio->next_block_for_io); 956 if (ret) { 957 page_cache_release(page); 958 goto out; 959 } 960 dio->next_block_for_io += this_chunk_blocks; 961 962 dio->block_in_file += this_chunk_blocks; 963 block_in_page += this_chunk_blocks; 964 dio->blocks_available -= this_chunk_blocks; 965 next_block: 966 BUG_ON(dio->block_in_file > dio->final_block_in_request); 967 if (dio->block_in_file == dio->final_block_in_request) 968 break; 969 } 970 971 /* Drop the ref which was taken in get_user_pages() */ 972 page_cache_release(page); 973 block_in_page = 0; 974 } 975 out: 976 return ret; 977 } 978 979 /* 980 * Releases both i_mutex and i_alloc_sem 981 */ 982 static ssize_t 983 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 984 const struct iovec *iov, loff_t offset, unsigned long nr_segs, 985 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, 986 dio_submit_t submit_io, struct dio *dio) 987 { 988 unsigned long user_addr; 989 unsigned long flags; 990 int seg; 991 ssize_t ret = 0; 992 ssize_t ret2; 993 size_t bytes; 994 995 dio->inode = inode; 996 dio->rw = rw; 997 dio->blkbits = blkbits; 998 dio->blkfactor = inode->i_blkbits - blkbits; 999 dio->block_in_file = offset >> blkbits; 1000 1001 dio->get_block = get_block; 1002 dio->end_io = end_io; 1003 dio->submit_io = submit_io; 1004 dio->final_block_in_bio = -1; 1005 dio->next_block_for_io = -1; 1006 1007 dio->iocb = iocb; 1008 dio->i_size = i_size_read(inode); 1009 1010 spin_lock_init(&dio->bio_lock); 1011 dio->refcount = 1; 1012 1013 /* 1014 * In case of non-aligned buffers, we may need 2 more 1015 * pages since we need to zero out first and last block. 1016 */ 1017 if (unlikely(dio->blkfactor)) 1018 dio->pages_in_io = 2; 1019 1020 for (seg = 0; seg < nr_segs; seg++) { 1021 user_addr = (unsigned long)iov[seg].iov_base; 1022 dio->pages_in_io += 1023 ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE 1024 - user_addr/PAGE_SIZE); 1025 } 1026 1027 for (seg = 0; seg < nr_segs; seg++) { 1028 user_addr = (unsigned long)iov[seg].iov_base; 1029 dio->size += bytes = iov[seg].iov_len; 1030 1031 /* Index into the first page of the first block */ 1032 dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; 1033 dio->final_block_in_request = dio->block_in_file + 1034 (bytes >> blkbits); 1035 /* Page fetching state */ 1036 dio->head = 0; 1037 dio->tail = 0; 1038 dio->curr_page = 0; 1039 1040 dio->total_pages = 0; 1041 if (user_addr & (PAGE_SIZE-1)) { 1042 dio->total_pages++; 1043 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); 1044 } 1045 dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; 1046 dio->curr_user_address = user_addr; 1047 1048 ret = do_direct_IO(dio); 1049 1050 dio->result += iov[seg].iov_len - 1051 ((dio->final_block_in_request - dio->block_in_file) << 1052 blkbits); 1053 1054 if (ret) { 1055 dio_cleanup(dio); 1056 break; 1057 } 1058 } /* end iovec loop */ 1059 1060 if (ret == -ENOTBLK) { 1061 /* 1062 * The remaining part of the request will be 1063 * be handled by buffered I/O when we return 1064 */ 1065 ret = 0; 1066 } 1067 /* 1068 * There may be some unwritten disk at the end of a part-written 1069 * fs-block-sized block. Go zero that now. 1070 */ 1071 dio_zero_block(dio, 1); 1072 1073 if (dio->cur_page) { 1074 ret2 = dio_send_cur_page(dio); 1075 if (ret == 0) 1076 ret = ret2; 1077 page_cache_release(dio->cur_page); 1078 dio->cur_page = NULL; 1079 } 1080 if (dio->bio) 1081 dio_bio_submit(dio); 1082 1083 /* 1084 * It is possible that, we return short IO due to end of file. 1085 * In that case, we need to release all the pages we got hold on. 1086 */ 1087 dio_cleanup(dio); 1088 1089 /* 1090 * All block lookups have been performed. For READ requests 1091 * we can let i_mutex go now that its achieved its purpose 1092 * of protecting us from looking up uninitialized blocks. 1093 */ 1094 if (rw == READ && (dio->flags & DIO_LOCKING)) 1095 mutex_unlock(&dio->inode->i_mutex); 1096 1097 /* 1098 * The only time we want to leave bios in flight is when a successful 1099 * partial aio read or full aio write have been setup. In that case 1100 * bio completion will call aio_complete. The only time it's safe to 1101 * call aio_complete is when we return -EIOCBQUEUED, so we key on that. 1102 * This had *better* be the only place that raises -EIOCBQUEUED. 1103 */ 1104 BUG_ON(ret == -EIOCBQUEUED); 1105 if (dio->is_async && ret == 0 && dio->result && 1106 ((rw & READ) || (dio->result == dio->size))) 1107 ret = -EIOCBQUEUED; 1108 1109 if (ret != -EIOCBQUEUED) { 1110 /* All IO is now issued, send it on its way */ 1111 blk_run_address_space(inode->i_mapping); 1112 dio_await_completion(dio); 1113 } 1114 1115 /* 1116 * Sync will always be dropping the final ref and completing the 1117 * operation. AIO can if it was a broken operation described above or 1118 * in fact if all the bios race to complete before we get here. In 1119 * that case dio_complete() translates the EIOCBQUEUED into the proper 1120 * return code that the caller will hand to aio_complete(). 1121 * 1122 * This is managed by the bio_lock instead of being an atomic_t so that 1123 * completion paths can drop their ref and use the remaining count to 1124 * decide to wake the submission path atomically. 1125 */ 1126 spin_lock_irqsave(&dio->bio_lock, flags); 1127 ret2 = --dio->refcount; 1128 spin_unlock_irqrestore(&dio->bio_lock, flags); 1129 1130 if (ret2 == 0) { 1131 ret = dio_complete(dio, offset, ret, false); 1132 kfree(dio); 1133 } else 1134 BUG_ON(ret != -EIOCBQUEUED); 1135 1136 return ret; 1137 } 1138 1139 /* 1140 * This is a library function for use by filesystem drivers. 1141 * 1142 * The locking rules are governed by the flags parameter: 1143 * - if the flags value contains DIO_LOCKING we use a fancy locking 1144 * scheme for dumb filesystems. 1145 * For writes this function is called under i_mutex and returns with 1146 * i_mutex held, for reads, i_mutex is not held on entry, but it is 1147 * taken and dropped again before returning. 1148 * For reads and writes i_alloc_sem is taken in shared mode and released 1149 * on I/O completion (which may happen asynchronously after returning to 1150 * the caller). 1151 * 1152 * - if the flags value does NOT contain DIO_LOCKING we don't use any 1153 * internal locking but rather rely on the filesystem to synchronize 1154 * direct I/O reads/writes versus each other and truncate. 1155 * For reads and writes both i_mutex and i_alloc_sem are not held on 1156 * entry and are never taken. 1157 */ 1158 ssize_t 1159 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1160 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1161 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1162 dio_submit_t submit_io, int flags) 1163 { 1164 int seg; 1165 size_t size; 1166 unsigned long addr; 1167 unsigned blkbits = inode->i_blkbits; 1168 unsigned bdev_blkbits = 0; 1169 unsigned blocksize_mask = (1 << blkbits) - 1; 1170 ssize_t retval = -EINVAL; 1171 loff_t end = offset; 1172 struct dio *dio; 1173 1174 if (rw & WRITE) 1175 rw = WRITE_ODIRECT_PLUG; 1176 1177 if (bdev) 1178 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1179 1180 if (offset & blocksize_mask) { 1181 if (bdev) 1182 blkbits = bdev_blkbits; 1183 blocksize_mask = (1 << blkbits) - 1; 1184 if (offset & blocksize_mask) 1185 goto out; 1186 } 1187 1188 /* Check the memory alignment. Blocks cannot straddle pages */ 1189 for (seg = 0; seg < nr_segs; seg++) { 1190 addr = (unsigned long)iov[seg].iov_base; 1191 size = iov[seg].iov_len; 1192 end += size; 1193 if ((addr & blocksize_mask) || (size & blocksize_mask)) { 1194 if (bdev) 1195 blkbits = bdev_blkbits; 1196 blocksize_mask = (1 << blkbits) - 1; 1197 if ((addr & blocksize_mask) || (size & blocksize_mask)) 1198 goto out; 1199 } 1200 } 1201 1202 dio = kmalloc(sizeof(*dio), GFP_KERNEL); 1203 retval = -ENOMEM; 1204 if (!dio) 1205 goto out; 1206 /* 1207 * Believe it or not, zeroing out the page array caused a .5% 1208 * performance regression in a database benchmark. So, we take 1209 * care to only zero out what's needed. 1210 */ 1211 memset(dio, 0, offsetof(struct dio, pages)); 1212 1213 dio->flags = flags; 1214 if (dio->flags & DIO_LOCKING) { 1215 /* watch out for a 0 len io from a tricksy fs */ 1216 if (rw == READ && end > offset) { 1217 struct address_space *mapping = 1218 iocb->ki_filp->f_mapping; 1219 1220 /* will be released by direct_io_worker */ 1221 mutex_lock(&inode->i_mutex); 1222 1223 retval = filemap_write_and_wait_range(mapping, offset, 1224 end - 1); 1225 if (retval) { 1226 mutex_unlock(&inode->i_mutex); 1227 kfree(dio); 1228 goto out; 1229 } 1230 } 1231 1232 /* 1233 * Will be released at I/O completion, possibly in a 1234 * different thread. 1235 */ 1236 down_read_non_owner(&inode->i_alloc_sem); 1237 } 1238 1239 /* 1240 * For file extending writes updating i_size before data 1241 * writeouts complete can expose uninitialized blocks. So 1242 * even for AIO, we need to wait for i/o to complete before 1243 * returning in this case. 1244 */ 1245 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && 1246 (end > i_size_read(inode))); 1247 1248 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1249 nr_segs, blkbits, get_block, end_io, 1250 submit_io, dio); 1251 1252 out: 1253 return retval; 1254 } 1255 EXPORT_SYMBOL(__blockdev_direct_IO); 1256