1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/fs.h> 20 #include <linux/pagemap.h> 21 #include <linux/highmem.h> 22 #include <linux/time.h> 23 #include <linux/init.h> 24 #include <linux/string.h> 25 #include <linux/backing-dev.h> 26 #include <linux/mpage.h> 27 #include <linux/falloc.h> 28 #include <linux/swap.h> 29 #include <linux/writeback.h> 30 #include <linux/statfs.h> 31 #include <linux/compat.h> 32 #include <linux/slab.h> 33 #include "ctree.h" 34 #include "disk-io.h" 35 #include "transaction.h" 36 #include "btrfs_inode.h" 37 #include "ioctl.h" 38 #include "print-tree.h" 39 #include "tree-log.h" 40 #include "locking.h" 41 #include "compat.h" 42 43 /* 44 * when auto defrag is enabled we 45 * queue up these defrag structs to remember which 46 * inodes need defragging passes 47 */ 48 struct inode_defrag { 49 struct rb_node rb_node; 50 /* objectid */ 51 u64 ino; 52 /* 53 * transid where the defrag was added, we search for 54 * extents newer than this 55 */ 56 u64 transid; 57 58 /* root objectid */ 59 u64 root; 60 61 /* last offset we were able to defrag */ 62 u64 last_offset; 63 64 /* if we've wrapped around back to zero once already */ 65 int cycled; 66 }; 67 68 /* pop a record for an inode into the defrag tree. The lock 69 * must be held already 70 * 71 * If you're inserting a record for an older transid than an 72 * existing record, the transid already in the tree is lowered 73 * 74 * If an existing record is found the defrag item you 75 * pass in is freed 76 */ 77 static int __btrfs_add_inode_defrag(struct inode *inode, 78 struct inode_defrag *defrag) 79 { 80 struct btrfs_root *root = BTRFS_I(inode)->root; 81 struct inode_defrag *entry; 82 struct rb_node **p; 83 struct rb_node *parent = NULL; 84 85 p = &root->fs_info->defrag_inodes.rb_node; 86 while (*p) { 87 parent = *p; 88 entry = rb_entry(parent, struct inode_defrag, rb_node); 89 90 if (defrag->ino < entry->ino) 91 p = &parent->rb_left; 92 else if (defrag->ino > entry->ino) 93 p = &parent->rb_right; 94 else { 95 /* if we're reinserting an entry for 96 * an old defrag run, make sure to 97 * lower the transid of our existing record 98 */ 99 if (defrag->transid < entry->transid) 100 entry->transid = defrag->transid; 101 if (defrag->last_offset > entry->last_offset) 102 entry->last_offset = defrag->last_offset; 103 goto exists; 104 } 105 } 106 BTRFS_I(inode)->in_defrag = 1; 107 rb_link_node(&defrag->rb_node, parent, p); 108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 109 return 0; 110 111 exists: 112 kfree(defrag); 113 return 0; 114 115 } 116 117 /* 118 * insert a defrag record for this inode if auto defrag is 119 * enabled 120 */ 121 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 122 struct inode *inode) 123 { 124 struct btrfs_root *root = BTRFS_I(inode)->root; 125 struct inode_defrag *defrag; 126 int ret = 0; 127 u64 transid; 128 129 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 130 return 0; 131 132 if (btrfs_fs_closing(root->fs_info)) 133 return 0; 134 135 if (BTRFS_I(inode)->in_defrag) 136 return 0; 137 138 if (trans) 139 transid = trans->transid; 140 else 141 transid = BTRFS_I(inode)->root->last_trans; 142 143 defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 144 if (!defrag) 145 return -ENOMEM; 146 147 defrag->ino = btrfs_ino(inode); 148 defrag->transid = transid; 149 defrag->root = root->root_key.objectid; 150 151 spin_lock(&root->fs_info->defrag_inodes_lock); 152 if (!BTRFS_I(inode)->in_defrag) 153 ret = __btrfs_add_inode_defrag(inode, defrag); 154 spin_unlock(&root->fs_info->defrag_inodes_lock); 155 return ret; 156 } 157 158 /* 159 * must be called with the defrag_inodes lock held 160 */ 161 struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, 162 struct rb_node **next) 163 { 164 struct inode_defrag *entry = NULL; 165 struct rb_node *p; 166 struct rb_node *parent = NULL; 167 168 p = info->defrag_inodes.rb_node; 169 while (p) { 170 parent = p; 171 entry = rb_entry(parent, struct inode_defrag, rb_node); 172 173 if (ino < entry->ino) 174 p = parent->rb_left; 175 else if (ino > entry->ino) 176 p = parent->rb_right; 177 else 178 return entry; 179 } 180 181 if (next) { 182 while (parent && ino > entry->ino) { 183 parent = rb_next(parent); 184 entry = rb_entry(parent, struct inode_defrag, rb_node); 185 } 186 *next = parent; 187 } 188 return NULL; 189 } 190 191 /* 192 * run through the list of inodes in the FS that need 193 * defragging 194 */ 195 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 196 { 197 struct inode_defrag *defrag; 198 struct btrfs_root *inode_root; 199 struct inode *inode; 200 struct rb_node *n; 201 struct btrfs_key key; 202 struct btrfs_ioctl_defrag_range_args range; 203 u64 first_ino = 0; 204 int num_defrag; 205 int defrag_batch = 1024; 206 207 memset(&range, 0, sizeof(range)); 208 range.len = (u64)-1; 209 210 atomic_inc(&fs_info->defrag_running); 211 spin_lock(&fs_info->defrag_inodes_lock); 212 while(1) { 213 n = NULL; 214 215 /* find an inode to defrag */ 216 defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); 217 if (!defrag) { 218 if (n) 219 defrag = rb_entry(n, struct inode_defrag, rb_node); 220 else if (first_ino) { 221 first_ino = 0; 222 continue; 223 } else { 224 break; 225 } 226 } 227 228 /* remove it from the rbtree */ 229 first_ino = defrag->ino + 1; 230 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); 231 232 if (btrfs_fs_closing(fs_info)) 233 goto next_free; 234 235 spin_unlock(&fs_info->defrag_inodes_lock); 236 237 /* get the inode */ 238 key.objectid = defrag->root; 239 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 240 key.offset = (u64)-1; 241 inode_root = btrfs_read_fs_root_no_name(fs_info, &key); 242 if (IS_ERR(inode_root)) 243 goto next; 244 245 key.objectid = defrag->ino; 246 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 247 key.offset = 0; 248 249 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 250 if (IS_ERR(inode)) 251 goto next; 252 253 /* do a chunk of defrag */ 254 BTRFS_I(inode)->in_defrag = 0; 255 range.start = defrag->last_offset; 256 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 257 defrag_batch); 258 /* 259 * if we filled the whole defrag batch, there 260 * must be more work to do. Queue this defrag 261 * again 262 */ 263 if (num_defrag == defrag_batch) { 264 defrag->last_offset = range.start; 265 __btrfs_add_inode_defrag(inode, defrag); 266 /* 267 * we don't want to kfree defrag, we added it back to 268 * the rbtree 269 */ 270 defrag = NULL; 271 } else if (defrag->last_offset && !defrag->cycled) { 272 /* 273 * we didn't fill our defrag batch, but 274 * we didn't start at zero. Make sure we loop 275 * around to the start of the file. 276 */ 277 defrag->last_offset = 0; 278 defrag->cycled = 1; 279 __btrfs_add_inode_defrag(inode, defrag); 280 defrag = NULL; 281 } 282 283 iput(inode); 284 next: 285 spin_lock(&fs_info->defrag_inodes_lock); 286 next_free: 287 kfree(defrag); 288 } 289 spin_unlock(&fs_info->defrag_inodes_lock); 290 291 atomic_dec(&fs_info->defrag_running); 292 293 /* 294 * during unmount, we use the transaction_wait queue to 295 * wait for the defragger to stop 296 */ 297 wake_up(&fs_info->transaction_wait); 298 return 0; 299 } 300 301 /* simple helper to fault in pages and copy. This should go away 302 * and be replaced with calls into generic code. 303 */ 304 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 305 size_t write_bytes, 306 struct page **prepared_pages, 307 struct iov_iter *i) 308 { 309 size_t copied = 0; 310 size_t total_copied = 0; 311 int pg = 0; 312 int offset = pos & (PAGE_CACHE_SIZE - 1); 313 314 while (write_bytes > 0) { 315 size_t count = min_t(size_t, 316 PAGE_CACHE_SIZE - offset, write_bytes); 317 struct page *page = prepared_pages[pg]; 318 /* 319 * Copy data from userspace to the current page 320 * 321 * Disable pagefault to avoid recursive lock since 322 * the pages are already locked 323 */ 324 pagefault_disable(); 325 copied = iov_iter_copy_from_user_atomic(page, i, offset, count); 326 pagefault_enable(); 327 328 /* Flush processor's dcache for this page */ 329 flush_dcache_page(page); 330 331 /* 332 * if we get a partial write, we can end up with 333 * partially up to date pages. These add 334 * a lot of complexity, so make sure they don't 335 * happen by forcing this copy to be retried. 336 * 337 * The rest of the btrfs_file_write code will fall 338 * back to page at a time copies after we return 0. 339 */ 340 if (!PageUptodate(page) && copied < count) 341 copied = 0; 342 343 iov_iter_advance(i, copied); 344 write_bytes -= copied; 345 total_copied += copied; 346 347 /* Return to btrfs_file_aio_write to fault page */ 348 if (unlikely(copied == 0)) 349 break; 350 351 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 352 offset += copied; 353 } else { 354 pg++; 355 offset = 0; 356 } 357 } 358 return total_copied; 359 } 360 361 /* 362 * unlocks pages after btrfs_file_write is done with them 363 */ 364 void btrfs_drop_pages(struct page **pages, size_t num_pages) 365 { 366 size_t i; 367 for (i = 0; i < num_pages; i++) { 368 /* page checked is some magic around finding pages that 369 * have been modified without going through btrfs_set_page_dirty 370 * clear it here 371 */ 372 ClearPageChecked(pages[i]); 373 unlock_page(pages[i]); 374 mark_page_accessed(pages[i]); 375 page_cache_release(pages[i]); 376 } 377 } 378 379 /* 380 * after copy_from_user, pages need to be dirtied and we need to make 381 * sure holes are created between the current EOF and the start of 382 * any next extents (if required). 383 * 384 * this also makes the decision about creating an inline extent vs 385 * doing real data extents, marking pages dirty and delalloc as required. 386 */ 387 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, 388 struct page **pages, size_t num_pages, 389 loff_t pos, size_t write_bytes, 390 struct extent_state **cached) 391 { 392 int err = 0; 393 int i; 394 u64 num_bytes; 395 u64 start_pos; 396 u64 end_of_last_block; 397 u64 end_pos = pos + write_bytes; 398 loff_t isize = i_size_read(inode); 399 400 start_pos = pos & ~((u64)root->sectorsize - 1); 401 num_bytes = (write_bytes + pos - start_pos + 402 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 403 404 end_of_last_block = start_pos + num_bytes - 1; 405 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 406 cached); 407 if (err) 408 return err; 409 410 for (i = 0; i < num_pages; i++) { 411 struct page *p = pages[i]; 412 SetPageUptodate(p); 413 ClearPageChecked(p); 414 set_page_dirty(p); 415 } 416 417 /* 418 * we've only changed i_size in ram, and we haven't updated 419 * the disk i_size. There is no need to log the inode 420 * at this time. 421 */ 422 if (end_pos > isize) 423 i_size_write(inode, end_pos); 424 return 0; 425 } 426 427 /* 428 * this drops all the extents in the cache that intersect the range 429 * [start, end]. Existing extents are split as required. 430 */ 431 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 432 int skip_pinned) 433 { 434 struct extent_map *em; 435 struct extent_map *split = NULL; 436 struct extent_map *split2 = NULL; 437 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 438 u64 len = end - start + 1; 439 int ret; 440 int testend = 1; 441 unsigned long flags; 442 int compressed = 0; 443 444 WARN_ON(end < start); 445 if (end == (u64)-1) { 446 len = (u64)-1; 447 testend = 0; 448 } 449 while (1) { 450 if (!split) 451 split = alloc_extent_map(); 452 if (!split2) 453 split2 = alloc_extent_map(); 454 BUG_ON(!split || !split2); 455 456 write_lock(&em_tree->lock); 457 em = lookup_extent_mapping(em_tree, start, len); 458 if (!em) { 459 write_unlock(&em_tree->lock); 460 break; 461 } 462 flags = em->flags; 463 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 464 if (testend && em->start + em->len >= start + len) { 465 free_extent_map(em); 466 write_unlock(&em_tree->lock); 467 break; 468 } 469 start = em->start + em->len; 470 if (testend) 471 len = start + len - (em->start + em->len); 472 free_extent_map(em); 473 write_unlock(&em_tree->lock); 474 continue; 475 } 476 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 477 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 478 remove_extent_mapping(em_tree, em); 479 480 if (em->block_start < EXTENT_MAP_LAST_BYTE && 481 em->start < start) { 482 split->start = em->start; 483 split->len = start - em->start; 484 split->orig_start = em->orig_start; 485 split->block_start = em->block_start; 486 487 if (compressed) 488 split->block_len = em->block_len; 489 else 490 split->block_len = split->len; 491 492 split->bdev = em->bdev; 493 split->flags = flags; 494 split->compress_type = em->compress_type; 495 ret = add_extent_mapping(em_tree, split); 496 BUG_ON(ret); 497 free_extent_map(split); 498 split = split2; 499 split2 = NULL; 500 } 501 if (em->block_start < EXTENT_MAP_LAST_BYTE && 502 testend && em->start + em->len > start + len) { 503 u64 diff = start + len - em->start; 504 505 split->start = start + len; 506 split->len = em->start + em->len - (start + len); 507 split->bdev = em->bdev; 508 split->flags = flags; 509 split->compress_type = em->compress_type; 510 511 if (compressed) { 512 split->block_len = em->block_len; 513 split->block_start = em->block_start; 514 split->orig_start = em->orig_start; 515 } else { 516 split->block_len = split->len; 517 split->block_start = em->block_start + diff; 518 split->orig_start = split->start; 519 } 520 521 ret = add_extent_mapping(em_tree, split); 522 BUG_ON(ret); 523 free_extent_map(split); 524 split = NULL; 525 } 526 write_unlock(&em_tree->lock); 527 528 /* once for us */ 529 free_extent_map(em); 530 /* once for the tree*/ 531 free_extent_map(em); 532 } 533 if (split) 534 free_extent_map(split); 535 if (split2) 536 free_extent_map(split2); 537 return 0; 538 } 539 540 /* 541 * this is very complex, but the basic idea is to drop all extents 542 * in the range start - end. hint_block is filled in with a block number 543 * that would be a good hint to the block allocator for this file. 544 * 545 * If an extent intersects the range but is not entirely inside the range 546 * it is either truncated or split. Anything entirely inside the range 547 * is deleted from the tree. 548 */ 549 int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 550 u64 start, u64 end, u64 *hint_byte, int drop_cache) 551 { 552 struct btrfs_root *root = BTRFS_I(inode)->root; 553 struct extent_buffer *leaf; 554 struct btrfs_file_extent_item *fi; 555 struct btrfs_path *path; 556 struct btrfs_key key; 557 struct btrfs_key new_key; 558 u64 ino = btrfs_ino(inode); 559 u64 search_start = start; 560 u64 disk_bytenr = 0; 561 u64 num_bytes = 0; 562 u64 extent_offset = 0; 563 u64 extent_end = 0; 564 int del_nr = 0; 565 int del_slot = 0; 566 int extent_type; 567 int recow; 568 int ret; 569 570 if (drop_cache) 571 btrfs_drop_extent_cache(inode, start, end - 1, 0); 572 573 path = btrfs_alloc_path(); 574 if (!path) 575 return -ENOMEM; 576 577 while (1) { 578 recow = 0; 579 ret = btrfs_lookup_file_extent(trans, root, path, ino, 580 search_start, -1); 581 if (ret < 0) 582 break; 583 if (ret > 0 && path->slots[0] > 0 && search_start == start) { 584 leaf = path->nodes[0]; 585 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); 586 if (key.objectid == ino && 587 key.type == BTRFS_EXTENT_DATA_KEY) 588 path->slots[0]--; 589 } 590 ret = 0; 591 next_slot: 592 leaf = path->nodes[0]; 593 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 594 BUG_ON(del_nr > 0); 595 ret = btrfs_next_leaf(root, path); 596 if (ret < 0) 597 break; 598 if (ret > 0) { 599 ret = 0; 600 break; 601 } 602 leaf = path->nodes[0]; 603 recow = 1; 604 } 605 606 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 607 if (key.objectid > ino || 608 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) 609 break; 610 611 fi = btrfs_item_ptr(leaf, path->slots[0], 612 struct btrfs_file_extent_item); 613 extent_type = btrfs_file_extent_type(leaf, fi); 614 615 if (extent_type == BTRFS_FILE_EXTENT_REG || 616 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 617 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 618 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 619 extent_offset = btrfs_file_extent_offset(leaf, fi); 620 extent_end = key.offset + 621 btrfs_file_extent_num_bytes(leaf, fi); 622 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 623 extent_end = key.offset + 624 btrfs_file_extent_inline_len(leaf, fi); 625 } else { 626 WARN_ON(1); 627 extent_end = search_start; 628 } 629 630 if (extent_end <= search_start) { 631 path->slots[0]++; 632 goto next_slot; 633 } 634 635 search_start = max(key.offset, start); 636 if (recow) { 637 btrfs_release_path(path); 638 continue; 639 } 640 641 /* 642 * | - range to drop - | 643 * | -------- extent -------- | 644 */ 645 if (start > key.offset && end < extent_end) { 646 BUG_ON(del_nr > 0); 647 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 648 649 memcpy(&new_key, &key, sizeof(new_key)); 650 new_key.offset = start; 651 ret = btrfs_duplicate_item(trans, root, path, 652 &new_key); 653 if (ret == -EAGAIN) { 654 btrfs_release_path(path); 655 continue; 656 } 657 if (ret < 0) 658 break; 659 660 leaf = path->nodes[0]; 661 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 662 struct btrfs_file_extent_item); 663 btrfs_set_file_extent_num_bytes(leaf, fi, 664 start - key.offset); 665 666 fi = btrfs_item_ptr(leaf, path->slots[0], 667 struct btrfs_file_extent_item); 668 669 extent_offset += start - key.offset; 670 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 671 btrfs_set_file_extent_num_bytes(leaf, fi, 672 extent_end - start); 673 btrfs_mark_buffer_dirty(leaf); 674 675 if (disk_bytenr > 0) { 676 ret = btrfs_inc_extent_ref(trans, root, 677 disk_bytenr, num_bytes, 0, 678 root->root_key.objectid, 679 new_key.objectid, 680 start - extent_offset); 681 BUG_ON(ret); 682 *hint_byte = disk_bytenr; 683 } 684 key.offset = start; 685 } 686 /* 687 * | ---- range to drop ----- | 688 * | -------- extent -------- | 689 */ 690 if (start <= key.offset && end < extent_end) { 691 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 692 693 memcpy(&new_key, &key, sizeof(new_key)); 694 new_key.offset = end; 695 btrfs_set_item_key_safe(trans, root, path, &new_key); 696 697 extent_offset += end - key.offset; 698 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 699 btrfs_set_file_extent_num_bytes(leaf, fi, 700 extent_end - end); 701 btrfs_mark_buffer_dirty(leaf); 702 if (disk_bytenr > 0) { 703 inode_sub_bytes(inode, end - key.offset); 704 *hint_byte = disk_bytenr; 705 } 706 break; 707 } 708 709 search_start = extent_end; 710 /* 711 * | ---- range to drop ----- | 712 * | -------- extent -------- | 713 */ 714 if (start > key.offset && end >= extent_end) { 715 BUG_ON(del_nr > 0); 716 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 717 718 btrfs_set_file_extent_num_bytes(leaf, fi, 719 start - key.offset); 720 btrfs_mark_buffer_dirty(leaf); 721 if (disk_bytenr > 0) { 722 inode_sub_bytes(inode, extent_end - start); 723 *hint_byte = disk_bytenr; 724 } 725 if (end == extent_end) 726 break; 727 728 path->slots[0]++; 729 goto next_slot; 730 } 731 732 /* 733 * | ---- range to drop ----- | 734 * | ------ extent ------ | 735 */ 736 if (start <= key.offset && end >= extent_end) { 737 if (del_nr == 0) { 738 del_slot = path->slots[0]; 739 del_nr = 1; 740 } else { 741 BUG_ON(del_slot + del_nr != path->slots[0]); 742 del_nr++; 743 } 744 745 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 746 inode_sub_bytes(inode, 747 extent_end - key.offset); 748 extent_end = ALIGN(extent_end, 749 root->sectorsize); 750 } else if (disk_bytenr > 0) { 751 ret = btrfs_free_extent(trans, root, 752 disk_bytenr, num_bytes, 0, 753 root->root_key.objectid, 754 key.objectid, key.offset - 755 extent_offset); 756 BUG_ON(ret); 757 inode_sub_bytes(inode, 758 extent_end - key.offset); 759 *hint_byte = disk_bytenr; 760 } 761 762 if (end == extent_end) 763 break; 764 765 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { 766 path->slots[0]++; 767 goto next_slot; 768 } 769 770 ret = btrfs_del_items(trans, root, path, del_slot, 771 del_nr); 772 BUG_ON(ret); 773 774 del_nr = 0; 775 del_slot = 0; 776 777 btrfs_release_path(path); 778 continue; 779 } 780 781 BUG_ON(1); 782 } 783 784 if (del_nr > 0) { 785 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 786 BUG_ON(ret); 787 } 788 789 btrfs_free_path(path); 790 return ret; 791 } 792 793 static int extent_mergeable(struct extent_buffer *leaf, int slot, 794 u64 objectid, u64 bytenr, u64 orig_offset, 795 u64 *start, u64 *end) 796 { 797 struct btrfs_file_extent_item *fi; 798 struct btrfs_key key; 799 u64 extent_end; 800 801 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 802 return 0; 803 804 btrfs_item_key_to_cpu(leaf, &key, slot); 805 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) 806 return 0; 807 808 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 809 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || 810 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || 811 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || 812 btrfs_file_extent_compression(leaf, fi) || 813 btrfs_file_extent_encryption(leaf, fi) || 814 btrfs_file_extent_other_encoding(leaf, fi)) 815 return 0; 816 817 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 818 if ((*start && *start != key.offset) || (*end && *end != extent_end)) 819 return 0; 820 821 *start = key.offset; 822 *end = extent_end; 823 return 1; 824 } 825 826 /* 827 * Mark extent in the range start - end as written. 828 * 829 * This changes extent type from 'pre-allocated' to 'regular'. If only 830 * part of extent is marked as written, the extent will be split into 831 * two or three. 832 */ 833 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 834 struct inode *inode, u64 start, u64 end) 835 { 836 struct btrfs_root *root = BTRFS_I(inode)->root; 837 struct extent_buffer *leaf; 838 struct btrfs_path *path; 839 struct btrfs_file_extent_item *fi; 840 struct btrfs_key key; 841 struct btrfs_key new_key; 842 u64 bytenr; 843 u64 num_bytes; 844 u64 extent_end; 845 u64 orig_offset; 846 u64 other_start; 847 u64 other_end; 848 u64 split; 849 int del_nr = 0; 850 int del_slot = 0; 851 int recow; 852 int ret; 853 u64 ino = btrfs_ino(inode); 854 855 btrfs_drop_extent_cache(inode, start, end - 1, 0); 856 857 path = btrfs_alloc_path(); 858 BUG_ON(!path); 859 again: 860 recow = 0; 861 split = start; 862 key.objectid = ino; 863 key.type = BTRFS_EXTENT_DATA_KEY; 864 key.offset = split; 865 866 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 867 if (ret < 0) 868 goto out; 869 if (ret > 0 && path->slots[0] > 0) 870 path->slots[0]--; 871 872 leaf = path->nodes[0]; 873 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 874 BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY); 875 fi = btrfs_item_ptr(leaf, path->slots[0], 876 struct btrfs_file_extent_item); 877 BUG_ON(btrfs_file_extent_type(leaf, fi) != 878 BTRFS_FILE_EXTENT_PREALLOC); 879 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 880 BUG_ON(key.offset > start || extent_end < end); 881 882 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 883 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 884 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 885 memcpy(&new_key, &key, sizeof(new_key)); 886 887 if (start == key.offset && end < extent_end) { 888 other_start = 0; 889 other_end = start; 890 if (extent_mergeable(leaf, path->slots[0] - 1, 891 ino, bytenr, orig_offset, 892 &other_start, &other_end)) { 893 new_key.offset = end; 894 btrfs_set_item_key_safe(trans, root, path, &new_key); 895 fi = btrfs_item_ptr(leaf, path->slots[0], 896 struct btrfs_file_extent_item); 897 btrfs_set_file_extent_num_bytes(leaf, fi, 898 extent_end - end); 899 btrfs_set_file_extent_offset(leaf, fi, 900 end - orig_offset); 901 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 902 struct btrfs_file_extent_item); 903 btrfs_set_file_extent_num_bytes(leaf, fi, 904 end - other_start); 905 btrfs_mark_buffer_dirty(leaf); 906 goto out; 907 } 908 } 909 910 if (start > key.offset && end == extent_end) { 911 other_start = end; 912 other_end = 0; 913 if (extent_mergeable(leaf, path->slots[0] + 1, 914 ino, bytenr, orig_offset, 915 &other_start, &other_end)) { 916 fi = btrfs_item_ptr(leaf, path->slots[0], 917 struct btrfs_file_extent_item); 918 btrfs_set_file_extent_num_bytes(leaf, fi, 919 start - key.offset); 920 path->slots[0]++; 921 new_key.offset = start; 922 btrfs_set_item_key_safe(trans, root, path, &new_key); 923 924 fi = btrfs_item_ptr(leaf, path->slots[0], 925 struct btrfs_file_extent_item); 926 btrfs_set_file_extent_num_bytes(leaf, fi, 927 other_end - start); 928 btrfs_set_file_extent_offset(leaf, fi, 929 start - orig_offset); 930 btrfs_mark_buffer_dirty(leaf); 931 goto out; 932 } 933 } 934 935 while (start > key.offset || end < extent_end) { 936 if (key.offset == start) 937 split = end; 938 939 new_key.offset = split; 940 ret = btrfs_duplicate_item(trans, root, path, &new_key); 941 if (ret == -EAGAIN) { 942 btrfs_release_path(path); 943 goto again; 944 } 945 BUG_ON(ret < 0); 946 947 leaf = path->nodes[0]; 948 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 949 struct btrfs_file_extent_item); 950 btrfs_set_file_extent_num_bytes(leaf, fi, 951 split - key.offset); 952 953 fi = btrfs_item_ptr(leaf, path->slots[0], 954 struct btrfs_file_extent_item); 955 956 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 957 btrfs_set_file_extent_num_bytes(leaf, fi, 958 extent_end - split); 959 btrfs_mark_buffer_dirty(leaf); 960 961 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 962 root->root_key.objectid, 963 ino, orig_offset); 964 BUG_ON(ret); 965 966 if (split == start) { 967 key.offset = start; 968 } else { 969 BUG_ON(start != key.offset); 970 path->slots[0]--; 971 extent_end = end; 972 } 973 recow = 1; 974 } 975 976 other_start = end; 977 other_end = 0; 978 if (extent_mergeable(leaf, path->slots[0] + 1, 979 ino, bytenr, orig_offset, 980 &other_start, &other_end)) { 981 if (recow) { 982 btrfs_release_path(path); 983 goto again; 984 } 985 extent_end = other_end; 986 del_slot = path->slots[0] + 1; 987 del_nr++; 988 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 989 0, root->root_key.objectid, 990 ino, orig_offset); 991 BUG_ON(ret); 992 } 993 other_start = 0; 994 other_end = start; 995 if (extent_mergeable(leaf, path->slots[0] - 1, 996 ino, bytenr, orig_offset, 997 &other_start, &other_end)) { 998 if (recow) { 999 btrfs_release_path(path); 1000 goto again; 1001 } 1002 key.offset = other_start; 1003 del_slot = path->slots[0]; 1004 del_nr++; 1005 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1006 0, root->root_key.objectid, 1007 ino, orig_offset); 1008 BUG_ON(ret); 1009 } 1010 if (del_nr == 0) { 1011 fi = btrfs_item_ptr(leaf, path->slots[0], 1012 struct btrfs_file_extent_item); 1013 btrfs_set_file_extent_type(leaf, fi, 1014 BTRFS_FILE_EXTENT_REG); 1015 btrfs_mark_buffer_dirty(leaf); 1016 } else { 1017 fi = btrfs_item_ptr(leaf, del_slot - 1, 1018 struct btrfs_file_extent_item); 1019 btrfs_set_file_extent_type(leaf, fi, 1020 BTRFS_FILE_EXTENT_REG); 1021 btrfs_set_file_extent_num_bytes(leaf, fi, 1022 extent_end - key.offset); 1023 btrfs_mark_buffer_dirty(leaf); 1024 1025 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 1026 BUG_ON(ret); 1027 } 1028 out: 1029 btrfs_free_path(path); 1030 return 0; 1031 } 1032 1033 /* 1034 * on error we return an unlocked page and the error value 1035 * on success we return a locked page and 0 1036 */ 1037 static int prepare_uptodate_page(struct page *page, u64 pos) 1038 { 1039 int ret = 0; 1040 1041 if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { 1042 ret = btrfs_readpage(NULL, page); 1043 if (ret) 1044 return ret; 1045 lock_page(page); 1046 if (!PageUptodate(page)) { 1047 unlock_page(page); 1048 return -EIO; 1049 } 1050 } 1051 return 0; 1052 } 1053 1054 /* 1055 * this gets pages into the page cache and locks them down, it also properly 1056 * waits for data=ordered extents to finish before allowing the pages to be 1057 * modified. 1058 */ 1059 static noinline int prepare_pages(struct btrfs_root *root, struct file *file, 1060 struct page **pages, size_t num_pages, 1061 loff_t pos, unsigned long first_index, 1062 unsigned long last_index, size_t write_bytes) 1063 { 1064 struct extent_state *cached_state = NULL; 1065 int i; 1066 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1067 struct inode *inode = fdentry(file)->d_inode; 1068 int err = 0; 1069 int faili = 0; 1070 u64 start_pos; 1071 u64 last_pos; 1072 1073 start_pos = pos & ~((u64)root->sectorsize - 1); 1074 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; 1075 1076 if (start_pos > inode->i_size) { 1077 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); 1078 if (err) 1079 return err; 1080 } 1081 1082 again: 1083 for (i = 0; i < num_pages; i++) { 1084 pages[i] = grab_cache_page(inode->i_mapping, index + i); 1085 if (!pages[i]) { 1086 faili = i - 1; 1087 err = -ENOMEM; 1088 goto fail; 1089 } 1090 1091 if (i == 0) 1092 err = prepare_uptodate_page(pages[i], pos); 1093 if (i == num_pages - 1) 1094 err = prepare_uptodate_page(pages[i], 1095 pos + write_bytes); 1096 if (err) { 1097 page_cache_release(pages[i]); 1098 faili = i - 1; 1099 goto fail; 1100 } 1101 wait_on_page_writeback(pages[i]); 1102 } 1103 err = 0; 1104 if (start_pos < inode->i_size) { 1105 struct btrfs_ordered_extent *ordered; 1106 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1107 start_pos, last_pos - 1, 0, &cached_state, 1108 GFP_NOFS); 1109 ordered = btrfs_lookup_first_ordered_extent(inode, 1110 last_pos - 1); 1111 if (ordered && 1112 ordered->file_offset + ordered->len > start_pos && 1113 ordered->file_offset < last_pos) { 1114 btrfs_put_ordered_extent(ordered); 1115 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1116 start_pos, last_pos - 1, 1117 &cached_state, GFP_NOFS); 1118 for (i = 0; i < num_pages; i++) { 1119 unlock_page(pages[i]); 1120 page_cache_release(pages[i]); 1121 } 1122 btrfs_wait_ordered_range(inode, start_pos, 1123 last_pos - start_pos); 1124 goto again; 1125 } 1126 if (ordered) 1127 btrfs_put_ordered_extent(ordered); 1128 1129 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, 1130 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1131 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, 1132 GFP_NOFS); 1133 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1134 start_pos, last_pos - 1, &cached_state, 1135 GFP_NOFS); 1136 } 1137 for (i = 0; i < num_pages; i++) { 1138 clear_page_dirty_for_io(pages[i]); 1139 set_page_extent_mapped(pages[i]); 1140 WARN_ON(!PageLocked(pages[i])); 1141 } 1142 return 0; 1143 fail: 1144 while (faili >= 0) { 1145 unlock_page(pages[faili]); 1146 page_cache_release(pages[faili]); 1147 faili--; 1148 } 1149 return err; 1150 1151 } 1152 1153 static noinline ssize_t __btrfs_buffered_write(struct file *file, 1154 struct iov_iter *i, 1155 loff_t pos) 1156 { 1157 struct inode *inode = fdentry(file)->d_inode; 1158 struct btrfs_root *root = BTRFS_I(inode)->root; 1159 struct page **pages = NULL; 1160 unsigned long first_index; 1161 unsigned long last_index; 1162 size_t num_written = 0; 1163 int nrptrs; 1164 int ret = 0; 1165 1166 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1167 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1168 (sizeof(struct page *))); 1169 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1170 if (!pages) 1171 return -ENOMEM; 1172 1173 first_index = pos >> PAGE_CACHE_SHIFT; 1174 last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT; 1175 1176 while (iov_iter_count(i) > 0) { 1177 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1178 size_t write_bytes = min(iov_iter_count(i), 1179 nrptrs * (size_t)PAGE_CACHE_SIZE - 1180 offset); 1181 size_t num_pages = (write_bytes + offset + 1182 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1183 size_t dirty_pages; 1184 size_t copied; 1185 1186 WARN_ON(num_pages > nrptrs); 1187 1188 /* 1189 * Fault pages before locking them in prepare_pages 1190 * to avoid recursive lock 1191 */ 1192 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { 1193 ret = -EFAULT; 1194 break; 1195 } 1196 1197 ret = btrfs_delalloc_reserve_space(inode, 1198 num_pages << PAGE_CACHE_SHIFT); 1199 if (ret) 1200 break; 1201 1202 /* 1203 * This is going to setup the pages array with the number of 1204 * pages we want, so we don't really need to worry about the 1205 * contents of pages from loop to loop 1206 */ 1207 ret = prepare_pages(root, file, pages, num_pages, 1208 pos, first_index, last_index, 1209 write_bytes); 1210 if (ret) { 1211 btrfs_delalloc_release_space(inode, 1212 num_pages << PAGE_CACHE_SHIFT); 1213 break; 1214 } 1215 1216 copied = btrfs_copy_from_user(pos, num_pages, 1217 write_bytes, pages, i); 1218 1219 /* 1220 * if we have trouble faulting in the pages, fall 1221 * back to one page at a time 1222 */ 1223 if (copied < write_bytes) 1224 nrptrs = 1; 1225 1226 if (copied == 0) 1227 dirty_pages = 0; 1228 else 1229 dirty_pages = (copied + offset + 1230 PAGE_CACHE_SIZE - 1) >> 1231 PAGE_CACHE_SHIFT; 1232 1233 /* 1234 * If we had a short copy we need to release the excess delaloc 1235 * bytes we reserved. We need to increment outstanding_extents 1236 * because btrfs_delalloc_release_space will decrement it, but 1237 * we still have an outstanding extent for the chunk we actually 1238 * managed to copy. 1239 */ 1240 if (num_pages > dirty_pages) { 1241 if (copied > 0) 1242 atomic_inc( 1243 &BTRFS_I(inode)->outstanding_extents); 1244 btrfs_delalloc_release_space(inode, 1245 (num_pages - dirty_pages) << 1246 PAGE_CACHE_SHIFT); 1247 } 1248 1249 if (copied > 0) { 1250 ret = btrfs_dirty_pages(root, inode, pages, 1251 dirty_pages, pos, copied, 1252 NULL); 1253 if (ret) { 1254 btrfs_delalloc_release_space(inode, 1255 dirty_pages << PAGE_CACHE_SHIFT); 1256 btrfs_drop_pages(pages, num_pages); 1257 break; 1258 } 1259 } 1260 1261 btrfs_drop_pages(pages, num_pages); 1262 1263 cond_resched(); 1264 1265 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1266 dirty_pages); 1267 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1268 btrfs_btree_balance_dirty(root, 1); 1269 btrfs_throttle(root); 1270 1271 pos += copied; 1272 num_written += copied; 1273 } 1274 1275 kfree(pages); 1276 1277 return num_written ? num_written : ret; 1278 } 1279 1280 static ssize_t __btrfs_direct_write(struct kiocb *iocb, 1281 const struct iovec *iov, 1282 unsigned long nr_segs, loff_t pos, 1283 loff_t *ppos, size_t count, size_t ocount) 1284 { 1285 struct file *file = iocb->ki_filp; 1286 struct inode *inode = fdentry(file)->d_inode; 1287 struct iov_iter i; 1288 ssize_t written; 1289 ssize_t written_buffered; 1290 loff_t endbyte; 1291 int err; 1292 1293 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, 1294 count, ocount); 1295 1296 /* 1297 * the generic O_DIRECT will update in-memory i_size after the 1298 * DIOs are done. But our endio handlers that update the on 1299 * disk i_size never update past the in memory i_size. So we 1300 * need one more update here to catch any additions to the 1301 * file 1302 */ 1303 if (inode->i_size != BTRFS_I(inode)->disk_i_size) { 1304 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 1305 mark_inode_dirty(inode); 1306 } 1307 1308 if (written < 0 || written == count) 1309 return written; 1310 1311 pos += written; 1312 count -= written; 1313 iov_iter_init(&i, iov, nr_segs, count, written); 1314 written_buffered = __btrfs_buffered_write(file, &i, pos); 1315 if (written_buffered < 0) { 1316 err = written_buffered; 1317 goto out; 1318 } 1319 endbyte = pos + written_buffered - 1; 1320 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 1321 if (err) 1322 goto out; 1323 written += written_buffered; 1324 *ppos = pos + written_buffered; 1325 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, 1326 endbyte >> PAGE_CACHE_SHIFT); 1327 out: 1328 return written ? written : err; 1329 } 1330 1331 static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1332 const struct iovec *iov, 1333 unsigned long nr_segs, loff_t pos) 1334 { 1335 struct file *file = iocb->ki_filp; 1336 struct inode *inode = fdentry(file)->d_inode; 1337 struct btrfs_root *root = BTRFS_I(inode)->root; 1338 loff_t *ppos = &iocb->ki_pos; 1339 ssize_t num_written = 0; 1340 ssize_t err = 0; 1341 size_t count, ocount; 1342 1343 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1344 1345 mutex_lock(&inode->i_mutex); 1346 1347 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 1348 if (err) { 1349 mutex_unlock(&inode->i_mutex); 1350 goto out; 1351 } 1352 count = ocount; 1353 1354 current->backing_dev_info = inode->i_mapping->backing_dev_info; 1355 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 1356 if (err) { 1357 mutex_unlock(&inode->i_mutex); 1358 goto out; 1359 } 1360 1361 if (count == 0) { 1362 mutex_unlock(&inode->i_mutex); 1363 goto out; 1364 } 1365 1366 err = file_remove_suid(file); 1367 if (err) { 1368 mutex_unlock(&inode->i_mutex); 1369 goto out; 1370 } 1371 1372 /* 1373 * If BTRFS flips readonly due to some impossible error 1374 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), 1375 * although we have opened a file as writable, we have 1376 * to stop this write operation to ensure FS consistency. 1377 */ 1378 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 1379 mutex_unlock(&inode->i_mutex); 1380 err = -EROFS; 1381 goto out; 1382 } 1383 1384 file_update_time(file); 1385 BTRFS_I(inode)->sequence++; 1386 1387 if (unlikely(file->f_flags & O_DIRECT)) { 1388 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1389 pos, ppos, count, ocount); 1390 } else { 1391 struct iov_iter i; 1392 1393 iov_iter_init(&i, iov, nr_segs, count, num_written); 1394 1395 num_written = __btrfs_buffered_write(file, &i, pos); 1396 if (num_written > 0) 1397 *ppos = pos + num_written; 1398 } 1399 1400 mutex_unlock(&inode->i_mutex); 1401 1402 /* 1403 * we want to make sure fsync finds this change 1404 * but we haven't joined a transaction running right now. 1405 * 1406 * Later on, someone is sure to update the inode and get the 1407 * real transid recorded. 1408 * 1409 * We set last_trans now to the fs_info generation + 1, 1410 * this will either be one more than the running transaction 1411 * or the generation used for the next transaction if there isn't 1412 * one running right now. 1413 */ 1414 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1415 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1416 err = generic_write_sync(file, pos, num_written); 1417 if (err < 0 && num_written > 0) 1418 num_written = err; 1419 } 1420 out: 1421 current->backing_dev_info = NULL; 1422 return num_written ? num_written : err; 1423 } 1424 1425 int btrfs_release_file(struct inode *inode, struct file *filp) 1426 { 1427 /* 1428 * ordered_data_close is set by settattr when we are about to truncate 1429 * a file from a non-zero size to a zero size. This tries to 1430 * flush down new bytes that may have been written if the 1431 * application were using truncate to replace a file in place. 1432 */ 1433 if (BTRFS_I(inode)->ordered_data_close) { 1434 BTRFS_I(inode)->ordered_data_close = 0; 1435 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); 1436 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1437 filemap_flush(inode->i_mapping); 1438 } 1439 if (filp->private_data) 1440 btrfs_ioctl_trans_end(filp); 1441 return 0; 1442 } 1443 1444 /* 1445 * fsync call for both files and directories. This logs the inode into 1446 * the tree log instead of forcing full commits whenever possible. 1447 * 1448 * It needs to call filemap_fdatawait so that all ordered extent updates are 1449 * in the metadata btree are up to date for copying to the log. 1450 * 1451 * It drops the inode mutex before doing the tree log commit. This is an 1452 * important optimization for directories because holding the mutex prevents 1453 * new operations on the dir while we write to disk. 1454 */ 1455 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 1456 { 1457 struct dentry *dentry = file->f_path.dentry; 1458 struct inode *inode = dentry->d_inode; 1459 struct btrfs_root *root = BTRFS_I(inode)->root; 1460 int ret = 0; 1461 struct btrfs_trans_handle *trans; 1462 1463 trace_btrfs_sync_file(file, datasync); 1464 1465 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1466 if (ret) 1467 return ret; 1468 mutex_lock(&inode->i_mutex); 1469 1470 /* we wait first, since the writeback may change the inode */ 1471 root->log_batch++; 1472 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1473 root->log_batch++; 1474 1475 /* 1476 * check the transaction that last modified this inode 1477 * and see if its already been committed 1478 */ 1479 if (!BTRFS_I(inode)->last_trans) { 1480 mutex_unlock(&inode->i_mutex); 1481 goto out; 1482 } 1483 1484 /* 1485 * if the last transaction that changed this file was before 1486 * the current transaction, we can bail out now without any 1487 * syncing 1488 */ 1489 smp_mb(); 1490 if (BTRFS_I(inode)->last_trans <= 1491 root->fs_info->last_trans_committed) { 1492 BTRFS_I(inode)->last_trans = 0; 1493 mutex_unlock(&inode->i_mutex); 1494 goto out; 1495 } 1496 1497 /* 1498 * ok we haven't committed the transaction yet, lets do a commit 1499 */ 1500 if (file->private_data) 1501 btrfs_ioctl_trans_end(file); 1502 1503 trans = btrfs_start_transaction(root, 0); 1504 if (IS_ERR(trans)) { 1505 ret = PTR_ERR(trans); 1506 mutex_unlock(&inode->i_mutex); 1507 goto out; 1508 } 1509 1510 ret = btrfs_log_dentry_safe(trans, root, dentry); 1511 if (ret < 0) { 1512 mutex_unlock(&inode->i_mutex); 1513 goto out; 1514 } 1515 1516 /* we've logged all the items and now have a consistent 1517 * version of the file in the log. It is possible that 1518 * someone will come in and modify the file, but that's 1519 * fine because the log is consistent on disk, and we 1520 * have references to all of the file's extents 1521 * 1522 * It is possible that someone will come in and log the 1523 * file again, but that will end up using the synchronization 1524 * inside btrfs_sync_log to keep things safe. 1525 */ 1526 mutex_unlock(&inode->i_mutex); 1527 1528 if (ret != BTRFS_NO_LOG_SYNC) { 1529 if (ret > 0) { 1530 ret = btrfs_commit_transaction(trans, root); 1531 } else { 1532 ret = btrfs_sync_log(trans, root); 1533 if (ret == 0) 1534 ret = btrfs_end_transaction(trans, root); 1535 else 1536 ret = btrfs_commit_transaction(trans, root); 1537 } 1538 } else { 1539 ret = btrfs_end_transaction(trans, root); 1540 } 1541 out: 1542 return ret > 0 ? -EIO : ret; 1543 } 1544 1545 static const struct vm_operations_struct btrfs_file_vm_ops = { 1546 .fault = filemap_fault, 1547 .page_mkwrite = btrfs_page_mkwrite, 1548 }; 1549 1550 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1551 { 1552 struct address_space *mapping = filp->f_mapping; 1553 1554 if (!mapping->a_ops->readpage) 1555 return -ENOEXEC; 1556 1557 file_accessed(filp); 1558 vma->vm_ops = &btrfs_file_vm_ops; 1559 vma->vm_flags |= VM_CAN_NONLINEAR; 1560 1561 return 0; 1562 } 1563 1564 static long btrfs_fallocate(struct file *file, int mode, 1565 loff_t offset, loff_t len) 1566 { 1567 struct inode *inode = file->f_path.dentry->d_inode; 1568 struct extent_state *cached_state = NULL; 1569 u64 cur_offset; 1570 u64 last_byte; 1571 u64 alloc_start; 1572 u64 alloc_end; 1573 u64 alloc_hint = 0; 1574 u64 locked_end; 1575 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1576 struct extent_map *em; 1577 int ret; 1578 1579 alloc_start = offset & ~mask; 1580 alloc_end = (offset + len + mask) & ~mask; 1581 1582 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 1583 if (mode & ~FALLOC_FL_KEEP_SIZE) 1584 return -EOPNOTSUPP; 1585 1586 /* 1587 * wait for ordered IO before we have any locks. We'll loop again 1588 * below with the locks held. 1589 */ 1590 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); 1591 1592 mutex_lock(&inode->i_mutex); 1593 ret = inode_newsize_ok(inode, alloc_end); 1594 if (ret) 1595 goto out; 1596 1597 if (alloc_start > inode->i_size) { 1598 ret = btrfs_cont_expand(inode, i_size_read(inode), 1599 alloc_start); 1600 if (ret) 1601 goto out; 1602 } 1603 1604 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); 1605 if (ret) 1606 goto out; 1607 1608 locked_end = alloc_end - 1; 1609 while (1) { 1610 struct btrfs_ordered_extent *ordered; 1611 1612 /* the extent lock is ordered inside the running 1613 * transaction 1614 */ 1615 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, 1616 locked_end, 0, &cached_state, GFP_NOFS); 1617 ordered = btrfs_lookup_first_ordered_extent(inode, 1618 alloc_end - 1); 1619 if (ordered && 1620 ordered->file_offset + ordered->len > alloc_start && 1621 ordered->file_offset < alloc_end) { 1622 btrfs_put_ordered_extent(ordered); 1623 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1624 alloc_start, locked_end, 1625 &cached_state, GFP_NOFS); 1626 /* 1627 * we can't wait on the range with the transaction 1628 * running or with the extent lock held 1629 */ 1630 btrfs_wait_ordered_range(inode, alloc_start, 1631 alloc_end - alloc_start); 1632 } else { 1633 if (ordered) 1634 btrfs_put_ordered_extent(ordered); 1635 break; 1636 } 1637 } 1638 1639 cur_offset = alloc_start; 1640 while (1) { 1641 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 1642 alloc_end - cur_offset, 0); 1643 BUG_ON(IS_ERR_OR_NULL(em)); 1644 last_byte = min(extent_map_end(em), alloc_end); 1645 last_byte = (last_byte + mask) & ~mask; 1646 if (em->block_start == EXTENT_MAP_HOLE || 1647 (cur_offset >= inode->i_size && 1648 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1649 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1650 last_byte - cur_offset, 1651 1 << inode->i_blkbits, 1652 offset + len, 1653 &alloc_hint); 1654 if (ret < 0) { 1655 free_extent_map(em); 1656 break; 1657 } 1658 } 1659 free_extent_map(em); 1660 1661 cur_offset = last_byte; 1662 if (cur_offset >= alloc_end) { 1663 ret = 0; 1664 break; 1665 } 1666 } 1667 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1668 &cached_state, GFP_NOFS); 1669 1670 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); 1671 out: 1672 mutex_unlock(&inode->i_mutex); 1673 return ret; 1674 } 1675 1676 static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) 1677 { 1678 struct btrfs_root *root = BTRFS_I(inode)->root; 1679 struct extent_map *em; 1680 struct extent_state *cached_state = NULL; 1681 u64 lockstart = *offset; 1682 u64 lockend = i_size_read(inode); 1683 u64 start = *offset; 1684 u64 orig_start = *offset; 1685 u64 len = i_size_read(inode); 1686 u64 last_end = 0; 1687 int ret = 0; 1688 1689 lockend = max_t(u64, root->sectorsize, lockend); 1690 if (lockend <= lockstart) 1691 lockend = lockstart + root->sectorsize; 1692 1693 len = lockend - lockstart + 1; 1694 1695 len = max_t(u64, len, root->sectorsize); 1696 if (inode->i_size == 0) 1697 return -ENXIO; 1698 1699 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, 1700 &cached_state, GFP_NOFS); 1701 1702 /* 1703 * Delalloc is such a pain. If we have a hole and we have pending 1704 * delalloc for a portion of the hole we will get back a hole that 1705 * exists for the entire range since it hasn't been actually written 1706 * yet. So to take care of this case we need to look for an extent just 1707 * before the position we want in case there is outstanding delalloc 1708 * going on here. 1709 */ 1710 if (origin == SEEK_HOLE && start != 0) { 1711 if (start <= root->sectorsize) 1712 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, 1713 root->sectorsize, 0); 1714 else 1715 em = btrfs_get_extent_fiemap(inode, NULL, 0, 1716 start - root->sectorsize, 1717 root->sectorsize, 0); 1718 if (IS_ERR(em)) { 1719 ret = -ENXIO; 1720 goto out; 1721 } 1722 last_end = em->start + em->len; 1723 if (em->block_start == EXTENT_MAP_DELALLOC) 1724 last_end = min_t(u64, last_end, inode->i_size); 1725 free_extent_map(em); 1726 } 1727 1728 while (1) { 1729 em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); 1730 if (IS_ERR(em)) { 1731 ret = -ENXIO; 1732 break; 1733 } 1734 1735 if (em->block_start == EXTENT_MAP_HOLE) { 1736 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 1737 if (last_end <= orig_start) { 1738 free_extent_map(em); 1739 ret = -ENXIO; 1740 break; 1741 } 1742 } 1743 1744 if (origin == SEEK_HOLE) { 1745 *offset = start; 1746 free_extent_map(em); 1747 break; 1748 } 1749 } else { 1750 if (origin == SEEK_DATA) { 1751 if (em->block_start == EXTENT_MAP_DELALLOC) { 1752 if (start >= inode->i_size) { 1753 free_extent_map(em); 1754 ret = -ENXIO; 1755 break; 1756 } 1757 } 1758 1759 *offset = start; 1760 free_extent_map(em); 1761 break; 1762 } 1763 } 1764 1765 start = em->start + em->len; 1766 last_end = em->start + em->len; 1767 1768 if (em->block_start == EXTENT_MAP_DELALLOC) 1769 last_end = min_t(u64, last_end, inode->i_size); 1770 1771 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 1772 free_extent_map(em); 1773 ret = -ENXIO; 1774 break; 1775 } 1776 free_extent_map(em); 1777 cond_resched(); 1778 } 1779 if (!ret) 1780 *offset = min(*offset, inode->i_size); 1781 out: 1782 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 1783 &cached_state, GFP_NOFS); 1784 return ret; 1785 } 1786 1787 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) 1788 { 1789 struct inode *inode = file->f_mapping->host; 1790 int ret; 1791 1792 mutex_lock(&inode->i_mutex); 1793 switch (origin) { 1794 case SEEK_END: 1795 case SEEK_CUR: 1796 offset = generic_file_llseek_unlocked(file, offset, origin); 1797 goto out; 1798 case SEEK_DATA: 1799 case SEEK_HOLE: 1800 ret = find_desired_extent(inode, &offset, origin); 1801 if (ret) { 1802 mutex_unlock(&inode->i_mutex); 1803 return ret; 1804 } 1805 } 1806 1807 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 1808 return -EINVAL; 1809 if (offset > inode->i_sb->s_maxbytes) 1810 return -EINVAL; 1811 1812 /* Special lock needed here? */ 1813 if (offset != file->f_pos) { 1814 file->f_pos = offset; 1815 file->f_version = 0; 1816 } 1817 out: 1818 mutex_unlock(&inode->i_mutex); 1819 return offset; 1820 } 1821 1822 const struct file_operations btrfs_file_operations = { 1823 .llseek = btrfs_file_llseek, 1824 .read = do_sync_read, 1825 .write = do_sync_write, 1826 .aio_read = generic_file_aio_read, 1827 .splice_read = generic_file_splice_read, 1828 .aio_write = btrfs_file_aio_write, 1829 .mmap = btrfs_file_mmap, 1830 .open = generic_file_open, 1831 .release = btrfs_release_file, 1832 .fsync = btrfs_sync_file, 1833 .fallocate = btrfs_fallocate, 1834 .unlocked_ioctl = btrfs_ioctl, 1835 #ifdef CONFIG_COMPAT 1836 .compat_ioctl = btrfs_ioctl, 1837 #endif 1838 }; 1839