1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/fs.h> 20 #include <linux/pagemap.h> 21 #include <linux/highmem.h> 22 #include <linux/time.h> 23 #include <linux/init.h> 24 #include <linux/string.h> 25 #include <linux/backing-dev.h> 26 #include <linux/mpage.h> 27 #include <linux/aio.h> 28 #include <linux/falloc.h> 29 #include <linux/swap.h> 30 #include <linux/writeback.h> 31 #include <linux/statfs.h> 32 #include <linux/compat.h> 33 #include <linux/slab.h> 34 #include <linux/btrfs.h> 35 #include "ctree.h" 36 #include "disk-io.h" 37 #include "transaction.h" 38 #include "btrfs_inode.h" 39 #include "print-tree.h" 40 #include "tree-log.h" 41 #include "locking.h" 42 #include "compat.h" 43 #include "volumes.h" 44 45 static struct kmem_cache *btrfs_inode_defrag_cachep; 46 /* 47 * when auto defrag is enabled we 48 * queue up these defrag structs to remember which 49 * inodes need defragging passes 50 */ 51 struct inode_defrag { 52 struct rb_node rb_node; 53 /* objectid */ 54 u64 ino; 55 /* 56 * transid where the defrag was added, we search for 57 * extents newer than this 58 */ 59 u64 transid; 60 61 /* root objectid */ 62 u64 root; 63 64 /* last offset we were able to defrag */ 65 u64 last_offset; 66 67 /* if we've wrapped around back to zero once already */ 68 int cycled; 69 }; 70 71 static int __compare_inode_defrag(struct inode_defrag *defrag1, 72 struct inode_defrag *defrag2) 73 { 74 if (defrag1->root > defrag2->root) 75 return 1; 76 else if (defrag1->root < defrag2->root) 77 return -1; 78 else if (defrag1->ino > defrag2->ino) 79 return 1; 80 else if (defrag1->ino < defrag2->ino) 81 return -1; 82 else 83 return 0; 84 } 85 86 /* pop a record for an inode into the defrag tree. The lock 87 * must be held already 88 * 89 * If you're inserting a record for an older transid than an 90 * existing record, the transid already in the tree is lowered 91 * 92 * If an existing record is found the defrag item you 93 * pass in is freed 94 */ 95 static int __btrfs_add_inode_defrag(struct inode *inode, 96 struct inode_defrag *defrag) 97 { 98 struct btrfs_root *root = BTRFS_I(inode)->root; 99 struct inode_defrag *entry; 100 struct rb_node **p; 101 struct rb_node *parent = NULL; 102 int ret; 103 104 p = &root->fs_info->defrag_inodes.rb_node; 105 while (*p) { 106 parent = *p; 107 entry = rb_entry(parent, struct inode_defrag, rb_node); 108 109 ret = __compare_inode_defrag(defrag, entry); 110 if (ret < 0) 111 p = &parent->rb_left; 112 else if (ret > 0) 113 p = &parent->rb_right; 114 else { 115 /* if we're reinserting an entry for 116 * an old defrag run, make sure to 117 * lower the transid of our existing record 118 */ 119 if (defrag->transid < entry->transid) 120 entry->transid = defrag->transid; 121 if (defrag->last_offset > entry->last_offset) 122 entry->last_offset = defrag->last_offset; 123 return -EEXIST; 124 } 125 } 126 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 127 rb_link_node(&defrag->rb_node, parent, p); 128 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 129 return 0; 130 } 131 132 static inline int __need_auto_defrag(struct btrfs_root *root) 133 { 134 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 135 return 0; 136 137 if (btrfs_fs_closing(root->fs_info)) 138 return 0; 139 140 return 1; 141 } 142 143 /* 144 * insert a defrag record for this inode if auto defrag is 145 * enabled 146 */ 147 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 148 struct inode *inode) 149 { 150 struct btrfs_root *root = BTRFS_I(inode)->root; 151 struct inode_defrag *defrag; 152 u64 transid; 153 int ret; 154 155 if (!__need_auto_defrag(root)) 156 return 0; 157 158 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 159 return 0; 160 161 if (trans) 162 transid = trans->transid; 163 else 164 transid = BTRFS_I(inode)->root->last_trans; 165 166 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 167 if (!defrag) 168 return -ENOMEM; 169 170 defrag->ino = btrfs_ino(inode); 171 defrag->transid = transid; 172 defrag->root = root->root_key.objectid; 173 174 spin_lock(&root->fs_info->defrag_inodes_lock); 175 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { 176 /* 177 * If we set IN_DEFRAG flag and evict the inode from memory, 178 * and then re-read this inode, this new inode doesn't have 179 * IN_DEFRAG flag. At the case, we may find the existed defrag. 180 */ 181 ret = __btrfs_add_inode_defrag(inode, defrag); 182 if (ret) 183 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 184 } else { 185 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 186 } 187 spin_unlock(&root->fs_info->defrag_inodes_lock); 188 return 0; 189 } 190 191 /* 192 * Requeue the defrag object. If there is a defrag object that points to 193 * the same inode in the tree, we will merge them together (by 194 * __btrfs_add_inode_defrag()) and free the one that we want to requeue. 195 */ 196 static void btrfs_requeue_inode_defrag(struct inode *inode, 197 struct inode_defrag *defrag) 198 { 199 struct btrfs_root *root = BTRFS_I(inode)->root; 200 int ret; 201 202 if (!__need_auto_defrag(root)) 203 goto out; 204 205 /* 206 * Here we don't check the IN_DEFRAG flag, because we need merge 207 * them together. 208 */ 209 spin_lock(&root->fs_info->defrag_inodes_lock); 210 ret = __btrfs_add_inode_defrag(inode, defrag); 211 spin_unlock(&root->fs_info->defrag_inodes_lock); 212 if (ret) 213 goto out; 214 return; 215 out: 216 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 217 } 218 219 /* 220 * pick the defragable inode that we want, if it doesn't exist, we will get 221 * the next one. 222 */ 223 static struct inode_defrag * 224 btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) 225 { 226 struct inode_defrag *entry = NULL; 227 struct inode_defrag tmp; 228 struct rb_node *p; 229 struct rb_node *parent = NULL; 230 int ret; 231 232 tmp.ino = ino; 233 tmp.root = root; 234 235 spin_lock(&fs_info->defrag_inodes_lock); 236 p = fs_info->defrag_inodes.rb_node; 237 while (p) { 238 parent = p; 239 entry = rb_entry(parent, struct inode_defrag, rb_node); 240 241 ret = __compare_inode_defrag(&tmp, entry); 242 if (ret < 0) 243 p = parent->rb_left; 244 else if (ret > 0) 245 p = parent->rb_right; 246 else 247 goto out; 248 } 249 250 if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 251 parent = rb_next(parent); 252 if (parent) 253 entry = rb_entry(parent, struct inode_defrag, rb_node); 254 else 255 entry = NULL; 256 } 257 out: 258 if (entry) 259 rb_erase(parent, &fs_info->defrag_inodes); 260 spin_unlock(&fs_info->defrag_inodes_lock); 261 return entry; 262 } 263 264 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 265 { 266 struct inode_defrag *defrag; 267 struct rb_node *node; 268 269 spin_lock(&fs_info->defrag_inodes_lock); 270 node = rb_first(&fs_info->defrag_inodes); 271 while (node) { 272 rb_erase(node, &fs_info->defrag_inodes); 273 defrag = rb_entry(node, struct inode_defrag, rb_node); 274 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 275 276 if (need_resched()) { 277 spin_unlock(&fs_info->defrag_inodes_lock); 278 cond_resched(); 279 spin_lock(&fs_info->defrag_inodes_lock); 280 } 281 282 node = rb_first(&fs_info->defrag_inodes); 283 } 284 spin_unlock(&fs_info->defrag_inodes_lock); 285 } 286 287 #define BTRFS_DEFRAG_BATCH 1024 288 289 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 290 struct inode_defrag *defrag) 291 { 292 struct btrfs_root *inode_root; 293 struct inode *inode; 294 struct btrfs_key key; 295 struct btrfs_ioctl_defrag_range_args range; 296 int num_defrag; 297 int index; 298 int ret; 299 300 /* get the inode */ 301 key.objectid = defrag->root; 302 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 303 key.offset = (u64)-1; 304 305 index = srcu_read_lock(&fs_info->subvol_srcu); 306 307 inode_root = btrfs_read_fs_root_no_name(fs_info, &key); 308 if (IS_ERR(inode_root)) { 309 ret = PTR_ERR(inode_root); 310 goto cleanup; 311 } 312 313 key.objectid = defrag->ino; 314 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 315 key.offset = 0; 316 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 317 if (IS_ERR(inode)) { 318 ret = PTR_ERR(inode); 319 goto cleanup; 320 } 321 srcu_read_unlock(&fs_info->subvol_srcu, index); 322 323 /* do a chunk of defrag */ 324 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 325 memset(&range, 0, sizeof(range)); 326 range.len = (u64)-1; 327 range.start = defrag->last_offset; 328 329 sb_start_write(fs_info->sb); 330 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 331 BTRFS_DEFRAG_BATCH); 332 sb_end_write(fs_info->sb); 333 /* 334 * if we filled the whole defrag batch, there 335 * must be more work to do. Queue this defrag 336 * again 337 */ 338 if (num_defrag == BTRFS_DEFRAG_BATCH) { 339 defrag->last_offset = range.start; 340 btrfs_requeue_inode_defrag(inode, defrag); 341 } else if (defrag->last_offset && !defrag->cycled) { 342 /* 343 * we didn't fill our defrag batch, but 344 * we didn't start at zero. Make sure we loop 345 * around to the start of the file. 346 */ 347 defrag->last_offset = 0; 348 defrag->cycled = 1; 349 btrfs_requeue_inode_defrag(inode, defrag); 350 } else { 351 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 352 } 353 354 iput(inode); 355 return 0; 356 cleanup: 357 srcu_read_unlock(&fs_info->subvol_srcu, index); 358 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 359 return ret; 360 } 361 362 /* 363 * run through the list of inodes in the FS that need 364 * defragging 365 */ 366 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 367 { 368 struct inode_defrag *defrag; 369 u64 first_ino = 0; 370 u64 root_objectid = 0; 371 372 atomic_inc(&fs_info->defrag_running); 373 while(1) { 374 /* Pause the auto defragger. */ 375 if (test_bit(BTRFS_FS_STATE_REMOUNTING, 376 &fs_info->fs_state)) 377 break; 378 379 if (!__need_auto_defrag(fs_info->tree_root)) 380 break; 381 382 /* find an inode to defrag */ 383 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, 384 first_ino); 385 if (!defrag) { 386 if (root_objectid || first_ino) { 387 root_objectid = 0; 388 first_ino = 0; 389 continue; 390 } else { 391 break; 392 } 393 } 394 395 first_ino = defrag->ino + 1; 396 root_objectid = defrag->root; 397 398 __btrfs_run_defrag_inode(fs_info, defrag); 399 } 400 atomic_dec(&fs_info->defrag_running); 401 402 /* 403 * during unmount, we use the transaction_wait queue to 404 * wait for the defragger to stop 405 */ 406 wake_up(&fs_info->transaction_wait); 407 return 0; 408 } 409 410 /* simple helper to fault in pages and copy. This should go away 411 * and be replaced with calls into generic code. 412 */ 413 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 414 size_t write_bytes, 415 struct page **prepared_pages, 416 struct iov_iter *i) 417 { 418 size_t copied = 0; 419 size_t total_copied = 0; 420 int pg = 0; 421 int offset = pos & (PAGE_CACHE_SIZE - 1); 422 423 while (write_bytes > 0) { 424 size_t count = min_t(size_t, 425 PAGE_CACHE_SIZE - offset, write_bytes); 426 struct page *page = prepared_pages[pg]; 427 /* 428 * Copy data from userspace to the current page 429 * 430 * Disable pagefault to avoid recursive lock since 431 * the pages are already locked 432 */ 433 pagefault_disable(); 434 copied = iov_iter_copy_from_user_atomic(page, i, offset, count); 435 pagefault_enable(); 436 437 /* Flush processor's dcache for this page */ 438 flush_dcache_page(page); 439 440 /* 441 * if we get a partial write, we can end up with 442 * partially up to date pages. These add 443 * a lot of complexity, so make sure they don't 444 * happen by forcing this copy to be retried. 445 * 446 * The rest of the btrfs_file_write code will fall 447 * back to page at a time copies after we return 0. 448 */ 449 if (!PageUptodate(page) && copied < count) 450 copied = 0; 451 452 iov_iter_advance(i, copied); 453 write_bytes -= copied; 454 total_copied += copied; 455 456 /* Return to btrfs_file_aio_write to fault page */ 457 if (unlikely(copied == 0)) 458 break; 459 460 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 461 offset += copied; 462 } else { 463 pg++; 464 offset = 0; 465 } 466 } 467 return total_copied; 468 } 469 470 /* 471 * unlocks pages after btrfs_file_write is done with them 472 */ 473 static void btrfs_drop_pages(struct page **pages, size_t num_pages) 474 { 475 size_t i; 476 for (i = 0; i < num_pages; i++) { 477 /* page checked is some magic around finding pages that 478 * have been modified without going through btrfs_set_page_dirty 479 * clear it here 480 */ 481 ClearPageChecked(pages[i]); 482 unlock_page(pages[i]); 483 mark_page_accessed(pages[i]); 484 page_cache_release(pages[i]); 485 } 486 } 487 488 /* 489 * after copy_from_user, pages need to be dirtied and we need to make 490 * sure holes are created between the current EOF and the start of 491 * any next extents (if required). 492 * 493 * this also makes the decision about creating an inline extent vs 494 * doing real data extents, marking pages dirty and delalloc as required. 495 */ 496 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, 497 struct page **pages, size_t num_pages, 498 loff_t pos, size_t write_bytes, 499 struct extent_state **cached) 500 { 501 int err = 0; 502 int i; 503 u64 num_bytes; 504 u64 start_pos; 505 u64 end_of_last_block; 506 u64 end_pos = pos + write_bytes; 507 loff_t isize = i_size_read(inode); 508 509 start_pos = pos & ~((u64)root->sectorsize - 1); 510 num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); 511 512 end_of_last_block = start_pos + num_bytes - 1; 513 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 514 cached); 515 if (err) 516 return err; 517 518 for (i = 0; i < num_pages; i++) { 519 struct page *p = pages[i]; 520 SetPageUptodate(p); 521 ClearPageChecked(p); 522 set_page_dirty(p); 523 } 524 525 /* 526 * we've only changed i_size in ram, and we haven't updated 527 * the disk i_size. There is no need to log the inode 528 * at this time. 529 */ 530 if (end_pos > isize) 531 i_size_write(inode, end_pos); 532 return 0; 533 } 534 535 /* 536 * this drops all the extents in the cache that intersect the range 537 * [start, end]. Existing extents are split as required. 538 */ 539 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 540 int skip_pinned) 541 { 542 struct extent_map *em; 543 struct extent_map *split = NULL; 544 struct extent_map *split2 = NULL; 545 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 546 u64 len = end - start + 1; 547 u64 gen; 548 int ret; 549 int testend = 1; 550 unsigned long flags; 551 int compressed = 0; 552 bool modified; 553 554 WARN_ON(end < start); 555 if (end == (u64)-1) { 556 len = (u64)-1; 557 testend = 0; 558 } 559 while (1) { 560 int no_splits = 0; 561 562 modified = false; 563 if (!split) 564 split = alloc_extent_map(); 565 if (!split2) 566 split2 = alloc_extent_map(); 567 if (!split || !split2) 568 no_splits = 1; 569 570 write_lock(&em_tree->lock); 571 em = lookup_extent_mapping(em_tree, start, len); 572 if (!em) { 573 write_unlock(&em_tree->lock); 574 break; 575 } 576 flags = em->flags; 577 gen = em->generation; 578 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 579 if (testend && em->start + em->len >= start + len) { 580 free_extent_map(em); 581 write_unlock(&em_tree->lock); 582 break; 583 } 584 start = em->start + em->len; 585 if (testend) 586 len = start + len - (em->start + em->len); 587 free_extent_map(em); 588 write_unlock(&em_tree->lock); 589 continue; 590 } 591 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 592 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 593 clear_bit(EXTENT_FLAG_LOGGING, &flags); 594 modified = !list_empty(&em->list); 595 remove_extent_mapping(em_tree, em); 596 if (no_splits) 597 goto next; 598 599 if (em->block_start < EXTENT_MAP_LAST_BYTE && 600 em->start < start) { 601 split->start = em->start; 602 split->len = start - em->start; 603 split->orig_start = em->orig_start; 604 split->block_start = em->block_start; 605 606 if (compressed) 607 split->block_len = em->block_len; 608 else 609 split->block_len = split->len; 610 split->ram_bytes = em->ram_bytes; 611 split->orig_block_len = max(split->block_len, 612 em->orig_block_len); 613 split->generation = gen; 614 split->bdev = em->bdev; 615 split->flags = flags; 616 split->compress_type = em->compress_type; 617 ret = add_extent_mapping(em_tree, split, modified); 618 BUG_ON(ret); /* Logic error */ 619 free_extent_map(split); 620 split = split2; 621 split2 = NULL; 622 } 623 if (em->block_start < EXTENT_MAP_LAST_BYTE && 624 testend && em->start + em->len > start + len) { 625 u64 diff = start + len - em->start; 626 627 split->start = start + len; 628 split->len = em->start + em->len - (start + len); 629 split->bdev = em->bdev; 630 split->flags = flags; 631 split->compress_type = em->compress_type; 632 split->generation = gen; 633 split->orig_block_len = max(em->block_len, 634 em->orig_block_len); 635 split->ram_bytes = em->ram_bytes; 636 637 if (compressed) { 638 split->block_len = em->block_len; 639 split->block_start = em->block_start; 640 split->orig_start = em->orig_start; 641 } else { 642 split->block_len = split->len; 643 split->block_start = em->block_start + diff; 644 split->orig_start = em->orig_start; 645 } 646 647 ret = add_extent_mapping(em_tree, split, modified); 648 BUG_ON(ret); /* Logic error */ 649 free_extent_map(split); 650 split = NULL; 651 } 652 next: 653 write_unlock(&em_tree->lock); 654 655 /* once for us */ 656 free_extent_map(em); 657 /* once for the tree*/ 658 free_extent_map(em); 659 } 660 if (split) 661 free_extent_map(split); 662 if (split2) 663 free_extent_map(split2); 664 } 665 666 /* 667 * this is very complex, but the basic idea is to drop all extents 668 * in the range start - end. hint_block is filled in with a block number 669 * that would be a good hint to the block allocator for this file. 670 * 671 * If an extent intersects the range but is not entirely inside the range 672 * it is either truncated or split. Anything entirely inside the range 673 * is deleted from the tree. 674 */ 675 int __btrfs_drop_extents(struct btrfs_trans_handle *trans, 676 struct btrfs_root *root, struct inode *inode, 677 struct btrfs_path *path, u64 start, u64 end, 678 u64 *drop_end, int drop_cache) 679 { 680 struct extent_buffer *leaf; 681 struct btrfs_file_extent_item *fi; 682 struct btrfs_key key; 683 struct btrfs_key new_key; 684 u64 ino = btrfs_ino(inode); 685 u64 search_start = start; 686 u64 disk_bytenr = 0; 687 u64 num_bytes = 0; 688 u64 extent_offset = 0; 689 u64 extent_end = 0; 690 int del_nr = 0; 691 int del_slot = 0; 692 int extent_type; 693 int recow; 694 int ret; 695 int modify_tree = -1; 696 int update_refs = (root->ref_cows || root == root->fs_info->tree_root); 697 int found = 0; 698 699 if (drop_cache) 700 btrfs_drop_extent_cache(inode, start, end - 1, 0); 701 702 if (start >= BTRFS_I(inode)->disk_i_size) 703 modify_tree = 0; 704 705 while (1) { 706 recow = 0; 707 ret = btrfs_lookup_file_extent(trans, root, path, ino, 708 search_start, modify_tree); 709 if (ret < 0) 710 break; 711 if (ret > 0 && path->slots[0] > 0 && search_start == start) { 712 leaf = path->nodes[0]; 713 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); 714 if (key.objectid == ino && 715 key.type == BTRFS_EXTENT_DATA_KEY) 716 path->slots[0]--; 717 } 718 ret = 0; 719 next_slot: 720 leaf = path->nodes[0]; 721 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 722 BUG_ON(del_nr > 0); 723 ret = btrfs_next_leaf(root, path); 724 if (ret < 0) 725 break; 726 if (ret > 0) { 727 ret = 0; 728 break; 729 } 730 leaf = path->nodes[0]; 731 recow = 1; 732 } 733 734 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 735 if (key.objectid > ino || 736 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) 737 break; 738 739 fi = btrfs_item_ptr(leaf, path->slots[0], 740 struct btrfs_file_extent_item); 741 extent_type = btrfs_file_extent_type(leaf, fi); 742 743 if (extent_type == BTRFS_FILE_EXTENT_REG || 744 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 745 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 746 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 747 extent_offset = btrfs_file_extent_offset(leaf, fi); 748 extent_end = key.offset + 749 btrfs_file_extent_num_bytes(leaf, fi); 750 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 751 extent_end = key.offset + 752 btrfs_file_extent_inline_len(leaf, fi); 753 } else { 754 WARN_ON(1); 755 extent_end = search_start; 756 } 757 758 if (extent_end <= search_start) { 759 path->slots[0]++; 760 goto next_slot; 761 } 762 763 found = 1; 764 search_start = max(key.offset, start); 765 if (recow || !modify_tree) { 766 modify_tree = -1; 767 btrfs_release_path(path); 768 continue; 769 } 770 771 /* 772 * | - range to drop - | 773 * | -------- extent -------- | 774 */ 775 if (start > key.offset && end < extent_end) { 776 BUG_ON(del_nr > 0); 777 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 778 779 memcpy(&new_key, &key, sizeof(new_key)); 780 new_key.offset = start; 781 ret = btrfs_duplicate_item(trans, root, path, 782 &new_key); 783 if (ret == -EAGAIN) { 784 btrfs_release_path(path); 785 continue; 786 } 787 if (ret < 0) 788 break; 789 790 leaf = path->nodes[0]; 791 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 792 struct btrfs_file_extent_item); 793 btrfs_set_file_extent_num_bytes(leaf, fi, 794 start - key.offset); 795 796 fi = btrfs_item_ptr(leaf, path->slots[0], 797 struct btrfs_file_extent_item); 798 799 extent_offset += start - key.offset; 800 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 801 btrfs_set_file_extent_num_bytes(leaf, fi, 802 extent_end - start); 803 btrfs_mark_buffer_dirty(leaf); 804 805 if (update_refs && disk_bytenr > 0) { 806 ret = btrfs_inc_extent_ref(trans, root, 807 disk_bytenr, num_bytes, 0, 808 root->root_key.objectid, 809 new_key.objectid, 810 start - extent_offset, 0); 811 BUG_ON(ret); /* -ENOMEM */ 812 } 813 key.offset = start; 814 } 815 /* 816 * | ---- range to drop ----- | 817 * | -------- extent -------- | 818 */ 819 if (start <= key.offset && end < extent_end) { 820 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 821 822 memcpy(&new_key, &key, sizeof(new_key)); 823 new_key.offset = end; 824 btrfs_set_item_key_safe(root, path, &new_key); 825 826 extent_offset += end - key.offset; 827 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 828 btrfs_set_file_extent_num_bytes(leaf, fi, 829 extent_end - end); 830 btrfs_mark_buffer_dirty(leaf); 831 if (update_refs && disk_bytenr > 0) 832 inode_sub_bytes(inode, end - key.offset); 833 break; 834 } 835 836 search_start = extent_end; 837 /* 838 * | ---- range to drop ----- | 839 * | -------- extent -------- | 840 */ 841 if (start > key.offset && end >= extent_end) { 842 BUG_ON(del_nr > 0); 843 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 844 845 btrfs_set_file_extent_num_bytes(leaf, fi, 846 start - key.offset); 847 btrfs_mark_buffer_dirty(leaf); 848 if (update_refs && disk_bytenr > 0) 849 inode_sub_bytes(inode, extent_end - start); 850 if (end == extent_end) 851 break; 852 853 path->slots[0]++; 854 goto next_slot; 855 } 856 857 /* 858 * | ---- range to drop ----- | 859 * | ------ extent ------ | 860 */ 861 if (start <= key.offset && end >= extent_end) { 862 if (del_nr == 0) { 863 del_slot = path->slots[0]; 864 del_nr = 1; 865 } else { 866 BUG_ON(del_slot + del_nr != path->slots[0]); 867 del_nr++; 868 } 869 870 if (update_refs && 871 extent_type == BTRFS_FILE_EXTENT_INLINE) { 872 inode_sub_bytes(inode, 873 extent_end - key.offset); 874 extent_end = ALIGN(extent_end, 875 root->sectorsize); 876 } else if (update_refs && disk_bytenr > 0) { 877 ret = btrfs_free_extent(trans, root, 878 disk_bytenr, num_bytes, 0, 879 root->root_key.objectid, 880 key.objectid, key.offset - 881 extent_offset, 0); 882 BUG_ON(ret); /* -ENOMEM */ 883 inode_sub_bytes(inode, 884 extent_end - key.offset); 885 } 886 887 if (end == extent_end) 888 break; 889 890 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { 891 path->slots[0]++; 892 goto next_slot; 893 } 894 895 ret = btrfs_del_items(trans, root, path, del_slot, 896 del_nr); 897 if (ret) { 898 btrfs_abort_transaction(trans, root, ret); 899 break; 900 } 901 902 del_nr = 0; 903 del_slot = 0; 904 905 btrfs_release_path(path); 906 continue; 907 } 908 909 BUG_ON(1); 910 } 911 912 if (!ret && del_nr > 0) { 913 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 914 if (ret) 915 btrfs_abort_transaction(trans, root, ret); 916 } 917 918 if (drop_end) 919 *drop_end = found ? min(end, extent_end) : end; 920 btrfs_release_path(path); 921 return ret; 922 } 923 924 int btrfs_drop_extents(struct btrfs_trans_handle *trans, 925 struct btrfs_root *root, struct inode *inode, u64 start, 926 u64 end, int drop_cache) 927 { 928 struct btrfs_path *path; 929 int ret; 930 931 path = btrfs_alloc_path(); 932 if (!path) 933 return -ENOMEM; 934 ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, 935 drop_cache); 936 btrfs_free_path(path); 937 return ret; 938 } 939 940 static int extent_mergeable(struct extent_buffer *leaf, int slot, 941 u64 objectid, u64 bytenr, u64 orig_offset, 942 u64 *start, u64 *end) 943 { 944 struct btrfs_file_extent_item *fi; 945 struct btrfs_key key; 946 u64 extent_end; 947 948 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 949 return 0; 950 951 btrfs_item_key_to_cpu(leaf, &key, slot); 952 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) 953 return 0; 954 955 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 956 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || 957 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || 958 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || 959 btrfs_file_extent_compression(leaf, fi) || 960 btrfs_file_extent_encryption(leaf, fi) || 961 btrfs_file_extent_other_encoding(leaf, fi)) 962 return 0; 963 964 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 965 if ((*start && *start != key.offset) || (*end && *end != extent_end)) 966 return 0; 967 968 *start = key.offset; 969 *end = extent_end; 970 return 1; 971 } 972 973 /* 974 * Mark extent in the range start - end as written. 975 * 976 * This changes extent type from 'pre-allocated' to 'regular'. If only 977 * part of extent is marked as written, the extent will be split into 978 * two or three. 979 */ 980 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 981 struct inode *inode, u64 start, u64 end) 982 { 983 struct btrfs_root *root = BTRFS_I(inode)->root; 984 struct extent_buffer *leaf; 985 struct btrfs_path *path; 986 struct btrfs_file_extent_item *fi; 987 struct btrfs_key key; 988 struct btrfs_key new_key; 989 u64 bytenr; 990 u64 num_bytes; 991 u64 extent_end; 992 u64 orig_offset; 993 u64 other_start; 994 u64 other_end; 995 u64 split; 996 int del_nr = 0; 997 int del_slot = 0; 998 int recow; 999 int ret; 1000 u64 ino = btrfs_ino(inode); 1001 1002 path = btrfs_alloc_path(); 1003 if (!path) 1004 return -ENOMEM; 1005 again: 1006 recow = 0; 1007 split = start; 1008 key.objectid = ino; 1009 key.type = BTRFS_EXTENT_DATA_KEY; 1010 key.offset = split; 1011 1012 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1013 if (ret < 0) 1014 goto out; 1015 if (ret > 0 && path->slots[0] > 0) 1016 path->slots[0]--; 1017 1018 leaf = path->nodes[0]; 1019 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1020 BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY); 1021 fi = btrfs_item_ptr(leaf, path->slots[0], 1022 struct btrfs_file_extent_item); 1023 BUG_ON(btrfs_file_extent_type(leaf, fi) != 1024 BTRFS_FILE_EXTENT_PREALLOC); 1025 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 1026 BUG_ON(key.offset > start || extent_end < end); 1027 1028 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1029 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1030 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 1031 memcpy(&new_key, &key, sizeof(new_key)); 1032 1033 if (start == key.offset && end < extent_end) { 1034 other_start = 0; 1035 other_end = start; 1036 if (extent_mergeable(leaf, path->slots[0] - 1, 1037 ino, bytenr, orig_offset, 1038 &other_start, &other_end)) { 1039 new_key.offset = end; 1040 btrfs_set_item_key_safe(root, path, &new_key); 1041 fi = btrfs_item_ptr(leaf, path->slots[0], 1042 struct btrfs_file_extent_item); 1043 btrfs_set_file_extent_generation(leaf, fi, 1044 trans->transid); 1045 btrfs_set_file_extent_num_bytes(leaf, fi, 1046 extent_end - end); 1047 btrfs_set_file_extent_offset(leaf, fi, 1048 end - orig_offset); 1049 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1050 struct btrfs_file_extent_item); 1051 btrfs_set_file_extent_generation(leaf, fi, 1052 trans->transid); 1053 btrfs_set_file_extent_num_bytes(leaf, fi, 1054 end - other_start); 1055 btrfs_mark_buffer_dirty(leaf); 1056 goto out; 1057 } 1058 } 1059 1060 if (start > key.offset && end == extent_end) { 1061 other_start = end; 1062 other_end = 0; 1063 if (extent_mergeable(leaf, path->slots[0] + 1, 1064 ino, bytenr, orig_offset, 1065 &other_start, &other_end)) { 1066 fi = btrfs_item_ptr(leaf, path->slots[0], 1067 struct btrfs_file_extent_item); 1068 btrfs_set_file_extent_num_bytes(leaf, fi, 1069 start - key.offset); 1070 btrfs_set_file_extent_generation(leaf, fi, 1071 trans->transid); 1072 path->slots[0]++; 1073 new_key.offset = start; 1074 btrfs_set_item_key_safe(root, path, &new_key); 1075 1076 fi = btrfs_item_ptr(leaf, path->slots[0], 1077 struct btrfs_file_extent_item); 1078 btrfs_set_file_extent_generation(leaf, fi, 1079 trans->transid); 1080 btrfs_set_file_extent_num_bytes(leaf, fi, 1081 other_end - start); 1082 btrfs_set_file_extent_offset(leaf, fi, 1083 start - orig_offset); 1084 btrfs_mark_buffer_dirty(leaf); 1085 goto out; 1086 } 1087 } 1088 1089 while (start > key.offset || end < extent_end) { 1090 if (key.offset == start) 1091 split = end; 1092 1093 new_key.offset = split; 1094 ret = btrfs_duplicate_item(trans, root, path, &new_key); 1095 if (ret == -EAGAIN) { 1096 btrfs_release_path(path); 1097 goto again; 1098 } 1099 if (ret < 0) { 1100 btrfs_abort_transaction(trans, root, ret); 1101 goto out; 1102 } 1103 1104 leaf = path->nodes[0]; 1105 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1106 struct btrfs_file_extent_item); 1107 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1108 btrfs_set_file_extent_num_bytes(leaf, fi, 1109 split - key.offset); 1110 1111 fi = btrfs_item_ptr(leaf, path->slots[0], 1112 struct btrfs_file_extent_item); 1113 1114 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1115 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 1116 btrfs_set_file_extent_num_bytes(leaf, fi, 1117 extent_end - split); 1118 btrfs_mark_buffer_dirty(leaf); 1119 1120 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 1121 root->root_key.objectid, 1122 ino, orig_offset, 0); 1123 BUG_ON(ret); /* -ENOMEM */ 1124 1125 if (split == start) { 1126 key.offset = start; 1127 } else { 1128 BUG_ON(start != key.offset); 1129 path->slots[0]--; 1130 extent_end = end; 1131 } 1132 recow = 1; 1133 } 1134 1135 other_start = end; 1136 other_end = 0; 1137 if (extent_mergeable(leaf, path->slots[0] + 1, 1138 ino, bytenr, orig_offset, 1139 &other_start, &other_end)) { 1140 if (recow) { 1141 btrfs_release_path(path); 1142 goto again; 1143 } 1144 extent_end = other_end; 1145 del_slot = path->slots[0] + 1; 1146 del_nr++; 1147 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1148 0, root->root_key.objectid, 1149 ino, orig_offset, 0); 1150 BUG_ON(ret); /* -ENOMEM */ 1151 } 1152 other_start = 0; 1153 other_end = start; 1154 if (extent_mergeable(leaf, path->slots[0] - 1, 1155 ino, bytenr, orig_offset, 1156 &other_start, &other_end)) { 1157 if (recow) { 1158 btrfs_release_path(path); 1159 goto again; 1160 } 1161 key.offset = other_start; 1162 del_slot = path->slots[0]; 1163 del_nr++; 1164 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1165 0, root->root_key.objectid, 1166 ino, orig_offset, 0); 1167 BUG_ON(ret); /* -ENOMEM */ 1168 } 1169 if (del_nr == 0) { 1170 fi = btrfs_item_ptr(leaf, path->slots[0], 1171 struct btrfs_file_extent_item); 1172 btrfs_set_file_extent_type(leaf, fi, 1173 BTRFS_FILE_EXTENT_REG); 1174 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1175 btrfs_mark_buffer_dirty(leaf); 1176 } else { 1177 fi = btrfs_item_ptr(leaf, del_slot - 1, 1178 struct btrfs_file_extent_item); 1179 btrfs_set_file_extent_type(leaf, fi, 1180 BTRFS_FILE_EXTENT_REG); 1181 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1182 btrfs_set_file_extent_num_bytes(leaf, fi, 1183 extent_end - key.offset); 1184 btrfs_mark_buffer_dirty(leaf); 1185 1186 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 1187 if (ret < 0) { 1188 btrfs_abort_transaction(trans, root, ret); 1189 goto out; 1190 } 1191 } 1192 out: 1193 btrfs_free_path(path); 1194 return 0; 1195 } 1196 1197 /* 1198 * on error we return an unlocked page and the error value 1199 * on success we return a locked page and 0 1200 */ 1201 static int prepare_uptodate_page(struct page *page, u64 pos, 1202 bool force_uptodate) 1203 { 1204 int ret = 0; 1205 1206 if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) && 1207 !PageUptodate(page)) { 1208 ret = btrfs_readpage(NULL, page); 1209 if (ret) 1210 return ret; 1211 lock_page(page); 1212 if (!PageUptodate(page)) { 1213 unlock_page(page); 1214 return -EIO; 1215 } 1216 } 1217 return 0; 1218 } 1219 1220 /* 1221 * this gets pages into the page cache and locks them down, it also properly 1222 * waits for data=ordered extents to finish before allowing the pages to be 1223 * modified. 1224 */ 1225 static noinline int prepare_pages(struct btrfs_root *root, struct file *file, 1226 struct page **pages, size_t num_pages, 1227 loff_t pos, unsigned long first_index, 1228 size_t write_bytes, bool force_uptodate) 1229 { 1230 struct extent_state *cached_state = NULL; 1231 int i; 1232 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1233 struct inode *inode = file_inode(file); 1234 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1235 int err = 0; 1236 int faili = 0; 1237 u64 start_pos; 1238 u64 last_pos; 1239 1240 start_pos = pos & ~((u64)root->sectorsize - 1); 1241 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; 1242 1243 again: 1244 for (i = 0; i < num_pages; i++) { 1245 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1246 mask | __GFP_WRITE); 1247 if (!pages[i]) { 1248 faili = i - 1; 1249 err = -ENOMEM; 1250 goto fail; 1251 } 1252 1253 if (i == 0) 1254 err = prepare_uptodate_page(pages[i], pos, 1255 force_uptodate); 1256 if (i == num_pages - 1) 1257 err = prepare_uptodate_page(pages[i], 1258 pos + write_bytes, false); 1259 if (err) { 1260 page_cache_release(pages[i]); 1261 faili = i - 1; 1262 goto fail; 1263 } 1264 wait_on_page_writeback(pages[i]); 1265 } 1266 err = 0; 1267 if (start_pos < inode->i_size) { 1268 struct btrfs_ordered_extent *ordered; 1269 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1270 start_pos, last_pos - 1, 0, &cached_state); 1271 ordered = btrfs_lookup_first_ordered_extent(inode, 1272 last_pos - 1); 1273 if (ordered && 1274 ordered->file_offset + ordered->len > start_pos && 1275 ordered->file_offset < last_pos) { 1276 btrfs_put_ordered_extent(ordered); 1277 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1278 start_pos, last_pos - 1, 1279 &cached_state, GFP_NOFS); 1280 for (i = 0; i < num_pages; i++) { 1281 unlock_page(pages[i]); 1282 page_cache_release(pages[i]); 1283 } 1284 btrfs_wait_ordered_range(inode, start_pos, 1285 last_pos - start_pos); 1286 goto again; 1287 } 1288 if (ordered) 1289 btrfs_put_ordered_extent(ordered); 1290 1291 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, 1292 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1293 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1294 0, 0, &cached_state, GFP_NOFS); 1295 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1296 start_pos, last_pos - 1, &cached_state, 1297 GFP_NOFS); 1298 } 1299 for (i = 0; i < num_pages; i++) { 1300 if (clear_page_dirty_for_io(pages[i])) 1301 account_page_redirty(pages[i]); 1302 set_page_extent_mapped(pages[i]); 1303 WARN_ON(!PageLocked(pages[i])); 1304 } 1305 return 0; 1306 fail: 1307 while (faili >= 0) { 1308 unlock_page(pages[faili]); 1309 page_cache_release(pages[faili]); 1310 faili--; 1311 } 1312 return err; 1313 1314 } 1315 1316 static noinline int check_can_nocow(struct inode *inode, loff_t pos, 1317 size_t *write_bytes) 1318 { 1319 struct btrfs_trans_handle *trans; 1320 struct btrfs_root *root = BTRFS_I(inode)->root; 1321 struct btrfs_ordered_extent *ordered; 1322 u64 lockstart, lockend; 1323 u64 num_bytes; 1324 int ret; 1325 1326 lockstart = round_down(pos, root->sectorsize); 1327 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; 1328 1329 while (1) { 1330 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1331 ordered = btrfs_lookup_ordered_range(inode, lockstart, 1332 lockend - lockstart + 1); 1333 if (!ordered) { 1334 break; 1335 } 1336 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1337 btrfs_start_ordered_extent(inode, ordered, 1); 1338 btrfs_put_ordered_extent(ordered); 1339 } 1340 1341 trans = btrfs_join_transaction(root); 1342 if (IS_ERR(trans)) { 1343 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1344 return PTR_ERR(trans); 1345 } 1346 1347 num_bytes = lockend - lockstart + 1; 1348 ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL, 1349 NULL); 1350 btrfs_end_transaction(trans, root); 1351 if (ret <= 0) { 1352 ret = 0; 1353 } else { 1354 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 1355 EXTENT_DIRTY | EXTENT_DELALLOC | 1356 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, 1357 NULL, GFP_NOFS); 1358 *write_bytes = min_t(size_t, *write_bytes, num_bytes); 1359 } 1360 1361 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); 1362 1363 return ret; 1364 } 1365 1366 static noinline ssize_t __btrfs_buffered_write(struct file *file, 1367 struct iov_iter *i, 1368 loff_t pos) 1369 { 1370 struct inode *inode = file_inode(file); 1371 struct btrfs_root *root = BTRFS_I(inode)->root; 1372 struct page **pages = NULL; 1373 u64 release_bytes = 0; 1374 unsigned long first_index; 1375 size_t num_written = 0; 1376 int nrptrs; 1377 int ret = 0; 1378 bool only_release_metadata = false; 1379 bool force_page_uptodate = false; 1380 1381 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1382 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1383 (sizeof(struct page *))); 1384 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); 1385 nrptrs = max(nrptrs, 8); 1386 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1387 if (!pages) 1388 return -ENOMEM; 1389 1390 first_index = pos >> PAGE_CACHE_SHIFT; 1391 1392 while (iov_iter_count(i) > 0) { 1393 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1394 size_t write_bytes = min(iov_iter_count(i), 1395 nrptrs * (size_t)PAGE_CACHE_SIZE - 1396 offset); 1397 size_t num_pages = (write_bytes + offset + 1398 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1399 size_t reserve_bytes; 1400 size_t dirty_pages; 1401 size_t copied; 1402 1403 WARN_ON(num_pages > nrptrs); 1404 1405 /* 1406 * Fault pages before locking them in prepare_pages 1407 * to avoid recursive lock 1408 */ 1409 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { 1410 ret = -EFAULT; 1411 break; 1412 } 1413 1414 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1415 ret = btrfs_check_data_free_space(inode, reserve_bytes); 1416 if (ret == -ENOSPC && 1417 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | 1418 BTRFS_INODE_PREALLOC))) { 1419 ret = check_can_nocow(inode, pos, &write_bytes); 1420 if (ret > 0) { 1421 only_release_metadata = true; 1422 /* 1423 * our prealloc extent may be smaller than 1424 * write_bytes, so scale down. 1425 */ 1426 num_pages = (write_bytes + offset + 1427 PAGE_CACHE_SIZE - 1) >> 1428 PAGE_CACHE_SHIFT; 1429 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1430 ret = 0; 1431 } else { 1432 ret = -ENOSPC; 1433 } 1434 } 1435 1436 if (ret) 1437 break; 1438 1439 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); 1440 if (ret) { 1441 if (!only_release_metadata) 1442 btrfs_free_reserved_data_space(inode, 1443 reserve_bytes); 1444 break; 1445 } 1446 1447 release_bytes = reserve_bytes; 1448 1449 /* 1450 * This is going to setup the pages array with the number of 1451 * pages we want, so we don't really need to worry about the 1452 * contents of pages from loop to loop 1453 */ 1454 ret = prepare_pages(root, file, pages, num_pages, 1455 pos, first_index, write_bytes, 1456 force_page_uptodate); 1457 if (ret) 1458 break; 1459 1460 copied = btrfs_copy_from_user(pos, num_pages, 1461 write_bytes, pages, i); 1462 1463 /* 1464 * if we have trouble faulting in the pages, fall 1465 * back to one page at a time 1466 */ 1467 if (copied < write_bytes) 1468 nrptrs = 1; 1469 1470 if (copied == 0) { 1471 force_page_uptodate = true; 1472 dirty_pages = 0; 1473 } else { 1474 force_page_uptodate = false; 1475 dirty_pages = (copied + offset + 1476 PAGE_CACHE_SIZE - 1) >> 1477 PAGE_CACHE_SHIFT; 1478 } 1479 1480 /* 1481 * If we had a short copy we need to release the excess delaloc 1482 * bytes we reserved. We need to increment outstanding_extents 1483 * because btrfs_delalloc_release_space will decrement it, but 1484 * we still have an outstanding extent for the chunk we actually 1485 * managed to copy. 1486 */ 1487 if (num_pages > dirty_pages) { 1488 release_bytes = (num_pages - dirty_pages) << 1489 PAGE_CACHE_SHIFT; 1490 if (copied > 0) { 1491 spin_lock(&BTRFS_I(inode)->lock); 1492 BTRFS_I(inode)->outstanding_extents++; 1493 spin_unlock(&BTRFS_I(inode)->lock); 1494 } 1495 if (only_release_metadata) 1496 btrfs_delalloc_release_metadata(inode, 1497 release_bytes); 1498 else 1499 btrfs_delalloc_release_space(inode, 1500 release_bytes); 1501 } 1502 1503 release_bytes = dirty_pages << PAGE_CACHE_SHIFT; 1504 if (copied > 0) { 1505 ret = btrfs_dirty_pages(root, inode, pages, 1506 dirty_pages, pos, copied, 1507 NULL); 1508 if (ret) { 1509 btrfs_drop_pages(pages, num_pages); 1510 break; 1511 } 1512 } 1513 1514 release_bytes = 0; 1515 btrfs_drop_pages(pages, num_pages); 1516 1517 if (only_release_metadata && copied > 0) { 1518 u64 lockstart = round_down(pos, root->sectorsize); 1519 u64 lockend = lockstart + 1520 (dirty_pages << PAGE_CACHE_SHIFT) - 1; 1521 1522 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 1523 lockend, EXTENT_NORESERVE, NULL, 1524 NULL, GFP_NOFS); 1525 only_release_metadata = false; 1526 } 1527 1528 cond_resched(); 1529 1530 balance_dirty_pages_ratelimited(inode->i_mapping); 1531 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1532 btrfs_btree_balance_dirty(root); 1533 1534 pos += copied; 1535 num_written += copied; 1536 } 1537 1538 kfree(pages); 1539 1540 if (release_bytes) { 1541 if (only_release_metadata) 1542 btrfs_delalloc_release_metadata(inode, release_bytes); 1543 else 1544 btrfs_delalloc_release_space(inode, release_bytes); 1545 } 1546 1547 return num_written ? num_written : ret; 1548 } 1549 1550 static ssize_t __btrfs_direct_write(struct kiocb *iocb, 1551 const struct iovec *iov, 1552 unsigned long nr_segs, loff_t pos, 1553 loff_t *ppos, size_t count, size_t ocount) 1554 { 1555 struct file *file = iocb->ki_filp; 1556 struct iov_iter i; 1557 ssize_t written; 1558 ssize_t written_buffered; 1559 loff_t endbyte; 1560 int err; 1561 1562 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, 1563 count, ocount); 1564 1565 if (written < 0 || written == count) 1566 return written; 1567 1568 pos += written; 1569 count -= written; 1570 iov_iter_init(&i, iov, nr_segs, count, written); 1571 written_buffered = __btrfs_buffered_write(file, &i, pos); 1572 if (written_buffered < 0) { 1573 err = written_buffered; 1574 goto out; 1575 } 1576 endbyte = pos + written_buffered - 1; 1577 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 1578 if (err) 1579 goto out; 1580 written += written_buffered; 1581 *ppos = pos + written_buffered; 1582 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, 1583 endbyte >> PAGE_CACHE_SHIFT); 1584 out: 1585 return written ? written : err; 1586 } 1587 1588 static void update_time_for_write(struct inode *inode) 1589 { 1590 struct timespec now; 1591 1592 if (IS_NOCMTIME(inode)) 1593 return; 1594 1595 now = current_fs_time(inode->i_sb); 1596 if (!timespec_equal(&inode->i_mtime, &now)) 1597 inode->i_mtime = now; 1598 1599 if (!timespec_equal(&inode->i_ctime, &now)) 1600 inode->i_ctime = now; 1601 1602 if (IS_I_VERSION(inode)) 1603 inode_inc_iversion(inode); 1604 } 1605 1606 static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1607 const struct iovec *iov, 1608 unsigned long nr_segs, loff_t pos) 1609 { 1610 struct file *file = iocb->ki_filp; 1611 struct inode *inode = file_inode(file); 1612 struct btrfs_root *root = BTRFS_I(inode)->root; 1613 loff_t *ppos = &iocb->ki_pos; 1614 u64 start_pos; 1615 ssize_t num_written = 0; 1616 ssize_t err = 0; 1617 size_t count, ocount; 1618 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); 1619 1620 mutex_lock(&inode->i_mutex); 1621 1622 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 1623 if (err) { 1624 mutex_unlock(&inode->i_mutex); 1625 goto out; 1626 } 1627 count = ocount; 1628 1629 current->backing_dev_info = inode->i_mapping->backing_dev_info; 1630 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 1631 if (err) { 1632 mutex_unlock(&inode->i_mutex); 1633 goto out; 1634 } 1635 1636 if (count == 0) { 1637 mutex_unlock(&inode->i_mutex); 1638 goto out; 1639 } 1640 1641 err = file_remove_suid(file); 1642 if (err) { 1643 mutex_unlock(&inode->i_mutex); 1644 goto out; 1645 } 1646 1647 /* 1648 * If BTRFS flips readonly due to some impossible error 1649 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), 1650 * although we have opened a file as writable, we have 1651 * to stop this write operation to ensure FS consistency. 1652 */ 1653 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { 1654 mutex_unlock(&inode->i_mutex); 1655 err = -EROFS; 1656 goto out; 1657 } 1658 1659 /* 1660 * We reserve space for updating the inode when we reserve space for the 1661 * extent we are going to write, so we will enospc out there. We don't 1662 * need to start yet another transaction to update the inode as we will 1663 * update the inode when we finish writing whatever data we write. 1664 */ 1665 update_time_for_write(inode); 1666 1667 start_pos = round_down(pos, root->sectorsize); 1668 if (start_pos > i_size_read(inode)) { 1669 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); 1670 if (err) { 1671 mutex_unlock(&inode->i_mutex); 1672 goto out; 1673 } 1674 } 1675 1676 if (sync) 1677 atomic_inc(&BTRFS_I(inode)->sync_writers); 1678 1679 if (unlikely(file->f_flags & O_DIRECT)) { 1680 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1681 pos, ppos, count, ocount); 1682 } else { 1683 struct iov_iter i; 1684 1685 iov_iter_init(&i, iov, nr_segs, count, num_written); 1686 1687 num_written = __btrfs_buffered_write(file, &i, pos); 1688 if (num_written > 0) 1689 *ppos = pos + num_written; 1690 } 1691 1692 mutex_unlock(&inode->i_mutex); 1693 1694 /* 1695 * we want to make sure fsync finds this change 1696 * but we haven't joined a transaction running right now. 1697 * 1698 * Later on, someone is sure to update the inode and get the 1699 * real transid recorded. 1700 * 1701 * We set last_trans now to the fs_info generation + 1, 1702 * this will either be one more than the running transaction 1703 * or the generation used for the next transaction if there isn't 1704 * one running right now. 1705 * 1706 * We also have to set last_sub_trans to the current log transid, 1707 * otherwise subsequent syncs to a file that's been synced in this 1708 * transaction will appear to have already occured. 1709 */ 1710 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1711 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1712 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1713 err = generic_write_sync(file, pos, num_written); 1714 if (err < 0 && num_written > 0) 1715 num_written = err; 1716 } 1717 1718 if (sync) 1719 atomic_dec(&BTRFS_I(inode)->sync_writers); 1720 out: 1721 current->backing_dev_info = NULL; 1722 return num_written ? num_written : err; 1723 } 1724 1725 int btrfs_release_file(struct inode *inode, struct file *filp) 1726 { 1727 /* 1728 * ordered_data_close is set by settattr when we are about to truncate 1729 * a file from a non-zero size to a zero size. This tries to 1730 * flush down new bytes that may have been written if the 1731 * application were using truncate to replace a file in place. 1732 */ 1733 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 1734 &BTRFS_I(inode)->runtime_flags)) { 1735 struct btrfs_trans_handle *trans; 1736 struct btrfs_root *root = BTRFS_I(inode)->root; 1737 1738 /* 1739 * We need to block on a committing transaction to keep us from 1740 * throwing a ordered operation on to the list and causing 1741 * something like sync to deadlock trying to flush out this 1742 * inode. 1743 */ 1744 trans = btrfs_start_transaction(root, 0); 1745 if (IS_ERR(trans)) 1746 return PTR_ERR(trans); 1747 btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); 1748 btrfs_end_transaction(trans, root); 1749 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1750 filemap_flush(inode->i_mapping); 1751 } 1752 if (filp->private_data) 1753 btrfs_ioctl_trans_end(filp); 1754 return 0; 1755 } 1756 1757 /* 1758 * fsync call for both files and directories. This logs the inode into 1759 * the tree log instead of forcing full commits whenever possible. 1760 * 1761 * It needs to call filemap_fdatawait so that all ordered extent updates are 1762 * in the metadata btree are up to date for copying to the log. 1763 * 1764 * It drops the inode mutex before doing the tree log commit. This is an 1765 * important optimization for directories because holding the mutex prevents 1766 * new operations on the dir while we write to disk. 1767 */ 1768 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 1769 { 1770 struct dentry *dentry = file->f_path.dentry; 1771 struct inode *inode = dentry->d_inode; 1772 struct btrfs_root *root = BTRFS_I(inode)->root; 1773 int ret = 0; 1774 struct btrfs_trans_handle *trans; 1775 bool full_sync = 0; 1776 1777 trace_btrfs_sync_file(file, datasync); 1778 1779 /* 1780 * We write the dirty pages in the range and wait until they complete 1781 * out of the ->i_mutex. If so, we can flush the dirty pages by 1782 * multi-task, and make the performance up. See 1783 * btrfs_wait_ordered_range for an explanation of the ASYNC check. 1784 */ 1785 atomic_inc(&BTRFS_I(inode)->sync_writers); 1786 ret = filemap_fdatawrite_range(inode->i_mapping, start, end); 1787 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1788 &BTRFS_I(inode)->runtime_flags)) 1789 ret = filemap_fdatawrite_range(inode->i_mapping, start, end); 1790 atomic_dec(&BTRFS_I(inode)->sync_writers); 1791 if (ret) 1792 return ret; 1793 1794 mutex_lock(&inode->i_mutex); 1795 1796 /* 1797 * We flush the dirty pages again to avoid some dirty pages in the 1798 * range being left. 1799 */ 1800 atomic_inc(&root->log_batch); 1801 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 1802 &BTRFS_I(inode)->runtime_flags); 1803 if (full_sync) 1804 btrfs_wait_ordered_range(inode, start, end - start + 1); 1805 atomic_inc(&root->log_batch); 1806 1807 /* 1808 * check the transaction that last modified this inode 1809 * and see if its already been committed 1810 */ 1811 if (!BTRFS_I(inode)->last_trans) { 1812 mutex_unlock(&inode->i_mutex); 1813 goto out; 1814 } 1815 1816 /* 1817 * if the last transaction that changed this file was before 1818 * the current transaction, we can bail out now without any 1819 * syncing 1820 */ 1821 smp_mb(); 1822 if (btrfs_inode_in_log(inode, root->fs_info->generation) || 1823 BTRFS_I(inode)->last_trans <= 1824 root->fs_info->last_trans_committed) { 1825 BTRFS_I(inode)->last_trans = 0; 1826 1827 /* 1828 * We'v had everything committed since the last time we were 1829 * modified so clear this flag in case it was set for whatever 1830 * reason, it's no longer relevant. 1831 */ 1832 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 1833 &BTRFS_I(inode)->runtime_flags); 1834 mutex_unlock(&inode->i_mutex); 1835 goto out; 1836 } 1837 1838 /* 1839 * ok we haven't committed the transaction yet, lets do a commit 1840 */ 1841 if (file->private_data) 1842 btrfs_ioctl_trans_end(file); 1843 1844 trans = btrfs_start_transaction(root, 0); 1845 if (IS_ERR(trans)) { 1846 ret = PTR_ERR(trans); 1847 mutex_unlock(&inode->i_mutex); 1848 goto out; 1849 } 1850 1851 ret = btrfs_log_dentry_safe(trans, root, dentry); 1852 if (ret < 0) { 1853 mutex_unlock(&inode->i_mutex); 1854 goto out; 1855 } 1856 1857 /* we've logged all the items and now have a consistent 1858 * version of the file in the log. It is possible that 1859 * someone will come in and modify the file, but that's 1860 * fine because the log is consistent on disk, and we 1861 * have references to all of the file's extents 1862 * 1863 * It is possible that someone will come in and log the 1864 * file again, but that will end up using the synchronization 1865 * inside btrfs_sync_log to keep things safe. 1866 */ 1867 mutex_unlock(&inode->i_mutex); 1868 1869 if (ret != BTRFS_NO_LOG_SYNC) { 1870 if (ret > 0) { 1871 /* 1872 * If we didn't already wait for ordered extents we need 1873 * to do that now. 1874 */ 1875 if (!full_sync) 1876 btrfs_wait_ordered_range(inode, start, 1877 end - start + 1); 1878 ret = btrfs_commit_transaction(trans, root); 1879 } else { 1880 ret = btrfs_sync_log(trans, root); 1881 if (ret == 0) { 1882 ret = btrfs_end_transaction(trans, root); 1883 } else { 1884 if (!full_sync) 1885 btrfs_wait_ordered_range(inode, start, 1886 end - 1887 start + 1); 1888 ret = btrfs_commit_transaction(trans, root); 1889 } 1890 } 1891 } else { 1892 ret = btrfs_end_transaction(trans, root); 1893 } 1894 out: 1895 return ret > 0 ? -EIO : ret; 1896 } 1897 1898 static const struct vm_operations_struct btrfs_file_vm_ops = { 1899 .fault = filemap_fault, 1900 .page_mkwrite = btrfs_page_mkwrite, 1901 .remap_pages = generic_file_remap_pages, 1902 }; 1903 1904 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1905 { 1906 struct address_space *mapping = filp->f_mapping; 1907 1908 if (!mapping->a_ops->readpage) 1909 return -ENOEXEC; 1910 1911 file_accessed(filp); 1912 vma->vm_ops = &btrfs_file_vm_ops; 1913 1914 return 0; 1915 } 1916 1917 static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, 1918 int slot, u64 start, u64 end) 1919 { 1920 struct btrfs_file_extent_item *fi; 1921 struct btrfs_key key; 1922 1923 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 1924 return 0; 1925 1926 btrfs_item_key_to_cpu(leaf, &key, slot); 1927 if (key.objectid != btrfs_ino(inode) || 1928 key.type != BTRFS_EXTENT_DATA_KEY) 1929 return 0; 1930 1931 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 1932 1933 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 1934 return 0; 1935 1936 if (btrfs_file_extent_disk_bytenr(leaf, fi)) 1937 return 0; 1938 1939 if (key.offset == end) 1940 return 1; 1941 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) 1942 return 1; 1943 return 0; 1944 } 1945 1946 static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, 1947 struct btrfs_path *path, u64 offset, u64 end) 1948 { 1949 struct btrfs_root *root = BTRFS_I(inode)->root; 1950 struct extent_buffer *leaf; 1951 struct btrfs_file_extent_item *fi; 1952 struct extent_map *hole_em; 1953 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 1954 struct btrfs_key key; 1955 int ret; 1956 1957 key.objectid = btrfs_ino(inode); 1958 key.type = BTRFS_EXTENT_DATA_KEY; 1959 key.offset = offset; 1960 1961 1962 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1963 if (ret < 0) 1964 return ret; 1965 BUG_ON(!ret); 1966 1967 leaf = path->nodes[0]; 1968 if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { 1969 u64 num_bytes; 1970 1971 path->slots[0]--; 1972 fi = btrfs_item_ptr(leaf, path->slots[0], 1973 struct btrfs_file_extent_item); 1974 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + 1975 end - offset; 1976 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1977 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 1978 btrfs_set_file_extent_offset(leaf, fi, 0); 1979 btrfs_mark_buffer_dirty(leaf); 1980 goto out; 1981 } 1982 1983 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { 1984 u64 num_bytes; 1985 1986 path->slots[0]++; 1987 key.offset = offset; 1988 btrfs_set_item_key_safe(root, path, &key); 1989 fi = btrfs_item_ptr(leaf, path->slots[0], 1990 struct btrfs_file_extent_item); 1991 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - 1992 offset; 1993 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1994 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 1995 btrfs_set_file_extent_offset(leaf, fi, 0); 1996 btrfs_mark_buffer_dirty(leaf); 1997 goto out; 1998 } 1999 btrfs_release_path(path); 2000 2001 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 2002 0, 0, end - offset, 0, end - offset, 2003 0, 0, 0); 2004 if (ret) 2005 return ret; 2006 2007 out: 2008 btrfs_release_path(path); 2009 2010 hole_em = alloc_extent_map(); 2011 if (!hole_em) { 2012 btrfs_drop_extent_cache(inode, offset, end - 1, 0); 2013 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 2014 &BTRFS_I(inode)->runtime_flags); 2015 } else { 2016 hole_em->start = offset; 2017 hole_em->len = end - offset; 2018 hole_em->ram_bytes = hole_em->len; 2019 hole_em->orig_start = offset; 2020 2021 hole_em->block_start = EXTENT_MAP_HOLE; 2022 hole_em->block_len = 0; 2023 hole_em->orig_block_len = 0; 2024 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 2025 hole_em->compress_type = BTRFS_COMPRESS_NONE; 2026 hole_em->generation = trans->transid; 2027 2028 do { 2029 btrfs_drop_extent_cache(inode, offset, end - 1, 0); 2030 write_lock(&em_tree->lock); 2031 ret = add_extent_mapping(em_tree, hole_em, 1); 2032 write_unlock(&em_tree->lock); 2033 } while (ret == -EEXIST); 2034 free_extent_map(hole_em); 2035 if (ret) 2036 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 2037 &BTRFS_I(inode)->runtime_flags); 2038 } 2039 2040 return 0; 2041 } 2042 2043 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) 2044 { 2045 struct btrfs_root *root = BTRFS_I(inode)->root; 2046 struct extent_state *cached_state = NULL; 2047 struct btrfs_path *path; 2048 struct btrfs_block_rsv *rsv; 2049 struct btrfs_trans_handle *trans; 2050 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); 2051 u64 lockend = round_down(offset + len, 2052 BTRFS_I(inode)->root->sectorsize) - 1; 2053 u64 cur_offset = lockstart; 2054 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 2055 u64 drop_end; 2056 int ret = 0; 2057 int err = 0; 2058 bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 2059 ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2060 2061 btrfs_wait_ordered_range(inode, offset, len); 2062 2063 mutex_lock(&inode->i_mutex); 2064 /* 2065 * We needn't truncate any page which is beyond the end of the file 2066 * because we are sure there is no data there. 2067 */ 2068 /* 2069 * Only do this if we are in the same page and we aren't doing the 2070 * entire page. 2071 */ 2072 if (same_page && len < PAGE_CACHE_SIZE) { 2073 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) 2074 ret = btrfs_truncate_page(inode, offset, len, 0); 2075 mutex_unlock(&inode->i_mutex); 2076 return ret; 2077 } 2078 2079 /* zero back part of the first page */ 2080 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2081 ret = btrfs_truncate_page(inode, offset, 0, 0); 2082 if (ret) { 2083 mutex_unlock(&inode->i_mutex); 2084 return ret; 2085 } 2086 } 2087 2088 /* zero the front end of the last page */ 2089 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { 2090 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 2091 if (ret) { 2092 mutex_unlock(&inode->i_mutex); 2093 return ret; 2094 } 2095 } 2096 2097 if (lockend < lockstart) { 2098 mutex_unlock(&inode->i_mutex); 2099 return 0; 2100 } 2101 2102 while (1) { 2103 struct btrfs_ordered_extent *ordered; 2104 2105 truncate_pagecache_range(inode, lockstart, lockend); 2106 2107 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2108 0, &cached_state); 2109 ordered = btrfs_lookup_first_ordered_extent(inode, lockend); 2110 2111 /* 2112 * We need to make sure we have no ordered extents in this range 2113 * and nobody raced in and read a page in this range, if we did 2114 * we need to try again. 2115 */ 2116 if ((!ordered || 2117 (ordered->file_offset + ordered->len < lockstart || 2118 ordered->file_offset > lockend)) && 2119 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, 2120 lockend, EXTENT_UPTODATE, 0, 2121 cached_state)) { 2122 if (ordered) 2123 btrfs_put_ordered_extent(ordered); 2124 break; 2125 } 2126 if (ordered) 2127 btrfs_put_ordered_extent(ordered); 2128 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, 2129 lockend, &cached_state, GFP_NOFS); 2130 btrfs_wait_ordered_range(inode, lockstart, 2131 lockend - lockstart + 1); 2132 } 2133 2134 path = btrfs_alloc_path(); 2135 if (!path) { 2136 ret = -ENOMEM; 2137 goto out; 2138 } 2139 2140 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 2141 if (!rsv) { 2142 ret = -ENOMEM; 2143 goto out_free; 2144 } 2145 rsv->size = btrfs_calc_trunc_metadata_size(root, 1); 2146 rsv->failfast = 1; 2147 2148 /* 2149 * 1 - update the inode 2150 * 1 - removing the extents in the range 2151 * 1 - adding the hole extent 2152 */ 2153 trans = btrfs_start_transaction(root, 3); 2154 if (IS_ERR(trans)) { 2155 err = PTR_ERR(trans); 2156 goto out_free; 2157 } 2158 2159 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, 2160 min_size); 2161 BUG_ON(ret); 2162 trans->block_rsv = rsv; 2163 2164 while (cur_offset < lockend) { 2165 ret = __btrfs_drop_extents(trans, root, inode, path, 2166 cur_offset, lockend + 1, 2167 &drop_end, 1); 2168 if (ret != -ENOSPC) 2169 break; 2170 2171 trans->block_rsv = &root->fs_info->trans_block_rsv; 2172 2173 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2174 if (ret) { 2175 err = ret; 2176 break; 2177 } 2178 2179 cur_offset = drop_end; 2180 2181 ret = btrfs_update_inode(trans, root, inode); 2182 if (ret) { 2183 err = ret; 2184 break; 2185 } 2186 2187 btrfs_end_transaction(trans, root); 2188 btrfs_btree_balance_dirty(root); 2189 2190 trans = btrfs_start_transaction(root, 3); 2191 if (IS_ERR(trans)) { 2192 ret = PTR_ERR(trans); 2193 trans = NULL; 2194 break; 2195 } 2196 2197 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, 2198 rsv, min_size); 2199 BUG_ON(ret); /* shouldn't happen */ 2200 trans->block_rsv = rsv; 2201 } 2202 2203 if (ret) { 2204 err = ret; 2205 goto out_trans; 2206 } 2207 2208 trans->block_rsv = &root->fs_info->trans_block_rsv; 2209 ret = fill_holes(trans, inode, path, cur_offset, drop_end); 2210 if (ret) { 2211 err = ret; 2212 goto out_trans; 2213 } 2214 2215 out_trans: 2216 if (!trans) 2217 goto out_free; 2218 2219 inode_inc_iversion(inode); 2220 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2221 2222 trans->block_rsv = &root->fs_info->trans_block_rsv; 2223 ret = btrfs_update_inode(trans, root, inode); 2224 btrfs_end_transaction(trans, root); 2225 btrfs_btree_balance_dirty(root); 2226 out_free: 2227 btrfs_free_path(path); 2228 btrfs_free_block_rsv(root, rsv); 2229 out: 2230 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2231 &cached_state, GFP_NOFS); 2232 mutex_unlock(&inode->i_mutex); 2233 if (ret && !err) 2234 err = ret; 2235 return err; 2236 } 2237 2238 static long btrfs_fallocate(struct file *file, int mode, 2239 loff_t offset, loff_t len) 2240 { 2241 struct inode *inode = file_inode(file); 2242 struct extent_state *cached_state = NULL; 2243 struct btrfs_root *root = BTRFS_I(inode)->root; 2244 u64 cur_offset; 2245 u64 last_byte; 2246 u64 alloc_start; 2247 u64 alloc_end; 2248 u64 alloc_hint = 0; 2249 u64 locked_end; 2250 struct extent_map *em; 2251 int blocksize = BTRFS_I(inode)->root->sectorsize; 2252 int ret; 2253 2254 alloc_start = round_down(offset, blocksize); 2255 alloc_end = round_up(offset + len, blocksize); 2256 2257 /* Make sure we aren't being give some crap mode */ 2258 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2259 return -EOPNOTSUPP; 2260 2261 if (mode & FALLOC_FL_PUNCH_HOLE) 2262 return btrfs_punch_hole(inode, offset, len); 2263 2264 /* 2265 * Make sure we have enough space before we do the 2266 * allocation. 2267 */ 2268 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); 2269 if (ret) 2270 return ret; 2271 if (root->fs_info->quota_enabled) { 2272 ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); 2273 if (ret) 2274 goto out_reserve_fail; 2275 } 2276 2277 mutex_lock(&inode->i_mutex); 2278 ret = inode_newsize_ok(inode, alloc_end); 2279 if (ret) 2280 goto out; 2281 2282 if (alloc_start > inode->i_size) { 2283 ret = btrfs_cont_expand(inode, i_size_read(inode), 2284 alloc_start); 2285 if (ret) 2286 goto out; 2287 } else { 2288 /* 2289 * If we are fallocating from the end of the file onward we 2290 * need to zero out the end of the page if i_size lands in the 2291 * middle of a page. 2292 */ 2293 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); 2294 if (ret) 2295 goto out; 2296 } 2297 2298 /* 2299 * wait for ordered IO before we have any locks. We'll loop again 2300 * below with the locks held. 2301 */ 2302 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); 2303 2304 locked_end = alloc_end - 1; 2305 while (1) { 2306 struct btrfs_ordered_extent *ordered; 2307 2308 /* the extent lock is ordered inside the running 2309 * transaction 2310 */ 2311 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, 2312 locked_end, 0, &cached_state); 2313 ordered = btrfs_lookup_first_ordered_extent(inode, 2314 alloc_end - 1); 2315 if (ordered && 2316 ordered->file_offset + ordered->len > alloc_start && 2317 ordered->file_offset < alloc_end) { 2318 btrfs_put_ordered_extent(ordered); 2319 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 2320 alloc_start, locked_end, 2321 &cached_state, GFP_NOFS); 2322 /* 2323 * we can't wait on the range with the transaction 2324 * running or with the extent lock held 2325 */ 2326 btrfs_wait_ordered_range(inode, alloc_start, 2327 alloc_end - alloc_start); 2328 } else { 2329 if (ordered) 2330 btrfs_put_ordered_extent(ordered); 2331 break; 2332 } 2333 } 2334 2335 cur_offset = alloc_start; 2336 while (1) { 2337 u64 actual_end; 2338 2339 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 2340 alloc_end - cur_offset, 0); 2341 if (IS_ERR_OR_NULL(em)) { 2342 if (!em) 2343 ret = -ENOMEM; 2344 else 2345 ret = PTR_ERR(em); 2346 break; 2347 } 2348 last_byte = min(extent_map_end(em), alloc_end); 2349 actual_end = min_t(u64, extent_map_end(em), offset + len); 2350 last_byte = ALIGN(last_byte, blocksize); 2351 2352 if (em->block_start == EXTENT_MAP_HOLE || 2353 (cur_offset >= inode->i_size && 2354 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 2355 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 2356 last_byte - cur_offset, 2357 1 << inode->i_blkbits, 2358 offset + len, 2359 &alloc_hint); 2360 2361 if (ret < 0) { 2362 free_extent_map(em); 2363 break; 2364 } 2365 } else if (actual_end > inode->i_size && 2366 !(mode & FALLOC_FL_KEEP_SIZE)) { 2367 /* 2368 * We didn't need to allocate any more space, but we 2369 * still extended the size of the file so we need to 2370 * update i_size. 2371 */ 2372 inode->i_ctime = CURRENT_TIME; 2373 i_size_write(inode, actual_end); 2374 btrfs_ordered_update_i_size(inode, actual_end, NULL); 2375 } 2376 free_extent_map(em); 2377 2378 cur_offset = last_byte; 2379 if (cur_offset >= alloc_end) { 2380 ret = 0; 2381 break; 2382 } 2383 } 2384 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 2385 &cached_state, GFP_NOFS); 2386 out: 2387 mutex_unlock(&inode->i_mutex); 2388 if (root->fs_info->quota_enabled) 2389 btrfs_qgroup_free(root, alloc_end - alloc_start); 2390 out_reserve_fail: 2391 /* Let go of our reservation. */ 2392 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); 2393 return ret; 2394 } 2395 2396 static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) 2397 { 2398 struct btrfs_root *root = BTRFS_I(inode)->root; 2399 struct extent_map *em; 2400 struct extent_state *cached_state = NULL; 2401 u64 lockstart = *offset; 2402 u64 lockend = i_size_read(inode); 2403 u64 start = *offset; 2404 u64 orig_start = *offset; 2405 u64 len = i_size_read(inode); 2406 u64 last_end = 0; 2407 int ret = 0; 2408 2409 lockend = max_t(u64, root->sectorsize, lockend); 2410 if (lockend <= lockstart) 2411 lockend = lockstart + root->sectorsize; 2412 2413 lockend--; 2414 len = lockend - lockstart + 1; 2415 2416 len = max_t(u64, len, root->sectorsize); 2417 if (inode->i_size == 0) 2418 return -ENXIO; 2419 2420 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, 2421 &cached_state); 2422 2423 /* 2424 * Delalloc is such a pain. If we have a hole and we have pending 2425 * delalloc for a portion of the hole we will get back a hole that 2426 * exists for the entire range since it hasn't been actually written 2427 * yet. So to take care of this case we need to look for an extent just 2428 * before the position we want in case there is outstanding delalloc 2429 * going on here. 2430 */ 2431 if (whence == SEEK_HOLE && start != 0) { 2432 if (start <= root->sectorsize) 2433 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, 2434 root->sectorsize, 0); 2435 else 2436 em = btrfs_get_extent_fiemap(inode, NULL, 0, 2437 start - root->sectorsize, 2438 root->sectorsize, 0); 2439 if (IS_ERR(em)) { 2440 ret = PTR_ERR(em); 2441 goto out; 2442 } 2443 last_end = em->start + em->len; 2444 if (em->block_start == EXTENT_MAP_DELALLOC) 2445 last_end = min_t(u64, last_end, inode->i_size); 2446 free_extent_map(em); 2447 } 2448 2449 while (1) { 2450 em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); 2451 if (IS_ERR(em)) { 2452 ret = PTR_ERR(em); 2453 break; 2454 } 2455 2456 if (em->block_start == EXTENT_MAP_HOLE) { 2457 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 2458 if (last_end <= orig_start) { 2459 free_extent_map(em); 2460 ret = -ENXIO; 2461 break; 2462 } 2463 } 2464 2465 if (whence == SEEK_HOLE) { 2466 *offset = start; 2467 free_extent_map(em); 2468 break; 2469 } 2470 } else { 2471 if (whence == SEEK_DATA) { 2472 if (em->block_start == EXTENT_MAP_DELALLOC) { 2473 if (start >= inode->i_size) { 2474 free_extent_map(em); 2475 ret = -ENXIO; 2476 break; 2477 } 2478 } 2479 2480 if (!test_bit(EXTENT_FLAG_PREALLOC, 2481 &em->flags)) { 2482 *offset = start; 2483 free_extent_map(em); 2484 break; 2485 } 2486 } 2487 } 2488 2489 start = em->start + em->len; 2490 last_end = em->start + em->len; 2491 2492 if (em->block_start == EXTENT_MAP_DELALLOC) 2493 last_end = min_t(u64, last_end, inode->i_size); 2494 2495 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 2496 free_extent_map(em); 2497 ret = -ENXIO; 2498 break; 2499 } 2500 free_extent_map(em); 2501 cond_resched(); 2502 } 2503 if (!ret) 2504 *offset = min(*offset, inode->i_size); 2505 out: 2506 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2507 &cached_state, GFP_NOFS); 2508 return ret; 2509 } 2510 2511 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) 2512 { 2513 struct inode *inode = file->f_mapping->host; 2514 int ret; 2515 2516 mutex_lock(&inode->i_mutex); 2517 switch (whence) { 2518 case SEEK_END: 2519 case SEEK_CUR: 2520 offset = generic_file_llseek(file, offset, whence); 2521 goto out; 2522 case SEEK_DATA: 2523 case SEEK_HOLE: 2524 if (offset >= i_size_read(inode)) { 2525 mutex_unlock(&inode->i_mutex); 2526 return -ENXIO; 2527 } 2528 2529 ret = find_desired_extent(inode, &offset, whence); 2530 if (ret) { 2531 mutex_unlock(&inode->i_mutex); 2532 return ret; 2533 } 2534 } 2535 2536 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 2537 out: 2538 mutex_unlock(&inode->i_mutex); 2539 return offset; 2540 } 2541 2542 const struct file_operations btrfs_file_operations = { 2543 .llseek = btrfs_file_llseek, 2544 .read = do_sync_read, 2545 .write = do_sync_write, 2546 .aio_read = generic_file_aio_read, 2547 .splice_read = generic_file_splice_read, 2548 .aio_write = btrfs_file_aio_write, 2549 .mmap = btrfs_file_mmap, 2550 .open = generic_file_open, 2551 .release = btrfs_release_file, 2552 .fsync = btrfs_sync_file, 2553 .fallocate = btrfs_fallocate, 2554 .unlocked_ioctl = btrfs_ioctl, 2555 #ifdef CONFIG_COMPAT 2556 .compat_ioctl = btrfs_ioctl, 2557 #endif 2558 }; 2559 2560 void btrfs_auto_defrag_exit(void) 2561 { 2562 if (btrfs_inode_defrag_cachep) 2563 kmem_cache_destroy(btrfs_inode_defrag_cachep); 2564 } 2565 2566 int btrfs_auto_defrag_init(void) 2567 { 2568 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 2569 sizeof(struct inode_defrag), 0, 2570 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 2571 NULL); 2572 if (!btrfs_inode_defrag_cachep) 2573 return -ENOMEM; 2574 2575 return 0; 2576 } 2577