1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/fs.h> 20 #include <linux/pagemap.h> 21 #include <linux/highmem.h> 22 #include <linux/time.h> 23 #include <linux/init.h> 24 #include <linux/string.h> 25 #include <linux/backing-dev.h> 26 #include <linux/mpage.h> 27 #include <linux/falloc.h> 28 #include <linux/swap.h> 29 #include <linux/writeback.h> 30 #include <linux/statfs.h> 31 #include <linux/compat.h> 32 #include <linux/slab.h> 33 #include "ctree.h" 34 #include "disk-io.h" 35 #include "transaction.h" 36 #include "btrfs_inode.h" 37 #include "ioctl.h" 38 #include "print-tree.h" 39 #include "tree-log.h" 40 #include "locking.h" 41 #include "compat.h" 42 43 /* 44 * when auto defrag is enabled we 45 * queue up these defrag structs to remember which 46 * inodes need defragging passes 47 */ 48 struct inode_defrag { 49 struct rb_node rb_node; 50 /* objectid */ 51 u64 ino; 52 /* 53 * transid where the defrag was added, we search for 54 * extents newer than this 55 */ 56 u64 transid; 57 58 /* root objectid */ 59 u64 root; 60 61 /* last offset we were able to defrag */ 62 u64 last_offset; 63 64 /* if we've wrapped around back to zero once already */ 65 int cycled; 66 }; 67 68 /* pop a record for an inode into the defrag tree. The lock 69 * must be held already 70 * 71 * If you're inserting a record for an older transid than an 72 * existing record, the transid already in the tree is lowered 73 * 74 * If an existing record is found the defrag item you 75 * pass in is freed 76 */ 77 static void __btrfs_add_inode_defrag(struct inode *inode, 78 struct inode_defrag *defrag) 79 { 80 struct btrfs_root *root = BTRFS_I(inode)->root; 81 struct inode_defrag *entry; 82 struct rb_node **p; 83 struct rb_node *parent = NULL; 84 85 p = &root->fs_info->defrag_inodes.rb_node; 86 while (*p) { 87 parent = *p; 88 entry = rb_entry(parent, struct inode_defrag, rb_node); 89 90 if (defrag->ino < entry->ino) 91 p = &parent->rb_left; 92 else if (defrag->ino > entry->ino) 93 p = &parent->rb_right; 94 else { 95 /* if we're reinserting an entry for 96 * an old defrag run, make sure to 97 * lower the transid of our existing record 98 */ 99 if (defrag->transid < entry->transid) 100 entry->transid = defrag->transid; 101 if (defrag->last_offset > entry->last_offset) 102 entry->last_offset = defrag->last_offset; 103 goto exists; 104 } 105 } 106 BTRFS_I(inode)->in_defrag = 1; 107 rb_link_node(&defrag->rb_node, parent, p); 108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 109 return; 110 111 exists: 112 kfree(defrag); 113 return; 114 115 } 116 117 /* 118 * insert a defrag record for this inode if auto defrag is 119 * enabled 120 */ 121 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 122 struct inode *inode) 123 { 124 struct btrfs_root *root = BTRFS_I(inode)->root; 125 struct inode_defrag *defrag; 126 u64 transid; 127 128 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 129 return 0; 130 131 if (btrfs_fs_closing(root->fs_info)) 132 return 0; 133 134 if (BTRFS_I(inode)->in_defrag) 135 return 0; 136 137 if (trans) 138 transid = trans->transid; 139 else 140 transid = BTRFS_I(inode)->root->last_trans; 141 142 defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 143 if (!defrag) 144 return -ENOMEM; 145 146 defrag->ino = btrfs_ino(inode); 147 defrag->transid = transid; 148 defrag->root = root->root_key.objectid; 149 150 spin_lock(&root->fs_info->defrag_inodes_lock); 151 if (!BTRFS_I(inode)->in_defrag) 152 __btrfs_add_inode_defrag(inode, defrag); 153 else 154 kfree(defrag); 155 spin_unlock(&root->fs_info->defrag_inodes_lock); 156 return 0; 157 } 158 159 /* 160 * must be called with the defrag_inodes lock held 161 */ 162 struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, 163 struct rb_node **next) 164 { 165 struct inode_defrag *entry = NULL; 166 struct rb_node *p; 167 struct rb_node *parent = NULL; 168 169 p = info->defrag_inodes.rb_node; 170 while (p) { 171 parent = p; 172 entry = rb_entry(parent, struct inode_defrag, rb_node); 173 174 if (ino < entry->ino) 175 p = parent->rb_left; 176 else if (ino > entry->ino) 177 p = parent->rb_right; 178 else 179 return entry; 180 } 181 182 if (next) { 183 while (parent && ino > entry->ino) { 184 parent = rb_next(parent); 185 entry = rb_entry(parent, struct inode_defrag, rb_node); 186 } 187 *next = parent; 188 } 189 return NULL; 190 } 191 192 /* 193 * run through the list of inodes in the FS that need 194 * defragging 195 */ 196 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 197 { 198 struct inode_defrag *defrag; 199 struct btrfs_root *inode_root; 200 struct inode *inode; 201 struct rb_node *n; 202 struct btrfs_key key; 203 struct btrfs_ioctl_defrag_range_args range; 204 u64 first_ino = 0; 205 int num_defrag; 206 int defrag_batch = 1024; 207 208 memset(&range, 0, sizeof(range)); 209 range.len = (u64)-1; 210 211 atomic_inc(&fs_info->defrag_running); 212 spin_lock(&fs_info->defrag_inodes_lock); 213 while(1) { 214 n = NULL; 215 216 /* find an inode to defrag */ 217 defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); 218 if (!defrag) { 219 if (n) 220 defrag = rb_entry(n, struct inode_defrag, rb_node); 221 else if (first_ino) { 222 first_ino = 0; 223 continue; 224 } else { 225 break; 226 } 227 } 228 229 /* remove it from the rbtree */ 230 first_ino = defrag->ino + 1; 231 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); 232 233 if (btrfs_fs_closing(fs_info)) 234 goto next_free; 235 236 spin_unlock(&fs_info->defrag_inodes_lock); 237 238 /* get the inode */ 239 key.objectid = defrag->root; 240 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 241 key.offset = (u64)-1; 242 inode_root = btrfs_read_fs_root_no_name(fs_info, &key); 243 if (IS_ERR(inode_root)) 244 goto next; 245 246 key.objectid = defrag->ino; 247 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 248 key.offset = 0; 249 250 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 251 if (IS_ERR(inode)) 252 goto next; 253 254 /* do a chunk of defrag */ 255 BTRFS_I(inode)->in_defrag = 0; 256 range.start = defrag->last_offset; 257 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 258 defrag_batch); 259 /* 260 * if we filled the whole defrag batch, there 261 * must be more work to do. Queue this defrag 262 * again 263 */ 264 if (num_defrag == defrag_batch) { 265 defrag->last_offset = range.start; 266 __btrfs_add_inode_defrag(inode, defrag); 267 /* 268 * we don't want to kfree defrag, we added it back to 269 * the rbtree 270 */ 271 defrag = NULL; 272 } else if (defrag->last_offset && !defrag->cycled) { 273 /* 274 * we didn't fill our defrag batch, but 275 * we didn't start at zero. Make sure we loop 276 * around to the start of the file. 277 */ 278 defrag->last_offset = 0; 279 defrag->cycled = 1; 280 __btrfs_add_inode_defrag(inode, defrag); 281 defrag = NULL; 282 } 283 284 iput(inode); 285 next: 286 spin_lock(&fs_info->defrag_inodes_lock); 287 next_free: 288 kfree(defrag); 289 } 290 spin_unlock(&fs_info->defrag_inodes_lock); 291 292 atomic_dec(&fs_info->defrag_running); 293 294 /* 295 * during unmount, we use the transaction_wait queue to 296 * wait for the defragger to stop 297 */ 298 wake_up(&fs_info->transaction_wait); 299 return 0; 300 } 301 302 /* simple helper to fault in pages and copy. This should go away 303 * and be replaced with calls into generic code. 304 */ 305 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 306 size_t write_bytes, 307 struct page **prepared_pages, 308 struct iov_iter *i) 309 { 310 size_t copied = 0; 311 size_t total_copied = 0; 312 int pg = 0; 313 int offset = pos & (PAGE_CACHE_SIZE - 1); 314 315 while (write_bytes > 0) { 316 size_t count = min_t(size_t, 317 PAGE_CACHE_SIZE - offset, write_bytes); 318 struct page *page = prepared_pages[pg]; 319 /* 320 * Copy data from userspace to the current page 321 * 322 * Disable pagefault to avoid recursive lock since 323 * the pages are already locked 324 */ 325 pagefault_disable(); 326 copied = iov_iter_copy_from_user_atomic(page, i, offset, count); 327 pagefault_enable(); 328 329 /* Flush processor's dcache for this page */ 330 flush_dcache_page(page); 331 332 /* 333 * if we get a partial write, we can end up with 334 * partially up to date pages. These add 335 * a lot of complexity, so make sure they don't 336 * happen by forcing this copy to be retried. 337 * 338 * The rest of the btrfs_file_write code will fall 339 * back to page at a time copies after we return 0. 340 */ 341 if (!PageUptodate(page) && copied < count) 342 copied = 0; 343 344 iov_iter_advance(i, copied); 345 write_bytes -= copied; 346 total_copied += copied; 347 348 /* Return to btrfs_file_aio_write to fault page */ 349 if (unlikely(copied == 0)) 350 break; 351 352 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 353 offset += copied; 354 } else { 355 pg++; 356 offset = 0; 357 } 358 } 359 return total_copied; 360 } 361 362 /* 363 * unlocks pages after btrfs_file_write is done with them 364 */ 365 void btrfs_drop_pages(struct page **pages, size_t num_pages) 366 { 367 size_t i; 368 for (i = 0; i < num_pages; i++) { 369 /* page checked is some magic around finding pages that 370 * have been modified without going through btrfs_set_page_dirty 371 * clear it here 372 */ 373 ClearPageChecked(pages[i]); 374 unlock_page(pages[i]); 375 mark_page_accessed(pages[i]); 376 page_cache_release(pages[i]); 377 } 378 } 379 380 /* 381 * after copy_from_user, pages need to be dirtied and we need to make 382 * sure holes are created between the current EOF and the start of 383 * any next extents (if required). 384 * 385 * this also makes the decision about creating an inline extent vs 386 * doing real data extents, marking pages dirty and delalloc as required. 387 */ 388 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, 389 struct page **pages, size_t num_pages, 390 loff_t pos, size_t write_bytes, 391 struct extent_state **cached) 392 { 393 int err = 0; 394 int i; 395 u64 num_bytes; 396 u64 start_pos; 397 u64 end_of_last_block; 398 u64 end_pos = pos + write_bytes; 399 loff_t isize = i_size_read(inode); 400 401 start_pos = pos & ~((u64)root->sectorsize - 1); 402 num_bytes = (write_bytes + pos - start_pos + 403 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 404 405 end_of_last_block = start_pos + num_bytes - 1; 406 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 407 cached); 408 if (err) 409 return err; 410 411 for (i = 0; i < num_pages; i++) { 412 struct page *p = pages[i]; 413 SetPageUptodate(p); 414 ClearPageChecked(p); 415 set_page_dirty(p); 416 } 417 418 /* 419 * we've only changed i_size in ram, and we haven't updated 420 * the disk i_size. There is no need to log the inode 421 * at this time. 422 */ 423 if (end_pos > isize) 424 i_size_write(inode, end_pos); 425 return 0; 426 } 427 428 /* 429 * this drops all the extents in the cache that intersect the range 430 * [start, end]. Existing extents are split as required. 431 */ 432 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 433 int skip_pinned) 434 { 435 struct extent_map *em; 436 struct extent_map *split = NULL; 437 struct extent_map *split2 = NULL; 438 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 439 u64 len = end - start + 1; 440 int ret; 441 int testend = 1; 442 unsigned long flags; 443 int compressed = 0; 444 445 WARN_ON(end < start); 446 if (end == (u64)-1) { 447 len = (u64)-1; 448 testend = 0; 449 } 450 while (1) { 451 if (!split) 452 split = alloc_extent_map(); 453 if (!split2) 454 split2 = alloc_extent_map(); 455 BUG_ON(!split || !split2); /* -ENOMEM */ 456 457 write_lock(&em_tree->lock); 458 em = lookup_extent_mapping(em_tree, start, len); 459 if (!em) { 460 write_unlock(&em_tree->lock); 461 break; 462 } 463 flags = em->flags; 464 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 465 if (testend && em->start + em->len >= start + len) { 466 free_extent_map(em); 467 write_unlock(&em_tree->lock); 468 break; 469 } 470 start = em->start + em->len; 471 if (testend) 472 len = start + len - (em->start + em->len); 473 free_extent_map(em); 474 write_unlock(&em_tree->lock); 475 continue; 476 } 477 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 478 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 479 remove_extent_mapping(em_tree, em); 480 481 if (em->block_start < EXTENT_MAP_LAST_BYTE && 482 em->start < start) { 483 split->start = em->start; 484 split->len = start - em->start; 485 split->orig_start = em->orig_start; 486 split->block_start = em->block_start; 487 488 if (compressed) 489 split->block_len = em->block_len; 490 else 491 split->block_len = split->len; 492 493 split->bdev = em->bdev; 494 split->flags = flags; 495 split->compress_type = em->compress_type; 496 ret = add_extent_mapping(em_tree, split); 497 BUG_ON(ret); /* Logic error */ 498 free_extent_map(split); 499 split = split2; 500 split2 = NULL; 501 } 502 if (em->block_start < EXTENT_MAP_LAST_BYTE && 503 testend && em->start + em->len > start + len) { 504 u64 diff = start + len - em->start; 505 506 split->start = start + len; 507 split->len = em->start + em->len - (start + len); 508 split->bdev = em->bdev; 509 split->flags = flags; 510 split->compress_type = em->compress_type; 511 512 if (compressed) { 513 split->block_len = em->block_len; 514 split->block_start = em->block_start; 515 split->orig_start = em->orig_start; 516 } else { 517 split->block_len = split->len; 518 split->block_start = em->block_start + diff; 519 split->orig_start = split->start; 520 } 521 522 ret = add_extent_mapping(em_tree, split); 523 BUG_ON(ret); /* Logic error */ 524 free_extent_map(split); 525 split = NULL; 526 } 527 write_unlock(&em_tree->lock); 528 529 /* once for us */ 530 free_extent_map(em); 531 /* once for the tree*/ 532 free_extent_map(em); 533 } 534 if (split) 535 free_extent_map(split); 536 if (split2) 537 free_extent_map(split2); 538 return 0; 539 } 540 541 /* 542 * this is very complex, but the basic idea is to drop all extents 543 * in the range start - end. hint_block is filled in with a block number 544 * that would be a good hint to the block allocator for this file. 545 * 546 * If an extent intersects the range but is not entirely inside the range 547 * it is either truncated or split. Anything entirely inside the range 548 * is deleted from the tree. 549 */ 550 int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 551 u64 start, u64 end, u64 *hint_byte, int drop_cache) 552 { 553 struct btrfs_root *root = BTRFS_I(inode)->root; 554 struct extent_buffer *leaf; 555 struct btrfs_file_extent_item *fi; 556 struct btrfs_path *path; 557 struct btrfs_key key; 558 struct btrfs_key new_key; 559 u64 ino = btrfs_ino(inode); 560 u64 search_start = start; 561 u64 disk_bytenr = 0; 562 u64 num_bytes = 0; 563 u64 extent_offset = 0; 564 u64 extent_end = 0; 565 int del_nr = 0; 566 int del_slot = 0; 567 int extent_type; 568 int recow; 569 int ret; 570 int modify_tree = -1; 571 572 if (drop_cache) 573 btrfs_drop_extent_cache(inode, start, end - 1, 0); 574 575 path = btrfs_alloc_path(); 576 if (!path) 577 return -ENOMEM; 578 579 if (start >= BTRFS_I(inode)->disk_i_size) 580 modify_tree = 0; 581 582 while (1) { 583 recow = 0; 584 ret = btrfs_lookup_file_extent(trans, root, path, ino, 585 search_start, modify_tree); 586 if (ret < 0) 587 break; 588 if (ret > 0 && path->slots[0] > 0 && search_start == start) { 589 leaf = path->nodes[0]; 590 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); 591 if (key.objectid == ino && 592 key.type == BTRFS_EXTENT_DATA_KEY) 593 path->slots[0]--; 594 } 595 ret = 0; 596 next_slot: 597 leaf = path->nodes[0]; 598 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 599 BUG_ON(del_nr > 0); 600 ret = btrfs_next_leaf(root, path); 601 if (ret < 0) 602 break; 603 if (ret > 0) { 604 ret = 0; 605 break; 606 } 607 leaf = path->nodes[0]; 608 recow = 1; 609 } 610 611 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 612 if (key.objectid > ino || 613 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) 614 break; 615 616 fi = btrfs_item_ptr(leaf, path->slots[0], 617 struct btrfs_file_extent_item); 618 extent_type = btrfs_file_extent_type(leaf, fi); 619 620 if (extent_type == BTRFS_FILE_EXTENT_REG || 621 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 622 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 623 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 624 extent_offset = btrfs_file_extent_offset(leaf, fi); 625 extent_end = key.offset + 626 btrfs_file_extent_num_bytes(leaf, fi); 627 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 628 extent_end = key.offset + 629 btrfs_file_extent_inline_len(leaf, fi); 630 } else { 631 WARN_ON(1); 632 extent_end = search_start; 633 } 634 635 if (extent_end <= search_start) { 636 path->slots[0]++; 637 goto next_slot; 638 } 639 640 search_start = max(key.offset, start); 641 if (recow || !modify_tree) { 642 modify_tree = -1; 643 btrfs_release_path(path); 644 continue; 645 } 646 647 /* 648 * | - range to drop - | 649 * | -------- extent -------- | 650 */ 651 if (start > key.offset && end < extent_end) { 652 BUG_ON(del_nr > 0); 653 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 654 655 memcpy(&new_key, &key, sizeof(new_key)); 656 new_key.offset = start; 657 ret = btrfs_duplicate_item(trans, root, path, 658 &new_key); 659 if (ret == -EAGAIN) { 660 btrfs_release_path(path); 661 continue; 662 } 663 if (ret < 0) 664 break; 665 666 leaf = path->nodes[0]; 667 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 668 struct btrfs_file_extent_item); 669 btrfs_set_file_extent_num_bytes(leaf, fi, 670 start - key.offset); 671 672 fi = btrfs_item_ptr(leaf, path->slots[0], 673 struct btrfs_file_extent_item); 674 675 extent_offset += start - key.offset; 676 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 677 btrfs_set_file_extent_num_bytes(leaf, fi, 678 extent_end - start); 679 btrfs_mark_buffer_dirty(leaf); 680 681 if (disk_bytenr > 0) { 682 ret = btrfs_inc_extent_ref(trans, root, 683 disk_bytenr, num_bytes, 0, 684 root->root_key.objectid, 685 new_key.objectid, 686 start - extent_offset, 0); 687 BUG_ON(ret); /* -ENOMEM */ 688 *hint_byte = disk_bytenr; 689 } 690 key.offset = start; 691 } 692 /* 693 * | ---- range to drop ----- | 694 * | -------- extent -------- | 695 */ 696 if (start <= key.offset && end < extent_end) { 697 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 698 699 memcpy(&new_key, &key, sizeof(new_key)); 700 new_key.offset = end; 701 btrfs_set_item_key_safe(trans, root, path, &new_key); 702 703 extent_offset += end - key.offset; 704 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 705 btrfs_set_file_extent_num_bytes(leaf, fi, 706 extent_end - end); 707 btrfs_mark_buffer_dirty(leaf); 708 if (disk_bytenr > 0) { 709 inode_sub_bytes(inode, end - key.offset); 710 *hint_byte = disk_bytenr; 711 } 712 break; 713 } 714 715 search_start = extent_end; 716 /* 717 * | ---- range to drop ----- | 718 * | -------- extent -------- | 719 */ 720 if (start > key.offset && end >= extent_end) { 721 BUG_ON(del_nr > 0); 722 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 723 724 btrfs_set_file_extent_num_bytes(leaf, fi, 725 start - key.offset); 726 btrfs_mark_buffer_dirty(leaf); 727 if (disk_bytenr > 0) { 728 inode_sub_bytes(inode, extent_end - start); 729 *hint_byte = disk_bytenr; 730 } 731 if (end == extent_end) 732 break; 733 734 path->slots[0]++; 735 goto next_slot; 736 } 737 738 /* 739 * | ---- range to drop ----- | 740 * | ------ extent ------ | 741 */ 742 if (start <= key.offset && end >= extent_end) { 743 if (del_nr == 0) { 744 del_slot = path->slots[0]; 745 del_nr = 1; 746 } else { 747 BUG_ON(del_slot + del_nr != path->slots[0]); 748 del_nr++; 749 } 750 751 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 752 inode_sub_bytes(inode, 753 extent_end - key.offset); 754 extent_end = ALIGN(extent_end, 755 root->sectorsize); 756 } else if (disk_bytenr > 0) { 757 ret = btrfs_free_extent(trans, root, 758 disk_bytenr, num_bytes, 0, 759 root->root_key.objectid, 760 key.objectid, key.offset - 761 extent_offset, 0); 762 BUG_ON(ret); /* -ENOMEM */ 763 inode_sub_bytes(inode, 764 extent_end - key.offset); 765 *hint_byte = disk_bytenr; 766 } 767 768 if (end == extent_end) 769 break; 770 771 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { 772 path->slots[0]++; 773 goto next_slot; 774 } 775 776 ret = btrfs_del_items(trans, root, path, del_slot, 777 del_nr); 778 if (ret) { 779 btrfs_abort_transaction(trans, root, ret); 780 goto out; 781 } 782 783 del_nr = 0; 784 del_slot = 0; 785 786 btrfs_release_path(path); 787 continue; 788 } 789 790 BUG_ON(1); 791 } 792 793 if (!ret && del_nr > 0) { 794 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 795 if (ret) 796 btrfs_abort_transaction(trans, root, ret); 797 } 798 799 out: 800 btrfs_free_path(path); 801 return ret; 802 } 803 804 static int extent_mergeable(struct extent_buffer *leaf, int slot, 805 u64 objectid, u64 bytenr, u64 orig_offset, 806 u64 *start, u64 *end) 807 { 808 struct btrfs_file_extent_item *fi; 809 struct btrfs_key key; 810 u64 extent_end; 811 812 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 813 return 0; 814 815 btrfs_item_key_to_cpu(leaf, &key, slot); 816 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) 817 return 0; 818 819 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 820 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || 821 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || 822 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || 823 btrfs_file_extent_compression(leaf, fi) || 824 btrfs_file_extent_encryption(leaf, fi) || 825 btrfs_file_extent_other_encoding(leaf, fi)) 826 return 0; 827 828 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 829 if ((*start && *start != key.offset) || (*end && *end != extent_end)) 830 return 0; 831 832 *start = key.offset; 833 *end = extent_end; 834 return 1; 835 } 836 837 /* 838 * Mark extent in the range start - end as written. 839 * 840 * This changes extent type from 'pre-allocated' to 'regular'. If only 841 * part of extent is marked as written, the extent will be split into 842 * two or three. 843 */ 844 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 845 struct inode *inode, u64 start, u64 end) 846 { 847 struct btrfs_root *root = BTRFS_I(inode)->root; 848 struct extent_buffer *leaf; 849 struct btrfs_path *path; 850 struct btrfs_file_extent_item *fi; 851 struct btrfs_key key; 852 struct btrfs_key new_key; 853 u64 bytenr; 854 u64 num_bytes; 855 u64 extent_end; 856 u64 orig_offset; 857 u64 other_start; 858 u64 other_end; 859 u64 split; 860 int del_nr = 0; 861 int del_slot = 0; 862 int recow; 863 int ret; 864 u64 ino = btrfs_ino(inode); 865 866 btrfs_drop_extent_cache(inode, start, end - 1, 0); 867 868 path = btrfs_alloc_path(); 869 if (!path) 870 return -ENOMEM; 871 again: 872 recow = 0; 873 split = start; 874 key.objectid = ino; 875 key.type = BTRFS_EXTENT_DATA_KEY; 876 key.offset = split; 877 878 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 879 if (ret < 0) 880 goto out; 881 if (ret > 0 && path->slots[0] > 0) 882 path->slots[0]--; 883 884 leaf = path->nodes[0]; 885 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 886 BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY); 887 fi = btrfs_item_ptr(leaf, path->slots[0], 888 struct btrfs_file_extent_item); 889 BUG_ON(btrfs_file_extent_type(leaf, fi) != 890 BTRFS_FILE_EXTENT_PREALLOC); 891 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 892 BUG_ON(key.offset > start || extent_end < end); 893 894 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 895 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 896 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 897 memcpy(&new_key, &key, sizeof(new_key)); 898 899 if (start == key.offset && end < extent_end) { 900 other_start = 0; 901 other_end = start; 902 if (extent_mergeable(leaf, path->slots[0] - 1, 903 ino, bytenr, orig_offset, 904 &other_start, &other_end)) { 905 new_key.offset = end; 906 btrfs_set_item_key_safe(trans, root, path, &new_key); 907 fi = btrfs_item_ptr(leaf, path->slots[0], 908 struct btrfs_file_extent_item); 909 btrfs_set_file_extent_num_bytes(leaf, fi, 910 extent_end - end); 911 btrfs_set_file_extent_offset(leaf, fi, 912 end - orig_offset); 913 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 914 struct btrfs_file_extent_item); 915 btrfs_set_file_extent_num_bytes(leaf, fi, 916 end - other_start); 917 btrfs_mark_buffer_dirty(leaf); 918 goto out; 919 } 920 } 921 922 if (start > key.offset && end == extent_end) { 923 other_start = end; 924 other_end = 0; 925 if (extent_mergeable(leaf, path->slots[0] + 1, 926 ino, bytenr, orig_offset, 927 &other_start, &other_end)) { 928 fi = btrfs_item_ptr(leaf, path->slots[0], 929 struct btrfs_file_extent_item); 930 btrfs_set_file_extent_num_bytes(leaf, fi, 931 start - key.offset); 932 path->slots[0]++; 933 new_key.offset = start; 934 btrfs_set_item_key_safe(trans, root, path, &new_key); 935 936 fi = btrfs_item_ptr(leaf, path->slots[0], 937 struct btrfs_file_extent_item); 938 btrfs_set_file_extent_num_bytes(leaf, fi, 939 other_end - start); 940 btrfs_set_file_extent_offset(leaf, fi, 941 start - orig_offset); 942 btrfs_mark_buffer_dirty(leaf); 943 goto out; 944 } 945 } 946 947 while (start > key.offset || end < extent_end) { 948 if (key.offset == start) 949 split = end; 950 951 new_key.offset = split; 952 ret = btrfs_duplicate_item(trans, root, path, &new_key); 953 if (ret == -EAGAIN) { 954 btrfs_release_path(path); 955 goto again; 956 } 957 if (ret < 0) { 958 btrfs_abort_transaction(trans, root, ret); 959 goto out; 960 } 961 962 leaf = path->nodes[0]; 963 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 964 struct btrfs_file_extent_item); 965 btrfs_set_file_extent_num_bytes(leaf, fi, 966 split - key.offset); 967 968 fi = btrfs_item_ptr(leaf, path->slots[0], 969 struct btrfs_file_extent_item); 970 971 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 972 btrfs_set_file_extent_num_bytes(leaf, fi, 973 extent_end - split); 974 btrfs_mark_buffer_dirty(leaf); 975 976 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 977 root->root_key.objectid, 978 ino, orig_offset, 0); 979 BUG_ON(ret); /* -ENOMEM */ 980 981 if (split == start) { 982 key.offset = start; 983 } else { 984 BUG_ON(start != key.offset); 985 path->slots[0]--; 986 extent_end = end; 987 } 988 recow = 1; 989 } 990 991 other_start = end; 992 other_end = 0; 993 if (extent_mergeable(leaf, path->slots[0] + 1, 994 ino, bytenr, orig_offset, 995 &other_start, &other_end)) { 996 if (recow) { 997 btrfs_release_path(path); 998 goto again; 999 } 1000 extent_end = other_end; 1001 del_slot = path->slots[0] + 1; 1002 del_nr++; 1003 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1004 0, root->root_key.objectid, 1005 ino, orig_offset, 0); 1006 BUG_ON(ret); /* -ENOMEM */ 1007 } 1008 other_start = 0; 1009 other_end = start; 1010 if (extent_mergeable(leaf, path->slots[0] - 1, 1011 ino, bytenr, orig_offset, 1012 &other_start, &other_end)) { 1013 if (recow) { 1014 btrfs_release_path(path); 1015 goto again; 1016 } 1017 key.offset = other_start; 1018 del_slot = path->slots[0]; 1019 del_nr++; 1020 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1021 0, root->root_key.objectid, 1022 ino, orig_offset, 0); 1023 BUG_ON(ret); /* -ENOMEM */ 1024 } 1025 if (del_nr == 0) { 1026 fi = btrfs_item_ptr(leaf, path->slots[0], 1027 struct btrfs_file_extent_item); 1028 btrfs_set_file_extent_type(leaf, fi, 1029 BTRFS_FILE_EXTENT_REG); 1030 btrfs_mark_buffer_dirty(leaf); 1031 } else { 1032 fi = btrfs_item_ptr(leaf, del_slot - 1, 1033 struct btrfs_file_extent_item); 1034 btrfs_set_file_extent_type(leaf, fi, 1035 BTRFS_FILE_EXTENT_REG); 1036 btrfs_set_file_extent_num_bytes(leaf, fi, 1037 extent_end - key.offset); 1038 btrfs_mark_buffer_dirty(leaf); 1039 1040 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 1041 if (ret < 0) { 1042 btrfs_abort_transaction(trans, root, ret); 1043 goto out; 1044 } 1045 } 1046 out: 1047 btrfs_free_path(path); 1048 return 0; 1049 } 1050 1051 /* 1052 * on error we return an unlocked page and the error value 1053 * on success we return a locked page and 0 1054 */ 1055 static int prepare_uptodate_page(struct page *page, u64 pos, 1056 bool force_uptodate) 1057 { 1058 int ret = 0; 1059 1060 if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) && 1061 !PageUptodate(page)) { 1062 ret = btrfs_readpage(NULL, page); 1063 if (ret) 1064 return ret; 1065 lock_page(page); 1066 if (!PageUptodate(page)) { 1067 unlock_page(page); 1068 return -EIO; 1069 } 1070 } 1071 return 0; 1072 } 1073 1074 /* 1075 * this gets pages into the page cache and locks them down, it also properly 1076 * waits for data=ordered extents to finish before allowing the pages to be 1077 * modified. 1078 */ 1079 static noinline int prepare_pages(struct btrfs_root *root, struct file *file, 1080 struct page **pages, size_t num_pages, 1081 loff_t pos, unsigned long first_index, 1082 size_t write_bytes, bool force_uptodate) 1083 { 1084 struct extent_state *cached_state = NULL; 1085 int i; 1086 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1087 struct inode *inode = fdentry(file)->d_inode; 1088 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1089 int err = 0; 1090 int faili = 0; 1091 u64 start_pos; 1092 u64 last_pos; 1093 1094 start_pos = pos & ~((u64)root->sectorsize - 1); 1095 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; 1096 1097 again: 1098 for (i = 0; i < num_pages; i++) { 1099 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1100 mask | __GFP_WRITE); 1101 if (!pages[i]) { 1102 faili = i - 1; 1103 err = -ENOMEM; 1104 goto fail; 1105 } 1106 1107 if (i == 0) 1108 err = prepare_uptodate_page(pages[i], pos, 1109 force_uptodate); 1110 if (i == num_pages - 1) 1111 err = prepare_uptodate_page(pages[i], 1112 pos + write_bytes, false); 1113 if (err) { 1114 page_cache_release(pages[i]); 1115 faili = i - 1; 1116 goto fail; 1117 } 1118 wait_on_page_writeback(pages[i]); 1119 } 1120 err = 0; 1121 if (start_pos < inode->i_size) { 1122 struct btrfs_ordered_extent *ordered; 1123 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1124 start_pos, last_pos - 1, 0, &cached_state); 1125 ordered = btrfs_lookup_first_ordered_extent(inode, 1126 last_pos - 1); 1127 if (ordered && 1128 ordered->file_offset + ordered->len > start_pos && 1129 ordered->file_offset < last_pos) { 1130 btrfs_put_ordered_extent(ordered); 1131 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1132 start_pos, last_pos - 1, 1133 &cached_state, GFP_NOFS); 1134 for (i = 0; i < num_pages; i++) { 1135 unlock_page(pages[i]); 1136 page_cache_release(pages[i]); 1137 } 1138 btrfs_wait_ordered_range(inode, start_pos, 1139 last_pos - start_pos); 1140 goto again; 1141 } 1142 if (ordered) 1143 btrfs_put_ordered_extent(ordered); 1144 1145 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, 1146 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1147 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, 1148 GFP_NOFS); 1149 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1150 start_pos, last_pos - 1, &cached_state, 1151 GFP_NOFS); 1152 } 1153 for (i = 0; i < num_pages; i++) { 1154 if (clear_page_dirty_for_io(pages[i])) 1155 account_page_redirty(pages[i]); 1156 set_page_extent_mapped(pages[i]); 1157 WARN_ON(!PageLocked(pages[i])); 1158 } 1159 return 0; 1160 fail: 1161 while (faili >= 0) { 1162 unlock_page(pages[faili]); 1163 page_cache_release(pages[faili]); 1164 faili--; 1165 } 1166 return err; 1167 1168 } 1169 1170 static noinline ssize_t __btrfs_buffered_write(struct file *file, 1171 struct iov_iter *i, 1172 loff_t pos) 1173 { 1174 struct inode *inode = fdentry(file)->d_inode; 1175 struct btrfs_root *root = BTRFS_I(inode)->root; 1176 struct page **pages = NULL; 1177 unsigned long first_index; 1178 size_t num_written = 0; 1179 int nrptrs; 1180 int ret = 0; 1181 bool force_page_uptodate = false; 1182 1183 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1184 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1185 (sizeof(struct page *))); 1186 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); 1187 nrptrs = max(nrptrs, 8); 1188 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1189 if (!pages) 1190 return -ENOMEM; 1191 1192 first_index = pos >> PAGE_CACHE_SHIFT; 1193 1194 while (iov_iter_count(i) > 0) { 1195 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1196 size_t write_bytes = min(iov_iter_count(i), 1197 nrptrs * (size_t)PAGE_CACHE_SIZE - 1198 offset); 1199 size_t num_pages = (write_bytes + offset + 1200 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1201 size_t dirty_pages; 1202 size_t copied; 1203 1204 WARN_ON(num_pages > nrptrs); 1205 1206 /* 1207 * Fault pages before locking them in prepare_pages 1208 * to avoid recursive lock 1209 */ 1210 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { 1211 ret = -EFAULT; 1212 break; 1213 } 1214 1215 ret = btrfs_delalloc_reserve_space(inode, 1216 num_pages << PAGE_CACHE_SHIFT); 1217 if (ret) 1218 break; 1219 1220 /* 1221 * This is going to setup the pages array with the number of 1222 * pages we want, so we don't really need to worry about the 1223 * contents of pages from loop to loop 1224 */ 1225 ret = prepare_pages(root, file, pages, num_pages, 1226 pos, first_index, write_bytes, 1227 force_page_uptodate); 1228 if (ret) { 1229 btrfs_delalloc_release_space(inode, 1230 num_pages << PAGE_CACHE_SHIFT); 1231 break; 1232 } 1233 1234 copied = btrfs_copy_from_user(pos, num_pages, 1235 write_bytes, pages, i); 1236 1237 /* 1238 * if we have trouble faulting in the pages, fall 1239 * back to one page at a time 1240 */ 1241 if (copied < write_bytes) 1242 nrptrs = 1; 1243 1244 if (copied == 0) { 1245 force_page_uptodate = true; 1246 dirty_pages = 0; 1247 } else { 1248 force_page_uptodate = false; 1249 dirty_pages = (copied + offset + 1250 PAGE_CACHE_SIZE - 1) >> 1251 PAGE_CACHE_SHIFT; 1252 } 1253 1254 /* 1255 * If we had a short copy we need to release the excess delaloc 1256 * bytes we reserved. We need to increment outstanding_extents 1257 * because btrfs_delalloc_release_space will decrement it, but 1258 * we still have an outstanding extent for the chunk we actually 1259 * managed to copy. 1260 */ 1261 if (num_pages > dirty_pages) { 1262 if (copied > 0) { 1263 spin_lock(&BTRFS_I(inode)->lock); 1264 BTRFS_I(inode)->outstanding_extents++; 1265 spin_unlock(&BTRFS_I(inode)->lock); 1266 } 1267 btrfs_delalloc_release_space(inode, 1268 (num_pages - dirty_pages) << 1269 PAGE_CACHE_SHIFT); 1270 } 1271 1272 if (copied > 0) { 1273 ret = btrfs_dirty_pages(root, inode, pages, 1274 dirty_pages, pos, copied, 1275 NULL); 1276 if (ret) { 1277 btrfs_delalloc_release_space(inode, 1278 dirty_pages << PAGE_CACHE_SHIFT); 1279 btrfs_drop_pages(pages, num_pages); 1280 break; 1281 } 1282 } 1283 1284 btrfs_drop_pages(pages, num_pages); 1285 1286 cond_resched(); 1287 1288 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1289 dirty_pages); 1290 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1291 btrfs_btree_balance_dirty(root, 1); 1292 1293 pos += copied; 1294 num_written += copied; 1295 } 1296 1297 kfree(pages); 1298 1299 return num_written ? num_written : ret; 1300 } 1301 1302 static ssize_t __btrfs_direct_write(struct kiocb *iocb, 1303 const struct iovec *iov, 1304 unsigned long nr_segs, loff_t pos, 1305 loff_t *ppos, size_t count, size_t ocount) 1306 { 1307 struct file *file = iocb->ki_filp; 1308 struct inode *inode = fdentry(file)->d_inode; 1309 struct iov_iter i; 1310 ssize_t written; 1311 ssize_t written_buffered; 1312 loff_t endbyte; 1313 int err; 1314 1315 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, 1316 count, ocount); 1317 1318 /* 1319 * the generic O_DIRECT will update in-memory i_size after the 1320 * DIOs are done. But our endio handlers that update the on 1321 * disk i_size never update past the in memory i_size. So we 1322 * need one more update here to catch any additions to the 1323 * file 1324 */ 1325 if (inode->i_size != BTRFS_I(inode)->disk_i_size) { 1326 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 1327 mark_inode_dirty(inode); 1328 } 1329 1330 if (written < 0 || written == count) 1331 return written; 1332 1333 pos += written; 1334 count -= written; 1335 iov_iter_init(&i, iov, nr_segs, count, written); 1336 written_buffered = __btrfs_buffered_write(file, &i, pos); 1337 if (written_buffered < 0) { 1338 err = written_buffered; 1339 goto out; 1340 } 1341 endbyte = pos + written_buffered - 1; 1342 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 1343 if (err) 1344 goto out; 1345 written += written_buffered; 1346 *ppos = pos + written_buffered; 1347 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, 1348 endbyte >> PAGE_CACHE_SHIFT); 1349 out: 1350 return written ? written : err; 1351 } 1352 1353 static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1354 const struct iovec *iov, 1355 unsigned long nr_segs, loff_t pos) 1356 { 1357 struct file *file = iocb->ki_filp; 1358 struct inode *inode = fdentry(file)->d_inode; 1359 struct btrfs_root *root = BTRFS_I(inode)->root; 1360 loff_t *ppos = &iocb->ki_pos; 1361 u64 start_pos; 1362 ssize_t num_written = 0; 1363 ssize_t err = 0; 1364 size_t count, ocount; 1365 1366 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1367 1368 mutex_lock(&inode->i_mutex); 1369 1370 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 1371 if (err) { 1372 mutex_unlock(&inode->i_mutex); 1373 goto out; 1374 } 1375 count = ocount; 1376 1377 current->backing_dev_info = inode->i_mapping->backing_dev_info; 1378 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 1379 if (err) { 1380 mutex_unlock(&inode->i_mutex); 1381 goto out; 1382 } 1383 1384 if (count == 0) { 1385 mutex_unlock(&inode->i_mutex); 1386 goto out; 1387 } 1388 1389 err = file_remove_suid(file); 1390 if (err) { 1391 mutex_unlock(&inode->i_mutex); 1392 goto out; 1393 } 1394 1395 /* 1396 * If BTRFS flips readonly due to some impossible error 1397 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), 1398 * although we have opened a file as writable, we have 1399 * to stop this write operation to ensure FS consistency. 1400 */ 1401 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 1402 mutex_unlock(&inode->i_mutex); 1403 err = -EROFS; 1404 goto out; 1405 } 1406 1407 err = btrfs_update_time(file); 1408 if (err) { 1409 mutex_unlock(&inode->i_mutex); 1410 goto out; 1411 } 1412 BTRFS_I(inode)->sequence++; 1413 1414 start_pos = round_down(pos, root->sectorsize); 1415 if (start_pos > i_size_read(inode)) { 1416 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); 1417 if (err) { 1418 mutex_unlock(&inode->i_mutex); 1419 goto out; 1420 } 1421 } 1422 1423 if (unlikely(file->f_flags & O_DIRECT)) { 1424 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1425 pos, ppos, count, ocount); 1426 } else { 1427 struct iov_iter i; 1428 1429 iov_iter_init(&i, iov, nr_segs, count, num_written); 1430 1431 num_written = __btrfs_buffered_write(file, &i, pos); 1432 if (num_written > 0) 1433 *ppos = pos + num_written; 1434 } 1435 1436 mutex_unlock(&inode->i_mutex); 1437 1438 /* 1439 * we want to make sure fsync finds this change 1440 * but we haven't joined a transaction running right now. 1441 * 1442 * Later on, someone is sure to update the inode and get the 1443 * real transid recorded. 1444 * 1445 * We set last_trans now to the fs_info generation + 1, 1446 * this will either be one more than the running transaction 1447 * or the generation used for the next transaction if there isn't 1448 * one running right now. 1449 */ 1450 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1451 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1452 err = generic_write_sync(file, pos, num_written); 1453 if (err < 0 && num_written > 0) 1454 num_written = err; 1455 } 1456 out: 1457 current->backing_dev_info = NULL; 1458 return num_written ? num_written : err; 1459 } 1460 1461 int btrfs_release_file(struct inode *inode, struct file *filp) 1462 { 1463 /* 1464 * ordered_data_close is set by settattr when we are about to truncate 1465 * a file from a non-zero size to a zero size. This tries to 1466 * flush down new bytes that may have been written if the 1467 * application were using truncate to replace a file in place. 1468 */ 1469 if (BTRFS_I(inode)->ordered_data_close) { 1470 BTRFS_I(inode)->ordered_data_close = 0; 1471 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); 1472 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1473 filemap_flush(inode->i_mapping); 1474 } 1475 if (filp->private_data) 1476 btrfs_ioctl_trans_end(filp); 1477 return 0; 1478 } 1479 1480 /* 1481 * fsync call for both files and directories. This logs the inode into 1482 * the tree log instead of forcing full commits whenever possible. 1483 * 1484 * It needs to call filemap_fdatawait so that all ordered extent updates are 1485 * in the metadata btree are up to date for copying to the log. 1486 * 1487 * It drops the inode mutex before doing the tree log commit. This is an 1488 * important optimization for directories because holding the mutex prevents 1489 * new operations on the dir while we write to disk. 1490 */ 1491 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 1492 { 1493 struct dentry *dentry = file->f_path.dentry; 1494 struct inode *inode = dentry->d_inode; 1495 struct btrfs_root *root = BTRFS_I(inode)->root; 1496 int ret = 0; 1497 struct btrfs_trans_handle *trans; 1498 1499 trace_btrfs_sync_file(file, datasync); 1500 1501 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1502 if (ret) 1503 return ret; 1504 mutex_lock(&inode->i_mutex); 1505 1506 /* we wait first, since the writeback may change the inode */ 1507 root->log_batch++; 1508 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1509 root->log_batch++; 1510 1511 /* 1512 * check the transaction that last modified this inode 1513 * and see if its already been committed 1514 */ 1515 if (!BTRFS_I(inode)->last_trans) { 1516 mutex_unlock(&inode->i_mutex); 1517 goto out; 1518 } 1519 1520 /* 1521 * if the last transaction that changed this file was before 1522 * the current transaction, we can bail out now without any 1523 * syncing 1524 */ 1525 smp_mb(); 1526 if (BTRFS_I(inode)->last_trans <= 1527 root->fs_info->last_trans_committed) { 1528 BTRFS_I(inode)->last_trans = 0; 1529 mutex_unlock(&inode->i_mutex); 1530 goto out; 1531 } 1532 1533 /* 1534 * ok we haven't committed the transaction yet, lets do a commit 1535 */ 1536 if (file->private_data) 1537 btrfs_ioctl_trans_end(file); 1538 1539 trans = btrfs_start_transaction(root, 0); 1540 if (IS_ERR(trans)) { 1541 ret = PTR_ERR(trans); 1542 mutex_unlock(&inode->i_mutex); 1543 goto out; 1544 } 1545 1546 ret = btrfs_log_dentry_safe(trans, root, dentry); 1547 if (ret < 0) { 1548 mutex_unlock(&inode->i_mutex); 1549 goto out; 1550 } 1551 1552 /* we've logged all the items and now have a consistent 1553 * version of the file in the log. It is possible that 1554 * someone will come in and modify the file, but that's 1555 * fine because the log is consistent on disk, and we 1556 * have references to all of the file's extents 1557 * 1558 * It is possible that someone will come in and log the 1559 * file again, but that will end up using the synchronization 1560 * inside btrfs_sync_log to keep things safe. 1561 */ 1562 mutex_unlock(&inode->i_mutex); 1563 1564 if (ret != BTRFS_NO_LOG_SYNC) { 1565 if (ret > 0) { 1566 ret = btrfs_commit_transaction(trans, root); 1567 } else { 1568 ret = btrfs_sync_log(trans, root); 1569 if (ret == 0) 1570 ret = btrfs_end_transaction(trans, root); 1571 else 1572 ret = btrfs_commit_transaction(trans, root); 1573 } 1574 } else { 1575 ret = btrfs_end_transaction(trans, root); 1576 } 1577 out: 1578 return ret > 0 ? -EIO : ret; 1579 } 1580 1581 static const struct vm_operations_struct btrfs_file_vm_ops = { 1582 .fault = filemap_fault, 1583 .page_mkwrite = btrfs_page_mkwrite, 1584 }; 1585 1586 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1587 { 1588 struct address_space *mapping = filp->f_mapping; 1589 1590 if (!mapping->a_ops->readpage) 1591 return -ENOEXEC; 1592 1593 file_accessed(filp); 1594 vma->vm_ops = &btrfs_file_vm_ops; 1595 vma->vm_flags |= VM_CAN_NONLINEAR; 1596 1597 return 0; 1598 } 1599 1600 static long btrfs_fallocate(struct file *file, int mode, 1601 loff_t offset, loff_t len) 1602 { 1603 struct inode *inode = file->f_path.dentry->d_inode; 1604 struct extent_state *cached_state = NULL; 1605 u64 cur_offset; 1606 u64 last_byte; 1607 u64 alloc_start; 1608 u64 alloc_end; 1609 u64 alloc_hint = 0; 1610 u64 locked_end; 1611 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1612 struct extent_map *em; 1613 int ret; 1614 1615 alloc_start = offset & ~mask; 1616 alloc_end = (offset + len + mask) & ~mask; 1617 1618 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 1619 if (mode & ~FALLOC_FL_KEEP_SIZE) 1620 return -EOPNOTSUPP; 1621 1622 /* 1623 * Make sure we have enough space before we do the 1624 * allocation. 1625 */ 1626 ret = btrfs_check_data_free_space(inode, len); 1627 if (ret) 1628 return ret; 1629 1630 /* 1631 * wait for ordered IO before we have any locks. We'll loop again 1632 * below with the locks held. 1633 */ 1634 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); 1635 1636 mutex_lock(&inode->i_mutex); 1637 ret = inode_newsize_ok(inode, alloc_end); 1638 if (ret) 1639 goto out; 1640 1641 if (alloc_start > inode->i_size) { 1642 ret = btrfs_cont_expand(inode, i_size_read(inode), 1643 alloc_start); 1644 if (ret) 1645 goto out; 1646 } 1647 1648 locked_end = alloc_end - 1; 1649 while (1) { 1650 struct btrfs_ordered_extent *ordered; 1651 1652 /* the extent lock is ordered inside the running 1653 * transaction 1654 */ 1655 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, 1656 locked_end, 0, &cached_state); 1657 ordered = btrfs_lookup_first_ordered_extent(inode, 1658 alloc_end - 1); 1659 if (ordered && 1660 ordered->file_offset + ordered->len > alloc_start && 1661 ordered->file_offset < alloc_end) { 1662 btrfs_put_ordered_extent(ordered); 1663 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1664 alloc_start, locked_end, 1665 &cached_state, GFP_NOFS); 1666 /* 1667 * we can't wait on the range with the transaction 1668 * running or with the extent lock held 1669 */ 1670 btrfs_wait_ordered_range(inode, alloc_start, 1671 alloc_end - alloc_start); 1672 } else { 1673 if (ordered) 1674 btrfs_put_ordered_extent(ordered); 1675 break; 1676 } 1677 } 1678 1679 cur_offset = alloc_start; 1680 while (1) { 1681 u64 actual_end; 1682 1683 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 1684 alloc_end - cur_offset, 0); 1685 if (IS_ERR_OR_NULL(em)) { 1686 if (!em) 1687 ret = -ENOMEM; 1688 else 1689 ret = PTR_ERR(em); 1690 break; 1691 } 1692 last_byte = min(extent_map_end(em), alloc_end); 1693 actual_end = min_t(u64, extent_map_end(em), offset + len); 1694 last_byte = (last_byte + mask) & ~mask; 1695 1696 if (em->block_start == EXTENT_MAP_HOLE || 1697 (cur_offset >= inode->i_size && 1698 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1699 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1700 last_byte - cur_offset, 1701 1 << inode->i_blkbits, 1702 offset + len, 1703 &alloc_hint); 1704 1705 if (ret < 0) { 1706 free_extent_map(em); 1707 break; 1708 } 1709 } else if (actual_end > inode->i_size && 1710 !(mode & FALLOC_FL_KEEP_SIZE)) { 1711 /* 1712 * We didn't need to allocate any more space, but we 1713 * still extended the size of the file so we need to 1714 * update i_size. 1715 */ 1716 inode->i_ctime = CURRENT_TIME; 1717 i_size_write(inode, actual_end); 1718 btrfs_ordered_update_i_size(inode, actual_end, NULL); 1719 } 1720 free_extent_map(em); 1721 1722 cur_offset = last_byte; 1723 if (cur_offset >= alloc_end) { 1724 ret = 0; 1725 break; 1726 } 1727 } 1728 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1729 &cached_state, GFP_NOFS); 1730 out: 1731 mutex_unlock(&inode->i_mutex); 1732 /* Let go of our reservation. */ 1733 btrfs_free_reserved_data_space(inode, len); 1734 return ret; 1735 } 1736 1737 static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) 1738 { 1739 struct btrfs_root *root = BTRFS_I(inode)->root; 1740 struct extent_map *em; 1741 struct extent_state *cached_state = NULL; 1742 u64 lockstart = *offset; 1743 u64 lockend = i_size_read(inode); 1744 u64 start = *offset; 1745 u64 orig_start = *offset; 1746 u64 len = i_size_read(inode); 1747 u64 last_end = 0; 1748 int ret = 0; 1749 1750 lockend = max_t(u64, root->sectorsize, lockend); 1751 if (lockend <= lockstart) 1752 lockend = lockstart + root->sectorsize; 1753 1754 len = lockend - lockstart + 1; 1755 1756 len = max_t(u64, len, root->sectorsize); 1757 if (inode->i_size == 0) 1758 return -ENXIO; 1759 1760 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, 1761 &cached_state); 1762 1763 /* 1764 * Delalloc is such a pain. If we have a hole and we have pending 1765 * delalloc for a portion of the hole we will get back a hole that 1766 * exists for the entire range since it hasn't been actually written 1767 * yet. So to take care of this case we need to look for an extent just 1768 * before the position we want in case there is outstanding delalloc 1769 * going on here. 1770 */ 1771 if (origin == SEEK_HOLE && start != 0) { 1772 if (start <= root->sectorsize) 1773 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, 1774 root->sectorsize, 0); 1775 else 1776 em = btrfs_get_extent_fiemap(inode, NULL, 0, 1777 start - root->sectorsize, 1778 root->sectorsize, 0); 1779 if (IS_ERR(em)) { 1780 ret = PTR_ERR(em); 1781 goto out; 1782 } 1783 last_end = em->start + em->len; 1784 if (em->block_start == EXTENT_MAP_DELALLOC) 1785 last_end = min_t(u64, last_end, inode->i_size); 1786 free_extent_map(em); 1787 } 1788 1789 while (1) { 1790 em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); 1791 if (IS_ERR(em)) { 1792 ret = PTR_ERR(em); 1793 break; 1794 } 1795 1796 if (em->block_start == EXTENT_MAP_HOLE) { 1797 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 1798 if (last_end <= orig_start) { 1799 free_extent_map(em); 1800 ret = -ENXIO; 1801 break; 1802 } 1803 } 1804 1805 if (origin == SEEK_HOLE) { 1806 *offset = start; 1807 free_extent_map(em); 1808 break; 1809 } 1810 } else { 1811 if (origin == SEEK_DATA) { 1812 if (em->block_start == EXTENT_MAP_DELALLOC) { 1813 if (start >= inode->i_size) { 1814 free_extent_map(em); 1815 ret = -ENXIO; 1816 break; 1817 } 1818 } 1819 1820 *offset = start; 1821 free_extent_map(em); 1822 break; 1823 } 1824 } 1825 1826 start = em->start + em->len; 1827 last_end = em->start + em->len; 1828 1829 if (em->block_start == EXTENT_MAP_DELALLOC) 1830 last_end = min_t(u64, last_end, inode->i_size); 1831 1832 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 1833 free_extent_map(em); 1834 ret = -ENXIO; 1835 break; 1836 } 1837 free_extent_map(em); 1838 cond_resched(); 1839 } 1840 if (!ret) 1841 *offset = min(*offset, inode->i_size); 1842 out: 1843 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 1844 &cached_state, GFP_NOFS); 1845 return ret; 1846 } 1847 1848 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) 1849 { 1850 struct inode *inode = file->f_mapping->host; 1851 int ret; 1852 1853 mutex_lock(&inode->i_mutex); 1854 switch (origin) { 1855 case SEEK_END: 1856 case SEEK_CUR: 1857 offset = generic_file_llseek(file, offset, origin); 1858 goto out; 1859 case SEEK_DATA: 1860 case SEEK_HOLE: 1861 if (offset >= i_size_read(inode)) { 1862 mutex_unlock(&inode->i_mutex); 1863 return -ENXIO; 1864 } 1865 1866 ret = find_desired_extent(inode, &offset, origin); 1867 if (ret) { 1868 mutex_unlock(&inode->i_mutex); 1869 return ret; 1870 } 1871 } 1872 1873 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { 1874 offset = -EINVAL; 1875 goto out; 1876 } 1877 if (offset > inode->i_sb->s_maxbytes) { 1878 offset = -EINVAL; 1879 goto out; 1880 } 1881 1882 /* Special lock needed here? */ 1883 if (offset != file->f_pos) { 1884 file->f_pos = offset; 1885 file->f_version = 0; 1886 } 1887 out: 1888 mutex_unlock(&inode->i_mutex); 1889 return offset; 1890 } 1891 1892 const struct file_operations btrfs_file_operations = { 1893 .llseek = btrfs_file_llseek, 1894 .read = do_sync_read, 1895 .write = do_sync_write, 1896 .aio_read = generic_file_aio_read, 1897 .splice_read = generic_file_splice_read, 1898 .aio_write = btrfs_file_aio_write, 1899 .mmap = btrfs_file_mmap, 1900 .open = generic_file_open, 1901 .release = btrfs_release_file, 1902 .fsync = btrfs_sync_file, 1903 .fallocate = btrfs_fallocate, 1904 .unlocked_ioctl = btrfs_ioctl, 1905 #ifdef CONFIG_COMPAT 1906 .compat_ioctl = btrfs_ioctl, 1907 #endif 1908 }; 1909