1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/fs.h> 7 #include <linux/pagemap.h> 8 #include <linux/time.h> 9 #include <linux/init.h> 10 #include <linux/string.h> 11 #include <linux/backing-dev.h> 12 #include <linux/falloc.h> 13 #include <linux/writeback.h> 14 #include <linux/compat.h> 15 #include <linux/slab.h> 16 #include <linux/btrfs.h> 17 #include <linux/uio.h> 18 #include <linux/iversion.h> 19 #include "ctree.h" 20 #include "disk-io.h" 21 #include "transaction.h" 22 #include "btrfs_inode.h" 23 #include "print-tree.h" 24 #include "tree-log.h" 25 #include "locking.h" 26 #include "volumes.h" 27 #include "qgroup.h" 28 #include "compression.h" 29 #include "delalloc-space.h" 30 #include "reflink.h" 31 #include "subpage.h" 32 33 static struct kmem_cache *btrfs_inode_defrag_cachep; 34 /* 35 * when auto defrag is enabled we 36 * queue up these defrag structs to remember which 37 * inodes need defragging passes 38 */ 39 struct inode_defrag { 40 struct rb_node rb_node; 41 /* objectid */ 42 u64 ino; 43 /* 44 * transid where the defrag was added, we search for 45 * extents newer than this 46 */ 47 u64 transid; 48 49 /* root objectid */ 50 u64 root; 51 52 /* last offset we were able to defrag */ 53 u64 last_offset; 54 55 /* if we've wrapped around back to zero once already */ 56 int cycled; 57 }; 58 59 static int __compare_inode_defrag(struct inode_defrag *defrag1, 60 struct inode_defrag *defrag2) 61 { 62 if (defrag1->root > defrag2->root) 63 return 1; 64 else if (defrag1->root < defrag2->root) 65 return -1; 66 else if (defrag1->ino > defrag2->ino) 67 return 1; 68 else if (defrag1->ino < defrag2->ino) 69 return -1; 70 else 71 return 0; 72 } 73 74 /* pop a record for an inode into the defrag tree. The lock 75 * must be held already 76 * 77 * If you're inserting a record for an older transid than an 78 * existing record, the transid already in the tree is lowered 79 * 80 * If an existing record is found the defrag item you 81 * pass in is freed 82 */ 83 static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, 84 struct inode_defrag *defrag) 85 { 86 struct btrfs_fs_info *fs_info = inode->root->fs_info; 87 struct inode_defrag *entry; 88 struct rb_node **p; 89 struct rb_node *parent = NULL; 90 int ret; 91 92 p = &fs_info->defrag_inodes.rb_node; 93 while (*p) { 94 parent = *p; 95 entry = rb_entry(parent, struct inode_defrag, rb_node); 96 97 ret = __compare_inode_defrag(defrag, entry); 98 if (ret < 0) 99 p = &parent->rb_left; 100 else if (ret > 0) 101 p = &parent->rb_right; 102 else { 103 /* if we're reinserting an entry for 104 * an old defrag run, make sure to 105 * lower the transid of our existing record 106 */ 107 if (defrag->transid < entry->transid) 108 entry->transid = defrag->transid; 109 if (defrag->last_offset > entry->last_offset) 110 entry->last_offset = defrag->last_offset; 111 return -EEXIST; 112 } 113 } 114 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 115 rb_link_node(&defrag->rb_node, parent, p); 116 rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); 117 return 0; 118 } 119 120 static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) 121 { 122 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 123 return 0; 124 125 if (btrfs_fs_closing(fs_info)) 126 return 0; 127 128 return 1; 129 } 130 131 /* 132 * insert a defrag record for this inode if auto defrag is 133 * enabled 134 */ 135 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 136 struct btrfs_inode *inode) 137 { 138 struct btrfs_root *root = inode->root; 139 struct btrfs_fs_info *fs_info = root->fs_info; 140 struct inode_defrag *defrag; 141 u64 transid; 142 int ret; 143 144 if (!__need_auto_defrag(fs_info)) 145 return 0; 146 147 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 148 return 0; 149 150 if (trans) 151 transid = trans->transid; 152 else 153 transid = inode->root->last_trans; 154 155 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 156 if (!defrag) 157 return -ENOMEM; 158 159 defrag->ino = btrfs_ino(inode); 160 defrag->transid = transid; 161 defrag->root = root->root_key.objectid; 162 163 spin_lock(&fs_info->defrag_inodes_lock); 164 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 165 /* 166 * If we set IN_DEFRAG flag and evict the inode from memory, 167 * and then re-read this inode, this new inode doesn't have 168 * IN_DEFRAG flag. At the case, we may find the existed defrag. 169 */ 170 ret = __btrfs_add_inode_defrag(inode, defrag); 171 if (ret) 172 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 173 } else { 174 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 175 } 176 spin_unlock(&fs_info->defrag_inodes_lock); 177 return 0; 178 } 179 180 /* 181 * Requeue the defrag object. If there is a defrag object that points to 182 * the same inode in the tree, we will merge them together (by 183 * __btrfs_add_inode_defrag()) and free the one that we want to requeue. 184 */ 185 static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode, 186 struct inode_defrag *defrag) 187 { 188 struct btrfs_fs_info *fs_info = inode->root->fs_info; 189 int ret; 190 191 if (!__need_auto_defrag(fs_info)) 192 goto out; 193 194 /* 195 * Here we don't check the IN_DEFRAG flag, because we need merge 196 * them together. 197 */ 198 spin_lock(&fs_info->defrag_inodes_lock); 199 ret = __btrfs_add_inode_defrag(inode, defrag); 200 spin_unlock(&fs_info->defrag_inodes_lock); 201 if (ret) 202 goto out; 203 return; 204 out: 205 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 206 } 207 208 /* 209 * pick the defragable inode that we want, if it doesn't exist, we will get 210 * the next one. 211 */ 212 static struct inode_defrag * 213 btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) 214 { 215 struct inode_defrag *entry = NULL; 216 struct inode_defrag tmp; 217 struct rb_node *p; 218 struct rb_node *parent = NULL; 219 int ret; 220 221 tmp.ino = ino; 222 tmp.root = root; 223 224 spin_lock(&fs_info->defrag_inodes_lock); 225 p = fs_info->defrag_inodes.rb_node; 226 while (p) { 227 parent = p; 228 entry = rb_entry(parent, struct inode_defrag, rb_node); 229 230 ret = __compare_inode_defrag(&tmp, entry); 231 if (ret < 0) 232 p = parent->rb_left; 233 else if (ret > 0) 234 p = parent->rb_right; 235 else 236 goto out; 237 } 238 239 if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 240 parent = rb_next(parent); 241 if (parent) 242 entry = rb_entry(parent, struct inode_defrag, rb_node); 243 else 244 entry = NULL; 245 } 246 out: 247 if (entry) 248 rb_erase(parent, &fs_info->defrag_inodes); 249 spin_unlock(&fs_info->defrag_inodes_lock); 250 return entry; 251 } 252 253 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 254 { 255 struct inode_defrag *defrag; 256 struct rb_node *node; 257 258 spin_lock(&fs_info->defrag_inodes_lock); 259 node = rb_first(&fs_info->defrag_inodes); 260 while (node) { 261 rb_erase(node, &fs_info->defrag_inodes); 262 defrag = rb_entry(node, struct inode_defrag, rb_node); 263 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 264 265 cond_resched_lock(&fs_info->defrag_inodes_lock); 266 267 node = rb_first(&fs_info->defrag_inodes); 268 } 269 spin_unlock(&fs_info->defrag_inodes_lock); 270 } 271 272 #define BTRFS_DEFRAG_BATCH 1024 273 274 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 275 struct inode_defrag *defrag) 276 { 277 struct btrfs_root *inode_root; 278 struct inode *inode; 279 struct btrfs_ioctl_defrag_range_args range; 280 int num_defrag; 281 int ret; 282 283 /* get the inode */ 284 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 285 if (IS_ERR(inode_root)) { 286 ret = PTR_ERR(inode_root); 287 goto cleanup; 288 } 289 290 inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); 291 btrfs_put_root(inode_root); 292 if (IS_ERR(inode)) { 293 ret = PTR_ERR(inode); 294 goto cleanup; 295 } 296 297 /* do a chunk of defrag */ 298 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 299 memset(&range, 0, sizeof(range)); 300 range.len = (u64)-1; 301 range.start = defrag->last_offset; 302 303 sb_start_write(fs_info->sb); 304 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 305 BTRFS_DEFRAG_BATCH); 306 sb_end_write(fs_info->sb); 307 /* 308 * if we filled the whole defrag batch, there 309 * must be more work to do. Queue this defrag 310 * again 311 */ 312 if (num_defrag == BTRFS_DEFRAG_BATCH) { 313 defrag->last_offset = range.start; 314 btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); 315 } else if (defrag->last_offset && !defrag->cycled) { 316 /* 317 * we didn't fill our defrag batch, but 318 * we didn't start at zero. Make sure we loop 319 * around to the start of the file. 320 */ 321 defrag->last_offset = 0; 322 defrag->cycled = 1; 323 btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); 324 } else { 325 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 326 } 327 328 iput(inode); 329 return 0; 330 cleanup: 331 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 332 return ret; 333 } 334 335 /* 336 * run through the list of inodes in the FS that need 337 * defragging 338 */ 339 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 340 { 341 struct inode_defrag *defrag; 342 u64 first_ino = 0; 343 u64 root_objectid = 0; 344 345 atomic_inc(&fs_info->defrag_running); 346 while (1) { 347 /* Pause the auto defragger. */ 348 if (test_bit(BTRFS_FS_STATE_REMOUNTING, 349 &fs_info->fs_state)) 350 break; 351 352 if (!__need_auto_defrag(fs_info)) 353 break; 354 355 /* find an inode to defrag */ 356 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, 357 first_ino); 358 if (!defrag) { 359 if (root_objectid || first_ino) { 360 root_objectid = 0; 361 first_ino = 0; 362 continue; 363 } else { 364 break; 365 } 366 } 367 368 first_ino = defrag->ino + 1; 369 root_objectid = defrag->root; 370 371 __btrfs_run_defrag_inode(fs_info, defrag); 372 } 373 atomic_dec(&fs_info->defrag_running); 374 375 /* 376 * during unmount, we use the transaction_wait queue to 377 * wait for the defragger to stop 378 */ 379 wake_up(&fs_info->transaction_wait); 380 return 0; 381 } 382 383 /* simple helper to fault in pages and copy. This should go away 384 * and be replaced with calls into generic code. 385 */ 386 static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, 387 struct page **prepared_pages, 388 struct iov_iter *i) 389 { 390 size_t copied = 0; 391 size_t total_copied = 0; 392 int pg = 0; 393 int offset = offset_in_page(pos); 394 395 while (write_bytes > 0) { 396 size_t count = min_t(size_t, 397 PAGE_SIZE - offset, write_bytes); 398 struct page *page = prepared_pages[pg]; 399 /* 400 * Copy data from userspace to the current page 401 */ 402 copied = copy_page_from_iter_atomic(page, offset, count, i); 403 404 /* Flush processor's dcache for this page */ 405 flush_dcache_page(page); 406 407 /* 408 * if we get a partial write, we can end up with 409 * partially up to date pages. These add 410 * a lot of complexity, so make sure they don't 411 * happen by forcing this copy to be retried. 412 * 413 * The rest of the btrfs_file_write code will fall 414 * back to page at a time copies after we return 0. 415 */ 416 if (unlikely(copied < count)) { 417 if (!PageUptodate(page)) { 418 iov_iter_revert(i, copied); 419 copied = 0; 420 } 421 if (!copied) 422 break; 423 } 424 425 write_bytes -= copied; 426 total_copied += copied; 427 offset += copied; 428 if (offset == PAGE_SIZE) { 429 pg++; 430 offset = 0; 431 } 432 } 433 return total_copied; 434 } 435 436 /* 437 * unlocks pages after btrfs_file_write is done with them 438 */ 439 static void btrfs_drop_pages(struct page **pages, size_t num_pages) 440 { 441 size_t i; 442 for (i = 0; i < num_pages; i++) { 443 /* page checked is some magic around finding pages that 444 * have been modified without going through btrfs_set_page_dirty 445 * clear it here. There should be no need to mark the pages 446 * accessed as prepare_pages should have marked them accessed 447 * in prepare_pages via find_or_create_page() 448 */ 449 ClearPageChecked(pages[i]); 450 unlock_page(pages[i]); 451 put_page(pages[i]); 452 } 453 } 454 455 /* 456 * After btrfs_copy_from_user(), update the following things for delalloc: 457 * - Mark newly dirtied pages as DELALLOC in the io tree. 458 * Used to advise which range is to be written back. 459 * - Mark modified pages as Uptodate/Dirty and not needing COW fixup 460 * - Update inode size for past EOF write 461 */ 462 int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, 463 size_t num_pages, loff_t pos, size_t write_bytes, 464 struct extent_state **cached, bool noreserve) 465 { 466 struct btrfs_fs_info *fs_info = inode->root->fs_info; 467 int err = 0; 468 int i; 469 u64 num_bytes; 470 u64 start_pos; 471 u64 end_of_last_block; 472 u64 end_pos = pos + write_bytes; 473 loff_t isize = i_size_read(&inode->vfs_inode); 474 unsigned int extra_bits = 0; 475 476 if (write_bytes == 0) 477 return 0; 478 479 if (noreserve) 480 extra_bits |= EXTENT_NORESERVE; 481 482 start_pos = round_down(pos, fs_info->sectorsize); 483 num_bytes = round_up(write_bytes + pos - start_pos, 484 fs_info->sectorsize); 485 ASSERT(num_bytes <= U32_MAX); 486 487 end_of_last_block = start_pos + num_bytes - 1; 488 489 /* 490 * The pages may have already been dirty, clear out old accounting so 491 * we can set things up properly 492 */ 493 clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, 494 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 495 0, 0, cached); 496 497 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 498 extra_bits, cached); 499 if (err) 500 return err; 501 502 for (i = 0; i < num_pages; i++) { 503 struct page *p = pages[i]; 504 505 btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); 506 ClearPageChecked(p); 507 btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); 508 } 509 510 /* 511 * we've only changed i_size in ram, and we haven't updated 512 * the disk i_size. There is no need to log the inode 513 * at this time. 514 */ 515 if (end_pos > isize) 516 i_size_write(&inode->vfs_inode, end_pos); 517 return 0; 518 } 519 520 /* 521 * this drops all the extents in the cache that intersect the range 522 * [start, end]. Existing extents are split as required. 523 */ 524 void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, 525 int skip_pinned) 526 { 527 struct extent_map *em; 528 struct extent_map *split = NULL; 529 struct extent_map *split2 = NULL; 530 struct extent_map_tree *em_tree = &inode->extent_tree; 531 u64 len = end - start + 1; 532 u64 gen; 533 int ret; 534 int testend = 1; 535 unsigned long flags; 536 int compressed = 0; 537 bool modified; 538 539 WARN_ON(end < start); 540 if (end == (u64)-1) { 541 len = (u64)-1; 542 testend = 0; 543 } 544 while (1) { 545 int no_splits = 0; 546 547 modified = false; 548 if (!split) 549 split = alloc_extent_map(); 550 if (!split2) 551 split2 = alloc_extent_map(); 552 if (!split || !split2) 553 no_splits = 1; 554 555 write_lock(&em_tree->lock); 556 em = lookup_extent_mapping(em_tree, start, len); 557 if (!em) { 558 write_unlock(&em_tree->lock); 559 break; 560 } 561 flags = em->flags; 562 gen = em->generation; 563 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 564 if (testend && em->start + em->len >= start + len) { 565 free_extent_map(em); 566 write_unlock(&em_tree->lock); 567 break; 568 } 569 start = em->start + em->len; 570 if (testend) 571 len = start + len - (em->start + em->len); 572 free_extent_map(em); 573 write_unlock(&em_tree->lock); 574 continue; 575 } 576 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 577 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 578 clear_bit(EXTENT_FLAG_LOGGING, &flags); 579 modified = !list_empty(&em->list); 580 if (no_splits) 581 goto next; 582 583 if (em->start < start) { 584 split->start = em->start; 585 split->len = start - em->start; 586 587 if (em->block_start < EXTENT_MAP_LAST_BYTE) { 588 split->orig_start = em->orig_start; 589 split->block_start = em->block_start; 590 591 if (compressed) 592 split->block_len = em->block_len; 593 else 594 split->block_len = split->len; 595 split->orig_block_len = max(split->block_len, 596 em->orig_block_len); 597 split->ram_bytes = em->ram_bytes; 598 } else { 599 split->orig_start = split->start; 600 split->block_len = 0; 601 split->block_start = em->block_start; 602 split->orig_block_len = 0; 603 split->ram_bytes = split->len; 604 } 605 606 split->generation = gen; 607 split->flags = flags; 608 split->compress_type = em->compress_type; 609 replace_extent_mapping(em_tree, em, split, modified); 610 free_extent_map(split); 611 split = split2; 612 split2 = NULL; 613 } 614 if (testend && em->start + em->len > start + len) { 615 u64 diff = start + len - em->start; 616 617 split->start = start + len; 618 split->len = em->start + em->len - (start + len); 619 split->flags = flags; 620 split->compress_type = em->compress_type; 621 split->generation = gen; 622 623 if (em->block_start < EXTENT_MAP_LAST_BYTE) { 624 split->orig_block_len = max(em->block_len, 625 em->orig_block_len); 626 627 split->ram_bytes = em->ram_bytes; 628 if (compressed) { 629 split->block_len = em->block_len; 630 split->block_start = em->block_start; 631 split->orig_start = em->orig_start; 632 } else { 633 split->block_len = split->len; 634 split->block_start = em->block_start 635 + diff; 636 split->orig_start = em->orig_start; 637 } 638 } else { 639 split->ram_bytes = split->len; 640 split->orig_start = split->start; 641 split->block_len = 0; 642 split->block_start = em->block_start; 643 split->orig_block_len = 0; 644 } 645 646 if (extent_map_in_tree(em)) { 647 replace_extent_mapping(em_tree, em, split, 648 modified); 649 } else { 650 ret = add_extent_mapping(em_tree, split, 651 modified); 652 ASSERT(ret == 0); /* Logic error */ 653 } 654 free_extent_map(split); 655 split = NULL; 656 } 657 next: 658 if (extent_map_in_tree(em)) 659 remove_extent_mapping(em_tree, em); 660 write_unlock(&em_tree->lock); 661 662 /* once for us */ 663 free_extent_map(em); 664 /* once for the tree*/ 665 free_extent_map(em); 666 } 667 if (split) 668 free_extent_map(split); 669 if (split2) 670 free_extent_map(split2); 671 } 672 673 /* 674 * this is very complex, but the basic idea is to drop all extents 675 * in the range start - end. hint_block is filled in with a block number 676 * that would be a good hint to the block allocator for this file. 677 * 678 * If an extent intersects the range but is not entirely inside the range 679 * it is either truncated or split. Anything entirely inside the range 680 * is deleted from the tree. 681 * 682 * Note: the VFS' inode number of bytes is not updated, it's up to the caller 683 * to deal with that. We set the field 'bytes_found' of the arguments structure 684 * with the number of allocated bytes found in the target range, so that the 685 * caller can update the inode's number of bytes in an atomic way when 686 * replacing extents in a range to avoid races with stat(2). 687 */ 688 int btrfs_drop_extents(struct btrfs_trans_handle *trans, 689 struct btrfs_root *root, struct btrfs_inode *inode, 690 struct btrfs_drop_extents_args *args) 691 { 692 struct btrfs_fs_info *fs_info = root->fs_info; 693 struct extent_buffer *leaf; 694 struct btrfs_file_extent_item *fi; 695 struct btrfs_ref ref = { 0 }; 696 struct btrfs_key key; 697 struct btrfs_key new_key; 698 u64 ino = btrfs_ino(inode); 699 u64 search_start = args->start; 700 u64 disk_bytenr = 0; 701 u64 num_bytes = 0; 702 u64 extent_offset = 0; 703 u64 extent_end = 0; 704 u64 last_end = args->start; 705 int del_nr = 0; 706 int del_slot = 0; 707 int extent_type; 708 int recow; 709 int ret; 710 int modify_tree = -1; 711 int update_refs; 712 int found = 0; 713 int leafs_visited = 0; 714 struct btrfs_path *path = args->path; 715 716 args->bytes_found = 0; 717 args->extent_inserted = false; 718 719 /* Must always have a path if ->replace_extent is true */ 720 ASSERT(!(args->replace_extent && !args->path)); 721 722 if (!path) { 723 path = btrfs_alloc_path(); 724 if (!path) { 725 ret = -ENOMEM; 726 goto out; 727 } 728 } 729 730 if (args->drop_cache) 731 btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0); 732 733 if (args->start >= inode->disk_i_size && !args->replace_extent) 734 modify_tree = 0; 735 736 update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || 737 root == fs_info->tree_root); 738 while (1) { 739 recow = 0; 740 ret = btrfs_lookup_file_extent(trans, root, path, ino, 741 search_start, modify_tree); 742 if (ret < 0) 743 break; 744 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) { 745 leaf = path->nodes[0]; 746 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); 747 if (key.objectid == ino && 748 key.type == BTRFS_EXTENT_DATA_KEY) 749 path->slots[0]--; 750 } 751 ret = 0; 752 leafs_visited++; 753 next_slot: 754 leaf = path->nodes[0]; 755 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 756 BUG_ON(del_nr > 0); 757 ret = btrfs_next_leaf(root, path); 758 if (ret < 0) 759 break; 760 if (ret > 0) { 761 ret = 0; 762 break; 763 } 764 leafs_visited++; 765 leaf = path->nodes[0]; 766 recow = 1; 767 } 768 769 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 770 771 if (key.objectid > ino) 772 break; 773 if (WARN_ON_ONCE(key.objectid < ino) || 774 key.type < BTRFS_EXTENT_DATA_KEY) { 775 ASSERT(del_nr == 0); 776 path->slots[0]++; 777 goto next_slot; 778 } 779 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end) 780 break; 781 782 fi = btrfs_item_ptr(leaf, path->slots[0], 783 struct btrfs_file_extent_item); 784 extent_type = btrfs_file_extent_type(leaf, fi); 785 786 if (extent_type == BTRFS_FILE_EXTENT_REG || 787 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 788 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 789 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 790 extent_offset = btrfs_file_extent_offset(leaf, fi); 791 extent_end = key.offset + 792 btrfs_file_extent_num_bytes(leaf, fi); 793 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 794 extent_end = key.offset + 795 btrfs_file_extent_ram_bytes(leaf, fi); 796 } else { 797 /* can't happen */ 798 BUG(); 799 } 800 801 /* 802 * Don't skip extent items representing 0 byte lengths. They 803 * used to be created (bug) if while punching holes we hit 804 * -ENOSPC condition. So if we find one here, just ensure we 805 * delete it, otherwise we would insert a new file extent item 806 * with the same key (offset) as that 0 bytes length file 807 * extent item in the call to setup_items_for_insert() later 808 * in this function. 809 */ 810 if (extent_end == key.offset && extent_end >= search_start) { 811 last_end = extent_end; 812 goto delete_extent_item; 813 } 814 815 if (extent_end <= search_start) { 816 path->slots[0]++; 817 goto next_slot; 818 } 819 820 found = 1; 821 search_start = max(key.offset, args->start); 822 if (recow || !modify_tree) { 823 modify_tree = -1; 824 btrfs_release_path(path); 825 continue; 826 } 827 828 /* 829 * | - range to drop - | 830 * | -------- extent -------- | 831 */ 832 if (args->start > key.offset && args->end < extent_end) { 833 BUG_ON(del_nr > 0); 834 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 835 ret = -EOPNOTSUPP; 836 break; 837 } 838 839 memcpy(&new_key, &key, sizeof(new_key)); 840 new_key.offset = args->start; 841 ret = btrfs_duplicate_item(trans, root, path, 842 &new_key); 843 if (ret == -EAGAIN) { 844 btrfs_release_path(path); 845 continue; 846 } 847 if (ret < 0) 848 break; 849 850 leaf = path->nodes[0]; 851 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 852 struct btrfs_file_extent_item); 853 btrfs_set_file_extent_num_bytes(leaf, fi, 854 args->start - key.offset); 855 856 fi = btrfs_item_ptr(leaf, path->slots[0], 857 struct btrfs_file_extent_item); 858 859 extent_offset += args->start - key.offset; 860 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 861 btrfs_set_file_extent_num_bytes(leaf, fi, 862 extent_end - args->start); 863 btrfs_mark_buffer_dirty(leaf); 864 865 if (update_refs && disk_bytenr > 0) { 866 btrfs_init_generic_ref(&ref, 867 BTRFS_ADD_DELAYED_REF, 868 disk_bytenr, num_bytes, 0); 869 btrfs_init_data_ref(&ref, 870 root->root_key.objectid, 871 new_key.objectid, 872 args->start - extent_offset); 873 ret = btrfs_inc_extent_ref(trans, &ref); 874 BUG_ON(ret); /* -ENOMEM */ 875 } 876 key.offset = args->start; 877 } 878 /* 879 * From here on out we will have actually dropped something, so 880 * last_end can be updated. 881 */ 882 last_end = extent_end; 883 884 /* 885 * | ---- range to drop ----- | 886 * | -------- extent -------- | 887 */ 888 if (args->start <= key.offset && args->end < extent_end) { 889 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 890 ret = -EOPNOTSUPP; 891 break; 892 } 893 894 memcpy(&new_key, &key, sizeof(new_key)); 895 new_key.offset = args->end; 896 btrfs_set_item_key_safe(fs_info, path, &new_key); 897 898 extent_offset += args->end - key.offset; 899 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 900 btrfs_set_file_extent_num_bytes(leaf, fi, 901 extent_end - args->end); 902 btrfs_mark_buffer_dirty(leaf); 903 if (update_refs && disk_bytenr > 0) 904 args->bytes_found += args->end - key.offset; 905 break; 906 } 907 908 search_start = extent_end; 909 /* 910 * | ---- range to drop ----- | 911 * | -------- extent -------- | 912 */ 913 if (args->start > key.offset && args->end >= extent_end) { 914 BUG_ON(del_nr > 0); 915 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 916 ret = -EOPNOTSUPP; 917 break; 918 } 919 920 btrfs_set_file_extent_num_bytes(leaf, fi, 921 args->start - key.offset); 922 btrfs_mark_buffer_dirty(leaf); 923 if (update_refs && disk_bytenr > 0) 924 args->bytes_found += extent_end - args->start; 925 if (args->end == extent_end) 926 break; 927 928 path->slots[0]++; 929 goto next_slot; 930 } 931 932 /* 933 * | ---- range to drop ----- | 934 * | ------ extent ------ | 935 */ 936 if (args->start <= key.offset && args->end >= extent_end) { 937 delete_extent_item: 938 if (del_nr == 0) { 939 del_slot = path->slots[0]; 940 del_nr = 1; 941 } else { 942 BUG_ON(del_slot + del_nr != path->slots[0]); 943 del_nr++; 944 } 945 946 if (update_refs && 947 extent_type == BTRFS_FILE_EXTENT_INLINE) { 948 args->bytes_found += extent_end - key.offset; 949 extent_end = ALIGN(extent_end, 950 fs_info->sectorsize); 951 } else if (update_refs && disk_bytenr > 0) { 952 btrfs_init_generic_ref(&ref, 953 BTRFS_DROP_DELAYED_REF, 954 disk_bytenr, num_bytes, 0); 955 btrfs_init_data_ref(&ref, 956 root->root_key.objectid, 957 key.objectid, 958 key.offset - extent_offset); 959 ret = btrfs_free_extent(trans, &ref); 960 BUG_ON(ret); /* -ENOMEM */ 961 args->bytes_found += extent_end - key.offset; 962 } 963 964 if (args->end == extent_end) 965 break; 966 967 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { 968 path->slots[0]++; 969 goto next_slot; 970 } 971 972 ret = btrfs_del_items(trans, root, path, del_slot, 973 del_nr); 974 if (ret) { 975 btrfs_abort_transaction(trans, ret); 976 break; 977 } 978 979 del_nr = 0; 980 del_slot = 0; 981 982 btrfs_release_path(path); 983 continue; 984 } 985 986 BUG(); 987 } 988 989 if (!ret && del_nr > 0) { 990 /* 991 * Set path->slots[0] to first slot, so that after the delete 992 * if items are move off from our leaf to its immediate left or 993 * right neighbor leafs, we end up with a correct and adjusted 994 * path->slots[0] for our insertion (if args->replace_extent). 995 */ 996 path->slots[0] = del_slot; 997 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 998 if (ret) 999 btrfs_abort_transaction(trans, ret); 1000 } 1001 1002 leaf = path->nodes[0]; 1003 /* 1004 * If btrfs_del_items() was called, it might have deleted a leaf, in 1005 * which case it unlocked our path, so check path->locks[0] matches a 1006 * write lock. 1007 */ 1008 if (!ret && args->replace_extent && leafs_visited == 1 && 1009 path->locks[0] == BTRFS_WRITE_LOCK && 1010 btrfs_leaf_free_space(leaf) >= 1011 sizeof(struct btrfs_item) + args->extent_item_size) { 1012 1013 key.objectid = ino; 1014 key.type = BTRFS_EXTENT_DATA_KEY; 1015 key.offset = args->start; 1016 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { 1017 struct btrfs_key slot_key; 1018 1019 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); 1020 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) 1021 path->slots[0]++; 1022 } 1023 setup_items_for_insert(root, path, &key, 1024 &args->extent_item_size, 1); 1025 args->extent_inserted = true; 1026 } 1027 1028 if (!args->path) 1029 btrfs_free_path(path); 1030 else if (!args->extent_inserted) 1031 btrfs_release_path(path); 1032 out: 1033 args->drop_end = found ? min(args->end, last_end) : args->end; 1034 1035 return ret; 1036 } 1037 1038 static int extent_mergeable(struct extent_buffer *leaf, int slot, 1039 u64 objectid, u64 bytenr, u64 orig_offset, 1040 u64 *start, u64 *end) 1041 { 1042 struct btrfs_file_extent_item *fi; 1043 struct btrfs_key key; 1044 u64 extent_end; 1045 1046 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 1047 return 0; 1048 1049 btrfs_item_key_to_cpu(leaf, &key, slot); 1050 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) 1051 return 0; 1052 1053 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 1054 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || 1055 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || 1056 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || 1057 btrfs_file_extent_compression(leaf, fi) || 1058 btrfs_file_extent_encryption(leaf, fi) || 1059 btrfs_file_extent_other_encoding(leaf, fi)) 1060 return 0; 1061 1062 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 1063 if ((*start && *start != key.offset) || (*end && *end != extent_end)) 1064 return 0; 1065 1066 *start = key.offset; 1067 *end = extent_end; 1068 return 1; 1069 } 1070 1071 /* 1072 * Mark extent in the range start - end as written. 1073 * 1074 * This changes extent type from 'pre-allocated' to 'regular'. If only 1075 * part of extent is marked as written, the extent will be split into 1076 * two or three. 1077 */ 1078 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 1079 struct btrfs_inode *inode, u64 start, u64 end) 1080 { 1081 struct btrfs_fs_info *fs_info = trans->fs_info; 1082 struct btrfs_root *root = inode->root; 1083 struct extent_buffer *leaf; 1084 struct btrfs_path *path; 1085 struct btrfs_file_extent_item *fi; 1086 struct btrfs_ref ref = { 0 }; 1087 struct btrfs_key key; 1088 struct btrfs_key new_key; 1089 u64 bytenr; 1090 u64 num_bytes; 1091 u64 extent_end; 1092 u64 orig_offset; 1093 u64 other_start; 1094 u64 other_end; 1095 u64 split; 1096 int del_nr = 0; 1097 int del_slot = 0; 1098 int recow; 1099 int ret = 0; 1100 u64 ino = btrfs_ino(inode); 1101 1102 path = btrfs_alloc_path(); 1103 if (!path) 1104 return -ENOMEM; 1105 again: 1106 recow = 0; 1107 split = start; 1108 key.objectid = ino; 1109 key.type = BTRFS_EXTENT_DATA_KEY; 1110 key.offset = split; 1111 1112 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1113 if (ret < 0) 1114 goto out; 1115 if (ret > 0 && path->slots[0] > 0) 1116 path->slots[0]--; 1117 1118 leaf = path->nodes[0]; 1119 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1120 if (key.objectid != ino || 1121 key.type != BTRFS_EXTENT_DATA_KEY) { 1122 ret = -EINVAL; 1123 btrfs_abort_transaction(trans, ret); 1124 goto out; 1125 } 1126 fi = btrfs_item_ptr(leaf, path->slots[0], 1127 struct btrfs_file_extent_item); 1128 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) { 1129 ret = -EINVAL; 1130 btrfs_abort_transaction(trans, ret); 1131 goto out; 1132 } 1133 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 1134 if (key.offset > start || extent_end < end) { 1135 ret = -EINVAL; 1136 btrfs_abort_transaction(trans, ret); 1137 goto out; 1138 } 1139 1140 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1141 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1142 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 1143 memcpy(&new_key, &key, sizeof(new_key)); 1144 1145 if (start == key.offset && end < extent_end) { 1146 other_start = 0; 1147 other_end = start; 1148 if (extent_mergeable(leaf, path->slots[0] - 1, 1149 ino, bytenr, orig_offset, 1150 &other_start, &other_end)) { 1151 new_key.offset = end; 1152 btrfs_set_item_key_safe(fs_info, path, &new_key); 1153 fi = btrfs_item_ptr(leaf, path->slots[0], 1154 struct btrfs_file_extent_item); 1155 btrfs_set_file_extent_generation(leaf, fi, 1156 trans->transid); 1157 btrfs_set_file_extent_num_bytes(leaf, fi, 1158 extent_end - end); 1159 btrfs_set_file_extent_offset(leaf, fi, 1160 end - orig_offset); 1161 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1162 struct btrfs_file_extent_item); 1163 btrfs_set_file_extent_generation(leaf, fi, 1164 trans->transid); 1165 btrfs_set_file_extent_num_bytes(leaf, fi, 1166 end - other_start); 1167 btrfs_mark_buffer_dirty(leaf); 1168 goto out; 1169 } 1170 } 1171 1172 if (start > key.offset && end == extent_end) { 1173 other_start = end; 1174 other_end = 0; 1175 if (extent_mergeable(leaf, path->slots[0] + 1, 1176 ino, bytenr, orig_offset, 1177 &other_start, &other_end)) { 1178 fi = btrfs_item_ptr(leaf, path->slots[0], 1179 struct btrfs_file_extent_item); 1180 btrfs_set_file_extent_num_bytes(leaf, fi, 1181 start - key.offset); 1182 btrfs_set_file_extent_generation(leaf, fi, 1183 trans->transid); 1184 path->slots[0]++; 1185 new_key.offset = start; 1186 btrfs_set_item_key_safe(fs_info, path, &new_key); 1187 1188 fi = btrfs_item_ptr(leaf, path->slots[0], 1189 struct btrfs_file_extent_item); 1190 btrfs_set_file_extent_generation(leaf, fi, 1191 trans->transid); 1192 btrfs_set_file_extent_num_bytes(leaf, fi, 1193 other_end - start); 1194 btrfs_set_file_extent_offset(leaf, fi, 1195 start - orig_offset); 1196 btrfs_mark_buffer_dirty(leaf); 1197 goto out; 1198 } 1199 } 1200 1201 while (start > key.offset || end < extent_end) { 1202 if (key.offset == start) 1203 split = end; 1204 1205 new_key.offset = split; 1206 ret = btrfs_duplicate_item(trans, root, path, &new_key); 1207 if (ret == -EAGAIN) { 1208 btrfs_release_path(path); 1209 goto again; 1210 } 1211 if (ret < 0) { 1212 btrfs_abort_transaction(trans, ret); 1213 goto out; 1214 } 1215 1216 leaf = path->nodes[0]; 1217 fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 1218 struct btrfs_file_extent_item); 1219 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1220 btrfs_set_file_extent_num_bytes(leaf, fi, 1221 split - key.offset); 1222 1223 fi = btrfs_item_ptr(leaf, path->slots[0], 1224 struct btrfs_file_extent_item); 1225 1226 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1227 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 1228 btrfs_set_file_extent_num_bytes(leaf, fi, 1229 extent_end - split); 1230 btrfs_mark_buffer_dirty(leaf); 1231 1232 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, 1233 num_bytes, 0); 1234 btrfs_init_data_ref(&ref, root->root_key.objectid, ino, 1235 orig_offset); 1236 ret = btrfs_inc_extent_ref(trans, &ref); 1237 if (ret) { 1238 btrfs_abort_transaction(trans, ret); 1239 goto out; 1240 } 1241 1242 if (split == start) { 1243 key.offset = start; 1244 } else { 1245 if (start != key.offset) { 1246 ret = -EINVAL; 1247 btrfs_abort_transaction(trans, ret); 1248 goto out; 1249 } 1250 path->slots[0]--; 1251 extent_end = end; 1252 } 1253 recow = 1; 1254 } 1255 1256 other_start = end; 1257 other_end = 0; 1258 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 1259 num_bytes, 0); 1260 btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset); 1261 if (extent_mergeable(leaf, path->slots[0] + 1, 1262 ino, bytenr, orig_offset, 1263 &other_start, &other_end)) { 1264 if (recow) { 1265 btrfs_release_path(path); 1266 goto again; 1267 } 1268 extent_end = other_end; 1269 del_slot = path->slots[0] + 1; 1270 del_nr++; 1271 ret = btrfs_free_extent(trans, &ref); 1272 if (ret) { 1273 btrfs_abort_transaction(trans, ret); 1274 goto out; 1275 } 1276 } 1277 other_start = 0; 1278 other_end = start; 1279 if (extent_mergeable(leaf, path->slots[0] - 1, 1280 ino, bytenr, orig_offset, 1281 &other_start, &other_end)) { 1282 if (recow) { 1283 btrfs_release_path(path); 1284 goto again; 1285 } 1286 key.offset = other_start; 1287 del_slot = path->slots[0]; 1288 del_nr++; 1289 ret = btrfs_free_extent(trans, &ref); 1290 if (ret) { 1291 btrfs_abort_transaction(trans, ret); 1292 goto out; 1293 } 1294 } 1295 if (del_nr == 0) { 1296 fi = btrfs_item_ptr(leaf, path->slots[0], 1297 struct btrfs_file_extent_item); 1298 btrfs_set_file_extent_type(leaf, fi, 1299 BTRFS_FILE_EXTENT_REG); 1300 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1301 btrfs_mark_buffer_dirty(leaf); 1302 } else { 1303 fi = btrfs_item_ptr(leaf, del_slot - 1, 1304 struct btrfs_file_extent_item); 1305 btrfs_set_file_extent_type(leaf, fi, 1306 BTRFS_FILE_EXTENT_REG); 1307 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1308 btrfs_set_file_extent_num_bytes(leaf, fi, 1309 extent_end - key.offset); 1310 btrfs_mark_buffer_dirty(leaf); 1311 1312 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 1313 if (ret < 0) { 1314 btrfs_abort_transaction(trans, ret); 1315 goto out; 1316 } 1317 } 1318 out: 1319 btrfs_free_path(path); 1320 return ret; 1321 } 1322 1323 /* 1324 * on error we return an unlocked page and the error value 1325 * on success we return a locked page and 0 1326 */ 1327 static int prepare_uptodate_page(struct inode *inode, 1328 struct page *page, u64 pos, 1329 bool force_uptodate) 1330 { 1331 int ret = 0; 1332 1333 if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && 1334 !PageUptodate(page)) { 1335 ret = btrfs_readpage(NULL, page); 1336 if (ret) 1337 return ret; 1338 lock_page(page); 1339 if (!PageUptodate(page)) { 1340 unlock_page(page); 1341 return -EIO; 1342 } 1343 if (page->mapping != inode->i_mapping) { 1344 unlock_page(page); 1345 return -EAGAIN; 1346 } 1347 } 1348 return 0; 1349 } 1350 1351 /* 1352 * this just gets pages into the page cache and locks them down. 1353 */ 1354 static noinline int prepare_pages(struct inode *inode, struct page **pages, 1355 size_t num_pages, loff_t pos, 1356 size_t write_bytes, bool force_uptodate) 1357 { 1358 int i; 1359 unsigned long index = pos >> PAGE_SHIFT; 1360 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1361 int err = 0; 1362 int faili; 1363 1364 for (i = 0; i < num_pages; i++) { 1365 again: 1366 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1367 mask | __GFP_WRITE); 1368 if (!pages[i]) { 1369 faili = i - 1; 1370 err = -ENOMEM; 1371 goto fail; 1372 } 1373 1374 err = set_page_extent_mapped(pages[i]); 1375 if (err < 0) { 1376 faili = i; 1377 goto fail; 1378 } 1379 1380 if (i == 0) 1381 err = prepare_uptodate_page(inode, pages[i], pos, 1382 force_uptodate); 1383 if (!err && i == num_pages - 1) 1384 err = prepare_uptodate_page(inode, pages[i], 1385 pos + write_bytes, false); 1386 if (err) { 1387 put_page(pages[i]); 1388 if (err == -EAGAIN) { 1389 err = 0; 1390 goto again; 1391 } 1392 faili = i - 1; 1393 goto fail; 1394 } 1395 wait_on_page_writeback(pages[i]); 1396 } 1397 1398 return 0; 1399 fail: 1400 while (faili >= 0) { 1401 unlock_page(pages[faili]); 1402 put_page(pages[faili]); 1403 faili--; 1404 } 1405 return err; 1406 1407 } 1408 1409 /* 1410 * This function locks the extent and properly waits for data=ordered extents 1411 * to finish before allowing the pages to be modified if need. 1412 * 1413 * The return value: 1414 * 1 - the extent is locked 1415 * 0 - the extent is not locked, and everything is OK 1416 * -EAGAIN - need re-prepare the pages 1417 * the other < 0 number - Something wrong happens 1418 */ 1419 static noinline int 1420 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, 1421 size_t num_pages, loff_t pos, 1422 size_t write_bytes, 1423 u64 *lockstart, u64 *lockend, 1424 struct extent_state **cached_state) 1425 { 1426 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1427 u64 start_pos; 1428 u64 last_pos; 1429 int i; 1430 int ret = 0; 1431 1432 start_pos = round_down(pos, fs_info->sectorsize); 1433 last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; 1434 1435 if (start_pos < inode->vfs_inode.i_size) { 1436 struct btrfs_ordered_extent *ordered; 1437 1438 lock_extent_bits(&inode->io_tree, start_pos, last_pos, 1439 cached_state); 1440 ordered = btrfs_lookup_ordered_range(inode, start_pos, 1441 last_pos - start_pos + 1); 1442 if (ordered && 1443 ordered->file_offset + ordered->num_bytes > start_pos && 1444 ordered->file_offset <= last_pos) { 1445 unlock_extent_cached(&inode->io_tree, start_pos, 1446 last_pos, cached_state); 1447 for (i = 0; i < num_pages; i++) { 1448 unlock_page(pages[i]); 1449 put_page(pages[i]); 1450 } 1451 btrfs_start_ordered_extent(ordered, 1); 1452 btrfs_put_ordered_extent(ordered); 1453 return -EAGAIN; 1454 } 1455 if (ordered) 1456 btrfs_put_ordered_extent(ordered); 1457 1458 *lockstart = start_pos; 1459 *lockend = last_pos; 1460 ret = 1; 1461 } 1462 1463 /* 1464 * We should be called after prepare_pages() which should have locked 1465 * all pages in the range. 1466 */ 1467 for (i = 0; i < num_pages; i++) 1468 WARN_ON(!PageLocked(pages[i])); 1469 1470 return ret; 1471 } 1472 1473 static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, 1474 size_t *write_bytes, bool nowait) 1475 { 1476 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1477 struct btrfs_root *root = inode->root; 1478 u64 lockstart, lockend; 1479 u64 num_bytes; 1480 int ret; 1481 1482 if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 1483 return 0; 1484 1485 if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock)) 1486 return -EAGAIN; 1487 1488 lockstart = round_down(pos, fs_info->sectorsize); 1489 lockend = round_up(pos + *write_bytes, 1490 fs_info->sectorsize) - 1; 1491 num_bytes = lockend - lockstart + 1; 1492 1493 if (nowait) { 1494 struct btrfs_ordered_extent *ordered; 1495 1496 if (!try_lock_extent(&inode->io_tree, lockstart, lockend)) 1497 return -EAGAIN; 1498 1499 ordered = btrfs_lookup_ordered_range(inode, lockstart, 1500 num_bytes); 1501 if (ordered) { 1502 btrfs_put_ordered_extent(ordered); 1503 ret = -EAGAIN; 1504 goto out_unlock; 1505 } 1506 } else { 1507 btrfs_lock_and_flush_ordered_range(inode, lockstart, 1508 lockend, NULL); 1509 } 1510 1511 ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, 1512 NULL, NULL, NULL, false); 1513 if (ret <= 0) { 1514 ret = 0; 1515 if (!nowait) 1516 btrfs_drew_write_unlock(&root->snapshot_lock); 1517 } else { 1518 *write_bytes = min_t(size_t, *write_bytes , 1519 num_bytes - pos + lockstart); 1520 } 1521 out_unlock: 1522 unlock_extent(&inode->io_tree, lockstart, lockend); 1523 1524 return ret; 1525 } 1526 1527 static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos, 1528 size_t *write_bytes) 1529 { 1530 return check_can_nocow(inode, pos, write_bytes, true); 1531 } 1532 1533 /* 1534 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) 1535 * 1536 * @pos: File offset 1537 * @write_bytes: The length to write, will be updated to the nocow writeable 1538 * range 1539 * 1540 * This function will flush ordered extents in the range to ensure proper 1541 * nocow checks. 1542 * 1543 * Return: 1544 * >0 and update @write_bytes if we can do nocow write 1545 * 0 if we can't do nocow write 1546 * -EAGAIN if we can't get the needed lock or there are ordered extents 1547 * for * (nowait == true) case 1548 * <0 if other error happened 1549 * 1550 * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock(). 1551 */ 1552 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, 1553 size_t *write_bytes) 1554 { 1555 return check_can_nocow(inode, pos, write_bytes, false); 1556 } 1557 1558 void btrfs_check_nocow_unlock(struct btrfs_inode *inode) 1559 { 1560 btrfs_drew_write_unlock(&inode->root->snapshot_lock); 1561 } 1562 1563 static void update_time_for_write(struct inode *inode) 1564 { 1565 struct timespec64 now; 1566 1567 if (IS_NOCMTIME(inode)) 1568 return; 1569 1570 now = current_time(inode); 1571 if (!timespec64_equal(&inode->i_mtime, &now)) 1572 inode->i_mtime = now; 1573 1574 if (!timespec64_equal(&inode->i_ctime, &now)) 1575 inode->i_ctime = now; 1576 1577 if (IS_I_VERSION(inode)) 1578 inode_inc_iversion(inode); 1579 } 1580 1581 static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, 1582 size_t count) 1583 { 1584 struct file *file = iocb->ki_filp; 1585 struct inode *inode = file_inode(file); 1586 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1587 loff_t pos = iocb->ki_pos; 1588 int ret; 1589 loff_t oldsize; 1590 loff_t start_pos; 1591 1592 if (iocb->ki_flags & IOCB_NOWAIT) { 1593 size_t nocow_bytes = count; 1594 1595 /* We will allocate space in case nodatacow is not set, so bail */ 1596 if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0) 1597 return -EAGAIN; 1598 /* 1599 * There are holes in the range or parts of the range that must 1600 * be COWed (shared extents, RO block groups, etc), so just bail 1601 * out. 1602 */ 1603 if (nocow_bytes < count) 1604 return -EAGAIN; 1605 } 1606 1607 current->backing_dev_info = inode_to_bdi(inode); 1608 ret = file_remove_privs(file); 1609 if (ret) 1610 return ret; 1611 1612 /* 1613 * We reserve space for updating the inode when we reserve space for the 1614 * extent we are going to write, so we will enospc out there. We don't 1615 * need to start yet another transaction to update the inode as we will 1616 * update the inode when we finish writing whatever data we write. 1617 */ 1618 update_time_for_write(inode); 1619 1620 start_pos = round_down(pos, fs_info->sectorsize); 1621 oldsize = i_size_read(inode); 1622 if (start_pos > oldsize) { 1623 /* Expand hole size to cover write data, preventing empty gap */ 1624 loff_t end_pos = round_up(pos + count, fs_info->sectorsize); 1625 1626 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); 1627 if (ret) { 1628 current->backing_dev_info = NULL; 1629 return ret; 1630 } 1631 } 1632 1633 return 0; 1634 } 1635 1636 static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, 1637 struct iov_iter *i) 1638 { 1639 struct file *file = iocb->ki_filp; 1640 loff_t pos; 1641 struct inode *inode = file_inode(file); 1642 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1643 struct page **pages = NULL; 1644 struct extent_changeset *data_reserved = NULL; 1645 u64 release_bytes = 0; 1646 u64 lockstart; 1647 u64 lockend; 1648 size_t num_written = 0; 1649 int nrptrs; 1650 ssize_t ret; 1651 bool only_release_metadata = false; 1652 bool force_page_uptodate = false; 1653 loff_t old_isize = i_size_read(inode); 1654 unsigned int ilock_flags = 0; 1655 1656 if (iocb->ki_flags & IOCB_NOWAIT) 1657 ilock_flags |= BTRFS_ILOCK_TRY; 1658 1659 ret = btrfs_inode_lock(inode, ilock_flags); 1660 if (ret < 0) 1661 return ret; 1662 1663 ret = generic_write_checks(iocb, i); 1664 if (ret <= 0) 1665 goto out; 1666 1667 ret = btrfs_write_check(iocb, i, ret); 1668 if (ret < 0) 1669 goto out; 1670 1671 pos = iocb->ki_pos; 1672 nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), 1673 PAGE_SIZE / (sizeof(struct page *))); 1674 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); 1675 nrptrs = max(nrptrs, 8); 1676 pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); 1677 if (!pages) { 1678 ret = -ENOMEM; 1679 goto out; 1680 } 1681 1682 while (iov_iter_count(i) > 0) { 1683 struct extent_state *cached_state = NULL; 1684 size_t offset = offset_in_page(pos); 1685 size_t sector_offset; 1686 size_t write_bytes = min(iov_iter_count(i), 1687 nrptrs * (size_t)PAGE_SIZE - 1688 offset); 1689 size_t num_pages; 1690 size_t reserve_bytes; 1691 size_t dirty_pages; 1692 size_t copied; 1693 size_t dirty_sectors; 1694 size_t num_sectors; 1695 int extents_locked; 1696 1697 /* 1698 * Fault pages before locking them in prepare_pages 1699 * to avoid recursive lock 1700 */ 1701 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { 1702 ret = -EFAULT; 1703 break; 1704 } 1705 1706 only_release_metadata = false; 1707 sector_offset = pos & (fs_info->sectorsize - 1); 1708 1709 extent_changeset_release(data_reserved); 1710 ret = btrfs_check_data_free_space(BTRFS_I(inode), 1711 &data_reserved, pos, 1712 write_bytes); 1713 if (ret < 0) { 1714 /* 1715 * If we don't have to COW at the offset, reserve 1716 * metadata only. write_bytes may get smaller than 1717 * requested here. 1718 */ 1719 if (btrfs_check_nocow_lock(BTRFS_I(inode), pos, 1720 &write_bytes) > 0) 1721 only_release_metadata = true; 1722 else 1723 break; 1724 } 1725 1726 num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); 1727 WARN_ON(num_pages > nrptrs); 1728 reserve_bytes = round_up(write_bytes + sector_offset, 1729 fs_info->sectorsize); 1730 WARN_ON(reserve_bytes == 0); 1731 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 1732 reserve_bytes); 1733 if (ret) { 1734 if (!only_release_metadata) 1735 btrfs_free_reserved_data_space(BTRFS_I(inode), 1736 data_reserved, pos, 1737 write_bytes); 1738 else 1739 btrfs_check_nocow_unlock(BTRFS_I(inode)); 1740 break; 1741 } 1742 1743 release_bytes = reserve_bytes; 1744 again: 1745 /* 1746 * This is going to setup the pages array with the number of 1747 * pages we want, so we don't really need to worry about the 1748 * contents of pages from loop to loop 1749 */ 1750 ret = prepare_pages(inode, pages, num_pages, 1751 pos, write_bytes, 1752 force_page_uptodate); 1753 if (ret) { 1754 btrfs_delalloc_release_extents(BTRFS_I(inode), 1755 reserve_bytes); 1756 break; 1757 } 1758 1759 extents_locked = lock_and_cleanup_extent_if_need( 1760 BTRFS_I(inode), pages, 1761 num_pages, pos, write_bytes, &lockstart, 1762 &lockend, &cached_state); 1763 if (extents_locked < 0) { 1764 if (extents_locked == -EAGAIN) 1765 goto again; 1766 btrfs_delalloc_release_extents(BTRFS_I(inode), 1767 reserve_bytes); 1768 ret = extents_locked; 1769 break; 1770 } 1771 1772 copied = btrfs_copy_from_user(pos, write_bytes, pages, i); 1773 1774 num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); 1775 dirty_sectors = round_up(copied + sector_offset, 1776 fs_info->sectorsize); 1777 dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); 1778 1779 /* 1780 * if we have trouble faulting in the pages, fall 1781 * back to one page at a time 1782 */ 1783 if (copied < write_bytes) 1784 nrptrs = 1; 1785 1786 if (copied == 0) { 1787 force_page_uptodate = true; 1788 dirty_sectors = 0; 1789 dirty_pages = 0; 1790 } else { 1791 force_page_uptodate = false; 1792 dirty_pages = DIV_ROUND_UP(copied + offset, 1793 PAGE_SIZE); 1794 } 1795 1796 if (num_sectors > dirty_sectors) { 1797 /* release everything except the sectors we dirtied */ 1798 release_bytes -= dirty_sectors << fs_info->sectorsize_bits; 1799 if (only_release_metadata) { 1800 btrfs_delalloc_release_metadata(BTRFS_I(inode), 1801 release_bytes, true); 1802 } else { 1803 u64 __pos; 1804 1805 __pos = round_down(pos, 1806 fs_info->sectorsize) + 1807 (dirty_pages << PAGE_SHIFT); 1808 btrfs_delalloc_release_space(BTRFS_I(inode), 1809 data_reserved, __pos, 1810 release_bytes, true); 1811 } 1812 } 1813 1814 release_bytes = round_up(copied + sector_offset, 1815 fs_info->sectorsize); 1816 1817 ret = btrfs_dirty_pages(BTRFS_I(inode), pages, 1818 dirty_pages, pos, copied, 1819 &cached_state, only_release_metadata); 1820 1821 /* 1822 * If we have not locked the extent range, because the range's 1823 * start offset is >= i_size, we might still have a non-NULL 1824 * cached extent state, acquired while marking the extent range 1825 * as delalloc through btrfs_dirty_pages(). Therefore free any 1826 * possible cached extent state to avoid a memory leak. 1827 */ 1828 if (extents_locked) 1829 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1830 lockstart, lockend, &cached_state); 1831 else 1832 free_extent_state(cached_state); 1833 1834 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); 1835 if (ret) { 1836 btrfs_drop_pages(pages, num_pages); 1837 break; 1838 } 1839 1840 release_bytes = 0; 1841 if (only_release_metadata) 1842 btrfs_check_nocow_unlock(BTRFS_I(inode)); 1843 1844 btrfs_drop_pages(pages, num_pages); 1845 1846 cond_resched(); 1847 1848 balance_dirty_pages_ratelimited(inode->i_mapping); 1849 1850 pos += copied; 1851 num_written += copied; 1852 } 1853 1854 kfree(pages); 1855 1856 if (release_bytes) { 1857 if (only_release_metadata) { 1858 btrfs_check_nocow_unlock(BTRFS_I(inode)); 1859 btrfs_delalloc_release_metadata(BTRFS_I(inode), 1860 release_bytes, true); 1861 } else { 1862 btrfs_delalloc_release_space(BTRFS_I(inode), 1863 data_reserved, 1864 round_down(pos, fs_info->sectorsize), 1865 release_bytes, true); 1866 } 1867 } 1868 1869 extent_changeset_free(data_reserved); 1870 if (num_written > 0) { 1871 pagecache_isize_extended(inode, old_isize, iocb->ki_pos); 1872 iocb->ki_pos += num_written; 1873 } 1874 out: 1875 btrfs_inode_unlock(inode, ilock_flags); 1876 return num_written ? num_written : ret; 1877 } 1878 1879 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, 1880 const struct iov_iter *iter, loff_t offset) 1881 { 1882 const u32 blocksize_mask = fs_info->sectorsize - 1; 1883 1884 if (offset & blocksize_mask) 1885 return -EINVAL; 1886 1887 if (iov_iter_alignment(iter) & blocksize_mask) 1888 return -EINVAL; 1889 1890 return 0; 1891 } 1892 1893 static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) 1894 { 1895 struct file *file = iocb->ki_filp; 1896 struct inode *inode = file_inode(file); 1897 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1898 loff_t pos; 1899 ssize_t written = 0; 1900 ssize_t written_buffered; 1901 loff_t endbyte; 1902 ssize_t err; 1903 unsigned int ilock_flags = 0; 1904 struct iomap_dio *dio = NULL; 1905 1906 if (iocb->ki_flags & IOCB_NOWAIT) 1907 ilock_flags |= BTRFS_ILOCK_TRY; 1908 1909 /* If the write DIO is within EOF, use a shared lock */ 1910 if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode)) 1911 ilock_flags |= BTRFS_ILOCK_SHARED; 1912 1913 relock: 1914 err = btrfs_inode_lock(inode, ilock_flags); 1915 if (err < 0) 1916 return err; 1917 1918 err = generic_write_checks(iocb, from); 1919 if (err <= 0) { 1920 btrfs_inode_unlock(inode, ilock_flags); 1921 return err; 1922 } 1923 1924 err = btrfs_write_check(iocb, from, err); 1925 if (err < 0) { 1926 btrfs_inode_unlock(inode, ilock_flags); 1927 goto out; 1928 } 1929 1930 pos = iocb->ki_pos; 1931 /* 1932 * Re-check since file size may have changed just before taking the 1933 * lock or pos may have changed because of O_APPEND in generic_write_check() 1934 */ 1935 if ((ilock_flags & BTRFS_ILOCK_SHARED) && 1936 pos + iov_iter_count(from) > i_size_read(inode)) { 1937 btrfs_inode_unlock(inode, ilock_flags); 1938 ilock_flags &= ~BTRFS_ILOCK_SHARED; 1939 goto relock; 1940 } 1941 1942 if (check_direct_IO(fs_info, from, pos)) { 1943 btrfs_inode_unlock(inode, ilock_flags); 1944 goto buffered; 1945 } 1946 1947 dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 1948 0); 1949 1950 btrfs_inode_unlock(inode, ilock_flags); 1951 1952 if (IS_ERR_OR_NULL(dio)) { 1953 err = PTR_ERR_OR_ZERO(dio); 1954 if (err < 0 && err != -ENOTBLK) 1955 goto out; 1956 } else { 1957 written = iomap_dio_complete(dio); 1958 } 1959 1960 if (written < 0 || !iov_iter_count(from)) { 1961 err = written; 1962 goto out; 1963 } 1964 1965 buffered: 1966 pos = iocb->ki_pos; 1967 written_buffered = btrfs_buffered_write(iocb, from); 1968 if (written_buffered < 0) { 1969 err = written_buffered; 1970 goto out; 1971 } 1972 /* 1973 * Ensure all data is persisted. We want the next direct IO read to be 1974 * able to read what was just written. 1975 */ 1976 endbyte = pos + written_buffered - 1; 1977 err = btrfs_fdatawrite_range(inode, pos, endbyte); 1978 if (err) 1979 goto out; 1980 err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); 1981 if (err) 1982 goto out; 1983 written += written_buffered; 1984 iocb->ki_pos = pos + written_buffered; 1985 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, 1986 endbyte >> PAGE_SHIFT); 1987 out: 1988 return written ? written : err; 1989 } 1990 1991 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, 1992 struct iov_iter *from) 1993 { 1994 struct file *file = iocb->ki_filp; 1995 struct btrfs_inode *inode = BTRFS_I(file_inode(file)); 1996 ssize_t num_written = 0; 1997 const bool sync = iocb->ki_flags & IOCB_DSYNC; 1998 1999 /* 2000 * If the fs flips readonly due to some impossible error, although we 2001 * have opened a file as writable, we have to stop this write operation 2002 * to ensure consistency. 2003 */ 2004 if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state)) 2005 return -EROFS; 2006 2007 if (!(iocb->ki_flags & IOCB_DIRECT) && 2008 (iocb->ki_flags & IOCB_NOWAIT)) 2009 return -EOPNOTSUPP; 2010 2011 if (sync) 2012 atomic_inc(&inode->sync_writers); 2013 2014 if (iocb->ki_flags & IOCB_DIRECT) 2015 num_written = btrfs_direct_write(iocb, from); 2016 else 2017 num_written = btrfs_buffered_write(iocb, from); 2018 2019 btrfs_set_inode_last_sub_trans(inode); 2020 2021 if (num_written > 0) 2022 num_written = generic_write_sync(iocb, num_written); 2023 2024 if (sync) 2025 atomic_dec(&inode->sync_writers); 2026 2027 current->backing_dev_info = NULL; 2028 return num_written; 2029 } 2030 2031 int btrfs_release_file(struct inode *inode, struct file *filp) 2032 { 2033 struct btrfs_file_private *private = filp->private_data; 2034 2035 if (private && private->filldir_buf) 2036 kfree(private->filldir_buf); 2037 kfree(private); 2038 filp->private_data = NULL; 2039 2040 /* 2041 * Set by setattr when we are about to truncate a file from a non-zero 2042 * size to a zero size. This tries to flush down new bytes that may 2043 * have been written if the application were using truncate to replace 2044 * a file in place. 2045 */ 2046 if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, 2047 &BTRFS_I(inode)->runtime_flags)) 2048 filemap_flush(inode->i_mapping); 2049 return 0; 2050 } 2051 2052 static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) 2053 { 2054 int ret; 2055 struct blk_plug plug; 2056 2057 /* 2058 * This is only called in fsync, which would do synchronous writes, so 2059 * a plug can merge adjacent IOs as much as possible. Esp. in case of 2060 * multiple disks using raid profile, a large IO can be split to 2061 * several segments of stripe length (currently 64K). 2062 */ 2063 blk_start_plug(&plug); 2064 atomic_inc(&BTRFS_I(inode)->sync_writers); 2065 ret = btrfs_fdatawrite_range(inode, start, end); 2066 atomic_dec(&BTRFS_I(inode)->sync_writers); 2067 blk_finish_plug(&plug); 2068 2069 return ret; 2070 } 2071 2072 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) 2073 { 2074 struct btrfs_inode *inode = BTRFS_I(ctx->inode); 2075 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2076 2077 if (btrfs_inode_in_log(inode, fs_info->generation) && 2078 list_empty(&ctx->ordered_extents)) 2079 return true; 2080 2081 /* 2082 * If we are doing a fast fsync we can not bail out if the inode's 2083 * last_trans is <= then the last committed transaction, because we only 2084 * update the last_trans of the inode during ordered extent completion, 2085 * and for a fast fsync we don't wait for that, we only wait for the 2086 * writeback to complete. 2087 */ 2088 if (inode->last_trans <= fs_info->last_trans_committed && 2089 (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || 2090 list_empty(&ctx->ordered_extents))) 2091 return true; 2092 2093 return false; 2094 } 2095 2096 /* 2097 * fsync call for both files and directories. This logs the inode into 2098 * the tree log instead of forcing full commits whenever possible. 2099 * 2100 * It needs to call filemap_fdatawait so that all ordered extent updates are 2101 * in the metadata btree are up to date for copying to the log. 2102 * 2103 * It drops the inode mutex before doing the tree log commit. This is an 2104 * important optimization for directories because holding the mutex prevents 2105 * new operations on the dir while we write to disk. 2106 */ 2107 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 2108 { 2109 struct dentry *dentry = file_dentry(file); 2110 struct inode *inode = d_inode(dentry); 2111 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2112 struct btrfs_root *root = BTRFS_I(inode)->root; 2113 struct btrfs_trans_handle *trans; 2114 struct btrfs_log_ctx ctx; 2115 int ret = 0, err; 2116 u64 len; 2117 bool full_sync; 2118 2119 trace_btrfs_sync_file(file, datasync); 2120 2121 btrfs_init_log_ctx(&ctx, inode); 2122 2123 /* 2124 * Always set the range to a full range, otherwise we can get into 2125 * several problems, from missing file extent items to represent holes 2126 * when not using the NO_HOLES feature, to log tree corruption due to 2127 * races between hole detection during logging and completion of ordered 2128 * extents outside the range, to missing checksums due to ordered extents 2129 * for which we flushed only a subset of their pages. 2130 */ 2131 start = 0; 2132 end = LLONG_MAX; 2133 len = (u64)LLONG_MAX + 1; 2134 2135 /* 2136 * We write the dirty pages in the range and wait until they complete 2137 * out of the ->i_mutex. If so, we can flush the dirty pages by 2138 * multi-task, and make the performance up. See 2139 * btrfs_wait_ordered_range for an explanation of the ASYNC check. 2140 */ 2141 ret = start_ordered_ops(inode, start, end); 2142 if (ret) 2143 goto out; 2144 2145 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); 2146 2147 atomic_inc(&root->log_batch); 2148 2149 /* 2150 * Always check for the full sync flag while holding the inode's lock, 2151 * to avoid races with other tasks. The flag must be either set all the 2152 * time during logging or always off all the time while logging. 2153 */ 2154 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 2155 &BTRFS_I(inode)->runtime_flags); 2156 2157 /* 2158 * Before we acquired the inode's lock and the mmap lock, someone may 2159 * have dirtied more pages in the target range. We need to make sure 2160 * that writeback for any such pages does not start while we are logging 2161 * the inode, because if it does, any of the following might happen when 2162 * we are not doing a full inode sync: 2163 * 2164 * 1) We log an extent after its writeback finishes but before its 2165 * checksums are added to the csum tree, leading to -EIO errors 2166 * when attempting to read the extent after a log replay. 2167 * 2168 * 2) We can end up logging an extent before its writeback finishes. 2169 * Therefore after the log replay we will have a file extent item 2170 * pointing to an unwritten extent (and no data checksums as well). 2171 * 2172 * So trigger writeback for any eventual new dirty pages and then we 2173 * wait for all ordered extents to complete below. 2174 */ 2175 ret = start_ordered_ops(inode, start, end); 2176 if (ret) { 2177 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 2178 goto out; 2179 } 2180 2181 /* 2182 * We have to do this here to avoid the priority inversion of waiting on 2183 * IO of a lower priority task while holding a transaction open. 2184 * 2185 * For a full fsync we wait for the ordered extents to complete while 2186 * for a fast fsync we wait just for writeback to complete, and then 2187 * attach the ordered extents to the transaction so that a transaction 2188 * commit waits for their completion, to avoid data loss if we fsync, 2189 * the current transaction commits before the ordered extents complete 2190 * and a power failure happens right after that. 2191 * 2192 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the 2193 * logical address recorded in the ordered extent may change. We need 2194 * to wait for the IO to stabilize the logical address. 2195 */ 2196 if (full_sync || btrfs_is_zoned(fs_info)) { 2197 ret = btrfs_wait_ordered_range(inode, start, len); 2198 } else { 2199 /* 2200 * Get our ordered extents as soon as possible to avoid doing 2201 * checksum lookups in the csum tree, and use instead the 2202 * checksums attached to the ordered extents. 2203 */ 2204 btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), 2205 &ctx.ordered_extents); 2206 ret = filemap_fdatawait_range(inode->i_mapping, start, end); 2207 } 2208 2209 if (ret) 2210 goto out_release_extents; 2211 2212 atomic_inc(&root->log_batch); 2213 2214 smp_mb(); 2215 if (skip_inode_logging(&ctx)) { 2216 /* 2217 * We've had everything committed since the last time we were 2218 * modified so clear this flag in case it was set for whatever 2219 * reason, it's no longer relevant. 2220 */ 2221 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 2222 &BTRFS_I(inode)->runtime_flags); 2223 /* 2224 * An ordered extent might have started before and completed 2225 * already with io errors, in which case the inode was not 2226 * updated and we end up here. So check the inode's mapping 2227 * for any errors that might have happened since we last 2228 * checked called fsync. 2229 */ 2230 ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); 2231 goto out_release_extents; 2232 } 2233 2234 /* 2235 * We use start here because we will need to wait on the IO to complete 2236 * in btrfs_sync_log, which could require joining a transaction (for 2237 * example checking cross references in the nocow path). If we use join 2238 * here we could get into a situation where we're waiting on IO to 2239 * happen that is blocked on a transaction trying to commit. With start 2240 * we inc the extwriter counter, so we wait for all extwriters to exit 2241 * before we start blocking joiners. This comment is to keep somebody 2242 * from thinking they are super smart and changing this to 2243 * btrfs_join_transaction *cough*Josef*cough*. 2244 */ 2245 trans = btrfs_start_transaction(root, 0); 2246 if (IS_ERR(trans)) { 2247 ret = PTR_ERR(trans); 2248 goto out_release_extents; 2249 } 2250 trans->in_fsync = true; 2251 2252 ret = btrfs_log_dentry_safe(trans, dentry, &ctx); 2253 btrfs_release_log_ctx_extents(&ctx); 2254 if (ret < 0) { 2255 /* Fallthrough and commit/free transaction. */ 2256 ret = 1; 2257 } 2258 2259 /* we've logged all the items and now have a consistent 2260 * version of the file in the log. It is possible that 2261 * someone will come in and modify the file, but that's 2262 * fine because the log is consistent on disk, and we 2263 * have references to all of the file's extents 2264 * 2265 * It is possible that someone will come in and log the 2266 * file again, but that will end up using the synchronization 2267 * inside btrfs_sync_log to keep things safe. 2268 */ 2269 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 2270 2271 if (ret != BTRFS_NO_LOG_SYNC) { 2272 if (!ret) { 2273 ret = btrfs_sync_log(trans, root, &ctx); 2274 if (!ret) { 2275 ret = btrfs_end_transaction(trans); 2276 goto out; 2277 } 2278 } 2279 if (!full_sync) { 2280 ret = btrfs_wait_ordered_range(inode, start, len); 2281 if (ret) { 2282 btrfs_end_transaction(trans); 2283 goto out; 2284 } 2285 } 2286 ret = btrfs_commit_transaction(trans); 2287 } else { 2288 ret = btrfs_end_transaction(trans); 2289 } 2290 out: 2291 ASSERT(list_empty(&ctx.list)); 2292 err = file_check_and_advance_wb_err(file); 2293 if (!ret) 2294 ret = err; 2295 return ret > 0 ? -EIO : ret; 2296 2297 out_release_extents: 2298 btrfs_release_log_ctx_extents(&ctx); 2299 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 2300 goto out; 2301 } 2302 2303 static const struct vm_operations_struct btrfs_file_vm_ops = { 2304 .fault = filemap_fault, 2305 .map_pages = filemap_map_pages, 2306 .page_mkwrite = btrfs_page_mkwrite, 2307 }; 2308 2309 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 2310 { 2311 struct address_space *mapping = filp->f_mapping; 2312 2313 if (!mapping->a_ops->readpage) 2314 return -ENOEXEC; 2315 2316 file_accessed(filp); 2317 vma->vm_ops = &btrfs_file_vm_ops; 2318 2319 return 0; 2320 } 2321 2322 static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, 2323 int slot, u64 start, u64 end) 2324 { 2325 struct btrfs_file_extent_item *fi; 2326 struct btrfs_key key; 2327 2328 if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 2329 return 0; 2330 2331 btrfs_item_key_to_cpu(leaf, &key, slot); 2332 if (key.objectid != btrfs_ino(inode) || 2333 key.type != BTRFS_EXTENT_DATA_KEY) 2334 return 0; 2335 2336 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 2337 2338 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 2339 return 0; 2340 2341 if (btrfs_file_extent_disk_bytenr(leaf, fi)) 2342 return 0; 2343 2344 if (key.offset == end) 2345 return 1; 2346 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) 2347 return 1; 2348 return 0; 2349 } 2350 2351 static int fill_holes(struct btrfs_trans_handle *trans, 2352 struct btrfs_inode *inode, 2353 struct btrfs_path *path, u64 offset, u64 end) 2354 { 2355 struct btrfs_fs_info *fs_info = trans->fs_info; 2356 struct btrfs_root *root = inode->root; 2357 struct extent_buffer *leaf; 2358 struct btrfs_file_extent_item *fi; 2359 struct extent_map *hole_em; 2360 struct extent_map_tree *em_tree = &inode->extent_tree; 2361 struct btrfs_key key; 2362 int ret; 2363 2364 if (btrfs_fs_incompat(fs_info, NO_HOLES)) 2365 goto out; 2366 2367 key.objectid = btrfs_ino(inode); 2368 key.type = BTRFS_EXTENT_DATA_KEY; 2369 key.offset = offset; 2370 2371 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2372 if (ret <= 0) { 2373 /* 2374 * We should have dropped this offset, so if we find it then 2375 * something has gone horribly wrong. 2376 */ 2377 if (ret == 0) 2378 ret = -EINVAL; 2379 return ret; 2380 } 2381 2382 leaf = path->nodes[0]; 2383 if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) { 2384 u64 num_bytes; 2385 2386 path->slots[0]--; 2387 fi = btrfs_item_ptr(leaf, path->slots[0], 2388 struct btrfs_file_extent_item); 2389 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + 2390 end - offset; 2391 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2392 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 2393 btrfs_set_file_extent_offset(leaf, fi, 0); 2394 btrfs_mark_buffer_dirty(leaf); 2395 goto out; 2396 } 2397 2398 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { 2399 u64 num_bytes; 2400 2401 key.offset = offset; 2402 btrfs_set_item_key_safe(fs_info, path, &key); 2403 fi = btrfs_item_ptr(leaf, path->slots[0], 2404 struct btrfs_file_extent_item); 2405 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - 2406 offset; 2407 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2408 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 2409 btrfs_set_file_extent_offset(leaf, fi, 0); 2410 btrfs_mark_buffer_dirty(leaf); 2411 goto out; 2412 } 2413 btrfs_release_path(path); 2414 2415 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), 2416 offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0); 2417 if (ret) 2418 return ret; 2419 2420 out: 2421 btrfs_release_path(path); 2422 2423 hole_em = alloc_extent_map(); 2424 if (!hole_em) { 2425 btrfs_drop_extent_cache(inode, offset, end - 1, 0); 2426 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 2427 } else { 2428 hole_em->start = offset; 2429 hole_em->len = end - offset; 2430 hole_em->ram_bytes = hole_em->len; 2431 hole_em->orig_start = offset; 2432 2433 hole_em->block_start = EXTENT_MAP_HOLE; 2434 hole_em->block_len = 0; 2435 hole_em->orig_block_len = 0; 2436 hole_em->compress_type = BTRFS_COMPRESS_NONE; 2437 hole_em->generation = trans->transid; 2438 2439 do { 2440 btrfs_drop_extent_cache(inode, offset, end - 1, 0); 2441 write_lock(&em_tree->lock); 2442 ret = add_extent_mapping(em_tree, hole_em, 1); 2443 write_unlock(&em_tree->lock); 2444 } while (ret == -EEXIST); 2445 free_extent_map(hole_em); 2446 if (ret) 2447 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 2448 &inode->runtime_flags); 2449 } 2450 2451 return 0; 2452 } 2453 2454 /* 2455 * Find a hole extent on given inode and change start/len to the end of hole 2456 * extent.(hole/vacuum extent whose em->start <= start && 2457 * em->start + em->len > start) 2458 * When a hole extent is found, return 1 and modify start/len. 2459 */ 2460 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) 2461 { 2462 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2463 struct extent_map *em; 2464 int ret = 0; 2465 2466 em = btrfs_get_extent(inode, NULL, 0, 2467 round_down(*start, fs_info->sectorsize), 2468 round_up(*len, fs_info->sectorsize)); 2469 if (IS_ERR(em)) 2470 return PTR_ERR(em); 2471 2472 /* Hole or vacuum extent(only exists in no-hole mode) */ 2473 if (em->block_start == EXTENT_MAP_HOLE) { 2474 ret = 1; 2475 *len = em->start + em->len > *start + *len ? 2476 0 : *start + *len - em->start - em->len; 2477 *start = em->start + em->len; 2478 } 2479 free_extent_map(em); 2480 return ret; 2481 } 2482 2483 static int btrfs_punch_hole_lock_range(struct inode *inode, 2484 const u64 lockstart, 2485 const u64 lockend, 2486 struct extent_state **cached_state) 2487 { 2488 /* 2489 * For subpage case, if the range is not at page boundary, we could 2490 * have pages at the leading/tailing part of the range. 2491 * This could lead to dead loop since filemap_range_has_page() 2492 * will always return true. 2493 * So here we need to do extra page alignment for 2494 * filemap_range_has_page(). 2495 */ 2496 const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); 2497 const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; 2498 2499 while (1) { 2500 struct btrfs_ordered_extent *ordered; 2501 int ret; 2502 2503 truncate_pagecache_range(inode, lockstart, lockend); 2504 2505 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2506 cached_state); 2507 ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), 2508 lockend); 2509 2510 /* 2511 * We need to make sure we have no ordered extents in this range 2512 * and nobody raced in and read a page in this range, if we did 2513 * we need to try again. 2514 */ 2515 if ((!ordered || 2516 (ordered->file_offset + ordered->num_bytes <= lockstart || 2517 ordered->file_offset > lockend)) && 2518 !filemap_range_has_page(inode->i_mapping, 2519 page_lockstart, page_lockend)) { 2520 if (ordered) 2521 btrfs_put_ordered_extent(ordered); 2522 break; 2523 } 2524 if (ordered) 2525 btrfs_put_ordered_extent(ordered); 2526 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, 2527 lockend, cached_state); 2528 ret = btrfs_wait_ordered_range(inode, lockstart, 2529 lockend - lockstart + 1); 2530 if (ret) 2531 return ret; 2532 } 2533 return 0; 2534 } 2535 2536 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, 2537 struct btrfs_inode *inode, 2538 struct btrfs_path *path, 2539 struct btrfs_replace_extent_info *extent_info, 2540 const u64 replace_len, 2541 const u64 bytes_to_drop) 2542 { 2543 struct btrfs_fs_info *fs_info = trans->fs_info; 2544 struct btrfs_root *root = inode->root; 2545 struct btrfs_file_extent_item *extent; 2546 struct extent_buffer *leaf; 2547 struct btrfs_key key; 2548 int slot; 2549 struct btrfs_ref ref = { 0 }; 2550 int ret; 2551 2552 if (replace_len == 0) 2553 return 0; 2554 2555 if (extent_info->disk_offset == 0 && 2556 btrfs_fs_incompat(fs_info, NO_HOLES)) { 2557 btrfs_update_inode_bytes(inode, 0, bytes_to_drop); 2558 return 0; 2559 } 2560 2561 key.objectid = btrfs_ino(inode); 2562 key.type = BTRFS_EXTENT_DATA_KEY; 2563 key.offset = extent_info->file_offset; 2564 ret = btrfs_insert_empty_item(trans, root, path, &key, 2565 sizeof(struct btrfs_file_extent_item)); 2566 if (ret) 2567 return ret; 2568 leaf = path->nodes[0]; 2569 slot = path->slots[0]; 2570 write_extent_buffer(leaf, extent_info->extent_buf, 2571 btrfs_item_ptr_offset(leaf, slot), 2572 sizeof(struct btrfs_file_extent_item)); 2573 extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 2574 ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); 2575 btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); 2576 btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); 2577 if (extent_info->is_new_extent) 2578 btrfs_set_file_extent_generation(leaf, extent, trans->transid); 2579 btrfs_mark_buffer_dirty(leaf); 2580 btrfs_release_path(path); 2581 2582 ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, 2583 replace_len); 2584 if (ret) 2585 return ret; 2586 2587 /* If it's a hole, nothing more needs to be done. */ 2588 if (extent_info->disk_offset == 0) { 2589 btrfs_update_inode_bytes(inode, 0, bytes_to_drop); 2590 return 0; 2591 } 2592 2593 btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop); 2594 2595 if (extent_info->is_new_extent && extent_info->insertions == 0) { 2596 key.objectid = extent_info->disk_offset; 2597 key.type = BTRFS_EXTENT_ITEM_KEY; 2598 key.offset = extent_info->disk_len; 2599 ret = btrfs_alloc_reserved_file_extent(trans, root, 2600 btrfs_ino(inode), 2601 extent_info->file_offset, 2602 extent_info->qgroup_reserved, 2603 &key); 2604 } else { 2605 u64 ref_offset; 2606 2607 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, 2608 extent_info->disk_offset, 2609 extent_info->disk_len, 0); 2610 ref_offset = extent_info->file_offset - extent_info->data_offset; 2611 btrfs_init_data_ref(&ref, root->root_key.objectid, 2612 btrfs_ino(inode), ref_offset); 2613 ret = btrfs_inc_extent_ref(trans, &ref); 2614 } 2615 2616 extent_info->insertions++; 2617 2618 return ret; 2619 } 2620 2621 /* 2622 * The respective range must have been previously locked, as well as the inode. 2623 * The end offset is inclusive (last byte of the range). 2624 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing 2625 * the file range with an extent. 2626 * When not punching a hole, we don't want to end up in a state where we dropped 2627 * extents without inserting a new one, so we must abort the transaction to avoid 2628 * a corruption. 2629 */ 2630 int btrfs_replace_file_extents(struct btrfs_inode *inode, 2631 struct btrfs_path *path, const u64 start, 2632 const u64 end, 2633 struct btrfs_replace_extent_info *extent_info, 2634 struct btrfs_trans_handle **trans_out) 2635 { 2636 struct btrfs_drop_extents_args drop_args = { 0 }; 2637 struct btrfs_root *root = inode->root; 2638 struct btrfs_fs_info *fs_info = root->fs_info; 2639 u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); 2640 u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); 2641 struct btrfs_trans_handle *trans = NULL; 2642 struct btrfs_block_rsv *rsv; 2643 unsigned int rsv_count; 2644 u64 cur_offset; 2645 u64 len = end - start; 2646 int ret = 0; 2647 2648 if (end <= start) 2649 return -EINVAL; 2650 2651 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 2652 if (!rsv) { 2653 ret = -ENOMEM; 2654 goto out; 2655 } 2656 rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); 2657 rsv->failfast = 1; 2658 2659 /* 2660 * 1 - update the inode 2661 * 1 - removing the extents in the range 2662 * 1 - adding the hole extent if no_holes isn't set or if we are 2663 * replacing the range with a new extent 2664 */ 2665 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) 2666 rsv_count = 3; 2667 else 2668 rsv_count = 2; 2669 2670 trans = btrfs_start_transaction(root, rsv_count); 2671 if (IS_ERR(trans)) { 2672 ret = PTR_ERR(trans); 2673 trans = NULL; 2674 goto out_free; 2675 } 2676 2677 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, 2678 min_size, false); 2679 BUG_ON(ret); 2680 trans->block_rsv = rsv; 2681 2682 cur_offset = start; 2683 drop_args.path = path; 2684 drop_args.end = end + 1; 2685 drop_args.drop_cache = true; 2686 while (cur_offset < end) { 2687 drop_args.start = cur_offset; 2688 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 2689 /* If we are punching a hole decrement the inode's byte count */ 2690 if (!extent_info) 2691 btrfs_update_inode_bytes(inode, 0, 2692 drop_args.bytes_found); 2693 if (ret != -ENOSPC) { 2694 /* 2695 * When cloning we want to avoid transaction aborts when 2696 * nothing was done and we are attempting to clone parts 2697 * of inline extents, in such cases -EOPNOTSUPP is 2698 * returned by __btrfs_drop_extents() without having 2699 * changed anything in the file. 2700 */ 2701 if (extent_info && !extent_info->is_new_extent && 2702 ret && ret != -EOPNOTSUPP) 2703 btrfs_abort_transaction(trans, ret); 2704 break; 2705 } 2706 2707 trans->block_rsv = &fs_info->trans_block_rsv; 2708 2709 if (!extent_info && cur_offset < drop_args.drop_end && 2710 cur_offset < ino_size) { 2711 ret = fill_holes(trans, inode, path, cur_offset, 2712 drop_args.drop_end); 2713 if (ret) { 2714 /* 2715 * If we failed then we didn't insert our hole 2716 * entries for the area we dropped, so now the 2717 * fs is corrupted, so we must abort the 2718 * transaction. 2719 */ 2720 btrfs_abort_transaction(trans, ret); 2721 break; 2722 } 2723 } else if (!extent_info && cur_offset < drop_args.drop_end) { 2724 /* 2725 * We are past the i_size here, but since we didn't 2726 * insert holes we need to clear the mapped area so we 2727 * know to not set disk_i_size in this area until a new 2728 * file extent is inserted here. 2729 */ 2730 ret = btrfs_inode_clear_file_extent_range(inode, 2731 cur_offset, 2732 drop_args.drop_end - cur_offset); 2733 if (ret) { 2734 /* 2735 * We couldn't clear our area, so we could 2736 * presumably adjust up and corrupt the fs, so 2737 * we need to abort. 2738 */ 2739 btrfs_abort_transaction(trans, ret); 2740 break; 2741 } 2742 } 2743 2744 if (extent_info && 2745 drop_args.drop_end > extent_info->file_offset) { 2746 u64 replace_len = drop_args.drop_end - 2747 extent_info->file_offset; 2748 2749 ret = btrfs_insert_replace_extent(trans, inode, path, 2750 extent_info, replace_len, 2751 drop_args.bytes_found); 2752 if (ret) { 2753 btrfs_abort_transaction(trans, ret); 2754 break; 2755 } 2756 extent_info->data_len -= replace_len; 2757 extent_info->data_offset += replace_len; 2758 extent_info->file_offset += replace_len; 2759 } 2760 2761 ret = btrfs_update_inode(trans, root, inode); 2762 if (ret) 2763 break; 2764 2765 btrfs_end_transaction(trans); 2766 btrfs_btree_balance_dirty(fs_info); 2767 2768 trans = btrfs_start_transaction(root, rsv_count); 2769 if (IS_ERR(trans)) { 2770 ret = PTR_ERR(trans); 2771 trans = NULL; 2772 break; 2773 } 2774 2775 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 2776 rsv, min_size, false); 2777 BUG_ON(ret); /* shouldn't happen */ 2778 trans->block_rsv = rsv; 2779 2780 cur_offset = drop_args.drop_end; 2781 len = end - cur_offset; 2782 if (!extent_info && len) { 2783 ret = find_first_non_hole(inode, &cur_offset, &len); 2784 if (unlikely(ret < 0)) 2785 break; 2786 if (ret && !len) { 2787 ret = 0; 2788 break; 2789 } 2790 } 2791 } 2792 2793 /* 2794 * If we were cloning, force the next fsync to be a full one since we 2795 * we replaced (or just dropped in the case of cloning holes when 2796 * NO_HOLES is enabled) file extent items and did not setup new extent 2797 * maps for the replacement extents (or holes). 2798 */ 2799 if (extent_info && !extent_info->is_new_extent) 2800 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 2801 2802 if (ret) 2803 goto out_trans; 2804 2805 trans->block_rsv = &fs_info->trans_block_rsv; 2806 /* 2807 * If we are using the NO_HOLES feature we might have had already an 2808 * hole that overlaps a part of the region [lockstart, lockend] and 2809 * ends at (or beyond) lockend. Since we have no file extent items to 2810 * represent holes, drop_end can be less than lockend and so we must 2811 * make sure we have an extent map representing the existing hole (the 2812 * call to __btrfs_drop_extents() might have dropped the existing extent 2813 * map representing the existing hole), otherwise the fast fsync path 2814 * will not record the existence of the hole region 2815 * [existing_hole_start, lockend]. 2816 */ 2817 if (drop_args.drop_end <= end) 2818 drop_args.drop_end = end + 1; 2819 /* 2820 * Don't insert file hole extent item if it's for a range beyond eof 2821 * (because it's useless) or if it represents a 0 bytes range (when 2822 * cur_offset == drop_end). 2823 */ 2824 if (!extent_info && cur_offset < ino_size && 2825 cur_offset < drop_args.drop_end) { 2826 ret = fill_holes(trans, inode, path, cur_offset, 2827 drop_args.drop_end); 2828 if (ret) { 2829 /* Same comment as above. */ 2830 btrfs_abort_transaction(trans, ret); 2831 goto out_trans; 2832 } 2833 } else if (!extent_info && cur_offset < drop_args.drop_end) { 2834 /* See the comment in the loop above for the reasoning here. */ 2835 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, 2836 drop_args.drop_end - cur_offset); 2837 if (ret) { 2838 btrfs_abort_transaction(trans, ret); 2839 goto out_trans; 2840 } 2841 2842 } 2843 if (extent_info) { 2844 ret = btrfs_insert_replace_extent(trans, inode, path, 2845 extent_info, extent_info->data_len, 2846 drop_args.bytes_found); 2847 if (ret) { 2848 btrfs_abort_transaction(trans, ret); 2849 goto out_trans; 2850 } 2851 } 2852 2853 out_trans: 2854 if (!trans) 2855 goto out_free; 2856 2857 trans->block_rsv = &fs_info->trans_block_rsv; 2858 if (ret) 2859 btrfs_end_transaction(trans); 2860 else 2861 *trans_out = trans; 2862 out_free: 2863 btrfs_free_block_rsv(fs_info, rsv); 2864 out: 2865 return ret; 2866 } 2867 2868 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) 2869 { 2870 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2871 struct btrfs_root *root = BTRFS_I(inode)->root; 2872 struct extent_state *cached_state = NULL; 2873 struct btrfs_path *path; 2874 struct btrfs_trans_handle *trans = NULL; 2875 u64 lockstart; 2876 u64 lockend; 2877 u64 tail_start; 2878 u64 tail_len; 2879 u64 orig_start = offset; 2880 int ret = 0; 2881 bool same_block; 2882 u64 ino_size; 2883 bool truncated_block = false; 2884 bool updated_inode = false; 2885 2886 ret = btrfs_wait_ordered_range(inode, offset, len); 2887 if (ret) 2888 return ret; 2889 2890 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); 2891 ino_size = round_up(inode->i_size, fs_info->sectorsize); 2892 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); 2893 if (ret < 0) 2894 goto out_only_mutex; 2895 if (ret && !len) { 2896 /* Already in a large hole */ 2897 ret = 0; 2898 goto out_only_mutex; 2899 } 2900 2901 lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); 2902 lockend = round_down(offset + len, 2903 btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; 2904 same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) 2905 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); 2906 /* 2907 * We needn't truncate any block which is beyond the end of the file 2908 * because we are sure there is no data there. 2909 */ 2910 /* 2911 * Only do this if we are in the same block and we aren't doing the 2912 * entire block. 2913 */ 2914 if (same_block && len < fs_info->sectorsize) { 2915 if (offset < ino_size) { 2916 truncated_block = true; 2917 ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, 2918 0); 2919 } else { 2920 ret = 0; 2921 } 2922 goto out_only_mutex; 2923 } 2924 2925 /* zero back part of the first block */ 2926 if (offset < ino_size) { 2927 truncated_block = true; 2928 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); 2929 if (ret) { 2930 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 2931 return ret; 2932 } 2933 } 2934 2935 /* Check the aligned pages after the first unaligned page, 2936 * if offset != orig_start, which means the first unaligned page 2937 * including several following pages are already in holes, 2938 * the extra check can be skipped */ 2939 if (offset == orig_start) { 2940 /* after truncate page, check hole again */ 2941 len = offset + len - lockstart; 2942 offset = lockstart; 2943 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); 2944 if (ret < 0) 2945 goto out_only_mutex; 2946 if (ret && !len) { 2947 ret = 0; 2948 goto out_only_mutex; 2949 } 2950 lockstart = offset; 2951 } 2952 2953 /* Check the tail unaligned part is in a hole */ 2954 tail_start = lockend + 1; 2955 tail_len = offset + len - tail_start; 2956 if (tail_len) { 2957 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len); 2958 if (unlikely(ret < 0)) 2959 goto out_only_mutex; 2960 if (!ret) { 2961 /* zero the front end of the last page */ 2962 if (tail_start + tail_len < ino_size) { 2963 truncated_block = true; 2964 ret = btrfs_truncate_block(BTRFS_I(inode), 2965 tail_start + tail_len, 2966 0, 1); 2967 if (ret) 2968 goto out_only_mutex; 2969 } 2970 } 2971 } 2972 2973 if (lockend < lockstart) { 2974 ret = 0; 2975 goto out_only_mutex; 2976 } 2977 2978 ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, 2979 &cached_state); 2980 if (ret) 2981 goto out_only_mutex; 2982 2983 path = btrfs_alloc_path(); 2984 if (!path) { 2985 ret = -ENOMEM; 2986 goto out; 2987 } 2988 2989 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart, 2990 lockend, NULL, &trans); 2991 btrfs_free_path(path); 2992 if (ret) 2993 goto out; 2994 2995 ASSERT(trans != NULL); 2996 inode_inc_iversion(inode); 2997 inode->i_mtime = inode->i_ctime = current_time(inode); 2998 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 2999 updated_inode = true; 3000 btrfs_end_transaction(trans); 3001 btrfs_btree_balance_dirty(fs_info); 3002 out: 3003 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 3004 &cached_state); 3005 out_only_mutex: 3006 if (!updated_inode && truncated_block && !ret) { 3007 /* 3008 * If we only end up zeroing part of a page, we still need to 3009 * update the inode item, so that all the time fields are 3010 * updated as well as the necessary btrfs inode in memory fields 3011 * for detecting, at fsync time, if the inode isn't yet in the 3012 * log tree or it's there but not up to date. 3013 */ 3014 struct timespec64 now = current_time(inode); 3015 3016 inode_inc_iversion(inode); 3017 inode->i_mtime = now; 3018 inode->i_ctime = now; 3019 trans = btrfs_start_transaction(root, 1); 3020 if (IS_ERR(trans)) { 3021 ret = PTR_ERR(trans); 3022 } else { 3023 int ret2; 3024 3025 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 3026 ret2 = btrfs_end_transaction(trans); 3027 if (!ret) 3028 ret = ret2; 3029 } 3030 } 3031 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 3032 return ret; 3033 } 3034 3035 /* Helper structure to record which range is already reserved */ 3036 struct falloc_range { 3037 struct list_head list; 3038 u64 start; 3039 u64 len; 3040 }; 3041 3042 /* 3043 * Helper function to add falloc range 3044 * 3045 * Caller should have locked the larger range of extent containing 3046 * [start, len) 3047 */ 3048 static int add_falloc_range(struct list_head *head, u64 start, u64 len) 3049 { 3050 struct falloc_range *range = NULL; 3051 3052 if (!list_empty(head)) { 3053 /* 3054 * As fallocate iterates by bytenr order, we only need to check 3055 * the last range. 3056 */ 3057 range = list_last_entry(head, struct falloc_range, list); 3058 if (range->start + range->len == start) { 3059 range->len += len; 3060 return 0; 3061 } 3062 } 3063 3064 range = kmalloc(sizeof(*range), GFP_KERNEL); 3065 if (!range) 3066 return -ENOMEM; 3067 range->start = start; 3068 range->len = len; 3069 list_add_tail(&range->list, head); 3070 return 0; 3071 } 3072 3073 static int btrfs_fallocate_update_isize(struct inode *inode, 3074 const u64 end, 3075 const int mode) 3076 { 3077 struct btrfs_trans_handle *trans; 3078 struct btrfs_root *root = BTRFS_I(inode)->root; 3079 int ret; 3080 int ret2; 3081 3082 if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) 3083 return 0; 3084 3085 trans = btrfs_start_transaction(root, 1); 3086 if (IS_ERR(trans)) 3087 return PTR_ERR(trans); 3088 3089 inode->i_ctime = current_time(inode); 3090 i_size_write(inode, end); 3091 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 3092 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 3093 ret2 = btrfs_end_transaction(trans); 3094 3095 return ret ? ret : ret2; 3096 } 3097 3098 enum { 3099 RANGE_BOUNDARY_WRITTEN_EXTENT, 3100 RANGE_BOUNDARY_PREALLOC_EXTENT, 3101 RANGE_BOUNDARY_HOLE, 3102 }; 3103 3104 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, 3105 u64 offset) 3106 { 3107 const u64 sectorsize = btrfs_inode_sectorsize(inode); 3108 struct extent_map *em; 3109 int ret; 3110 3111 offset = round_down(offset, sectorsize); 3112 em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); 3113 if (IS_ERR(em)) 3114 return PTR_ERR(em); 3115 3116 if (em->block_start == EXTENT_MAP_HOLE) 3117 ret = RANGE_BOUNDARY_HOLE; 3118 else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3119 ret = RANGE_BOUNDARY_PREALLOC_EXTENT; 3120 else 3121 ret = RANGE_BOUNDARY_WRITTEN_EXTENT; 3122 3123 free_extent_map(em); 3124 return ret; 3125 } 3126 3127 static int btrfs_zero_range(struct inode *inode, 3128 loff_t offset, 3129 loff_t len, 3130 const int mode) 3131 { 3132 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 3133 struct extent_map *em; 3134 struct extent_changeset *data_reserved = NULL; 3135 int ret; 3136 u64 alloc_hint = 0; 3137 const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); 3138 u64 alloc_start = round_down(offset, sectorsize); 3139 u64 alloc_end = round_up(offset + len, sectorsize); 3140 u64 bytes_to_reserve = 0; 3141 bool space_reserved = false; 3142 3143 inode_dio_wait(inode); 3144 3145 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, 3146 alloc_end - alloc_start); 3147 if (IS_ERR(em)) { 3148 ret = PTR_ERR(em); 3149 goto out; 3150 } 3151 3152 /* 3153 * Avoid hole punching and extent allocation for some cases. More cases 3154 * could be considered, but these are unlikely common and we keep things 3155 * as simple as possible for now. Also, intentionally, if the target 3156 * range contains one or more prealloc extents together with regular 3157 * extents and holes, we drop all the existing extents and allocate a 3158 * new prealloc extent, so that we get a larger contiguous disk extent. 3159 */ 3160 if (em->start <= alloc_start && 3161 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3162 const u64 em_end = em->start + em->len; 3163 3164 if (em_end >= offset + len) { 3165 /* 3166 * The whole range is already a prealloc extent, 3167 * do nothing except updating the inode's i_size if 3168 * needed. 3169 */ 3170 free_extent_map(em); 3171 ret = btrfs_fallocate_update_isize(inode, offset + len, 3172 mode); 3173 goto out; 3174 } 3175 /* 3176 * Part of the range is already a prealloc extent, so operate 3177 * only on the remaining part of the range. 3178 */ 3179 alloc_start = em_end; 3180 ASSERT(IS_ALIGNED(alloc_start, sectorsize)); 3181 len = offset + len - alloc_start; 3182 offset = alloc_start; 3183 alloc_hint = em->block_start + em->len; 3184 } 3185 free_extent_map(em); 3186 3187 if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == 3188 BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { 3189 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, 3190 sectorsize); 3191 if (IS_ERR(em)) { 3192 ret = PTR_ERR(em); 3193 goto out; 3194 } 3195 3196 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3197 free_extent_map(em); 3198 ret = btrfs_fallocate_update_isize(inode, offset + len, 3199 mode); 3200 goto out; 3201 } 3202 if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { 3203 free_extent_map(em); 3204 ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, 3205 0); 3206 if (!ret) 3207 ret = btrfs_fallocate_update_isize(inode, 3208 offset + len, 3209 mode); 3210 return ret; 3211 } 3212 free_extent_map(em); 3213 alloc_start = round_down(offset, sectorsize); 3214 alloc_end = alloc_start + sectorsize; 3215 goto reserve_space; 3216 } 3217 3218 alloc_start = round_up(offset, sectorsize); 3219 alloc_end = round_down(offset + len, sectorsize); 3220 3221 /* 3222 * For unaligned ranges, check the pages at the boundaries, they might 3223 * map to an extent, in which case we need to partially zero them, or 3224 * they might map to a hole, in which case we need our allocation range 3225 * to cover them. 3226 */ 3227 if (!IS_ALIGNED(offset, sectorsize)) { 3228 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), 3229 offset); 3230 if (ret < 0) 3231 goto out; 3232 if (ret == RANGE_BOUNDARY_HOLE) { 3233 alloc_start = round_down(offset, sectorsize); 3234 ret = 0; 3235 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { 3236 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); 3237 if (ret) 3238 goto out; 3239 } else { 3240 ret = 0; 3241 } 3242 } 3243 3244 if (!IS_ALIGNED(offset + len, sectorsize)) { 3245 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), 3246 offset + len); 3247 if (ret < 0) 3248 goto out; 3249 if (ret == RANGE_BOUNDARY_HOLE) { 3250 alloc_end = round_up(offset + len, sectorsize); 3251 ret = 0; 3252 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { 3253 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len, 3254 0, 1); 3255 if (ret) 3256 goto out; 3257 } else { 3258 ret = 0; 3259 } 3260 } 3261 3262 reserve_space: 3263 if (alloc_start < alloc_end) { 3264 struct extent_state *cached_state = NULL; 3265 const u64 lockstart = alloc_start; 3266 const u64 lockend = alloc_end - 1; 3267 3268 bytes_to_reserve = alloc_end - alloc_start; 3269 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 3270 bytes_to_reserve); 3271 if (ret < 0) 3272 goto out; 3273 space_reserved = true; 3274 ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, 3275 &cached_state); 3276 if (ret) 3277 goto out; 3278 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, 3279 alloc_start, bytes_to_reserve); 3280 if (ret) { 3281 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, 3282 lockend, &cached_state); 3283 goto out; 3284 } 3285 ret = btrfs_prealloc_file_range(inode, mode, alloc_start, 3286 alloc_end - alloc_start, 3287 i_blocksize(inode), 3288 offset + len, &alloc_hint); 3289 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, 3290 lockend, &cached_state); 3291 /* btrfs_prealloc_file_range releases reserved space on error */ 3292 if (ret) { 3293 space_reserved = false; 3294 goto out; 3295 } 3296 } 3297 ret = btrfs_fallocate_update_isize(inode, offset + len, mode); 3298 out: 3299 if (ret && space_reserved) 3300 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, 3301 alloc_start, bytes_to_reserve); 3302 extent_changeset_free(data_reserved); 3303 3304 return ret; 3305 } 3306 3307 static long btrfs_fallocate(struct file *file, int mode, 3308 loff_t offset, loff_t len) 3309 { 3310 struct inode *inode = file_inode(file); 3311 struct extent_state *cached_state = NULL; 3312 struct extent_changeset *data_reserved = NULL; 3313 struct falloc_range *range; 3314 struct falloc_range *tmp; 3315 struct list_head reserve_list; 3316 u64 cur_offset; 3317 u64 last_byte; 3318 u64 alloc_start; 3319 u64 alloc_end; 3320 u64 alloc_hint = 0; 3321 u64 locked_end; 3322 u64 actual_end = 0; 3323 struct extent_map *em; 3324 int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); 3325 int ret; 3326 3327 /* Do not allow fallocate in ZONED mode */ 3328 if (btrfs_is_zoned(btrfs_sb(inode->i_sb))) 3329 return -EOPNOTSUPP; 3330 3331 alloc_start = round_down(offset, blocksize); 3332 alloc_end = round_up(offset + len, blocksize); 3333 cur_offset = alloc_start; 3334 3335 /* Make sure we aren't being give some crap mode */ 3336 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 3337 FALLOC_FL_ZERO_RANGE)) 3338 return -EOPNOTSUPP; 3339 3340 if (mode & FALLOC_FL_PUNCH_HOLE) 3341 return btrfs_punch_hole(inode, offset, len); 3342 3343 /* 3344 * Only trigger disk allocation, don't trigger qgroup reserve 3345 * 3346 * For qgroup space, it will be checked later. 3347 */ 3348 if (!(mode & FALLOC_FL_ZERO_RANGE)) { 3349 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 3350 alloc_end - alloc_start); 3351 if (ret < 0) 3352 return ret; 3353 } 3354 3355 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); 3356 3357 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { 3358 ret = inode_newsize_ok(inode, offset + len); 3359 if (ret) 3360 goto out; 3361 } 3362 3363 /* 3364 * TODO: Move these two operations after we have checked 3365 * accurate reserved space, or fallocate can still fail but 3366 * with page truncated or size expanded. 3367 * 3368 * But that's a minor problem and won't do much harm BTW. 3369 */ 3370 if (alloc_start > inode->i_size) { 3371 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode), 3372 alloc_start); 3373 if (ret) 3374 goto out; 3375 } else if (offset + len > inode->i_size) { 3376 /* 3377 * If we are fallocating from the end of the file onward we 3378 * need to zero out the end of the block if i_size lands in the 3379 * middle of a block. 3380 */ 3381 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); 3382 if (ret) 3383 goto out; 3384 } 3385 3386 /* 3387 * wait for ordered IO before we have any locks. We'll loop again 3388 * below with the locks held. 3389 */ 3390 ret = btrfs_wait_ordered_range(inode, alloc_start, 3391 alloc_end - alloc_start); 3392 if (ret) 3393 goto out; 3394 3395 if (mode & FALLOC_FL_ZERO_RANGE) { 3396 ret = btrfs_zero_range(inode, offset, len, mode); 3397 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 3398 return ret; 3399 } 3400 3401 locked_end = alloc_end - 1; 3402 while (1) { 3403 struct btrfs_ordered_extent *ordered; 3404 3405 /* the extent lock is ordered inside the running 3406 * transaction 3407 */ 3408 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, 3409 locked_end, &cached_state); 3410 ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), 3411 locked_end); 3412 3413 if (ordered && 3414 ordered->file_offset + ordered->num_bytes > alloc_start && 3415 ordered->file_offset < alloc_end) { 3416 btrfs_put_ordered_extent(ordered); 3417 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 3418 alloc_start, locked_end, 3419 &cached_state); 3420 /* 3421 * we can't wait on the range with the transaction 3422 * running or with the extent lock held 3423 */ 3424 ret = btrfs_wait_ordered_range(inode, alloc_start, 3425 alloc_end - alloc_start); 3426 if (ret) 3427 goto out; 3428 } else { 3429 if (ordered) 3430 btrfs_put_ordered_extent(ordered); 3431 break; 3432 } 3433 } 3434 3435 /* First, check if we exceed the qgroup limit */ 3436 INIT_LIST_HEAD(&reserve_list); 3437 while (cur_offset < alloc_end) { 3438 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, 3439 alloc_end - cur_offset); 3440 if (IS_ERR(em)) { 3441 ret = PTR_ERR(em); 3442 break; 3443 } 3444 last_byte = min(extent_map_end(em), alloc_end); 3445 actual_end = min_t(u64, extent_map_end(em), offset + len); 3446 last_byte = ALIGN(last_byte, blocksize); 3447 if (em->block_start == EXTENT_MAP_HOLE || 3448 (cur_offset >= inode->i_size && 3449 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 3450 ret = add_falloc_range(&reserve_list, cur_offset, 3451 last_byte - cur_offset); 3452 if (ret < 0) { 3453 free_extent_map(em); 3454 break; 3455 } 3456 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), 3457 &data_reserved, cur_offset, 3458 last_byte - cur_offset); 3459 if (ret < 0) { 3460 cur_offset = last_byte; 3461 free_extent_map(em); 3462 break; 3463 } 3464 } else { 3465 /* 3466 * Do not need to reserve unwritten extent for this 3467 * range, free reserved data space first, otherwise 3468 * it'll result in false ENOSPC error. 3469 */ 3470 btrfs_free_reserved_data_space(BTRFS_I(inode), 3471 data_reserved, cur_offset, 3472 last_byte - cur_offset); 3473 } 3474 free_extent_map(em); 3475 cur_offset = last_byte; 3476 } 3477 3478 /* 3479 * If ret is still 0, means we're OK to fallocate. 3480 * Or just cleanup the list and exit. 3481 */ 3482 list_for_each_entry_safe(range, tmp, &reserve_list, list) { 3483 if (!ret) 3484 ret = btrfs_prealloc_file_range(inode, mode, 3485 range->start, 3486 range->len, i_blocksize(inode), 3487 offset + len, &alloc_hint); 3488 else 3489 btrfs_free_reserved_data_space(BTRFS_I(inode), 3490 data_reserved, range->start, 3491 range->len); 3492 list_del(&range->list); 3493 kfree(range); 3494 } 3495 if (ret < 0) 3496 goto out_unlock; 3497 3498 /* 3499 * We didn't need to allocate any more space, but we still extended the 3500 * size of the file so we need to update i_size and the inode item. 3501 */ 3502 ret = btrfs_fallocate_update_isize(inode, actual_end, mode); 3503 out_unlock: 3504 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 3505 &cached_state); 3506 out: 3507 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 3508 /* Let go of our reservation. */ 3509 if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) 3510 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, 3511 cur_offset, alloc_end - cur_offset); 3512 extent_changeset_free(data_reserved); 3513 return ret; 3514 } 3515 3516 static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, 3517 int whence) 3518 { 3519 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3520 struct extent_map *em = NULL; 3521 struct extent_state *cached_state = NULL; 3522 loff_t i_size = inode->vfs_inode.i_size; 3523 u64 lockstart; 3524 u64 lockend; 3525 u64 start; 3526 u64 len; 3527 int ret = 0; 3528 3529 if (i_size == 0 || offset >= i_size) 3530 return -ENXIO; 3531 3532 /* 3533 * offset can be negative, in this case we start finding DATA/HOLE from 3534 * the very start of the file. 3535 */ 3536 start = max_t(loff_t, 0, offset); 3537 3538 lockstart = round_down(start, fs_info->sectorsize); 3539 lockend = round_up(i_size, fs_info->sectorsize); 3540 if (lockend <= lockstart) 3541 lockend = lockstart + fs_info->sectorsize; 3542 lockend--; 3543 len = lockend - lockstart + 1; 3544 3545 lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state); 3546 3547 while (start < i_size) { 3548 em = btrfs_get_extent_fiemap(inode, start, len); 3549 if (IS_ERR(em)) { 3550 ret = PTR_ERR(em); 3551 em = NULL; 3552 break; 3553 } 3554 3555 if (whence == SEEK_HOLE && 3556 (em->block_start == EXTENT_MAP_HOLE || 3557 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) 3558 break; 3559 else if (whence == SEEK_DATA && 3560 (em->block_start != EXTENT_MAP_HOLE && 3561 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) 3562 break; 3563 3564 start = em->start + em->len; 3565 free_extent_map(em); 3566 em = NULL; 3567 cond_resched(); 3568 } 3569 free_extent_map(em); 3570 unlock_extent_cached(&inode->io_tree, lockstart, lockend, 3571 &cached_state); 3572 if (ret) { 3573 offset = ret; 3574 } else { 3575 if (whence == SEEK_DATA && start >= i_size) 3576 offset = -ENXIO; 3577 else 3578 offset = min_t(loff_t, start, i_size); 3579 } 3580 3581 return offset; 3582 } 3583 3584 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) 3585 { 3586 struct inode *inode = file->f_mapping->host; 3587 3588 switch (whence) { 3589 default: 3590 return generic_file_llseek(file, offset, whence); 3591 case SEEK_DATA: 3592 case SEEK_HOLE: 3593 btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); 3594 offset = find_desired_extent(BTRFS_I(inode), offset, whence); 3595 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 3596 break; 3597 } 3598 3599 if (offset < 0) 3600 return offset; 3601 3602 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 3603 } 3604 3605 static int btrfs_file_open(struct inode *inode, struct file *filp) 3606 { 3607 filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; 3608 return generic_file_open(inode, filp); 3609 } 3610 3611 static int check_direct_read(struct btrfs_fs_info *fs_info, 3612 const struct iov_iter *iter, loff_t offset) 3613 { 3614 int ret; 3615 int i, seg; 3616 3617 ret = check_direct_IO(fs_info, iter, offset); 3618 if (ret < 0) 3619 return ret; 3620 3621 if (!iter_is_iovec(iter)) 3622 return 0; 3623 3624 for (seg = 0; seg < iter->nr_segs; seg++) 3625 for (i = seg + 1; i < iter->nr_segs; i++) 3626 if (iter->iov[seg].iov_base == iter->iov[i].iov_base) 3627 return -EINVAL; 3628 return 0; 3629 } 3630 3631 static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) 3632 { 3633 struct inode *inode = file_inode(iocb->ki_filp); 3634 ssize_t ret; 3635 3636 if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) 3637 return 0; 3638 3639 btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); 3640 ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0); 3641 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 3642 return ret; 3643 } 3644 3645 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 3646 { 3647 ssize_t ret = 0; 3648 3649 if (iocb->ki_flags & IOCB_DIRECT) { 3650 ret = btrfs_direct_read(iocb, to); 3651 if (ret < 0 || !iov_iter_count(to) || 3652 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) 3653 return ret; 3654 } 3655 3656 return filemap_read(iocb, to, ret); 3657 } 3658 3659 const struct file_operations btrfs_file_operations = { 3660 .llseek = btrfs_file_llseek, 3661 .read_iter = btrfs_file_read_iter, 3662 .splice_read = generic_file_splice_read, 3663 .write_iter = btrfs_file_write_iter, 3664 .splice_write = iter_file_splice_write, 3665 .mmap = btrfs_file_mmap, 3666 .open = btrfs_file_open, 3667 .release = btrfs_release_file, 3668 .fsync = btrfs_sync_file, 3669 .fallocate = btrfs_fallocate, 3670 .unlocked_ioctl = btrfs_ioctl, 3671 #ifdef CONFIG_COMPAT 3672 .compat_ioctl = btrfs_compat_ioctl, 3673 #endif 3674 .remap_file_range = btrfs_remap_file_range, 3675 }; 3676 3677 void __cold btrfs_auto_defrag_exit(void) 3678 { 3679 kmem_cache_destroy(btrfs_inode_defrag_cachep); 3680 } 3681 3682 int __init btrfs_auto_defrag_init(void) 3683 { 3684 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 3685 sizeof(struct inode_defrag), 0, 3686 SLAB_MEM_SPREAD, 3687 NULL); 3688 if (!btrfs_inode_defrag_cachep) 3689 return -ENOMEM; 3690 3691 return 0; 3692 } 3693 3694 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) 3695 { 3696 int ret; 3697 3698 /* 3699 * So with compression we will find and lock a dirty page and clear the 3700 * first one as dirty, setup an async extent, and immediately return 3701 * with the entire range locked but with nobody actually marked with 3702 * writeback. So we can't just filemap_write_and_wait_range() and 3703 * expect it to work since it will just kick off a thread to do the 3704 * actual work. So we need to call filemap_fdatawrite_range _again_ 3705 * since it will wait on the page lock, which won't be unlocked until 3706 * after the pages have been marked as writeback and so we're good to go 3707 * from there. We have to do this otherwise we'll miss the ordered 3708 * extents and that results in badness. Please Josef, do not think you 3709 * know better and pull this out at some point in the future, it is 3710 * right and you are wrong. 3711 */ 3712 ret = filemap_fdatawrite_range(inode->i_mapping, start, end); 3713 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 3714 &BTRFS_I(inode)->runtime_flags)) 3715 ret = filemap_fdatawrite_range(inode->i_mapping, start, end); 3716 3717 return ret; 3718 } 3719