1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <crypto/hash.h> 7 #include <linux/kernel.h> 8 #include <linux/bio.h> 9 #include <linux/file.h> 10 #include <linux/fs.h> 11 #include <linux/pagemap.h> 12 #include <linux/highmem.h> 13 #include <linux/time.h> 14 #include <linux/init.h> 15 #include <linux/string.h> 16 #include <linux/backing-dev.h> 17 #include <linux/writeback.h> 18 #include <linux/compat.h> 19 #include <linux/xattr.h> 20 #include <linux/posix_acl.h> 21 #include <linux/falloc.h> 22 #include <linux/slab.h> 23 #include <linux/ratelimit.h> 24 #include <linux/btrfs.h> 25 #include <linux/blkdev.h> 26 #include <linux/posix_acl_xattr.h> 27 #include <linux/uio.h> 28 #include <linux/magic.h> 29 #include <linux/iversion.h> 30 #include <linux/swap.h> 31 #include <linux/migrate.h> 32 #include <linux/sched/mm.h> 33 #include <linux/iomap.h> 34 #include <asm/unaligned.h> 35 #include <linux/fsverity.h> 36 #include "misc.h" 37 #include "ctree.h" 38 #include "disk-io.h" 39 #include "transaction.h" 40 #include "btrfs_inode.h" 41 #include "print-tree.h" 42 #include "ordered-data.h" 43 #include "xattr.h" 44 #include "tree-log.h" 45 #include "volumes.h" 46 #include "compression.h" 47 #include "locking.h" 48 #include "free-space-cache.h" 49 #include "props.h" 50 #include "qgroup.h" 51 #include "delalloc-space.h" 52 #include "block-group.h" 53 #include "space-info.h" 54 #include "zoned.h" 55 #include "subpage.h" 56 57 struct btrfs_iget_args { 58 u64 ino; 59 struct btrfs_root *root; 60 }; 61 62 struct btrfs_dio_data { 63 u64 reserve; 64 loff_t length; 65 ssize_t submitted; 66 struct extent_changeset *data_reserved; 67 }; 68 69 static const struct inode_operations btrfs_dir_inode_operations; 70 static const struct inode_operations btrfs_symlink_inode_operations; 71 static const struct inode_operations btrfs_special_inode_operations; 72 static const struct inode_operations btrfs_file_inode_operations; 73 static const struct address_space_operations btrfs_aops; 74 static const struct file_operations btrfs_dir_file_operations; 75 76 static struct kmem_cache *btrfs_inode_cachep; 77 struct kmem_cache *btrfs_trans_handle_cachep; 78 struct kmem_cache *btrfs_path_cachep; 79 struct kmem_cache *btrfs_free_space_cachep; 80 struct kmem_cache *btrfs_free_space_bitmap_cachep; 81 82 static int btrfs_setsize(struct inode *inode, struct iattr *attr); 83 static int btrfs_truncate(struct inode *inode, bool skip_writeback); 84 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 85 static noinline int cow_file_range(struct btrfs_inode *inode, 86 struct page *locked_page, 87 u64 start, u64 end, int *page_started, 88 unsigned long *nr_written, int unlock); 89 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, 90 u64 len, u64 orig_start, u64 block_start, 91 u64 block_len, u64 orig_block_len, 92 u64 ram_bytes, int compress_type, 93 int type); 94 95 static void __endio_write_update_ordered(struct btrfs_inode *inode, 96 const u64 offset, const u64 bytes, 97 const bool uptodate); 98 99 /* 100 * btrfs_inode_lock - lock inode i_rwsem based on arguments passed 101 * 102 * ilock_flags can have the following bit set: 103 * 104 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode 105 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt 106 * return -EAGAIN 107 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock 108 */ 109 int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) 110 { 111 if (ilock_flags & BTRFS_ILOCK_SHARED) { 112 if (ilock_flags & BTRFS_ILOCK_TRY) { 113 if (!inode_trylock_shared(inode)) 114 return -EAGAIN; 115 else 116 return 0; 117 } 118 inode_lock_shared(inode); 119 } else { 120 if (ilock_flags & BTRFS_ILOCK_TRY) { 121 if (!inode_trylock(inode)) 122 return -EAGAIN; 123 else 124 return 0; 125 } 126 inode_lock(inode); 127 } 128 if (ilock_flags & BTRFS_ILOCK_MMAP) 129 down_write(&BTRFS_I(inode)->i_mmap_lock); 130 return 0; 131 } 132 133 /* 134 * btrfs_inode_unlock - unock inode i_rwsem 135 * 136 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() 137 * to decide whether the lock acquired is shared or exclusive. 138 */ 139 void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) 140 { 141 if (ilock_flags & BTRFS_ILOCK_MMAP) 142 up_write(&BTRFS_I(inode)->i_mmap_lock); 143 if (ilock_flags & BTRFS_ILOCK_SHARED) 144 inode_unlock_shared(inode); 145 else 146 inode_unlock(inode); 147 } 148 149 /* 150 * Cleanup all submitted ordered extents in specified range to handle errors 151 * from the btrfs_run_delalloc_range() callback. 152 * 153 * NOTE: caller must ensure that when an error happens, it can not call 154 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING 155 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata 156 * to be released, which we want to happen only when finishing the ordered 157 * extent (btrfs_finish_ordered_io()). 158 */ 159 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, 160 struct page *locked_page, 161 u64 offset, u64 bytes) 162 { 163 unsigned long index = offset >> PAGE_SHIFT; 164 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; 165 u64 page_start = page_offset(locked_page); 166 u64 page_end = page_start + PAGE_SIZE - 1; 167 168 struct page *page; 169 170 while (index <= end_index) { 171 /* 172 * For locked page, we will call end_extent_writepage() on it 173 * in run_delalloc_range() for the error handling. That 174 * end_extent_writepage() function will call 175 * btrfs_mark_ordered_io_finished() to clear page Ordered and 176 * run the ordered extent accounting. 177 * 178 * Here we can't just clear the Ordered bit, or 179 * btrfs_mark_ordered_io_finished() would skip the accounting 180 * for the page range, and the ordered extent will never finish. 181 */ 182 if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { 183 index++; 184 continue; 185 } 186 page = find_get_page(inode->vfs_inode.i_mapping, index); 187 index++; 188 if (!page) 189 continue; 190 191 /* 192 * Here we just clear all Ordered bits for every page in the 193 * range, then __endio_write_update_ordered() will handle 194 * the ordered extent accounting for the range. 195 */ 196 btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, 197 offset, bytes); 198 put_page(page); 199 } 200 201 /* The locked page covers the full range, nothing needs to be done */ 202 if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) 203 return; 204 /* 205 * In case this page belongs to the delalloc range being instantiated 206 * then skip it, since the first page of a range is going to be 207 * properly cleaned up by the caller of run_delalloc_range 208 */ 209 if (page_start >= offset && page_end <= (offset + bytes - 1)) { 210 bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; 211 offset = page_offset(locked_page) + PAGE_SIZE; 212 } 213 214 return __endio_write_update_ordered(inode, offset, bytes, false); 215 } 216 217 static int btrfs_dirty_inode(struct inode *inode); 218 219 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 220 struct inode *inode, struct inode *dir, 221 const struct qstr *qstr) 222 { 223 int err; 224 225 err = btrfs_init_acl(trans, inode, dir); 226 if (!err) 227 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 228 return err; 229 } 230 231 /* 232 * this does all the hard work for inserting an inline extent into 233 * the btree. The caller should have done a btrfs_drop_extents so that 234 * no overlapping inline items exist in the btree 235 */ 236 static int insert_inline_extent(struct btrfs_trans_handle *trans, 237 struct btrfs_path *path, bool extent_inserted, 238 struct btrfs_root *root, struct inode *inode, 239 u64 start, size_t size, size_t compressed_size, 240 int compress_type, 241 struct page **compressed_pages) 242 { 243 struct extent_buffer *leaf; 244 struct page *page = NULL; 245 char *kaddr; 246 unsigned long ptr; 247 struct btrfs_file_extent_item *ei; 248 int ret; 249 size_t cur_size = size; 250 unsigned long offset; 251 252 ASSERT((compressed_size > 0 && compressed_pages) || 253 (compressed_size == 0 && !compressed_pages)); 254 255 if (compressed_size && compressed_pages) 256 cur_size = compressed_size; 257 258 if (!extent_inserted) { 259 struct btrfs_key key; 260 size_t datasize; 261 262 key.objectid = btrfs_ino(BTRFS_I(inode)); 263 key.offset = start; 264 key.type = BTRFS_EXTENT_DATA_KEY; 265 266 datasize = btrfs_file_extent_calc_inline_size(cur_size); 267 ret = btrfs_insert_empty_item(trans, root, path, &key, 268 datasize); 269 if (ret) 270 goto fail; 271 } 272 leaf = path->nodes[0]; 273 ei = btrfs_item_ptr(leaf, path->slots[0], 274 struct btrfs_file_extent_item); 275 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 276 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 277 btrfs_set_file_extent_encryption(leaf, ei, 0); 278 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 279 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 280 ptr = btrfs_file_extent_inline_start(ei); 281 282 if (compress_type != BTRFS_COMPRESS_NONE) { 283 struct page *cpage; 284 int i = 0; 285 while (compressed_size > 0) { 286 cpage = compressed_pages[i]; 287 cur_size = min_t(unsigned long, compressed_size, 288 PAGE_SIZE); 289 290 kaddr = page_address(cpage); 291 write_extent_buffer(leaf, kaddr, ptr, cur_size); 292 293 i++; 294 ptr += cur_size; 295 compressed_size -= cur_size; 296 } 297 btrfs_set_file_extent_compression(leaf, ei, 298 compress_type); 299 } else { 300 page = find_get_page(inode->i_mapping, 301 start >> PAGE_SHIFT); 302 btrfs_set_file_extent_compression(leaf, ei, 0); 303 kaddr = kmap_atomic(page); 304 offset = offset_in_page(start); 305 write_extent_buffer(leaf, kaddr + offset, ptr, size); 306 kunmap_atomic(kaddr); 307 put_page(page); 308 } 309 btrfs_mark_buffer_dirty(leaf); 310 btrfs_release_path(path); 311 312 /* 313 * We align size to sectorsize for inline extents just for simplicity 314 * sake. 315 */ 316 size = ALIGN(size, root->fs_info->sectorsize); 317 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); 318 if (ret) 319 goto fail; 320 321 /* 322 * we're an inline extent, so nobody can 323 * extend the file past i_size without locking 324 * a page we already have locked. 325 * 326 * We must do any isize and inode updates 327 * before we unlock the pages. Otherwise we 328 * could end up racing with unlink. 329 */ 330 BTRFS_I(inode)->disk_i_size = inode->i_size; 331 fail: 332 return ret; 333 } 334 335 336 /* 337 * conditionally insert an inline extent into the file. This 338 * does the checks required to make sure the data is small enough 339 * to fit as an inline extent. 340 */ 341 static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, 342 u64 end, size_t compressed_size, 343 int compress_type, 344 struct page **compressed_pages) 345 { 346 struct btrfs_drop_extents_args drop_args = { 0 }; 347 struct btrfs_root *root = inode->root; 348 struct btrfs_fs_info *fs_info = root->fs_info; 349 struct btrfs_trans_handle *trans; 350 u64 isize = i_size_read(&inode->vfs_inode); 351 u64 actual_end = min(end + 1, isize); 352 u64 inline_len = actual_end - start; 353 u64 aligned_end = ALIGN(end, fs_info->sectorsize); 354 u64 data_len = inline_len; 355 int ret; 356 struct btrfs_path *path; 357 358 if (compressed_size) 359 data_len = compressed_size; 360 361 if (start > 0 || 362 actual_end > fs_info->sectorsize || 363 data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || 364 (!compressed_size && 365 (actual_end & (fs_info->sectorsize - 1)) == 0) || 366 end + 1 < isize || 367 data_len > fs_info->max_inline) { 368 return 1; 369 } 370 371 path = btrfs_alloc_path(); 372 if (!path) 373 return -ENOMEM; 374 375 trans = btrfs_join_transaction(root); 376 if (IS_ERR(trans)) { 377 btrfs_free_path(path); 378 return PTR_ERR(trans); 379 } 380 trans->block_rsv = &inode->block_rsv; 381 382 drop_args.path = path; 383 drop_args.start = start; 384 drop_args.end = aligned_end; 385 drop_args.drop_cache = true; 386 drop_args.replace_extent = true; 387 388 if (compressed_size && compressed_pages) 389 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( 390 compressed_size); 391 else 392 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( 393 inline_len); 394 395 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 396 if (ret) { 397 btrfs_abort_transaction(trans, ret); 398 goto out; 399 } 400 401 if (isize > actual_end) 402 inline_len = min_t(u64, isize, actual_end); 403 ret = insert_inline_extent(trans, path, drop_args.extent_inserted, 404 root, &inode->vfs_inode, start, 405 inline_len, compressed_size, 406 compress_type, compressed_pages); 407 if (ret && ret != -ENOSPC) { 408 btrfs_abort_transaction(trans, ret); 409 goto out; 410 } else if (ret == -ENOSPC) { 411 ret = 1; 412 goto out; 413 } 414 415 btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); 416 ret = btrfs_update_inode(trans, root, inode); 417 if (ret && ret != -ENOSPC) { 418 btrfs_abort_transaction(trans, ret); 419 goto out; 420 } else if (ret == -ENOSPC) { 421 ret = 1; 422 goto out; 423 } 424 425 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 426 out: 427 /* 428 * Don't forget to free the reserved space, as for inlined extent 429 * it won't count as data extent, free them directly here. 430 * And at reserve time, it's always aligned to page size, so 431 * just free one page here. 432 */ 433 btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); 434 btrfs_free_path(path); 435 btrfs_end_transaction(trans); 436 return ret; 437 } 438 439 struct async_extent { 440 u64 start; 441 u64 ram_size; 442 u64 compressed_size; 443 struct page **pages; 444 unsigned long nr_pages; 445 int compress_type; 446 struct list_head list; 447 }; 448 449 struct async_chunk { 450 struct inode *inode; 451 struct page *locked_page; 452 u64 start; 453 u64 end; 454 unsigned int write_flags; 455 struct list_head extents; 456 struct cgroup_subsys_state *blkcg_css; 457 struct btrfs_work work; 458 atomic_t *pending; 459 }; 460 461 struct async_cow { 462 /* Number of chunks in flight; must be first in the structure */ 463 atomic_t num_chunks; 464 struct async_chunk chunks[]; 465 }; 466 467 static noinline int add_async_extent(struct async_chunk *cow, 468 u64 start, u64 ram_size, 469 u64 compressed_size, 470 struct page **pages, 471 unsigned long nr_pages, 472 int compress_type) 473 { 474 struct async_extent *async_extent; 475 476 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 477 BUG_ON(!async_extent); /* -ENOMEM */ 478 async_extent->start = start; 479 async_extent->ram_size = ram_size; 480 async_extent->compressed_size = compressed_size; 481 async_extent->pages = pages; 482 async_extent->nr_pages = nr_pages; 483 async_extent->compress_type = compress_type; 484 list_add_tail(&async_extent->list, &cow->extents); 485 return 0; 486 } 487 488 /* 489 * Check if the inode has flags compatible with compression 490 */ 491 static inline bool inode_can_compress(struct btrfs_inode *inode) 492 { 493 /* Subpage doesn't support compression yet */ 494 if (inode->root->fs_info->sectorsize < PAGE_SIZE) 495 return false; 496 if (inode->flags & BTRFS_INODE_NODATACOW || 497 inode->flags & BTRFS_INODE_NODATASUM) 498 return false; 499 return true; 500 } 501 502 /* 503 * Check if the inode needs to be submitted to compression, based on mount 504 * options, defragmentation, properties or heuristics. 505 */ 506 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, 507 u64 end) 508 { 509 struct btrfs_fs_info *fs_info = inode->root->fs_info; 510 511 if (!inode_can_compress(inode)) { 512 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), 513 KERN_ERR "BTRFS: unexpected compression for ino %llu\n", 514 btrfs_ino(inode)); 515 return 0; 516 } 517 /* force compress */ 518 if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) 519 return 1; 520 /* defrag ioctl */ 521 if (inode->defrag_compress) 522 return 1; 523 /* bad compression ratios */ 524 if (inode->flags & BTRFS_INODE_NOCOMPRESS) 525 return 0; 526 if (btrfs_test_opt(fs_info, COMPRESS) || 527 inode->flags & BTRFS_INODE_COMPRESS || 528 inode->prop_compress) 529 return btrfs_compress_heuristic(&inode->vfs_inode, start, end); 530 return 0; 531 } 532 533 static inline void inode_should_defrag(struct btrfs_inode *inode, 534 u64 start, u64 end, u64 num_bytes, u64 small_write) 535 { 536 /* If this is a small write inside eof, kick off a defrag */ 537 if (num_bytes < small_write && 538 (start > 0 || end + 1 < inode->disk_i_size)) 539 btrfs_add_inode_defrag(NULL, inode); 540 } 541 542 /* 543 * we create compressed extents in two phases. The first 544 * phase compresses a range of pages that have already been 545 * locked (both pages and state bits are locked). 546 * 547 * This is done inside an ordered work queue, and the compression 548 * is spread across many cpus. The actual IO submission is step 549 * two, and the ordered work queue takes care of making sure that 550 * happens in the same order things were put onto the queue by 551 * writepages and friends. 552 * 553 * If this code finds it can't get good compression, it puts an 554 * entry onto the work queue to write the uncompressed bytes. This 555 * makes sure that both compressed inodes and uncompressed inodes 556 * are written in the same order that the flusher thread sent them 557 * down. 558 */ 559 static noinline int compress_file_range(struct async_chunk *async_chunk) 560 { 561 struct inode *inode = async_chunk->inode; 562 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 563 u64 blocksize = fs_info->sectorsize; 564 u64 start = async_chunk->start; 565 u64 end = async_chunk->end; 566 u64 actual_end; 567 u64 i_size; 568 int ret = 0; 569 struct page **pages = NULL; 570 unsigned long nr_pages; 571 unsigned long total_compressed = 0; 572 unsigned long total_in = 0; 573 int i; 574 int will_compress; 575 int compress_type = fs_info->compress_type; 576 int compressed_extents = 0; 577 int redirty = 0; 578 579 inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, 580 SZ_16K); 581 582 /* 583 * We need to save i_size before now because it could change in between 584 * us evaluating the size and assigning it. This is because we lock and 585 * unlock the page in truncate and fallocate, and then modify the i_size 586 * later on. 587 * 588 * The barriers are to emulate READ_ONCE, remove that once i_size_read 589 * does that for us. 590 */ 591 barrier(); 592 i_size = i_size_read(inode); 593 barrier(); 594 actual_end = min_t(u64, i_size, end + 1); 595 again: 596 will_compress = 0; 597 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; 598 BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); 599 nr_pages = min_t(unsigned long, nr_pages, 600 BTRFS_MAX_COMPRESSED / PAGE_SIZE); 601 602 /* 603 * we don't want to send crud past the end of i_size through 604 * compression, that's just a waste of CPU time. So, if the 605 * end of the file is before the start of our current 606 * requested range of bytes, we bail out to the uncompressed 607 * cleanup code that can deal with all of this. 608 * 609 * It isn't really the fastest way to fix things, but this is a 610 * very uncommon corner. 611 */ 612 if (actual_end <= start) 613 goto cleanup_and_bail_uncompressed; 614 615 total_compressed = actual_end - start; 616 617 /* 618 * skip compression for a small file range(<=blocksize) that 619 * isn't an inline extent, since it doesn't save disk space at all. 620 */ 621 if (total_compressed <= blocksize && 622 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 623 goto cleanup_and_bail_uncompressed; 624 625 total_compressed = min_t(unsigned long, total_compressed, 626 BTRFS_MAX_UNCOMPRESSED); 627 total_in = 0; 628 ret = 0; 629 630 /* 631 * we do compression for mount -o compress and when the 632 * inode has not been flagged as nocompress. This flag can 633 * change at any time if we discover bad compression ratios. 634 */ 635 if (inode_need_compress(BTRFS_I(inode), start, end)) { 636 WARN_ON(pages); 637 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 638 if (!pages) { 639 /* just bail out to the uncompressed code */ 640 nr_pages = 0; 641 goto cont; 642 } 643 644 if (BTRFS_I(inode)->defrag_compress) 645 compress_type = BTRFS_I(inode)->defrag_compress; 646 else if (BTRFS_I(inode)->prop_compress) 647 compress_type = BTRFS_I(inode)->prop_compress; 648 649 /* 650 * we need to call clear_page_dirty_for_io on each 651 * page in the range. Otherwise applications with the file 652 * mmap'd can wander in and change the page contents while 653 * we are compressing them. 654 * 655 * If the compression fails for any reason, we set the pages 656 * dirty again later on. 657 * 658 * Note that the remaining part is redirtied, the start pointer 659 * has moved, the end is the original one. 660 */ 661 if (!redirty) { 662 extent_range_clear_dirty_for_io(inode, start, end); 663 redirty = 1; 664 } 665 666 /* Compression level is applied here and only here */ 667 ret = btrfs_compress_pages( 668 compress_type | (fs_info->compress_level << 4), 669 inode->i_mapping, start, 670 pages, 671 &nr_pages, 672 &total_in, 673 &total_compressed); 674 675 if (!ret) { 676 unsigned long offset = offset_in_page(total_compressed); 677 struct page *page = pages[nr_pages - 1]; 678 679 /* zero the tail end of the last page, we might be 680 * sending it down to disk 681 */ 682 if (offset) 683 memzero_page(page, offset, PAGE_SIZE - offset); 684 will_compress = 1; 685 } 686 } 687 cont: 688 /* 689 * Check cow_file_range() for why we don't even try to create inline 690 * extent for subpage case. 691 */ 692 if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { 693 /* lets try to make an inline extent */ 694 if (ret || total_in < actual_end) { 695 /* we didn't compress the entire range, try 696 * to make an uncompressed inline extent. 697 */ 698 ret = cow_file_range_inline(BTRFS_I(inode), start, end, 699 0, BTRFS_COMPRESS_NONE, 700 NULL); 701 } else { 702 /* try making a compressed inline extent */ 703 ret = cow_file_range_inline(BTRFS_I(inode), start, end, 704 total_compressed, 705 compress_type, pages); 706 } 707 if (ret <= 0) { 708 unsigned long clear_flags = EXTENT_DELALLOC | 709 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 710 EXTENT_DO_ACCOUNTING; 711 unsigned long page_error_op; 712 713 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; 714 715 /* 716 * inline extent creation worked or returned error, 717 * we don't need to create any more async work items. 718 * Unlock and free up our temp pages. 719 * 720 * We use DO_ACCOUNTING here because we need the 721 * delalloc_release_metadata to be done _after_ we drop 722 * our outstanding extent for clearing delalloc for this 723 * range. 724 */ 725 extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, 726 NULL, 727 clear_flags, 728 PAGE_UNLOCK | 729 PAGE_START_WRITEBACK | 730 page_error_op | 731 PAGE_END_WRITEBACK); 732 733 /* 734 * Ensure we only free the compressed pages if we have 735 * them allocated, as we can still reach here with 736 * inode_need_compress() == false. 737 */ 738 if (pages) { 739 for (i = 0; i < nr_pages; i++) { 740 WARN_ON(pages[i]->mapping); 741 put_page(pages[i]); 742 } 743 kfree(pages); 744 } 745 return 0; 746 } 747 } 748 749 if (will_compress) { 750 /* 751 * we aren't doing an inline extent round the compressed size 752 * up to a block size boundary so the allocator does sane 753 * things 754 */ 755 total_compressed = ALIGN(total_compressed, blocksize); 756 757 /* 758 * one last check to make sure the compression is really a 759 * win, compare the page count read with the blocks on disk, 760 * compression must free at least one sector size 761 */ 762 total_in = ALIGN(total_in, PAGE_SIZE); 763 if (total_compressed + blocksize <= total_in) { 764 compressed_extents++; 765 766 /* 767 * The async work queues will take care of doing actual 768 * allocation on disk for these compressed pages, and 769 * will submit them to the elevator. 770 */ 771 add_async_extent(async_chunk, start, total_in, 772 total_compressed, pages, nr_pages, 773 compress_type); 774 775 if (start + total_in < end) { 776 start += total_in; 777 pages = NULL; 778 cond_resched(); 779 goto again; 780 } 781 return compressed_extents; 782 } 783 } 784 if (pages) { 785 /* 786 * the compression code ran but failed to make things smaller, 787 * free any pages it allocated and our page pointer array 788 */ 789 for (i = 0; i < nr_pages; i++) { 790 WARN_ON(pages[i]->mapping); 791 put_page(pages[i]); 792 } 793 kfree(pages); 794 pages = NULL; 795 total_compressed = 0; 796 nr_pages = 0; 797 798 /* flag the file so we don't compress in the future */ 799 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && 800 !(BTRFS_I(inode)->prop_compress)) { 801 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 802 } 803 } 804 cleanup_and_bail_uncompressed: 805 /* 806 * No compression, but we still need to write the pages in the file 807 * we've been given so far. redirty the locked page if it corresponds 808 * to our extent and set things up for the async work queue to run 809 * cow_file_range to do the normal delalloc dance. 810 */ 811 if (async_chunk->locked_page && 812 (page_offset(async_chunk->locked_page) >= start && 813 page_offset(async_chunk->locked_page)) <= end) { 814 __set_page_dirty_nobuffers(async_chunk->locked_page); 815 /* unlocked later on in the async handlers */ 816 } 817 818 if (redirty) 819 extent_range_redirty_for_io(inode, start, end); 820 add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, 821 BTRFS_COMPRESS_NONE); 822 compressed_extents++; 823 824 return compressed_extents; 825 } 826 827 static void free_async_extent_pages(struct async_extent *async_extent) 828 { 829 int i; 830 831 if (!async_extent->pages) 832 return; 833 834 for (i = 0; i < async_extent->nr_pages; i++) { 835 WARN_ON(async_extent->pages[i]->mapping); 836 put_page(async_extent->pages[i]); 837 } 838 kfree(async_extent->pages); 839 async_extent->nr_pages = 0; 840 async_extent->pages = NULL; 841 } 842 843 /* 844 * phase two of compressed writeback. This is the ordered portion 845 * of the code, which only gets called in the order the work was 846 * queued. We walk all the async extents created by compress_file_range 847 * and send them down to the disk. 848 */ 849 static noinline void submit_compressed_extents(struct async_chunk *async_chunk) 850 { 851 struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); 852 struct btrfs_fs_info *fs_info = inode->root->fs_info; 853 struct async_extent *async_extent; 854 u64 alloc_hint = 0; 855 struct btrfs_key ins; 856 struct extent_map *em; 857 struct btrfs_root *root = inode->root; 858 struct extent_io_tree *io_tree = &inode->io_tree; 859 int ret = 0; 860 861 again: 862 while (!list_empty(&async_chunk->extents)) { 863 async_extent = list_entry(async_chunk->extents.next, 864 struct async_extent, list); 865 list_del(&async_extent->list); 866 867 retry: 868 lock_extent(io_tree, async_extent->start, 869 async_extent->start + async_extent->ram_size - 1); 870 /* did the compression code fall back to uncompressed IO? */ 871 if (!async_extent->pages) { 872 int page_started = 0; 873 unsigned long nr_written = 0; 874 875 /* allocate blocks */ 876 ret = cow_file_range(inode, async_chunk->locked_page, 877 async_extent->start, 878 async_extent->start + 879 async_extent->ram_size - 1, 880 &page_started, &nr_written, 0); 881 882 /* JDM XXX */ 883 884 /* 885 * if page_started, cow_file_range inserted an 886 * inline extent and took care of all the unlocking 887 * and IO for us. Otherwise, we need to submit 888 * all those pages down to the drive. 889 */ 890 if (!page_started && !ret) 891 extent_write_locked_range(&inode->vfs_inode, 892 async_extent->start, 893 async_extent->start + 894 async_extent->ram_size - 1, 895 WB_SYNC_ALL); 896 else if (ret && async_chunk->locked_page) 897 unlock_page(async_chunk->locked_page); 898 kfree(async_extent); 899 cond_resched(); 900 continue; 901 } 902 903 ret = btrfs_reserve_extent(root, async_extent->ram_size, 904 async_extent->compressed_size, 905 async_extent->compressed_size, 906 0, alloc_hint, &ins, 1, 1); 907 if (ret) { 908 free_async_extent_pages(async_extent); 909 910 if (ret == -ENOSPC) { 911 unlock_extent(io_tree, async_extent->start, 912 async_extent->start + 913 async_extent->ram_size - 1); 914 915 /* 916 * we need to redirty the pages if we decide to 917 * fallback to uncompressed IO, otherwise we 918 * will not submit these pages down to lower 919 * layers. 920 */ 921 extent_range_redirty_for_io(&inode->vfs_inode, 922 async_extent->start, 923 async_extent->start + 924 async_extent->ram_size - 1); 925 926 goto retry; 927 } 928 goto out_free; 929 } 930 /* 931 * here we're doing allocation and writeback of the 932 * compressed pages 933 */ 934 em = create_io_em(inode, async_extent->start, 935 async_extent->ram_size, /* len */ 936 async_extent->start, /* orig_start */ 937 ins.objectid, /* block_start */ 938 ins.offset, /* block_len */ 939 ins.offset, /* orig_block_len */ 940 async_extent->ram_size, /* ram_bytes */ 941 async_extent->compress_type, 942 BTRFS_ORDERED_COMPRESSED); 943 if (IS_ERR(em)) 944 /* ret value is not necessary due to void function */ 945 goto out_free_reserve; 946 free_extent_map(em); 947 948 ret = btrfs_add_ordered_extent_compress(inode, 949 async_extent->start, 950 ins.objectid, 951 async_extent->ram_size, 952 ins.offset, 953 async_extent->compress_type); 954 if (ret) { 955 btrfs_drop_extent_cache(inode, async_extent->start, 956 async_extent->start + 957 async_extent->ram_size - 1, 0); 958 goto out_free_reserve; 959 } 960 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 961 962 /* 963 * clear dirty, set writeback and unlock the pages. 964 */ 965 extent_clear_unlock_delalloc(inode, async_extent->start, 966 async_extent->start + 967 async_extent->ram_size - 1, 968 NULL, EXTENT_LOCKED | EXTENT_DELALLOC, 969 PAGE_UNLOCK | PAGE_START_WRITEBACK); 970 if (btrfs_submit_compressed_write(inode, async_extent->start, 971 async_extent->ram_size, 972 ins.objectid, 973 ins.offset, async_extent->pages, 974 async_extent->nr_pages, 975 async_chunk->write_flags, 976 async_chunk->blkcg_css)) { 977 struct page *p = async_extent->pages[0]; 978 const u64 start = async_extent->start; 979 const u64 end = start + async_extent->ram_size - 1; 980 981 p->mapping = inode->vfs_inode.i_mapping; 982 btrfs_writepage_endio_finish_ordered(inode, p, start, 983 end, false); 984 985 p->mapping = NULL; 986 extent_clear_unlock_delalloc(inode, start, end, NULL, 0, 987 PAGE_END_WRITEBACK | 988 PAGE_SET_ERROR); 989 free_async_extent_pages(async_extent); 990 } 991 alloc_hint = ins.objectid + ins.offset; 992 kfree(async_extent); 993 cond_resched(); 994 } 995 return; 996 out_free_reserve: 997 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 998 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 999 out_free: 1000 extent_clear_unlock_delalloc(inode, async_extent->start, 1001 async_extent->start + 1002 async_extent->ram_size - 1, 1003 NULL, EXTENT_LOCKED | EXTENT_DELALLOC | 1004 EXTENT_DELALLOC_NEW | 1005 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 1006 PAGE_UNLOCK | PAGE_START_WRITEBACK | 1007 PAGE_END_WRITEBACK | PAGE_SET_ERROR); 1008 free_async_extent_pages(async_extent); 1009 kfree(async_extent); 1010 goto again; 1011 } 1012 1013 static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, 1014 u64 num_bytes) 1015 { 1016 struct extent_map_tree *em_tree = &inode->extent_tree; 1017 struct extent_map *em; 1018 u64 alloc_hint = 0; 1019 1020 read_lock(&em_tree->lock); 1021 em = search_extent_mapping(em_tree, start, num_bytes); 1022 if (em) { 1023 /* 1024 * if block start isn't an actual block number then find the 1025 * first block in this inode and use that as a hint. If that 1026 * block is also bogus then just don't worry about it. 1027 */ 1028 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 1029 free_extent_map(em); 1030 em = search_extent_mapping(em_tree, 0, 0); 1031 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 1032 alloc_hint = em->block_start; 1033 if (em) 1034 free_extent_map(em); 1035 } else { 1036 alloc_hint = em->block_start; 1037 free_extent_map(em); 1038 } 1039 } 1040 read_unlock(&em_tree->lock); 1041 1042 return alloc_hint; 1043 } 1044 1045 /* 1046 * when extent_io.c finds a delayed allocation range in the file, 1047 * the call backs end up in this code. The basic idea is to 1048 * allocate extents on disk for the range, and create ordered data structs 1049 * in ram to track those extents. 1050 * 1051 * locked_page is the page that writepage had locked already. We use 1052 * it to make sure we don't do extra locks or unlocks. 1053 * 1054 * *page_started is set to one if we unlock locked_page and do everything 1055 * required to start IO on it. It may be clean and already done with 1056 * IO when we return. 1057 */ 1058 static noinline int cow_file_range(struct btrfs_inode *inode, 1059 struct page *locked_page, 1060 u64 start, u64 end, int *page_started, 1061 unsigned long *nr_written, int unlock) 1062 { 1063 struct btrfs_root *root = inode->root; 1064 struct btrfs_fs_info *fs_info = root->fs_info; 1065 u64 alloc_hint = 0; 1066 u64 num_bytes; 1067 unsigned long ram_size; 1068 u64 cur_alloc_size = 0; 1069 u64 min_alloc_size; 1070 u64 blocksize = fs_info->sectorsize; 1071 struct btrfs_key ins; 1072 struct extent_map *em; 1073 unsigned clear_bits; 1074 unsigned long page_ops; 1075 bool extent_reserved = false; 1076 int ret = 0; 1077 1078 if (btrfs_is_free_space_inode(inode)) { 1079 WARN_ON_ONCE(1); 1080 ret = -EINVAL; 1081 goto out_unlock; 1082 } 1083 1084 num_bytes = ALIGN(end - start + 1, blocksize); 1085 num_bytes = max(blocksize, num_bytes); 1086 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); 1087 1088 inode_should_defrag(inode, start, end, num_bytes, SZ_64K); 1089 1090 /* 1091 * Due to the page size limit, for subpage we can only trigger the 1092 * writeback for the dirty sectors of page, that means data writeback 1093 * is doing more writeback than what we want. 1094 * 1095 * This is especially unexpected for some call sites like fallocate, 1096 * where we only increase i_size after everything is done. 1097 * This means we can trigger inline extent even if we didn't want to. 1098 * So here we skip inline extent creation completely. 1099 */ 1100 if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { 1101 /* lets try to make an inline extent */ 1102 ret = cow_file_range_inline(inode, start, end, 0, 1103 BTRFS_COMPRESS_NONE, NULL); 1104 if (ret == 0) { 1105 /* 1106 * We use DO_ACCOUNTING here because we need the 1107 * delalloc_release_metadata to be run _after_ we drop 1108 * our outstanding extent for clearing delalloc for this 1109 * range. 1110 */ 1111 extent_clear_unlock_delalloc(inode, start, end, 1112 locked_page, 1113 EXTENT_LOCKED | EXTENT_DELALLOC | 1114 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 1115 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 1116 PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); 1117 *nr_written = *nr_written + 1118 (end - start + PAGE_SIZE) / PAGE_SIZE; 1119 *page_started = 1; 1120 /* 1121 * locked_page is locked by the caller of 1122 * writepage_delalloc(), not locked by 1123 * __process_pages_contig(). 1124 * 1125 * We can't let __process_pages_contig() to unlock it, 1126 * as it doesn't have any subpage::writers recorded. 1127 * 1128 * Here we manually unlock the page, since the caller 1129 * can't use page_started to determine if it's an 1130 * inline extent or a compressed extent. 1131 */ 1132 unlock_page(locked_page); 1133 goto out; 1134 } else if (ret < 0) { 1135 goto out_unlock; 1136 } 1137 } 1138 1139 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 1140 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 1141 1142 /* 1143 * Relocation relies on the relocated extents to have exactly the same 1144 * size as the original extents. Normally writeback for relocation data 1145 * extents follows a NOCOW path because relocation preallocates the 1146 * extents. However, due to an operation such as scrub turning a block 1147 * group to RO mode, it may fallback to COW mode, so we must make sure 1148 * an extent allocated during COW has exactly the requested size and can 1149 * not be split into smaller extents, otherwise relocation breaks and 1150 * fails during the stage where it updates the bytenr of file extent 1151 * items. 1152 */ 1153 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1154 min_alloc_size = num_bytes; 1155 else 1156 min_alloc_size = fs_info->sectorsize; 1157 1158 while (num_bytes > 0) { 1159 cur_alloc_size = num_bytes; 1160 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, 1161 min_alloc_size, 0, alloc_hint, 1162 &ins, 1, 1); 1163 if (ret < 0) 1164 goto out_unlock; 1165 cur_alloc_size = ins.offset; 1166 extent_reserved = true; 1167 1168 ram_size = ins.offset; 1169 em = create_io_em(inode, start, ins.offset, /* len */ 1170 start, /* orig_start */ 1171 ins.objectid, /* block_start */ 1172 ins.offset, /* block_len */ 1173 ins.offset, /* orig_block_len */ 1174 ram_size, /* ram_bytes */ 1175 BTRFS_COMPRESS_NONE, /* compress_type */ 1176 BTRFS_ORDERED_REGULAR /* type */); 1177 if (IS_ERR(em)) { 1178 ret = PTR_ERR(em); 1179 goto out_reserve; 1180 } 1181 free_extent_map(em); 1182 1183 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 1184 ram_size, cur_alloc_size, 1185 BTRFS_ORDERED_REGULAR); 1186 if (ret) 1187 goto out_drop_extent_cache; 1188 1189 if (root->root_key.objectid == 1190 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1191 ret = btrfs_reloc_clone_csums(inode, start, 1192 cur_alloc_size); 1193 /* 1194 * Only drop cache here, and process as normal. 1195 * 1196 * We must not allow extent_clear_unlock_delalloc() 1197 * at out_unlock label to free meta of this ordered 1198 * extent, as its meta should be freed by 1199 * btrfs_finish_ordered_io(). 1200 * 1201 * So we must continue until @start is increased to 1202 * skip current ordered extent. 1203 */ 1204 if (ret) 1205 btrfs_drop_extent_cache(inode, start, 1206 start + ram_size - 1, 0); 1207 } 1208 1209 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1210 1211 /* 1212 * We're not doing compressed IO, don't unlock the first page 1213 * (which the caller expects to stay locked), don't clear any 1214 * dirty bits and don't set any writeback bits 1215 * 1216 * Do set the Ordered (Private2) bit so we know this page was 1217 * properly setup for writepage. 1218 */ 1219 page_ops = unlock ? PAGE_UNLOCK : 0; 1220 page_ops |= PAGE_SET_ORDERED; 1221 1222 extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, 1223 locked_page, 1224 EXTENT_LOCKED | EXTENT_DELALLOC, 1225 page_ops); 1226 if (num_bytes < cur_alloc_size) 1227 num_bytes = 0; 1228 else 1229 num_bytes -= cur_alloc_size; 1230 alloc_hint = ins.objectid + ins.offset; 1231 start += cur_alloc_size; 1232 extent_reserved = false; 1233 1234 /* 1235 * btrfs_reloc_clone_csums() error, since start is increased 1236 * extent_clear_unlock_delalloc() at out_unlock label won't 1237 * free metadata of current ordered extent, we're OK to exit. 1238 */ 1239 if (ret) 1240 goto out_unlock; 1241 } 1242 out: 1243 return ret; 1244 1245 out_drop_extent_cache: 1246 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); 1247 out_reserve: 1248 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1249 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 1250 out_unlock: 1251 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | 1252 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; 1253 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; 1254 /* 1255 * If we reserved an extent for our delalloc range (or a subrange) and 1256 * failed to create the respective ordered extent, then it means that 1257 * when we reserved the extent we decremented the extent's size from 1258 * the data space_info's bytes_may_use counter and incremented the 1259 * space_info's bytes_reserved counter by the same amount. We must make 1260 * sure extent_clear_unlock_delalloc() does not try to decrement again 1261 * the data space_info's bytes_may_use counter, therefore we do not pass 1262 * it the flag EXTENT_CLEAR_DATA_RESV. 1263 */ 1264 if (extent_reserved) { 1265 extent_clear_unlock_delalloc(inode, start, 1266 start + cur_alloc_size - 1, 1267 locked_page, 1268 clear_bits, 1269 page_ops); 1270 start += cur_alloc_size; 1271 if (start >= end) 1272 goto out; 1273 } 1274 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1275 clear_bits | EXTENT_CLEAR_DATA_RESV, 1276 page_ops); 1277 goto out; 1278 } 1279 1280 /* 1281 * work queue call back to started compression on a file and pages 1282 */ 1283 static noinline void async_cow_start(struct btrfs_work *work) 1284 { 1285 struct async_chunk *async_chunk; 1286 int compressed_extents; 1287 1288 async_chunk = container_of(work, struct async_chunk, work); 1289 1290 compressed_extents = compress_file_range(async_chunk); 1291 if (compressed_extents == 0) { 1292 btrfs_add_delayed_iput(async_chunk->inode); 1293 async_chunk->inode = NULL; 1294 } 1295 } 1296 1297 /* 1298 * work queue call back to submit previously compressed pages 1299 */ 1300 static noinline void async_cow_submit(struct btrfs_work *work) 1301 { 1302 struct async_chunk *async_chunk = container_of(work, struct async_chunk, 1303 work); 1304 struct btrfs_fs_info *fs_info = btrfs_work_owner(work); 1305 unsigned long nr_pages; 1306 1307 nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> 1308 PAGE_SHIFT; 1309 1310 /* 1311 * ->inode could be NULL if async_chunk_start has failed to compress, 1312 * in which case we don't have anything to submit, yet we need to 1313 * always adjust ->async_delalloc_pages as its paired with the init 1314 * happening in cow_file_range_async 1315 */ 1316 if (async_chunk->inode) 1317 submit_compressed_extents(async_chunk); 1318 1319 /* atomic_sub_return implies a barrier */ 1320 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 1321 5 * SZ_1M) 1322 cond_wake_up_nomb(&fs_info->async_submit_wait); 1323 } 1324 1325 static noinline void async_cow_free(struct btrfs_work *work) 1326 { 1327 struct async_chunk *async_chunk; 1328 1329 async_chunk = container_of(work, struct async_chunk, work); 1330 if (async_chunk->inode) 1331 btrfs_add_delayed_iput(async_chunk->inode); 1332 if (async_chunk->blkcg_css) 1333 css_put(async_chunk->blkcg_css); 1334 /* 1335 * Since the pointer to 'pending' is at the beginning of the array of 1336 * async_chunk's, freeing it ensures the whole array has been freed. 1337 */ 1338 if (atomic_dec_and_test(async_chunk->pending)) 1339 kvfree(async_chunk->pending); 1340 } 1341 1342 static int cow_file_range_async(struct btrfs_inode *inode, 1343 struct writeback_control *wbc, 1344 struct page *locked_page, 1345 u64 start, u64 end, int *page_started, 1346 unsigned long *nr_written) 1347 { 1348 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1349 struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); 1350 struct async_cow *ctx; 1351 struct async_chunk *async_chunk; 1352 unsigned long nr_pages; 1353 u64 cur_end; 1354 u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); 1355 int i; 1356 bool should_compress; 1357 unsigned nofs_flag; 1358 const unsigned int write_flags = wbc_to_write_flags(wbc); 1359 1360 unlock_extent(&inode->io_tree, start, end); 1361 1362 if (inode->flags & BTRFS_INODE_NOCOMPRESS && 1363 !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { 1364 num_chunks = 1; 1365 should_compress = false; 1366 } else { 1367 should_compress = true; 1368 } 1369 1370 nofs_flag = memalloc_nofs_save(); 1371 ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); 1372 memalloc_nofs_restore(nofs_flag); 1373 1374 if (!ctx) { 1375 unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | 1376 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 1377 EXTENT_DO_ACCOUNTING; 1378 unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | 1379 PAGE_END_WRITEBACK | PAGE_SET_ERROR; 1380 1381 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1382 clear_bits, page_ops); 1383 return -ENOMEM; 1384 } 1385 1386 async_chunk = ctx->chunks; 1387 atomic_set(&ctx->num_chunks, num_chunks); 1388 1389 for (i = 0; i < num_chunks; i++) { 1390 if (should_compress) 1391 cur_end = min(end, start + SZ_512K - 1); 1392 else 1393 cur_end = end; 1394 1395 /* 1396 * igrab is called higher up in the call chain, take only the 1397 * lightweight reference for the callback lifetime 1398 */ 1399 ihold(&inode->vfs_inode); 1400 async_chunk[i].pending = &ctx->num_chunks; 1401 async_chunk[i].inode = &inode->vfs_inode; 1402 async_chunk[i].start = start; 1403 async_chunk[i].end = cur_end; 1404 async_chunk[i].write_flags = write_flags; 1405 INIT_LIST_HEAD(&async_chunk[i].extents); 1406 1407 /* 1408 * The locked_page comes all the way from writepage and its 1409 * the original page we were actually given. As we spread 1410 * this large delalloc region across multiple async_chunk 1411 * structs, only the first struct needs a pointer to locked_page 1412 * 1413 * This way we don't need racey decisions about who is supposed 1414 * to unlock it. 1415 */ 1416 if (locked_page) { 1417 /* 1418 * Depending on the compressibility, the pages might or 1419 * might not go through async. We want all of them to 1420 * be accounted against wbc once. Let's do it here 1421 * before the paths diverge. wbc accounting is used 1422 * only for foreign writeback detection and doesn't 1423 * need full accuracy. Just account the whole thing 1424 * against the first page. 1425 */ 1426 wbc_account_cgroup_owner(wbc, locked_page, 1427 cur_end - start); 1428 async_chunk[i].locked_page = locked_page; 1429 locked_page = NULL; 1430 } else { 1431 async_chunk[i].locked_page = NULL; 1432 } 1433 1434 if (blkcg_css != blkcg_root_css) { 1435 css_get(blkcg_css); 1436 async_chunk[i].blkcg_css = blkcg_css; 1437 } else { 1438 async_chunk[i].blkcg_css = NULL; 1439 } 1440 1441 btrfs_init_work(&async_chunk[i].work, async_cow_start, 1442 async_cow_submit, async_cow_free); 1443 1444 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); 1445 atomic_add(nr_pages, &fs_info->async_delalloc_pages); 1446 1447 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); 1448 1449 *nr_written += nr_pages; 1450 start = cur_end + 1; 1451 } 1452 *page_started = 1; 1453 return 0; 1454 } 1455 1456 static noinline int run_delalloc_zoned(struct btrfs_inode *inode, 1457 struct page *locked_page, u64 start, 1458 u64 end, int *page_started, 1459 unsigned long *nr_written) 1460 { 1461 int ret; 1462 1463 ret = cow_file_range(inode, locked_page, start, end, page_started, 1464 nr_written, 0); 1465 if (ret) 1466 return ret; 1467 1468 if (*page_started) 1469 return 0; 1470 1471 __set_page_dirty_nobuffers(locked_page); 1472 account_page_redirty(locked_page); 1473 extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL); 1474 *page_started = 1; 1475 1476 return 0; 1477 } 1478 1479 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, 1480 u64 bytenr, u64 num_bytes) 1481 { 1482 int ret; 1483 struct btrfs_ordered_sum *sums; 1484 LIST_HEAD(list); 1485 1486 ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr, 1487 bytenr + num_bytes - 1, &list, 0); 1488 if (ret == 0 && list_empty(&list)) 1489 return 0; 1490 1491 while (!list_empty(&list)) { 1492 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1493 list_del(&sums->list); 1494 kfree(sums); 1495 } 1496 if (ret < 0) 1497 return ret; 1498 return 1; 1499 } 1500 1501 static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, 1502 const u64 start, const u64 end, 1503 int *page_started, unsigned long *nr_written) 1504 { 1505 const bool is_space_ino = btrfs_is_free_space_inode(inode); 1506 const bool is_reloc_ino = (inode->root->root_key.objectid == 1507 BTRFS_DATA_RELOC_TREE_OBJECTID); 1508 const u64 range_bytes = end + 1 - start; 1509 struct extent_io_tree *io_tree = &inode->io_tree; 1510 u64 range_start = start; 1511 u64 count; 1512 1513 /* 1514 * If EXTENT_NORESERVE is set it means that when the buffered write was 1515 * made we had not enough available data space and therefore we did not 1516 * reserve data space for it, since we though we could do NOCOW for the 1517 * respective file range (either there is prealloc extent or the inode 1518 * has the NOCOW bit set). 1519 * 1520 * However when we need to fallback to COW mode (because for example the 1521 * block group for the corresponding extent was turned to RO mode by a 1522 * scrub or relocation) we need to do the following: 1523 * 1524 * 1) We increment the bytes_may_use counter of the data space info. 1525 * If COW succeeds, it allocates a new data extent and after doing 1526 * that it decrements the space info's bytes_may_use counter and 1527 * increments its bytes_reserved counter by the same amount (we do 1528 * this at btrfs_add_reserved_bytes()). So we need to increment the 1529 * bytes_may_use counter to compensate (when space is reserved at 1530 * buffered write time, the bytes_may_use counter is incremented); 1531 * 1532 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so 1533 * that if the COW path fails for any reason, it decrements (through 1534 * extent_clear_unlock_delalloc()) the bytes_may_use counter of the 1535 * data space info, which we incremented in the step above. 1536 * 1537 * If we need to fallback to cow and the inode corresponds to a free 1538 * space cache inode or an inode of the data relocation tree, we must 1539 * also increment bytes_may_use of the data space_info for the same 1540 * reason. Space caches and relocated data extents always get a prealloc 1541 * extent for them, however scrub or balance may have set the block 1542 * group that contains that extent to RO mode and therefore force COW 1543 * when starting writeback. 1544 */ 1545 count = count_range_bits(io_tree, &range_start, end, range_bytes, 1546 EXTENT_NORESERVE, 0); 1547 if (count > 0 || is_space_ino || is_reloc_ino) { 1548 u64 bytes = count; 1549 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1550 struct btrfs_space_info *sinfo = fs_info->data_sinfo; 1551 1552 if (is_space_ino || is_reloc_ino) 1553 bytes = range_bytes; 1554 1555 spin_lock(&sinfo->lock); 1556 btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); 1557 spin_unlock(&sinfo->lock); 1558 1559 if (count > 0) 1560 clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, 1561 0, 0, NULL); 1562 } 1563 1564 return cow_file_range(inode, locked_page, start, end, page_started, 1565 nr_written, 1); 1566 } 1567 1568 /* 1569 * when nowcow writeback call back. This checks for snapshots or COW copies 1570 * of the extents that exist in the file, and COWs the file as required. 1571 * 1572 * If no cow copies or snapshots exist, we write directly to the existing 1573 * blocks on disk 1574 */ 1575 static noinline int run_delalloc_nocow(struct btrfs_inode *inode, 1576 struct page *locked_page, 1577 const u64 start, const u64 end, 1578 int *page_started, 1579 unsigned long *nr_written) 1580 { 1581 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1582 struct btrfs_root *root = inode->root; 1583 struct btrfs_path *path; 1584 u64 cow_start = (u64)-1; 1585 u64 cur_offset = start; 1586 int ret; 1587 bool check_prev = true; 1588 const bool freespace_inode = btrfs_is_free_space_inode(inode); 1589 u64 ino = btrfs_ino(inode); 1590 bool nocow = false; 1591 u64 disk_bytenr = 0; 1592 const bool force = inode->flags & BTRFS_INODE_NODATACOW; 1593 1594 path = btrfs_alloc_path(); 1595 if (!path) { 1596 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1597 EXTENT_LOCKED | EXTENT_DELALLOC | 1598 EXTENT_DO_ACCOUNTING | 1599 EXTENT_DEFRAG, PAGE_UNLOCK | 1600 PAGE_START_WRITEBACK | 1601 PAGE_END_WRITEBACK); 1602 return -ENOMEM; 1603 } 1604 1605 while (1) { 1606 struct btrfs_key found_key; 1607 struct btrfs_file_extent_item *fi; 1608 struct extent_buffer *leaf; 1609 u64 extent_end; 1610 u64 extent_offset; 1611 u64 num_bytes = 0; 1612 u64 disk_num_bytes; 1613 u64 ram_bytes; 1614 int extent_type; 1615 1616 nocow = false; 1617 1618 ret = btrfs_lookup_file_extent(NULL, root, path, ino, 1619 cur_offset, 0); 1620 if (ret < 0) 1621 goto error; 1622 1623 /* 1624 * If there is no extent for our range when doing the initial 1625 * search, then go back to the previous slot as it will be the 1626 * one containing the search offset 1627 */ 1628 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1629 leaf = path->nodes[0]; 1630 btrfs_item_key_to_cpu(leaf, &found_key, 1631 path->slots[0] - 1); 1632 if (found_key.objectid == ino && 1633 found_key.type == BTRFS_EXTENT_DATA_KEY) 1634 path->slots[0]--; 1635 } 1636 check_prev = false; 1637 next_slot: 1638 /* Go to next leaf if we have exhausted the current one */ 1639 leaf = path->nodes[0]; 1640 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1641 ret = btrfs_next_leaf(root, path); 1642 if (ret < 0) { 1643 if (cow_start != (u64)-1) 1644 cur_offset = cow_start; 1645 goto error; 1646 } 1647 if (ret > 0) 1648 break; 1649 leaf = path->nodes[0]; 1650 } 1651 1652 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1653 1654 /* Didn't find anything for our INO */ 1655 if (found_key.objectid > ino) 1656 break; 1657 /* 1658 * Keep searching until we find an EXTENT_ITEM or there are no 1659 * more extents for this inode 1660 */ 1661 if (WARN_ON_ONCE(found_key.objectid < ino) || 1662 found_key.type < BTRFS_EXTENT_DATA_KEY) { 1663 path->slots[0]++; 1664 goto next_slot; 1665 } 1666 1667 /* Found key is not EXTENT_DATA_KEY or starts after req range */ 1668 if (found_key.type > BTRFS_EXTENT_DATA_KEY || 1669 found_key.offset > end) 1670 break; 1671 1672 /* 1673 * If the found extent starts after requested offset, then 1674 * adjust extent_end to be right before this extent begins 1675 */ 1676 if (found_key.offset > cur_offset) { 1677 extent_end = found_key.offset; 1678 extent_type = 0; 1679 goto out_check; 1680 } 1681 1682 /* 1683 * Found extent which begins before our range and potentially 1684 * intersect it 1685 */ 1686 fi = btrfs_item_ptr(leaf, path->slots[0], 1687 struct btrfs_file_extent_item); 1688 extent_type = btrfs_file_extent_type(leaf, fi); 1689 1690 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1691 if (extent_type == BTRFS_FILE_EXTENT_REG || 1692 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1693 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1694 extent_offset = btrfs_file_extent_offset(leaf, fi); 1695 extent_end = found_key.offset + 1696 btrfs_file_extent_num_bytes(leaf, fi); 1697 disk_num_bytes = 1698 btrfs_file_extent_disk_num_bytes(leaf, fi); 1699 /* 1700 * If the extent we got ends before our current offset, 1701 * skip to the next extent. 1702 */ 1703 if (extent_end <= cur_offset) { 1704 path->slots[0]++; 1705 goto next_slot; 1706 } 1707 /* Skip holes */ 1708 if (disk_bytenr == 0) 1709 goto out_check; 1710 /* Skip compressed/encrypted/encoded extents */ 1711 if (btrfs_file_extent_compression(leaf, fi) || 1712 btrfs_file_extent_encryption(leaf, fi) || 1713 btrfs_file_extent_other_encoding(leaf, fi)) 1714 goto out_check; 1715 /* 1716 * If extent is created before the last volume's snapshot 1717 * this implies the extent is shared, hence we can't do 1718 * nocow. This is the same check as in 1719 * btrfs_cross_ref_exist but without calling 1720 * btrfs_search_slot. 1721 */ 1722 if (!freespace_inode && 1723 btrfs_file_extent_generation(leaf, fi) <= 1724 btrfs_root_last_snapshot(&root->root_item)) 1725 goto out_check; 1726 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1727 goto out_check; 1728 1729 /* 1730 * The following checks can be expensive, as they need to 1731 * take other locks and do btree or rbtree searches, so 1732 * release the path to avoid blocking other tasks for too 1733 * long. 1734 */ 1735 btrfs_release_path(path); 1736 1737 ret = btrfs_cross_ref_exist(root, ino, 1738 found_key.offset - 1739 extent_offset, disk_bytenr, false); 1740 if (ret) { 1741 /* 1742 * ret could be -EIO if the above fails to read 1743 * metadata. 1744 */ 1745 if (ret < 0) { 1746 if (cow_start != (u64)-1) 1747 cur_offset = cow_start; 1748 goto error; 1749 } 1750 1751 WARN_ON_ONCE(freespace_inode); 1752 goto out_check; 1753 } 1754 disk_bytenr += extent_offset; 1755 disk_bytenr += cur_offset - found_key.offset; 1756 num_bytes = min(end + 1, extent_end) - cur_offset; 1757 /* 1758 * If there are pending snapshots for this root, we 1759 * fall into common COW way 1760 */ 1761 if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) 1762 goto out_check; 1763 /* 1764 * force cow if csum exists in the range. 1765 * this ensure that csum for a given extent are 1766 * either valid or do not exist. 1767 */ 1768 ret = csum_exist_in_range(fs_info, disk_bytenr, 1769 num_bytes); 1770 if (ret) { 1771 /* 1772 * ret could be -EIO if the above fails to read 1773 * metadata. 1774 */ 1775 if (ret < 0) { 1776 if (cow_start != (u64)-1) 1777 cur_offset = cow_start; 1778 goto error; 1779 } 1780 WARN_ON_ONCE(freespace_inode); 1781 goto out_check; 1782 } 1783 /* If the extent's block group is RO, we must COW */ 1784 if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) 1785 goto out_check; 1786 nocow = true; 1787 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1788 extent_end = found_key.offset + ram_bytes; 1789 extent_end = ALIGN(extent_end, fs_info->sectorsize); 1790 /* Skip extents outside of our requested range */ 1791 if (extent_end <= start) { 1792 path->slots[0]++; 1793 goto next_slot; 1794 } 1795 } else { 1796 /* If this triggers then we have a memory corruption */ 1797 BUG(); 1798 } 1799 out_check: 1800 /* 1801 * If nocow is false then record the beginning of the range 1802 * that needs to be COWed 1803 */ 1804 if (!nocow) { 1805 if (cow_start == (u64)-1) 1806 cow_start = cur_offset; 1807 cur_offset = extent_end; 1808 if (cur_offset > end) 1809 break; 1810 if (!path->nodes[0]) 1811 continue; 1812 path->slots[0]++; 1813 goto next_slot; 1814 } 1815 1816 /* 1817 * COW range from cow_start to found_key.offset - 1. As the key 1818 * will contain the beginning of the first extent that can be 1819 * NOCOW, following one which needs to be COW'ed 1820 */ 1821 if (cow_start != (u64)-1) { 1822 ret = fallback_to_cow(inode, locked_page, 1823 cow_start, found_key.offset - 1, 1824 page_started, nr_written); 1825 if (ret) 1826 goto error; 1827 cow_start = (u64)-1; 1828 } 1829 1830 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1831 u64 orig_start = found_key.offset - extent_offset; 1832 struct extent_map *em; 1833 1834 em = create_io_em(inode, cur_offset, num_bytes, 1835 orig_start, 1836 disk_bytenr, /* block_start */ 1837 num_bytes, /* block_len */ 1838 disk_num_bytes, /* orig_block_len */ 1839 ram_bytes, BTRFS_COMPRESS_NONE, 1840 BTRFS_ORDERED_PREALLOC); 1841 if (IS_ERR(em)) { 1842 ret = PTR_ERR(em); 1843 goto error; 1844 } 1845 free_extent_map(em); 1846 ret = btrfs_add_ordered_extent(inode, cur_offset, 1847 disk_bytenr, num_bytes, 1848 num_bytes, 1849 BTRFS_ORDERED_PREALLOC); 1850 if (ret) { 1851 btrfs_drop_extent_cache(inode, cur_offset, 1852 cur_offset + num_bytes - 1, 1853 0); 1854 goto error; 1855 } 1856 } else { 1857 ret = btrfs_add_ordered_extent(inode, cur_offset, 1858 disk_bytenr, num_bytes, 1859 num_bytes, 1860 BTRFS_ORDERED_NOCOW); 1861 if (ret) 1862 goto error; 1863 } 1864 1865 if (nocow) 1866 btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1867 nocow = false; 1868 1869 if (root->root_key.objectid == 1870 BTRFS_DATA_RELOC_TREE_OBJECTID) 1871 /* 1872 * Error handled later, as we must prevent 1873 * extent_clear_unlock_delalloc() in error handler 1874 * from freeing metadata of created ordered extent. 1875 */ 1876 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1877 num_bytes); 1878 1879 extent_clear_unlock_delalloc(inode, cur_offset, 1880 cur_offset + num_bytes - 1, 1881 locked_page, EXTENT_LOCKED | 1882 EXTENT_DELALLOC | 1883 EXTENT_CLEAR_DATA_RESV, 1884 PAGE_UNLOCK | PAGE_SET_ORDERED); 1885 1886 cur_offset = extent_end; 1887 1888 /* 1889 * btrfs_reloc_clone_csums() error, now we're OK to call error 1890 * handler, as metadata for created ordered extent will only 1891 * be freed by btrfs_finish_ordered_io(). 1892 */ 1893 if (ret) 1894 goto error; 1895 if (cur_offset > end) 1896 break; 1897 } 1898 btrfs_release_path(path); 1899 1900 if (cur_offset <= end && cow_start == (u64)-1) 1901 cow_start = cur_offset; 1902 1903 if (cow_start != (u64)-1) { 1904 cur_offset = end; 1905 ret = fallback_to_cow(inode, locked_page, cow_start, end, 1906 page_started, nr_written); 1907 if (ret) 1908 goto error; 1909 } 1910 1911 error: 1912 if (nocow) 1913 btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1914 1915 if (ret && cur_offset < end) 1916 extent_clear_unlock_delalloc(inode, cur_offset, end, 1917 locked_page, EXTENT_LOCKED | 1918 EXTENT_DELALLOC | EXTENT_DEFRAG | 1919 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 1920 PAGE_START_WRITEBACK | 1921 PAGE_END_WRITEBACK); 1922 btrfs_free_path(path); 1923 return ret; 1924 } 1925 1926 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) 1927 { 1928 if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { 1929 if (inode->defrag_bytes && 1930 test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 1931 0, NULL)) 1932 return false; 1933 return true; 1934 } 1935 return false; 1936 } 1937 1938 /* 1939 * Function to process delayed allocation (create CoW) for ranges which are 1940 * being touched for the first time. 1941 */ 1942 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, 1943 u64 start, u64 end, int *page_started, unsigned long *nr_written, 1944 struct writeback_control *wbc) 1945 { 1946 int ret; 1947 const bool zoned = btrfs_is_zoned(inode->root->fs_info); 1948 1949 if (should_nocow(inode, start, end)) { 1950 ASSERT(!zoned); 1951 ret = run_delalloc_nocow(inode, locked_page, start, end, 1952 page_started, nr_written); 1953 } else if (!inode_can_compress(inode) || 1954 !inode_need_compress(inode, start, end)) { 1955 if (zoned) 1956 ret = run_delalloc_zoned(inode, locked_page, start, end, 1957 page_started, nr_written); 1958 else 1959 ret = cow_file_range(inode, locked_page, start, end, 1960 page_started, nr_written, 1); 1961 } else { 1962 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); 1963 ret = cow_file_range_async(inode, wbc, locked_page, start, end, 1964 page_started, nr_written); 1965 } 1966 ASSERT(ret <= 0); 1967 if (ret) 1968 btrfs_cleanup_ordered_extents(inode, locked_page, start, 1969 end - start + 1); 1970 return ret; 1971 } 1972 1973 void btrfs_split_delalloc_extent(struct inode *inode, 1974 struct extent_state *orig, u64 split) 1975 { 1976 u64 size; 1977 1978 /* not delalloc, ignore it */ 1979 if (!(orig->state & EXTENT_DELALLOC)) 1980 return; 1981 1982 size = orig->end - orig->start + 1; 1983 if (size > BTRFS_MAX_EXTENT_SIZE) { 1984 u32 num_extents; 1985 u64 new_size; 1986 1987 /* 1988 * See the explanation in btrfs_merge_delalloc_extent, the same 1989 * applies here, just in reverse. 1990 */ 1991 new_size = orig->end - split + 1; 1992 num_extents = count_max_extents(new_size); 1993 new_size = split - orig->start; 1994 num_extents += count_max_extents(new_size); 1995 if (count_max_extents(size) >= num_extents) 1996 return; 1997 } 1998 1999 spin_lock(&BTRFS_I(inode)->lock); 2000 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); 2001 spin_unlock(&BTRFS_I(inode)->lock); 2002 } 2003 2004 /* 2005 * Handle merged delayed allocation extents so we can keep track of new extents 2006 * that are just merged onto old extents, such as when we are doing sequential 2007 * writes, so we can properly account for the metadata space we'll need. 2008 */ 2009 void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, 2010 struct extent_state *other) 2011 { 2012 u64 new_size, old_size; 2013 u32 num_extents; 2014 2015 /* not delalloc, ignore it */ 2016 if (!(other->state & EXTENT_DELALLOC)) 2017 return; 2018 2019 if (new->start > other->start) 2020 new_size = new->end - other->start + 1; 2021 else 2022 new_size = other->end - new->start + 1; 2023 2024 /* we're not bigger than the max, unreserve the space and go */ 2025 if (new_size <= BTRFS_MAX_EXTENT_SIZE) { 2026 spin_lock(&BTRFS_I(inode)->lock); 2027 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); 2028 spin_unlock(&BTRFS_I(inode)->lock); 2029 return; 2030 } 2031 2032 /* 2033 * We have to add up either side to figure out how many extents were 2034 * accounted for before we merged into one big extent. If the number of 2035 * extents we accounted for is <= the amount we need for the new range 2036 * then we can return, otherwise drop. Think of it like this 2037 * 2038 * [ 4k][MAX_SIZE] 2039 * 2040 * So we've grown the extent by a MAX_SIZE extent, this would mean we 2041 * need 2 outstanding extents, on one side we have 1 and the other side 2042 * we have 1 so they are == and we can return. But in this case 2043 * 2044 * [MAX_SIZE+4k][MAX_SIZE+4k] 2045 * 2046 * Each range on their own accounts for 2 extents, but merged together 2047 * they are only 3 extents worth of accounting, so we need to drop in 2048 * this case. 2049 */ 2050 old_size = other->end - other->start + 1; 2051 num_extents = count_max_extents(old_size); 2052 old_size = new->end - new->start + 1; 2053 num_extents += count_max_extents(old_size); 2054 if (count_max_extents(new_size) >= num_extents) 2055 return; 2056 2057 spin_lock(&BTRFS_I(inode)->lock); 2058 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); 2059 spin_unlock(&BTRFS_I(inode)->lock); 2060 } 2061 2062 static void btrfs_add_delalloc_inodes(struct btrfs_root *root, 2063 struct inode *inode) 2064 { 2065 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2066 2067 spin_lock(&root->delalloc_lock); 2068 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 2069 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 2070 &root->delalloc_inodes); 2071 set_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2072 &BTRFS_I(inode)->runtime_flags); 2073 root->nr_delalloc_inodes++; 2074 if (root->nr_delalloc_inodes == 1) { 2075 spin_lock(&fs_info->delalloc_root_lock); 2076 BUG_ON(!list_empty(&root->delalloc_root)); 2077 list_add_tail(&root->delalloc_root, 2078 &fs_info->delalloc_roots); 2079 spin_unlock(&fs_info->delalloc_root_lock); 2080 } 2081 } 2082 spin_unlock(&root->delalloc_lock); 2083 } 2084 2085 2086 void __btrfs_del_delalloc_inode(struct btrfs_root *root, 2087 struct btrfs_inode *inode) 2088 { 2089 struct btrfs_fs_info *fs_info = root->fs_info; 2090 2091 if (!list_empty(&inode->delalloc_inodes)) { 2092 list_del_init(&inode->delalloc_inodes); 2093 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2094 &inode->runtime_flags); 2095 root->nr_delalloc_inodes--; 2096 if (!root->nr_delalloc_inodes) { 2097 ASSERT(list_empty(&root->delalloc_inodes)); 2098 spin_lock(&fs_info->delalloc_root_lock); 2099 BUG_ON(list_empty(&root->delalloc_root)); 2100 list_del_init(&root->delalloc_root); 2101 spin_unlock(&fs_info->delalloc_root_lock); 2102 } 2103 } 2104 } 2105 2106 static void btrfs_del_delalloc_inode(struct btrfs_root *root, 2107 struct btrfs_inode *inode) 2108 { 2109 spin_lock(&root->delalloc_lock); 2110 __btrfs_del_delalloc_inode(root, inode); 2111 spin_unlock(&root->delalloc_lock); 2112 } 2113 2114 /* 2115 * Properly track delayed allocation bytes in the inode and to maintain the 2116 * list of inodes that have pending delalloc work to be done. 2117 */ 2118 void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, 2119 unsigned *bits) 2120 { 2121 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2122 2123 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) 2124 WARN_ON(1); 2125 /* 2126 * set_bit and clear bit hooks normally require _irqsave/restore 2127 * but in this case, we are only testing for the DELALLOC 2128 * bit, which is only set or cleared with irqs on 2129 */ 2130 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 2131 struct btrfs_root *root = BTRFS_I(inode)->root; 2132 u64 len = state->end + 1 - state->start; 2133 u32 num_extents = count_max_extents(len); 2134 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); 2135 2136 spin_lock(&BTRFS_I(inode)->lock); 2137 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); 2138 spin_unlock(&BTRFS_I(inode)->lock); 2139 2140 /* For sanity tests */ 2141 if (btrfs_is_testing(fs_info)) 2142 return; 2143 2144 percpu_counter_add_batch(&fs_info->delalloc_bytes, len, 2145 fs_info->delalloc_batch); 2146 spin_lock(&BTRFS_I(inode)->lock); 2147 BTRFS_I(inode)->delalloc_bytes += len; 2148 if (*bits & EXTENT_DEFRAG) 2149 BTRFS_I(inode)->defrag_bytes += len; 2150 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2151 &BTRFS_I(inode)->runtime_flags)) 2152 btrfs_add_delalloc_inodes(root, inode); 2153 spin_unlock(&BTRFS_I(inode)->lock); 2154 } 2155 2156 if (!(state->state & EXTENT_DELALLOC_NEW) && 2157 (*bits & EXTENT_DELALLOC_NEW)) { 2158 spin_lock(&BTRFS_I(inode)->lock); 2159 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - 2160 state->start; 2161 spin_unlock(&BTRFS_I(inode)->lock); 2162 } 2163 } 2164 2165 /* 2166 * Once a range is no longer delalloc this function ensures that proper 2167 * accounting happens. 2168 */ 2169 void btrfs_clear_delalloc_extent(struct inode *vfs_inode, 2170 struct extent_state *state, unsigned *bits) 2171 { 2172 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 2173 struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); 2174 u64 len = state->end + 1 - state->start; 2175 u32 num_extents = count_max_extents(len); 2176 2177 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { 2178 spin_lock(&inode->lock); 2179 inode->defrag_bytes -= len; 2180 spin_unlock(&inode->lock); 2181 } 2182 2183 /* 2184 * set_bit and clear bit hooks normally require _irqsave/restore 2185 * but in this case, we are only testing for the DELALLOC 2186 * bit, which is only set or cleared with irqs on 2187 */ 2188 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 2189 struct btrfs_root *root = inode->root; 2190 bool do_list = !btrfs_is_free_space_inode(inode); 2191 2192 spin_lock(&inode->lock); 2193 btrfs_mod_outstanding_extents(inode, -num_extents); 2194 spin_unlock(&inode->lock); 2195 2196 /* 2197 * We don't reserve metadata space for space cache inodes so we 2198 * don't need to call delalloc_release_metadata if there is an 2199 * error. 2200 */ 2201 if (*bits & EXTENT_CLEAR_META_RESV && 2202 root != fs_info->tree_root) 2203 btrfs_delalloc_release_metadata(inode, len, false); 2204 2205 /* For sanity tests. */ 2206 if (btrfs_is_testing(fs_info)) 2207 return; 2208 2209 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && 2210 do_list && !(state->state & EXTENT_NORESERVE) && 2211 (*bits & EXTENT_CLEAR_DATA_RESV)) 2212 btrfs_free_reserved_data_space_noquota(fs_info, len); 2213 2214 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, 2215 fs_info->delalloc_batch); 2216 spin_lock(&inode->lock); 2217 inode->delalloc_bytes -= len; 2218 if (do_list && inode->delalloc_bytes == 0 && 2219 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2220 &inode->runtime_flags)) 2221 btrfs_del_delalloc_inode(root, inode); 2222 spin_unlock(&inode->lock); 2223 } 2224 2225 if ((state->state & EXTENT_DELALLOC_NEW) && 2226 (*bits & EXTENT_DELALLOC_NEW)) { 2227 spin_lock(&inode->lock); 2228 ASSERT(inode->new_delalloc_bytes >= len); 2229 inode->new_delalloc_bytes -= len; 2230 if (*bits & EXTENT_ADD_INODE_BYTES) 2231 inode_add_bytes(&inode->vfs_inode, len); 2232 spin_unlock(&inode->lock); 2233 } 2234 } 2235 2236 /* 2237 * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit 2238 * in a chunk's stripe. This function ensures that bios do not span a 2239 * stripe/chunk 2240 * 2241 * @page - The page we are about to add to the bio 2242 * @size - size we want to add to the bio 2243 * @bio - bio we want to ensure is smaller than a stripe 2244 * @bio_flags - flags of the bio 2245 * 2246 * return 1 if page cannot be added to the bio 2247 * return 0 if page can be added to the bio 2248 * return error otherwise 2249 */ 2250 int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, 2251 unsigned long bio_flags) 2252 { 2253 struct inode *inode = page->mapping->host; 2254 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2255 u64 logical = bio->bi_iter.bi_sector << 9; 2256 u32 bio_len = bio->bi_iter.bi_size; 2257 struct extent_map *em; 2258 int ret = 0; 2259 struct btrfs_io_geometry geom; 2260 2261 if (bio_flags & EXTENT_BIO_COMPRESSED) 2262 return 0; 2263 2264 em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); 2265 if (IS_ERR(em)) 2266 return PTR_ERR(em); 2267 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom); 2268 if (ret < 0) 2269 goto out; 2270 2271 if (geom.len < bio_len + size) 2272 ret = 1; 2273 out: 2274 free_extent_map(em); 2275 return ret; 2276 } 2277 2278 /* 2279 * in order to insert checksums into the metadata in large chunks, 2280 * we wait until bio submission time. All the pages in the bio are 2281 * checksummed and sums are attached onto the ordered extent record. 2282 * 2283 * At IO completion time the cums attached on the ordered extent record 2284 * are inserted into the btree 2285 */ 2286 static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, 2287 u64 dio_file_offset) 2288 { 2289 return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); 2290 } 2291 2292 /* 2293 * Split an extent_map at [start, start + len] 2294 * 2295 * This function is intended to be used only for extract_ordered_extent(). 2296 */ 2297 static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, 2298 u64 pre, u64 post) 2299 { 2300 struct extent_map_tree *em_tree = &inode->extent_tree; 2301 struct extent_map *em; 2302 struct extent_map *split_pre = NULL; 2303 struct extent_map *split_mid = NULL; 2304 struct extent_map *split_post = NULL; 2305 int ret = 0; 2306 unsigned long flags; 2307 2308 /* Sanity check */ 2309 if (pre == 0 && post == 0) 2310 return 0; 2311 2312 split_pre = alloc_extent_map(); 2313 if (pre) 2314 split_mid = alloc_extent_map(); 2315 if (post) 2316 split_post = alloc_extent_map(); 2317 if (!split_pre || (pre && !split_mid) || (post && !split_post)) { 2318 ret = -ENOMEM; 2319 goto out; 2320 } 2321 2322 ASSERT(pre + post < len); 2323 2324 lock_extent(&inode->io_tree, start, start + len - 1); 2325 write_lock(&em_tree->lock); 2326 em = lookup_extent_mapping(em_tree, start, len); 2327 if (!em) { 2328 ret = -EIO; 2329 goto out_unlock; 2330 } 2331 2332 ASSERT(em->len == len); 2333 ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); 2334 ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); 2335 ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); 2336 ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); 2337 ASSERT(!list_empty(&em->list)); 2338 2339 flags = em->flags; 2340 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 2341 2342 /* First, replace the em with a new extent_map starting from * em->start */ 2343 split_pre->start = em->start; 2344 split_pre->len = (pre ? pre : em->len - post); 2345 split_pre->orig_start = split_pre->start; 2346 split_pre->block_start = em->block_start; 2347 split_pre->block_len = split_pre->len; 2348 split_pre->orig_block_len = split_pre->block_len; 2349 split_pre->ram_bytes = split_pre->len; 2350 split_pre->flags = flags; 2351 split_pre->compress_type = em->compress_type; 2352 split_pre->generation = em->generation; 2353 2354 replace_extent_mapping(em_tree, em, split_pre, 1); 2355 2356 /* 2357 * Now we only have an extent_map at: 2358 * [em->start, em->start + pre] if pre != 0 2359 * [em->start, em->start + em->len - post] if pre == 0 2360 */ 2361 2362 if (pre) { 2363 /* Insert the middle extent_map */ 2364 split_mid->start = em->start + pre; 2365 split_mid->len = em->len - pre - post; 2366 split_mid->orig_start = split_mid->start; 2367 split_mid->block_start = em->block_start + pre; 2368 split_mid->block_len = split_mid->len; 2369 split_mid->orig_block_len = split_mid->block_len; 2370 split_mid->ram_bytes = split_mid->len; 2371 split_mid->flags = flags; 2372 split_mid->compress_type = em->compress_type; 2373 split_mid->generation = em->generation; 2374 add_extent_mapping(em_tree, split_mid, 1); 2375 } 2376 2377 if (post) { 2378 split_post->start = em->start + em->len - post; 2379 split_post->len = post; 2380 split_post->orig_start = split_post->start; 2381 split_post->block_start = em->block_start + em->len - post; 2382 split_post->block_len = split_post->len; 2383 split_post->orig_block_len = split_post->block_len; 2384 split_post->ram_bytes = split_post->len; 2385 split_post->flags = flags; 2386 split_post->compress_type = em->compress_type; 2387 split_post->generation = em->generation; 2388 add_extent_mapping(em_tree, split_post, 1); 2389 } 2390 2391 /* Once for us */ 2392 free_extent_map(em); 2393 /* Once for the tree */ 2394 free_extent_map(em); 2395 2396 out_unlock: 2397 write_unlock(&em_tree->lock); 2398 unlock_extent(&inode->io_tree, start, start + len - 1); 2399 out: 2400 free_extent_map(split_pre); 2401 free_extent_map(split_mid); 2402 free_extent_map(split_post); 2403 2404 return ret; 2405 } 2406 2407 static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, 2408 struct bio *bio, loff_t file_offset) 2409 { 2410 struct btrfs_ordered_extent *ordered; 2411 u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; 2412 u64 file_len; 2413 u64 len = bio->bi_iter.bi_size; 2414 u64 end = start + len; 2415 u64 ordered_end; 2416 u64 pre, post; 2417 int ret = 0; 2418 2419 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 2420 if (WARN_ON_ONCE(!ordered)) 2421 return BLK_STS_IOERR; 2422 2423 /* No need to split */ 2424 if (ordered->disk_num_bytes == len) 2425 goto out; 2426 2427 /* We cannot split once end_bio'd ordered extent */ 2428 if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) { 2429 ret = -EINVAL; 2430 goto out; 2431 } 2432 2433 /* We cannot split a compressed ordered extent */ 2434 if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) { 2435 ret = -EINVAL; 2436 goto out; 2437 } 2438 2439 ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes; 2440 /* bio must be in one ordered extent */ 2441 if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) { 2442 ret = -EINVAL; 2443 goto out; 2444 } 2445 2446 /* Checksum list should be empty */ 2447 if (WARN_ON_ONCE(!list_empty(&ordered->list))) { 2448 ret = -EINVAL; 2449 goto out; 2450 } 2451 2452 file_len = ordered->num_bytes; 2453 pre = start - ordered->disk_bytenr; 2454 post = ordered_end - end; 2455 2456 ret = btrfs_split_ordered_extent(ordered, pre, post); 2457 if (ret) 2458 goto out; 2459 ret = split_zoned_em(inode, file_offset, file_len, pre, post); 2460 2461 out: 2462 btrfs_put_ordered_extent(ordered); 2463 2464 return errno_to_blk_status(ret); 2465 } 2466 2467 /* 2468 * extent_io.c submission hook. This does the right thing for csum calculation 2469 * on write, or reading the csums from the tree before a read. 2470 * 2471 * Rules about async/sync submit, 2472 * a) read: sync submit 2473 * 2474 * b) write without checksum: sync submit 2475 * 2476 * c) write with checksum: 2477 * c-1) if bio is issued by fsync: sync submit 2478 * (sync_writers != 0) 2479 * 2480 * c-2) if root is reloc root: sync submit 2481 * (only in case of buffered IO) 2482 * 2483 * c-3) otherwise: async submit 2484 */ 2485 blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, 2486 int mirror_num, unsigned long bio_flags) 2487 2488 { 2489 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2490 struct btrfs_root *root = BTRFS_I(inode)->root; 2491 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; 2492 blk_status_t ret = 0; 2493 int skip_sum; 2494 int async = !atomic_read(&BTRFS_I(inode)->sync_writers); 2495 2496 skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || 2497 !fs_info->csum_root; 2498 2499 if (btrfs_is_free_space_inode(BTRFS_I(inode))) 2500 metadata = BTRFS_WQ_ENDIO_FREE_SPACE; 2501 2502 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 2503 struct page *page = bio_first_bvec_all(bio)->bv_page; 2504 loff_t file_offset = page_offset(page); 2505 2506 ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); 2507 if (ret) 2508 goto out; 2509 } 2510 2511 if (btrfs_op(bio) != BTRFS_MAP_WRITE) { 2512 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); 2513 if (ret) 2514 goto out; 2515 2516 if (bio_flags & EXTENT_BIO_COMPRESSED) { 2517 ret = btrfs_submit_compressed_read(inode, bio, 2518 mirror_num, 2519 bio_flags); 2520 goto out; 2521 } else { 2522 /* 2523 * Lookup bio sums does extra checks around whether we 2524 * need to csum or not, which is why we ignore skip_sum 2525 * here. 2526 */ 2527 ret = btrfs_lookup_bio_sums(inode, bio, NULL); 2528 if (ret) 2529 goto out; 2530 } 2531 goto mapit; 2532 } else if (async && !skip_sum) { 2533 /* csum items have already been cloned */ 2534 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 2535 goto mapit; 2536 /* we're doing a write, do the async checksumming */ 2537 ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, 2538 0, btrfs_submit_bio_start); 2539 goto out; 2540 } else if (!skip_sum) { 2541 ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); 2542 if (ret) 2543 goto out; 2544 } 2545 2546 mapit: 2547 ret = btrfs_map_bio(fs_info, bio, mirror_num); 2548 2549 out: 2550 if (ret) { 2551 bio->bi_status = ret; 2552 bio_endio(bio); 2553 } 2554 return ret; 2555 } 2556 2557 /* 2558 * given a list of ordered sums record them in the inode. This happens 2559 * at IO completion time based on sums calculated at bio submission time. 2560 */ 2561 static int add_pending_csums(struct btrfs_trans_handle *trans, 2562 struct list_head *list) 2563 { 2564 struct btrfs_ordered_sum *sum; 2565 int ret; 2566 2567 list_for_each_entry(sum, list, list) { 2568 trans->adding_csums = true; 2569 ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); 2570 trans->adding_csums = false; 2571 if (ret) 2572 return ret; 2573 } 2574 return 0; 2575 } 2576 2577 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, 2578 const u64 start, 2579 const u64 len, 2580 struct extent_state **cached_state) 2581 { 2582 u64 search_start = start; 2583 const u64 end = start + len - 1; 2584 2585 while (search_start < end) { 2586 const u64 search_len = end - search_start + 1; 2587 struct extent_map *em; 2588 u64 em_len; 2589 int ret = 0; 2590 2591 em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); 2592 if (IS_ERR(em)) 2593 return PTR_ERR(em); 2594 2595 if (em->block_start != EXTENT_MAP_HOLE) 2596 goto next; 2597 2598 em_len = em->len; 2599 if (em->start < search_start) 2600 em_len -= search_start - em->start; 2601 if (em_len > search_len) 2602 em_len = search_len; 2603 2604 ret = set_extent_bit(&inode->io_tree, search_start, 2605 search_start + em_len - 1, 2606 EXTENT_DELALLOC_NEW, 0, NULL, cached_state, 2607 GFP_NOFS, NULL); 2608 next: 2609 search_start = extent_map_end(em); 2610 free_extent_map(em); 2611 if (ret) 2612 return ret; 2613 } 2614 return 0; 2615 } 2616 2617 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 2618 unsigned int extra_bits, 2619 struct extent_state **cached_state) 2620 { 2621 WARN_ON(PAGE_ALIGNED(end)); 2622 2623 if (start >= i_size_read(&inode->vfs_inode) && 2624 !(inode->flags & BTRFS_INODE_PREALLOC)) { 2625 /* 2626 * There can't be any extents following eof in this case so just 2627 * set the delalloc new bit for the range directly. 2628 */ 2629 extra_bits |= EXTENT_DELALLOC_NEW; 2630 } else { 2631 int ret; 2632 2633 ret = btrfs_find_new_delalloc_bytes(inode, start, 2634 end + 1 - start, 2635 cached_state); 2636 if (ret) 2637 return ret; 2638 } 2639 2640 return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, 2641 cached_state); 2642 } 2643 2644 /* see btrfs_writepage_start_hook for details on why this is required */ 2645 struct btrfs_writepage_fixup { 2646 struct page *page; 2647 struct inode *inode; 2648 struct btrfs_work work; 2649 }; 2650 2651 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 2652 { 2653 struct btrfs_writepage_fixup *fixup; 2654 struct btrfs_ordered_extent *ordered; 2655 struct extent_state *cached_state = NULL; 2656 struct extent_changeset *data_reserved = NULL; 2657 struct page *page; 2658 struct btrfs_inode *inode; 2659 u64 page_start; 2660 u64 page_end; 2661 int ret = 0; 2662 bool free_delalloc_space = true; 2663 2664 fixup = container_of(work, struct btrfs_writepage_fixup, work); 2665 page = fixup->page; 2666 inode = BTRFS_I(fixup->inode); 2667 page_start = page_offset(page); 2668 page_end = page_offset(page) + PAGE_SIZE - 1; 2669 2670 /* 2671 * This is similar to page_mkwrite, we need to reserve the space before 2672 * we take the page lock. 2673 */ 2674 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, 2675 PAGE_SIZE); 2676 again: 2677 lock_page(page); 2678 2679 /* 2680 * Before we queued this fixup, we took a reference on the page. 2681 * page->mapping may go NULL, but it shouldn't be moved to a different 2682 * address space. 2683 */ 2684 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 2685 /* 2686 * Unfortunately this is a little tricky, either 2687 * 2688 * 1) We got here and our page had already been dealt with and 2689 * we reserved our space, thus ret == 0, so we need to just 2690 * drop our space reservation and bail. This can happen the 2691 * first time we come into the fixup worker, or could happen 2692 * while waiting for the ordered extent. 2693 * 2) Our page was already dealt with, but we happened to get an 2694 * ENOSPC above from the btrfs_delalloc_reserve_space. In 2695 * this case we obviously don't have anything to release, but 2696 * because the page was already dealt with we don't want to 2697 * mark the page with an error, so make sure we're resetting 2698 * ret to 0. This is why we have this check _before_ the ret 2699 * check, because we do not want to have a surprise ENOSPC 2700 * when the page was already properly dealt with. 2701 */ 2702 if (!ret) { 2703 btrfs_delalloc_release_extents(inode, PAGE_SIZE); 2704 btrfs_delalloc_release_space(inode, data_reserved, 2705 page_start, PAGE_SIZE, 2706 true); 2707 } 2708 ret = 0; 2709 goto out_page; 2710 } 2711 2712 /* 2713 * We can't mess with the page state unless it is locked, so now that 2714 * it is locked bail if we failed to make our space reservation. 2715 */ 2716 if (ret) 2717 goto out_page; 2718 2719 lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); 2720 2721 /* already ordered? We're done */ 2722 if (PageOrdered(page)) 2723 goto out_reserved; 2724 2725 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 2726 if (ordered) { 2727 unlock_extent_cached(&inode->io_tree, page_start, page_end, 2728 &cached_state); 2729 unlock_page(page); 2730 btrfs_start_ordered_extent(ordered, 1); 2731 btrfs_put_ordered_extent(ordered); 2732 goto again; 2733 } 2734 2735 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 2736 &cached_state); 2737 if (ret) 2738 goto out_reserved; 2739 2740 /* 2741 * Everything went as planned, we're now the owner of a dirty page with 2742 * delayed allocation bits set and space reserved for our COW 2743 * destination. 2744 * 2745 * The page was dirty when we started, nothing should have cleaned it. 2746 */ 2747 BUG_ON(!PageDirty(page)); 2748 free_delalloc_space = false; 2749 out_reserved: 2750 btrfs_delalloc_release_extents(inode, PAGE_SIZE); 2751 if (free_delalloc_space) 2752 btrfs_delalloc_release_space(inode, data_reserved, page_start, 2753 PAGE_SIZE, true); 2754 unlock_extent_cached(&inode->io_tree, page_start, page_end, 2755 &cached_state); 2756 out_page: 2757 if (ret) { 2758 /* 2759 * We hit ENOSPC or other errors. Update the mapping and page 2760 * to reflect the errors and clean the page. 2761 */ 2762 mapping_set_error(page->mapping, ret); 2763 end_extent_writepage(page, ret, page_start, page_end); 2764 clear_page_dirty_for_io(page); 2765 SetPageError(page); 2766 } 2767 ClearPageChecked(page); 2768 unlock_page(page); 2769 put_page(page); 2770 kfree(fixup); 2771 extent_changeset_free(data_reserved); 2772 /* 2773 * As a precaution, do a delayed iput in case it would be the last iput 2774 * that could need flushing space. Recursing back to fixup worker would 2775 * deadlock. 2776 */ 2777 btrfs_add_delayed_iput(&inode->vfs_inode); 2778 } 2779 2780 /* 2781 * There are a few paths in the higher layers of the kernel that directly 2782 * set the page dirty bit without asking the filesystem if it is a 2783 * good idea. This causes problems because we want to make sure COW 2784 * properly happens and the data=ordered rules are followed. 2785 * 2786 * In our case any range that doesn't have the ORDERED bit set 2787 * hasn't been properly setup for IO. We kick off an async process 2788 * to fix it up. The async helper will wait for ordered extents, set 2789 * the delalloc bit and make it safe to write the page. 2790 */ 2791 int btrfs_writepage_cow_fixup(struct page *page) 2792 { 2793 struct inode *inode = page->mapping->host; 2794 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2795 struct btrfs_writepage_fixup *fixup; 2796 2797 /* This page has ordered extent covering it already */ 2798 if (PageOrdered(page)) 2799 return 0; 2800 2801 /* 2802 * PageChecked is set below when we create a fixup worker for this page, 2803 * don't try to create another one if we're already PageChecked() 2804 * 2805 * The extent_io writepage code will redirty the page if we send back 2806 * EAGAIN. 2807 */ 2808 if (PageChecked(page)) 2809 return -EAGAIN; 2810 2811 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 2812 if (!fixup) 2813 return -EAGAIN; 2814 2815 /* 2816 * We are already holding a reference to this inode from 2817 * write_cache_pages. We need to hold it because the space reservation 2818 * takes place outside of the page lock, and we can't trust 2819 * page->mapping outside of the page lock. 2820 */ 2821 ihold(inode); 2822 SetPageChecked(page); 2823 get_page(page); 2824 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); 2825 fixup->page = page; 2826 fixup->inode = inode; 2827 btrfs_queue_work(fs_info->fixup_workers, &fixup->work); 2828 2829 return -EAGAIN; 2830 } 2831 2832 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 2833 struct btrfs_inode *inode, u64 file_pos, 2834 struct btrfs_file_extent_item *stack_fi, 2835 const bool update_inode_bytes, 2836 u64 qgroup_reserved) 2837 { 2838 struct btrfs_root *root = inode->root; 2839 const u64 sectorsize = root->fs_info->sectorsize; 2840 struct btrfs_path *path; 2841 struct extent_buffer *leaf; 2842 struct btrfs_key ins; 2843 u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); 2844 u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); 2845 u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); 2846 u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); 2847 struct btrfs_drop_extents_args drop_args = { 0 }; 2848 int ret; 2849 2850 path = btrfs_alloc_path(); 2851 if (!path) 2852 return -ENOMEM; 2853 2854 /* 2855 * we may be replacing one extent in the tree with another. 2856 * The new extent is pinned in the extent map, and we don't want 2857 * to drop it from the cache until it is completely in the btree. 2858 * 2859 * So, tell btrfs_drop_extents to leave this extent in the cache. 2860 * the caller is expected to unpin it and allow it to be merged 2861 * with the others. 2862 */ 2863 drop_args.path = path; 2864 drop_args.start = file_pos; 2865 drop_args.end = file_pos + num_bytes; 2866 drop_args.replace_extent = true; 2867 drop_args.extent_item_size = sizeof(*stack_fi); 2868 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 2869 if (ret) 2870 goto out; 2871 2872 if (!drop_args.extent_inserted) { 2873 ins.objectid = btrfs_ino(inode); 2874 ins.offset = file_pos; 2875 ins.type = BTRFS_EXTENT_DATA_KEY; 2876 2877 ret = btrfs_insert_empty_item(trans, root, path, &ins, 2878 sizeof(*stack_fi)); 2879 if (ret) 2880 goto out; 2881 } 2882 leaf = path->nodes[0]; 2883 btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); 2884 write_extent_buffer(leaf, stack_fi, 2885 btrfs_item_ptr_offset(leaf, path->slots[0]), 2886 sizeof(struct btrfs_file_extent_item)); 2887 2888 btrfs_mark_buffer_dirty(leaf); 2889 btrfs_release_path(path); 2890 2891 /* 2892 * If we dropped an inline extent here, we know the range where it is 2893 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the 2894 * number of bytes only for that range containing the inline extent. 2895 * The remaining of the range will be processed when clearning the 2896 * EXTENT_DELALLOC_BIT bit through the ordered extent completion. 2897 */ 2898 if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { 2899 u64 inline_size = round_down(drop_args.bytes_found, sectorsize); 2900 2901 inline_size = drop_args.bytes_found - inline_size; 2902 btrfs_update_inode_bytes(inode, sectorsize, inline_size); 2903 drop_args.bytes_found -= inline_size; 2904 num_bytes -= sectorsize; 2905 } 2906 2907 if (update_inode_bytes) 2908 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); 2909 2910 ins.objectid = disk_bytenr; 2911 ins.offset = disk_num_bytes; 2912 ins.type = BTRFS_EXTENT_ITEM_KEY; 2913 2914 ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); 2915 if (ret) 2916 goto out; 2917 2918 ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), 2919 file_pos, qgroup_reserved, &ins); 2920 out: 2921 btrfs_free_path(path); 2922 2923 return ret; 2924 } 2925 2926 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, 2927 u64 start, u64 len) 2928 { 2929 struct btrfs_block_group *cache; 2930 2931 cache = btrfs_lookup_block_group(fs_info, start); 2932 ASSERT(cache); 2933 2934 spin_lock(&cache->lock); 2935 cache->delalloc_bytes -= len; 2936 spin_unlock(&cache->lock); 2937 2938 btrfs_put_block_group(cache); 2939 } 2940 2941 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, 2942 struct btrfs_ordered_extent *oe) 2943 { 2944 struct btrfs_file_extent_item stack_fi; 2945 u64 logical_len; 2946 bool update_inode_bytes; 2947 2948 memset(&stack_fi, 0, sizeof(stack_fi)); 2949 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); 2950 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); 2951 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, 2952 oe->disk_num_bytes); 2953 if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) 2954 logical_len = oe->truncated_len; 2955 else 2956 logical_len = oe->num_bytes; 2957 btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len); 2958 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len); 2959 btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); 2960 /* Encryption and other encoding is reserved and all 0 */ 2961 2962 /* 2963 * For delalloc, when completing an ordered extent we update the inode's 2964 * bytes when clearing the range in the inode's io tree, so pass false 2965 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), 2966 * except if the ordered extent was truncated. 2967 */ 2968 update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || 2969 test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); 2970 2971 return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), 2972 oe->file_offset, &stack_fi, 2973 update_inode_bytes, oe->qgroup_rsv); 2974 } 2975 2976 /* 2977 * As ordered data IO finishes, this gets called so we can finish 2978 * an ordered extent if the range of bytes in the file it covers are 2979 * fully written. 2980 */ 2981 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 2982 { 2983 struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); 2984 struct btrfs_root *root = inode->root; 2985 struct btrfs_fs_info *fs_info = root->fs_info; 2986 struct btrfs_trans_handle *trans = NULL; 2987 struct extent_io_tree *io_tree = &inode->io_tree; 2988 struct extent_state *cached_state = NULL; 2989 u64 start, end; 2990 int compress_type = 0; 2991 int ret = 0; 2992 u64 logical_len = ordered_extent->num_bytes; 2993 bool freespace_inode; 2994 bool truncated = false; 2995 bool clear_reserved_extent = true; 2996 unsigned int clear_bits = EXTENT_DEFRAG; 2997 2998 start = ordered_extent->file_offset; 2999 end = start + ordered_extent->num_bytes - 1; 3000 3001 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3002 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && 3003 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) 3004 clear_bits |= EXTENT_DELALLOC_NEW; 3005 3006 freespace_inode = btrfs_is_free_space_inode(inode); 3007 3008 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 3009 ret = -EIO; 3010 goto out; 3011 } 3012 3013 if (ordered_extent->bdev) 3014 btrfs_rewrite_logical_zoned(ordered_extent); 3015 3016 btrfs_free_io_failure_record(inode, start, end); 3017 3018 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 3019 truncated = true; 3020 logical_len = ordered_extent->truncated_len; 3021 /* Truncated the entire extent, don't bother adding */ 3022 if (!logical_len) 3023 goto out; 3024 } 3025 3026 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 3027 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 3028 3029 btrfs_inode_safe_disk_i_size_write(inode, 0); 3030 if (freespace_inode) 3031 trans = btrfs_join_transaction_spacecache(root); 3032 else 3033 trans = btrfs_join_transaction(root); 3034 if (IS_ERR(trans)) { 3035 ret = PTR_ERR(trans); 3036 trans = NULL; 3037 goto out; 3038 } 3039 trans->block_rsv = &inode->block_rsv; 3040 ret = btrfs_update_inode_fallback(trans, root, inode); 3041 if (ret) /* -ENOMEM or corruption */ 3042 btrfs_abort_transaction(trans, ret); 3043 goto out; 3044 } 3045 3046 clear_bits |= EXTENT_LOCKED; 3047 lock_extent_bits(io_tree, start, end, &cached_state); 3048 3049 if (freespace_inode) 3050 trans = btrfs_join_transaction_spacecache(root); 3051 else 3052 trans = btrfs_join_transaction(root); 3053 if (IS_ERR(trans)) { 3054 ret = PTR_ERR(trans); 3055 trans = NULL; 3056 goto out; 3057 } 3058 3059 trans->block_rsv = &inode->block_rsv; 3060 3061 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 3062 compress_type = ordered_extent->compress_type; 3063 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 3064 BUG_ON(compress_type); 3065 ret = btrfs_mark_extent_written(trans, inode, 3066 ordered_extent->file_offset, 3067 ordered_extent->file_offset + 3068 logical_len); 3069 } else { 3070 BUG_ON(root == fs_info->tree_root); 3071 ret = insert_ordered_extent_file_extent(trans, ordered_extent); 3072 if (!ret) { 3073 clear_reserved_extent = false; 3074 btrfs_release_delalloc_bytes(fs_info, 3075 ordered_extent->disk_bytenr, 3076 ordered_extent->disk_num_bytes); 3077 } 3078 } 3079 unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, 3080 ordered_extent->num_bytes, trans->transid); 3081 if (ret < 0) { 3082 btrfs_abort_transaction(trans, ret); 3083 goto out; 3084 } 3085 3086 ret = add_pending_csums(trans, &ordered_extent->list); 3087 if (ret) { 3088 btrfs_abort_transaction(trans, ret); 3089 goto out; 3090 } 3091 3092 /* 3093 * If this is a new delalloc range, clear its new delalloc flag to 3094 * update the inode's number of bytes. This needs to be done first 3095 * before updating the inode item. 3096 */ 3097 if ((clear_bits & EXTENT_DELALLOC_NEW) && 3098 !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) 3099 clear_extent_bit(&inode->io_tree, start, end, 3100 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, 3101 0, 0, &cached_state); 3102 3103 btrfs_inode_safe_disk_i_size_write(inode, 0); 3104 ret = btrfs_update_inode_fallback(trans, root, inode); 3105 if (ret) { /* -ENOMEM or corruption */ 3106 btrfs_abort_transaction(trans, ret); 3107 goto out; 3108 } 3109 ret = 0; 3110 out: 3111 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 3112 (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, 3113 &cached_state); 3114 3115 if (trans) 3116 btrfs_end_transaction(trans); 3117 3118 if (ret || truncated) { 3119 u64 unwritten_start = start; 3120 3121 /* 3122 * If we failed to finish this ordered extent for any reason we 3123 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered 3124 * extent, and mark the inode with the error if it wasn't 3125 * already set. Any error during writeback would have already 3126 * set the mapping error, so we need to set it if we're the ones 3127 * marking this ordered extent as failed. 3128 */ 3129 if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, 3130 &ordered_extent->flags)) 3131 mapping_set_error(ordered_extent->inode->i_mapping, -EIO); 3132 3133 if (truncated) 3134 unwritten_start += logical_len; 3135 clear_extent_uptodate(io_tree, unwritten_start, end, NULL); 3136 3137 /* Drop the cache for the part of the extent we didn't write. */ 3138 btrfs_drop_extent_cache(inode, unwritten_start, end, 0); 3139 3140 /* 3141 * If the ordered extent had an IOERR or something else went 3142 * wrong we need to return the space for this ordered extent 3143 * back to the allocator. We only free the extent in the 3144 * truncated case if we didn't write out the extent at all. 3145 * 3146 * If we made it past insert_reserved_file_extent before we 3147 * errored out then we don't need to do this as the accounting 3148 * has already been done. 3149 */ 3150 if ((ret || !logical_len) && 3151 clear_reserved_extent && 3152 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3153 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 3154 /* 3155 * Discard the range before returning it back to the 3156 * free space pool 3157 */ 3158 if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) 3159 btrfs_discard_extent(fs_info, 3160 ordered_extent->disk_bytenr, 3161 ordered_extent->disk_num_bytes, 3162 NULL); 3163 btrfs_free_reserved_extent(fs_info, 3164 ordered_extent->disk_bytenr, 3165 ordered_extent->disk_num_bytes, 1); 3166 } 3167 } 3168 3169 /* 3170 * This needs to be done to make sure anybody waiting knows we are done 3171 * updating everything for this ordered extent. 3172 */ 3173 btrfs_remove_ordered_extent(inode, ordered_extent); 3174 3175 /* once for us */ 3176 btrfs_put_ordered_extent(ordered_extent); 3177 /* once for the tree */ 3178 btrfs_put_ordered_extent(ordered_extent); 3179 3180 return ret; 3181 } 3182 3183 static void finish_ordered_fn(struct btrfs_work *work) 3184 { 3185 struct btrfs_ordered_extent *ordered_extent; 3186 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 3187 btrfs_finish_ordered_io(ordered_extent); 3188 } 3189 3190 void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, 3191 struct page *page, u64 start, 3192 u64 end, bool uptodate) 3193 { 3194 trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); 3195 3196 btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, 3197 finish_ordered_fn, uptodate); 3198 } 3199 3200 /* 3201 * check_data_csum - verify checksum of one sector of uncompressed data 3202 * @inode: inode 3203 * @io_bio: btrfs_io_bio which contains the csum 3204 * @bio_offset: offset to the beginning of the bio (in bytes) 3205 * @page: page where is the data to be verified 3206 * @pgoff: offset inside the page 3207 * @start: logical offset in the file 3208 * 3209 * The length of such check is always one sector size. 3210 */ 3211 static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, 3212 u32 bio_offset, struct page *page, u32 pgoff, 3213 u64 start) 3214 { 3215 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3216 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 3217 char *kaddr; 3218 u32 len = fs_info->sectorsize; 3219 const u32 csum_size = fs_info->csum_size; 3220 unsigned int offset_sectors; 3221 u8 *csum_expected; 3222 u8 csum[BTRFS_CSUM_SIZE]; 3223 3224 ASSERT(pgoff + len <= PAGE_SIZE); 3225 3226 offset_sectors = bio_offset >> fs_info->sectorsize_bits; 3227 csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size; 3228 3229 kaddr = kmap_atomic(page); 3230 shash->tfm = fs_info->csum_shash; 3231 3232 crypto_shash_digest(shash, kaddr + pgoff, len, csum); 3233 3234 if (memcmp(csum, csum_expected, csum_size)) 3235 goto zeroit; 3236 3237 kunmap_atomic(kaddr); 3238 return 0; 3239 zeroit: 3240 btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, 3241 io_bio->mirror_num); 3242 if (io_bio->device) 3243 btrfs_dev_stat_inc_and_print(io_bio->device, 3244 BTRFS_DEV_STAT_CORRUPTION_ERRS); 3245 memset(kaddr + pgoff, 1, len); 3246 flush_dcache_page(page); 3247 kunmap_atomic(kaddr); 3248 return -EIO; 3249 } 3250 3251 /* 3252 * When reads are done, we need to check csums to verify the data is correct. 3253 * if there's a match, we allow the bio to finish. If not, the code in 3254 * extent_io.c will try to find good copies for us. 3255 * 3256 * @bio_offset: offset to the beginning of the bio (in bytes) 3257 * @start: file offset of the range start 3258 * @end: file offset of the range end (inclusive) 3259 * 3260 * Return a bitmap where bit set means a csum mismatch, and bit not set means 3261 * csum match. 3262 */ 3263 unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, 3264 struct page *page, u64 start, u64 end) 3265 { 3266 struct inode *inode = page->mapping->host; 3267 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3268 struct btrfs_root *root = BTRFS_I(inode)->root; 3269 const u32 sectorsize = root->fs_info->sectorsize; 3270 u32 pg_off; 3271 unsigned int result = 0; 3272 3273 if (PageChecked(page)) { 3274 ClearPageChecked(page); 3275 return 0; 3276 } 3277 3278 /* 3279 * For subpage case, above PageChecked is not safe as it's not subpage 3280 * compatible. 3281 * But for now only cow fixup and compressed read utilize PageChecked 3282 * flag, while in this context we can easily use io_bio->csum to 3283 * determine if we really need to do csum verification. 3284 * 3285 * So for now, just exit if io_bio->csum is NULL, as it means it's 3286 * compressed read, and its compressed data csum has already been 3287 * verified. 3288 */ 3289 if (io_bio->csum == NULL) 3290 return 0; 3291 3292 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 3293 return 0; 3294 3295 if (!root->fs_info->csum_root) 3296 return 0; 3297 3298 ASSERT(page_offset(page) <= start && 3299 end <= page_offset(page) + PAGE_SIZE - 1); 3300 for (pg_off = offset_in_page(start); 3301 pg_off < offset_in_page(end); 3302 pg_off += sectorsize, bio_offset += sectorsize) { 3303 u64 file_offset = pg_off + page_offset(page); 3304 int ret; 3305 3306 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 3307 test_range_bit(io_tree, file_offset, 3308 file_offset + sectorsize - 1, 3309 EXTENT_NODATASUM, 1, NULL)) { 3310 /* Skip the range without csum for data reloc inode */ 3311 clear_extent_bits(io_tree, file_offset, 3312 file_offset + sectorsize - 1, 3313 EXTENT_NODATASUM); 3314 continue; 3315 } 3316 ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, 3317 page_offset(page) + pg_off); 3318 if (ret < 0) { 3319 const int nr_bit = (pg_off - offset_in_page(start)) >> 3320 root->fs_info->sectorsize_bits; 3321 3322 result |= (1U << nr_bit); 3323 } 3324 } 3325 return result; 3326 } 3327 3328 /* 3329 * btrfs_add_delayed_iput - perform a delayed iput on @inode 3330 * 3331 * @inode: The inode we want to perform iput on 3332 * 3333 * This function uses the generic vfs_inode::i_count to track whether we should 3334 * just decrement it (in case it's > 1) or if this is the last iput then link 3335 * the inode to the delayed iput machinery. Delayed iputs are processed at 3336 * transaction commit time/superblock commit/cleaner kthread. 3337 */ 3338 void btrfs_add_delayed_iput(struct inode *inode) 3339 { 3340 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3341 struct btrfs_inode *binode = BTRFS_I(inode); 3342 3343 if (atomic_add_unless(&inode->i_count, -1, 1)) 3344 return; 3345 3346 atomic_inc(&fs_info->nr_delayed_iputs); 3347 spin_lock(&fs_info->delayed_iput_lock); 3348 ASSERT(list_empty(&binode->delayed_iput)); 3349 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); 3350 spin_unlock(&fs_info->delayed_iput_lock); 3351 if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) 3352 wake_up_process(fs_info->cleaner_kthread); 3353 } 3354 3355 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, 3356 struct btrfs_inode *inode) 3357 { 3358 list_del_init(&inode->delayed_iput); 3359 spin_unlock(&fs_info->delayed_iput_lock); 3360 iput(&inode->vfs_inode); 3361 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) 3362 wake_up(&fs_info->delayed_iputs_wait); 3363 spin_lock(&fs_info->delayed_iput_lock); 3364 } 3365 3366 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, 3367 struct btrfs_inode *inode) 3368 { 3369 if (!list_empty(&inode->delayed_iput)) { 3370 spin_lock(&fs_info->delayed_iput_lock); 3371 if (!list_empty(&inode->delayed_iput)) 3372 run_delayed_iput_locked(fs_info, inode); 3373 spin_unlock(&fs_info->delayed_iput_lock); 3374 } 3375 } 3376 3377 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) 3378 { 3379 3380 spin_lock(&fs_info->delayed_iput_lock); 3381 while (!list_empty(&fs_info->delayed_iputs)) { 3382 struct btrfs_inode *inode; 3383 3384 inode = list_first_entry(&fs_info->delayed_iputs, 3385 struct btrfs_inode, delayed_iput); 3386 run_delayed_iput_locked(fs_info, inode); 3387 cond_resched_lock(&fs_info->delayed_iput_lock); 3388 } 3389 spin_unlock(&fs_info->delayed_iput_lock); 3390 } 3391 3392 /** 3393 * Wait for flushing all delayed iputs 3394 * 3395 * @fs_info: the filesystem 3396 * 3397 * This will wait on any delayed iputs that are currently running with KILLABLE 3398 * set. Once they are all done running we will return, unless we are killed in 3399 * which case we return EINTR. This helps in user operations like fallocate etc 3400 * that might get blocked on the iputs. 3401 * 3402 * Return EINTR if we were killed, 0 if nothing's pending 3403 */ 3404 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) 3405 { 3406 int ret = wait_event_killable(fs_info->delayed_iputs_wait, 3407 atomic_read(&fs_info->nr_delayed_iputs) == 0); 3408 if (ret) 3409 return -EINTR; 3410 return 0; 3411 } 3412 3413 /* 3414 * This creates an orphan entry for the given inode in case something goes wrong 3415 * in the middle of an unlink. 3416 */ 3417 int btrfs_orphan_add(struct btrfs_trans_handle *trans, 3418 struct btrfs_inode *inode) 3419 { 3420 int ret; 3421 3422 ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); 3423 if (ret && ret != -EEXIST) { 3424 btrfs_abort_transaction(trans, ret); 3425 return ret; 3426 } 3427 3428 return 0; 3429 } 3430 3431 /* 3432 * We have done the delete so we can go ahead and remove the orphan item for 3433 * this particular inode. 3434 */ 3435 static int btrfs_orphan_del(struct btrfs_trans_handle *trans, 3436 struct btrfs_inode *inode) 3437 { 3438 return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); 3439 } 3440 3441 /* 3442 * this cleans up any orphans that may be left on the list from the last use 3443 * of this root. 3444 */ 3445 int btrfs_orphan_cleanup(struct btrfs_root *root) 3446 { 3447 struct btrfs_fs_info *fs_info = root->fs_info; 3448 struct btrfs_path *path; 3449 struct extent_buffer *leaf; 3450 struct btrfs_key key, found_key; 3451 struct btrfs_trans_handle *trans; 3452 struct inode *inode; 3453 u64 last_objectid = 0; 3454 int ret = 0, nr_unlink = 0; 3455 3456 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 3457 return 0; 3458 3459 path = btrfs_alloc_path(); 3460 if (!path) { 3461 ret = -ENOMEM; 3462 goto out; 3463 } 3464 path->reada = READA_BACK; 3465 3466 key.objectid = BTRFS_ORPHAN_OBJECTID; 3467 key.type = BTRFS_ORPHAN_ITEM_KEY; 3468 key.offset = (u64)-1; 3469 3470 while (1) { 3471 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3472 if (ret < 0) 3473 goto out; 3474 3475 /* 3476 * if ret == 0 means we found what we were searching for, which 3477 * is weird, but possible, so only screw with path if we didn't 3478 * find the key and see if we have stuff that matches 3479 */ 3480 if (ret > 0) { 3481 ret = 0; 3482 if (path->slots[0] == 0) 3483 break; 3484 path->slots[0]--; 3485 } 3486 3487 /* pull out the item */ 3488 leaf = path->nodes[0]; 3489 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3490 3491 /* make sure the item matches what we want */ 3492 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3493 break; 3494 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) 3495 break; 3496 3497 /* release the path since we're done with it */ 3498 btrfs_release_path(path); 3499 3500 /* 3501 * this is where we are basically btrfs_lookup, without the 3502 * crossing root thing. we store the inode number in the 3503 * offset of the orphan item. 3504 */ 3505 3506 if (found_key.offset == last_objectid) { 3507 btrfs_err(fs_info, 3508 "Error removing orphan entry, stopping orphan cleanup"); 3509 ret = -EINVAL; 3510 goto out; 3511 } 3512 3513 last_objectid = found_key.offset; 3514 3515 found_key.objectid = found_key.offset; 3516 found_key.type = BTRFS_INODE_ITEM_KEY; 3517 found_key.offset = 0; 3518 inode = btrfs_iget(fs_info->sb, last_objectid, root); 3519 ret = PTR_ERR_OR_ZERO(inode); 3520 if (ret && ret != -ENOENT) 3521 goto out; 3522 3523 if (ret == -ENOENT && root == fs_info->tree_root) { 3524 struct btrfs_root *dead_root; 3525 int is_dead_root = 0; 3526 3527 /* 3528 * This is an orphan in the tree root. Currently these 3529 * could come from 2 sources: 3530 * a) a root (snapshot/subvolume) deletion in progress 3531 * b) a free space cache inode 3532 * We need to distinguish those two, as the orphan item 3533 * for a root must not get deleted before the deletion 3534 * of the snapshot/subvolume's tree completes. 3535 * 3536 * btrfs_find_orphan_roots() ran before us, which has 3537 * found all deleted roots and loaded them into 3538 * fs_info->fs_roots_radix. So here we can find if an 3539 * orphan item corresponds to a deleted root by looking 3540 * up the root from that radix tree. 3541 */ 3542 3543 spin_lock(&fs_info->fs_roots_radix_lock); 3544 dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, 3545 (unsigned long)found_key.objectid); 3546 if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) 3547 is_dead_root = 1; 3548 spin_unlock(&fs_info->fs_roots_radix_lock); 3549 3550 if (is_dead_root) { 3551 /* prevent this orphan from being found again */ 3552 key.offset = found_key.objectid - 1; 3553 continue; 3554 } 3555 3556 } 3557 3558 /* 3559 * If we have an inode with links, there are a couple of 3560 * possibilities: 3561 * 3562 * 1. We were halfway through creating fsverity metadata for the 3563 * file. In that case, the orphan item represents incomplete 3564 * fsverity metadata which must be cleaned up with 3565 * btrfs_drop_verity_items and deleting the orphan item. 3566 3567 * 2. Old kernels (before v3.12) used to create an 3568 * orphan item for truncate indicating that there were possibly 3569 * extent items past i_size that needed to be deleted. In v3.12, 3570 * truncate was changed to update i_size in sync with the extent 3571 * items, but the (useless) orphan item was still created. Since 3572 * v4.18, we don't create the orphan item for truncate at all. 3573 * 3574 * So, this item could mean that we need to do a truncate, but 3575 * only if this filesystem was last used on a pre-v3.12 kernel 3576 * and was not cleanly unmounted. The odds of that are quite 3577 * slim, and it's a pain to do the truncate now, so just delete 3578 * the orphan item. 3579 * 3580 * It's also possible that this orphan item was supposed to be 3581 * deleted but wasn't. The inode number may have been reused, 3582 * but either way, we can delete the orphan item. 3583 */ 3584 if (ret == -ENOENT || inode->i_nlink) { 3585 if (!ret) { 3586 ret = btrfs_drop_verity_items(BTRFS_I(inode)); 3587 iput(inode); 3588 if (ret) 3589 goto out; 3590 } 3591 trans = btrfs_start_transaction(root, 1); 3592 if (IS_ERR(trans)) { 3593 ret = PTR_ERR(trans); 3594 goto out; 3595 } 3596 btrfs_debug(fs_info, "auto deleting %Lu", 3597 found_key.objectid); 3598 ret = btrfs_del_orphan_item(trans, root, 3599 found_key.objectid); 3600 btrfs_end_transaction(trans); 3601 if (ret) 3602 goto out; 3603 continue; 3604 } 3605 3606 nr_unlink++; 3607 3608 /* this will do delete_inode and everything for us */ 3609 iput(inode); 3610 } 3611 /* release the path since we're done with it */ 3612 btrfs_release_path(path); 3613 3614 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 3615 3616 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { 3617 trans = btrfs_join_transaction(root); 3618 if (!IS_ERR(trans)) 3619 btrfs_end_transaction(trans); 3620 } 3621 3622 if (nr_unlink) 3623 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); 3624 3625 out: 3626 if (ret) 3627 btrfs_err(fs_info, "could not do orphan cleanup %d", ret); 3628 btrfs_free_path(path); 3629 return ret; 3630 } 3631 3632 /* 3633 * very simple check to peek ahead in the leaf looking for xattrs. If we 3634 * don't find any xattrs, we know there can't be any acls. 3635 * 3636 * slot is the slot the inode is in, objectid is the objectid of the inode 3637 */ 3638 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 3639 int slot, u64 objectid, 3640 int *first_xattr_slot) 3641 { 3642 u32 nritems = btrfs_header_nritems(leaf); 3643 struct btrfs_key found_key; 3644 static u64 xattr_access = 0; 3645 static u64 xattr_default = 0; 3646 int scanned = 0; 3647 3648 if (!xattr_access) { 3649 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, 3650 strlen(XATTR_NAME_POSIX_ACL_ACCESS)); 3651 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, 3652 strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); 3653 } 3654 3655 slot++; 3656 *first_xattr_slot = -1; 3657 while (slot < nritems) { 3658 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3659 3660 /* we found a different objectid, there must not be acls */ 3661 if (found_key.objectid != objectid) 3662 return 0; 3663 3664 /* we found an xattr, assume we've got an acl */ 3665 if (found_key.type == BTRFS_XATTR_ITEM_KEY) { 3666 if (*first_xattr_slot == -1) 3667 *first_xattr_slot = slot; 3668 if (found_key.offset == xattr_access || 3669 found_key.offset == xattr_default) 3670 return 1; 3671 } 3672 3673 /* 3674 * we found a key greater than an xattr key, there can't 3675 * be any acls later on 3676 */ 3677 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 3678 return 0; 3679 3680 slot++; 3681 scanned++; 3682 3683 /* 3684 * it goes inode, inode backrefs, xattrs, extents, 3685 * so if there are a ton of hard links to an inode there can 3686 * be a lot of backrefs. Don't waste time searching too hard, 3687 * this is just an optimization 3688 */ 3689 if (scanned >= 8) 3690 break; 3691 } 3692 /* we hit the end of the leaf before we found an xattr or 3693 * something larger than an xattr. We have to assume the inode 3694 * has acls 3695 */ 3696 if (*first_xattr_slot == -1) 3697 *first_xattr_slot = slot; 3698 return 1; 3699 } 3700 3701 /* 3702 * read an inode from the btree into the in-memory inode 3703 */ 3704 static int btrfs_read_locked_inode(struct inode *inode, 3705 struct btrfs_path *in_path) 3706 { 3707 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3708 struct btrfs_path *path = in_path; 3709 struct extent_buffer *leaf; 3710 struct btrfs_inode_item *inode_item; 3711 struct btrfs_root *root = BTRFS_I(inode)->root; 3712 struct btrfs_key location; 3713 unsigned long ptr; 3714 int maybe_acls; 3715 u32 rdev; 3716 int ret; 3717 bool filled = false; 3718 int first_xattr_slot; 3719 3720 ret = btrfs_fill_inode(inode, &rdev); 3721 if (!ret) 3722 filled = true; 3723 3724 if (!path) { 3725 path = btrfs_alloc_path(); 3726 if (!path) 3727 return -ENOMEM; 3728 } 3729 3730 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3731 3732 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3733 if (ret) { 3734 if (path != in_path) 3735 btrfs_free_path(path); 3736 return ret; 3737 } 3738 3739 leaf = path->nodes[0]; 3740 3741 if (filled) 3742 goto cache_index; 3743 3744 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3745 struct btrfs_inode_item); 3746 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 3747 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 3748 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); 3749 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); 3750 btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); 3751 btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, 3752 round_up(i_size_read(inode), fs_info->sectorsize)); 3753 3754 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); 3755 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); 3756 3757 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); 3758 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); 3759 3760 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); 3761 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); 3762 3763 BTRFS_I(inode)->i_otime.tv_sec = 3764 btrfs_timespec_sec(leaf, &inode_item->otime); 3765 BTRFS_I(inode)->i_otime.tv_nsec = 3766 btrfs_timespec_nsec(leaf, &inode_item->otime); 3767 3768 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 3769 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 3770 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); 3771 3772 inode_set_iversion_queried(inode, 3773 btrfs_inode_sequence(leaf, inode_item)); 3774 inode->i_generation = BTRFS_I(inode)->generation; 3775 inode->i_rdev = 0; 3776 rdev = btrfs_inode_rdev(leaf, inode_item); 3777 3778 BTRFS_I(inode)->index_cnt = (u64)-1; 3779 btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), 3780 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); 3781 3782 cache_index: 3783 /* 3784 * If we were modified in the current generation and evicted from memory 3785 * and then re-read we need to do a full sync since we don't have any 3786 * idea about which extents were modified before we were evicted from 3787 * cache. 3788 * 3789 * This is required for both inode re-read from disk and delayed inode 3790 * in delayed_nodes_tree. 3791 */ 3792 if (BTRFS_I(inode)->last_trans == fs_info->generation) 3793 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3794 &BTRFS_I(inode)->runtime_flags); 3795 3796 /* 3797 * We don't persist the id of the transaction where an unlink operation 3798 * against the inode was last made. So here we assume the inode might 3799 * have been evicted, and therefore the exact value of last_unlink_trans 3800 * lost, and set it to last_trans to avoid metadata inconsistencies 3801 * between the inode and its parent if the inode is fsync'ed and the log 3802 * replayed. For example, in the scenario: 3803 * 3804 * touch mydir/foo 3805 * ln mydir/foo mydir/bar 3806 * sync 3807 * unlink mydir/bar 3808 * echo 2 > /proc/sys/vm/drop_caches # evicts inode 3809 * xfs_io -c fsync mydir/foo 3810 * <power failure> 3811 * mount fs, triggers fsync log replay 3812 * 3813 * We must make sure that when we fsync our inode foo we also log its 3814 * parent inode, otherwise after log replay the parent still has the 3815 * dentry with the "bar" name but our inode foo has a link count of 1 3816 * and doesn't have an inode ref with the name "bar" anymore. 3817 * 3818 * Setting last_unlink_trans to last_trans is a pessimistic approach, 3819 * but it guarantees correctness at the expense of occasional full 3820 * transaction commits on fsync if our inode is a directory, or if our 3821 * inode is not a directory, logging its parent unnecessarily. 3822 */ 3823 BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; 3824 3825 /* 3826 * Same logic as for last_unlink_trans. We don't persist the generation 3827 * of the last transaction where this inode was used for a reflink 3828 * operation, so after eviction and reloading the inode we must be 3829 * pessimistic and assume the last transaction that modified the inode. 3830 */ 3831 BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; 3832 3833 path->slots[0]++; 3834 if (inode->i_nlink != 1 || 3835 path->slots[0] >= btrfs_header_nritems(leaf)) 3836 goto cache_acl; 3837 3838 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); 3839 if (location.objectid != btrfs_ino(BTRFS_I(inode))) 3840 goto cache_acl; 3841 3842 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 3843 if (location.type == BTRFS_INODE_REF_KEY) { 3844 struct btrfs_inode_ref *ref; 3845 3846 ref = (struct btrfs_inode_ref *)ptr; 3847 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); 3848 } else if (location.type == BTRFS_INODE_EXTREF_KEY) { 3849 struct btrfs_inode_extref *extref; 3850 3851 extref = (struct btrfs_inode_extref *)ptr; 3852 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, 3853 extref); 3854 } 3855 cache_acl: 3856 /* 3857 * try to precache a NULL acl entry for files that don't have 3858 * any xattrs or acls 3859 */ 3860 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 3861 btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); 3862 if (first_xattr_slot != -1) { 3863 path->slots[0] = first_xattr_slot; 3864 ret = btrfs_load_inode_props(inode, path); 3865 if (ret) 3866 btrfs_err(fs_info, 3867 "error loading props for ino %llu (root %llu): %d", 3868 btrfs_ino(BTRFS_I(inode)), 3869 root->root_key.objectid, ret); 3870 } 3871 if (path != in_path) 3872 btrfs_free_path(path); 3873 3874 if (!maybe_acls) 3875 cache_no_acl(inode); 3876 3877 switch (inode->i_mode & S_IFMT) { 3878 case S_IFREG: 3879 inode->i_mapping->a_ops = &btrfs_aops; 3880 inode->i_fop = &btrfs_file_operations; 3881 inode->i_op = &btrfs_file_inode_operations; 3882 break; 3883 case S_IFDIR: 3884 inode->i_fop = &btrfs_dir_file_operations; 3885 inode->i_op = &btrfs_dir_inode_operations; 3886 break; 3887 case S_IFLNK: 3888 inode->i_op = &btrfs_symlink_inode_operations; 3889 inode_nohighmem(inode); 3890 inode->i_mapping->a_ops = &btrfs_aops; 3891 break; 3892 default: 3893 inode->i_op = &btrfs_special_inode_operations; 3894 init_special_inode(inode, inode->i_mode, rdev); 3895 break; 3896 } 3897 3898 btrfs_sync_inode_flags_to_i_flags(inode); 3899 return 0; 3900 } 3901 3902 /* 3903 * given a leaf and an inode, copy the inode fields into the leaf 3904 */ 3905 static void fill_inode_item(struct btrfs_trans_handle *trans, 3906 struct extent_buffer *leaf, 3907 struct btrfs_inode_item *item, 3908 struct inode *inode) 3909 { 3910 struct btrfs_map_token token; 3911 u64 flags; 3912 3913 btrfs_init_map_token(&token, leaf); 3914 3915 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 3916 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 3917 btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); 3918 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 3919 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 3920 3921 btrfs_set_token_timespec_sec(&token, &item->atime, 3922 inode->i_atime.tv_sec); 3923 btrfs_set_token_timespec_nsec(&token, &item->atime, 3924 inode->i_atime.tv_nsec); 3925 3926 btrfs_set_token_timespec_sec(&token, &item->mtime, 3927 inode->i_mtime.tv_sec); 3928 btrfs_set_token_timespec_nsec(&token, &item->mtime, 3929 inode->i_mtime.tv_nsec); 3930 3931 btrfs_set_token_timespec_sec(&token, &item->ctime, 3932 inode->i_ctime.tv_sec); 3933 btrfs_set_token_timespec_nsec(&token, &item->ctime, 3934 inode->i_ctime.tv_nsec); 3935 3936 btrfs_set_token_timespec_sec(&token, &item->otime, 3937 BTRFS_I(inode)->i_otime.tv_sec); 3938 btrfs_set_token_timespec_nsec(&token, &item->otime, 3939 BTRFS_I(inode)->i_otime.tv_nsec); 3940 3941 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); 3942 btrfs_set_token_inode_generation(&token, item, 3943 BTRFS_I(inode)->generation); 3944 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 3945 btrfs_set_token_inode_transid(&token, item, trans->transid); 3946 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 3947 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 3948 BTRFS_I(inode)->ro_flags); 3949 btrfs_set_token_inode_flags(&token, item, flags); 3950 btrfs_set_token_inode_block_group(&token, item, 0); 3951 } 3952 3953 /* 3954 * copy everything in the in-memory inode into the btree. 3955 */ 3956 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 3957 struct btrfs_root *root, 3958 struct btrfs_inode *inode) 3959 { 3960 struct btrfs_inode_item *inode_item; 3961 struct btrfs_path *path; 3962 struct extent_buffer *leaf; 3963 int ret; 3964 3965 path = btrfs_alloc_path(); 3966 if (!path) 3967 return -ENOMEM; 3968 3969 ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1); 3970 if (ret) { 3971 if (ret > 0) 3972 ret = -ENOENT; 3973 goto failed; 3974 } 3975 3976 leaf = path->nodes[0]; 3977 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3978 struct btrfs_inode_item); 3979 3980 fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); 3981 btrfs_mark_buffer_dirty(leaf); 3982 btrfs_set_inode_last_trans(trans, inode); 3983 ret = 0; 3984 failed: 3985 btrfs_free_path(path); 3986 return ret; 3987 } 3988 3989 /* 3990 * copy everything in the in-memory inode into the btree. 3991 */ 3992 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 3993 struct btrfs_root *root, 3994 struct btrfs_inode *inode) 3995 { 3996 struct btrfs_fs_info *fs_info = root->fs_info; 3997 int ret; 3998 3999 /* 4000 * If the inode is a free space inode, we can deadlock during commit 4001 * if we put it into the delayed code. 4002 * 4003 * The data relocation inode should also be directly updated 4004 * without delay 4005 */ 4006 if (!btrfs_is_free_space_inode(inode) 4007 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 4008 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { 4009 btrfs_update_root_times(trans, root); 4010 4011 ret = btrfs_delayed_update_inode(trans, root, inode); 4012 if (!ret) 4013 btrfs_set_inode_last_trans(trans, inode); 4014 return ret; 4015 } 4016 4017 return btrfs_update_inode_item(trans, root, inode); 4018 } 4019 4020 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 4021 struct btrfs_root *root, struct btrfs_inode *inode) 4022 { 4023 int ret; 4024 4025 ret = btrfs_update_inode(trans, root, inode); 4026 if (ret == -ENOSPC) 4027 return btrfs_update_inode_item(trans, root, inode); 4028 return ret; 4029 } 4030 4031 /* 4032 * unlink helper that gets used here in inode.c and in the tree logging 4033 * recovery code. It remove a link in a directory with a given name, and 4034 * also drops the back refs in the inode to the directory 4035 */ 4036 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 4037 struct btrfs_root *root, 4038 struct btrfs_inode *dir, 4039 struct btrfs_inode *inode, 4040 const char *name, int name_len) 4041 { 4042 struct btrfs_fs_info *fs_info = root->fs_info; 4043 struct btrfs_path *path; 4044 int ret = 0; 4045 struct btrfs_dir_item *di; 4046 u64 index; 4047 u64 ino = btrfs_ino(inode); 4048 u64 dir_ino = btrfs_ino(dir); 4049 4050 path = btrfs_alloc_path(); 4051 if (!path) { 4052 ret = -ENOMEM; 4053 goto out; 4054 } 4055 4056 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 4057 name, name_len, -1); 4058 if (IS_ERR_OR_NULL(di)) { 4059 ret = di ? PTR_ERR(di) : -ENOENT; 4060 goto err; 4061 } 4062 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4063 if (ret) 4064 goto err; 4065 btrfs_release_path(path); 4066 4067 /* 4068 * If we don't have dir index, we have to get it by looking up 4069 * the inode ref, since we get the inode ref, remove it directly, 4070 * it is unnecessary to do delayed deletion. 4071 * 4072 * But if we have dir index, needn't search inode ref to get it. 4073 * Since the inode ref is close to the inode item, it is better 4074 * that we delay to delete it, and just do this deletion when 4075 * we update the inode item. 4076 */ 4077 if (inode->dir_index) { 4078 ret = btrfs_delayed_delete_inode_ref(inode); 4079 if (!ret) { 4080 index = inode->dir_index; 4081 goto skip_backref; 4082 } 4083 } 4084 4085 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 4086 dir_ino, &index); 4087 if (ret) { 4088 btrfs_info(fs_info, 4089 "failed to delete reference to %.*s, inode %llu parent %llu", 4090 name_len, name, ino, dir_ino); 4091 btrfs_abort_transaction(trans, ret); 4092 goto err; 4093 } 4094 skip_backref: 4095 ret = btrfs_delete_delayed_dir_index(trans, dir, index); 4096 if (ret) { 4097 btrfs_abort_transaction(trans, ret); 4098 goto err; 4099 } 4100 4101 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, 4102 dir_ino); 4103 if (ret != 0 && ret != -ENOENT) { 4104 btrfs_abort_transaction(trans, ret); 4105 goto err; 4106 } 4107 4108 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, 4109 index); 4110 if (ret == -ENOENT) 4111 ret = 0; 4112 else if (ret) 4113 btrfs_abort_transaction(trans, ret); 4114 4115 /* 4116 * If we have a pending delayed iput we could end up with the final iput 4117 * being run in btrfs-cleaner context. If we have enough of these built 4118 * up we can end up burning a lot of time in btrfs-cleaner without any 4119 * way to throttle the unlinks. Since we're currently holding a ref on 4120 * the inode we can run the delayed iput here without any issues as the 4121 * final iput won't be done until after we drop the ref we're currently 4122 * holding. 4123 */ 4124 btrfs_run_delayed_iput(fs_info, inode); 4125 err: 4126 btrfs_free_path(path); 4127 if (ret) 4128 goto out; 4129 4130 btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); 4131 inode_inc_iversion(&inode->vfs_inode); 4132 inode_inc_iversion(&dir->vfs_inode); 4133 inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = 4134 dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); 4135 ret = btrfs_update_inode(trans, root, dir); 4136 out: 4137 return ret; 4138 } 4139 4140 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 4141 struct btrfs_root *root, 4142 struct btrfs_inode *dir, struct btrfs_inode *inode, 4143 const char *name, int name_len) 4144 { 4145 int ret; 4146 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 4147 if (!ret) { 4148 drop_nlink(&inode->vfs_inode); 4149 ret = btrfs_update_inode(trans, root, inode); 4150 } 4151 return ret; 4152 } 4153 4154 /* 4155 * helper to start transaction for unlink and rmdir. 4156 * 4157 * unlink and rmdir are special in btrfs, they do not always free space, so 4158 * if we cannot make our reservations the normal way try and see if there is 4159 * plenty of slack room in the global reserve to migrate, otherwise we cannot 4160 * allow the unlink to occur. 4161 */ 4162 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) 4163 { 4164 struct btrfs_root *root = BTRFS_I(dir)->root; 4165 4166 /* 4167 * 1 for the possible orphan item 4168 * 1 for the dir item 4169 * 1 for the dir index 4170 * 1 for the inode ref 4171 * 1 for the inode 4172 */ 4173 return btrfs_start_transaction_fallback_global_rsv(root, 5); 4174 } 4175 4176 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 4177 { 4178 struct btrfs_root *root = BTRFS_I(dir)->root; 4179 struct btrfs_trans_handle *trans; 4180 struct inode *inode = d_inode(dentry); 4181 int ret; 4182 4183 trans = __unlink_start_trans(dir); 4184 if (IS_ERR(trans)) 4185 return PTR_ERR(trans); 4186 4187 btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 4188 0); 4189 4190 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 4191 BTRFS_I(d_inode(dentry)), dentry->d_name.name, 4192 dentry->d_name.len); 4193 if (ret) 4194 goto out; 4195 4196 if (inode->i_nlink == 0) { 4197 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 4198 if (ret) 4199 goto out; 4200 } 4201 4202 out: 4203 btrfs_end_transaction(trans); 4204 btrfs_btree_balance_dirty(root->fs_info); 4205 return ret; 4206 } 4207 4208 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 4209 struct inode *dir, struct dentry *dentry) 4210 { 4211 struct btrfs_root *root = BTRFS_I(dir)->root; 4212 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); 4213 struct btrfs_path *path; 4214 struct extent_buffer *leaf; 4215 struct btrfs_dir_item *di; 4216 struct btrfs_key key; 4217 const char *name = dentry->d_name.name; 4218 int name_len = dentry->d_name.len; 4219 u64 index; 4220 int ret; 4221 u64 objectid; 4222 u64 dir_ino = btrfs_ino(BTRFS_I(dir)); 4223 4224 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { 4225 objectid = inode->root->root_key.objectid; 4226 } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { 4227 objectid = inode->location.objectid; 4228 } else { 4229 WARN_ON(1); 4230 return -EINVAL; 4231 } 4232 4233 path = btrfs_alloc_path(); 4234 if (!path) 4235 return -ENOMEM; 4236 4237 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 4238 name, name_len, -1); 4239 if (IS_ERR_OR_NULL(di)) { 4240 ret = di ? PTR_ERR(di) : -ENOENT; 4241 goto out; 4242 } 4243 4244 leaf = path->nodes[0]; 4245 btrfs_dir_item_key_to_cpu(leaf, di, &key); 4246 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 4247 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4248 if (ret) { 4249 btrfs_abort_transaction(trans, ret); 4250 goto out; 4251 } 4252 btrfs_release_path(path); 4253 4254 /* 4255 * This is a placeholder inode for a subvolume we didn't have a 4256 * reference to at the time of the snapshot creation. In the meantime 4257 * we could have renamed the real subvol link into our snapshot, so 4258 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. 4259 * Instead simply lookup the dir_index_item for this entry so we can 4260 * remove it. Otherwise we know we have a ref to the root and we can 4261 * call btrfs_del_root_ref, and it _shouldn't_ fail. 4262 */ 4263 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { 4264 di = btrfs_search_dir_index_item(root, path, dir_ino, 4265 name, name_len); 4266 if (IS_ERR_OR_NULL(di)) { 4267 if (!di) 4268 ret = -ENOENT; 4269 else 4270 ret = PTR_ERR(di); 4271 btrfs_abort_transaction(trans, ret); 4272 goto out; 4273 } 4274 4275 leaf = path->nodes[0]; 4276 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4277 index = key.offset; 4278 btrfs_release_path(path); 4279 } else { 4280 ret = btrfs_del_root_ref(trans, objectid, 4281 root->root_key.objectid, dir_ino, 4282 &index, name, name_len); 4283 if (ret) { 4284 btrfs_abort_transaction(trans, ret); 4285 goto out; 4286 } 4287 } 4288 4289 ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); 4290 if (ret) { 4291 btrfs_abort_transaction(trans, ret); 4292 goto out; 4293 } 4294 4295 btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); 4296 inode_inc_iversion(dir); 4297 dir->i_mtime = dir->i_ctime = current_time(dir); 4298 ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); 4299 if (ret) 4300 btrfs_abort_transaction(trans, ret); 4301 out: 4302 btrfs_free_path(path); 4303 return ret; 4304 } 4305 4306 /* 4307 * Helper to check if the subvolume references other subvolumes or if it's 4308 * default. 4309 */ 4310 static noinline int may_destroy_subvol(struct btrfs_root *root) 4311 { 4312 struct btrfs_fs_info *fs_info = root->fs_info; 4313 struct btrfs_path *path; 4314 struct btrfs_dir_item *di; 4315 struct btrfs_key key; 4316 u64 dir_id; 4317 int ret; 4318 4319 path = btrfs_alloc_path(); 4320 if (!path) 4321 return -ENOMEM; 4322 4323 /* Make sure this root isn't set as the default subvol */ 4324 dir_id = btrfs_super_root_dir(fs_info->super_copy); 4325 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, 4326 dir_id, "default", 7, 0); 4327 if (di && !IS_ERR(di)) { 4328 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 4329 if (key.objectid == root->root_key.objectid) { 4330 ret = -EPERM; 4331 btrfs_err(fs_info, 4332 "deleting default subvolume %llu is not allowed", 4333 key.objectid); 4334 goto out; 4335 } 4336 btrfs_release_path(path); 4337 } 4338 4339 key.objectid = root->root_key.objectid; 4340 key.type = BTRFS_ROOT_REF_KEY; 4341 key.offset = (u64)-1; 4342 4343 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4344 if (ret < 0) 4345 goto out; 4346 BUG_ON(ret == 0); 4347 4348 ret = 0; 4349 if (path->slots[0] > 0) { 4350 path->slots[0]--; 4351 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 4352 if (key.objectid == root->root_key.objectid && 4353 key.type == BTRFS_ROOT_REF_KEY) 4354 ret = -ENOTEMPTY; 4355 } 4356 out: 4357 btrfs_free_path(path); 4358 return ret; 4359 } 4360 4361 /* Delete all dentries for inodes belonging to the root */ 4362 static void btrfs_prune_dentries(struct btrfs_root *root) 4363 { 4364 struct btrfs_fs_info *fs_info = root->fs_info; 4365 struct rb_node *node; 4366 struct rb_node *prev; 4367 struct btrfs_inode *entry; 4368 struct inode *inode; 4369 u64 objectid = 0; 4370 4371 if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 4372 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4373 4374 spin_lock(&root->inode_lock); 4375 again: 4376 node = root->inode_tree.rb_node; 4377 prev = NULL; 4378 while (node) { 4379 prev = node; 4380 entry = rb_entry(node, struct btrfs_inode, rb_node); 4381 4382 if (objectid < btrfs_ino(entry)) 4383 node = node->rb_left; 4384 else if (objectid > btrfs_ino(entry)) 4385 node = node->rb_right; 4386 else 4387 break; 4388 } 4389 if (!node) { 4390 while (prev) { 4391 entry = rb_entry(prev, struct btrfs_inode, rb_node); 4392 if (objectid <= btrfs_ino(entry)) { 4393 node = prev; 4394 break; 4395 } 4396 prev = rb_next(prev); 4397 } 4398 } 4399 while (node) { 4400 entry = rb_entry(node, struct btrfs_inode, rb_node); 4401 objectid = btrfs_ino(entry) + 1; 4402 inode = igrab(&entry->vfs_inode); 4403 if (inode) { 4404 spin_unlock(&root->inode_lock); 4405 if (atomic_read(&inode->i_count) > 1) 4406 d_prune_aliases(inode); 4407 /* 4408 * btrfs_drop_inode will have it removed from the inode 4409 * cache when its usage count hits zero. 4410 */ 4411 iput(inode); 4412 cond_resched(); 4413 spin_lock(&root->inode_lock); 4414 goto again; 4415 } 4416 4417 if (cond_resched_lock(&root->inode_lock)) 4418 goto again; 4419 4420 node = rb_next(node); 4421 } 4422 spin_unlock(&root->inode_lock); 4423 } 4424 4425 int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) 4426 { 4427 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); 4428 struct btrfs_root *root = BTRFS_I(dir)->root; 4429 struct inode *inode = d_inode(dentry); 4430 struct btrfs_root *dest = BTRFS_I(inode)->root; 4431 struct btrfs_trans_handle *trans; 4432 struct btrfs_block_rsv block_rsv; 4433 u64 root_flags; 4434 int ret; 4435 4436 /* 4437 * Don't allow to delete a subvolume with send in progress. This is 4438 * inside the inode lock so the error handling that has to drop the bit 4439 * again is not run concurrently. 4440 */ 4441 spin_lock(&dest->root_item_lock); 4442 if (dest->send_in_progress) { 4443 spin_unlock(&dest->root_item_lock); 4444 btrfs_warn(fs_info, 4445 "attempt to delete subvolume %llu during send", 4446 dest->root_key.objectid); 4447 return -EPERM; 4448 } 4449 root_flags = btrfs_root_flags(&dest->root_item); 4450 btrfs_set_root_flags(&dest->root_item, 4451 root_flags | BTRFS_ROOT_SUBVOL_DEAD); 4452 spin_unlock(&dest->root_item_lock); 4453 4454 down_write(&fs_info->subvol_sem); 4455 4456 ret = may_destroy_subvol(dest); 4457 if (ret) 4458 goto out_up_write; 4459 4460 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 4461 /* 4462 * One for dir inode, 4463 * two for dir entries, 4464 * two for root ref/backref. 4465 */ 4466 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); 4467 if (ret) 4468 goto out_up_write; 4469 4470 trans = btrfs_start_transaction(root, 0); 4471 if (IS_ERR(trans)) { 4472 ret = PTR_ERR(trans); 4473 goto out_release; 4474 } 4475 trans->block_rsv = &block_rsv; 4476 trans->bytes_reserved = block_rsv.size; 4477 4478 btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); 4479 4480 ret = btrfs_unlink_subvol(trans, dir, dentry); 4481 if (ret) { 4482 btrfs_abort_transaction(trans, ret); 4483 goto out_end_trans; 4484 } 4485 4486 ret = btrfs_record_root_in_trans(trans, dest); 4487 if (ret) { 4488 btrfs_abort_transaction(trans, ret); 4489 goto out_end_trans; 4490 } 4491 4492 memset(&dest->root_item.drop_progress, 0, 4493 sizeof(dest->root_item.drop_progress)); 4494 btrfs_set_root_drop_level(&dest->root_item, 0); 4495 btrfs_set_root_refs(&dest->root_item, 0); 4496 4497 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { 4498 ret = btrfs_insert_orphan_item(trans, 4499 fs_info->tree_root, 4500 dest->root_key.objectid); 4501 if (ret) { 4502 btrfs_abort_transaction(trans, ret); 4503 goto out_end_trans; 4504 } 4505 } 4506 4507 ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, 4508 BTRFS_UUID_KEY_SUBVOL, 4509 dest->root_key.objectid); 4510 if (ret && ret != -ENOENT) { 4511 btrfs_abort_transaction(trans, ret); 4512 goto out_end_trans; 4513 } 4514 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { 4515 ret = btrfs_uuid_tree_remove(trans, 4516 dest->root_item.received_uuid, 4517 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4518 dest->root_key.objectid); 4519 if (ret && ret != -ENOENT) { 4520 btrfs_abort_transaction(trans, ret); 4521 goto out_end_trans; 4522 } 4523 } 4524 4525 free_anon_bdev(dest->anon_dev); 4526 dest->anon_dev = 0; 4527 out_end_trans: 4528 trans->block_rsv = NULL; 4529 trans->bytes_reserved = 0; 4530 ret = btrfs_end_transaction(trans); 4531 inode->i_flags |= S_DEAD; 4532 out_release: 4533 btrfs_subvolume_release_metadata(root, &block_rsv); 4534 out_up_write: 4535 up_write(&fs_info->subvol_sem); 4536 if (ret) { 4537 spin_lock(&dest->root_item_lock); 4538 root_flags = btrfs_root_flags(&dest->root_item); 4539 btrfs_set_root_flags(&dest->root_item, 4540 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); 4541 spin_unlock(&dest->root_item_lock); 4542 } else { 4543 d_invalidate(dentry); 4544 btrfs_prune_dentries(dest); 4545 ASSERT(dest->send_in_progress == 0); 4546 } 4547 4548 return ret; 4549 } 4550 4551 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 4552 { 4553 struct inode *inode = d_inode(dentry); 4554 int err = 0; 4555 struct btrfs_root *root = BTRFS_I(dir)->root; 4556 struct btrfs_trans_handle *trans; 4557 u64 last_unlink_trans; 4558 4559 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 4560 return -ENOTEMPTY; 4561 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) 4562 return btrfs_delete_subvolume(dir, dentry); 4563 4564 trans = __unlink_start_trans(dir); 4565 if (IS_ERR(trans)) 4566 return PTR_ERR(trans); 4567 4568 if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 4569 err = btrfs_unlink_subvol(trans, dir, dentry); 4570 goto out; 4571 } 4572 4573 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 4574 if (err) 4575 goto out; 4576 4577 last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; 4578 4579 /* now the directory is empty */ 4580 err = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 4581 BTRFS_I(d_inode(dentry)), dentry->d_name.name, 4582 dentry->d_name.len); 4583 if (!err) { 4584 btrfs_i_size_write(BTRFS_I(inode), 0); 4585 /* 4586 * Propagate the last_unlink_trans value of the deleted dir to 4587 * its parent directory. This is to prevent an unrecoverable 4588 * log tree in the case we do something like this: 4589 * 1) create dir foo 4590 * 2) create snapshot under dir foo 4591 * 3) delete the snapshot 4592 * 4) rmdir foo 4593 * 5) mkdir foo 4594 * 6) fsync foo or some file inside foo 4595 */ 4596 if (last_unlink_trans >= trans->transid) 4597 BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; 4598 } 4599 out: 4600 btrfs_end_transaction(trans); 4601 btrfs_btree_balance_dirty(root->fs_info); 4602 4603 return err; 4604 } 4605 4606 /* 4607 * Return this if we need to call truncate_block for the last bit of the 4608 * truncate. 4609 */ 4610 #define NEED_TRUNCATE_BLOCK 1 4611 4612 /* 4613 * Remove inode items from a given root. 4614 * 4615 * @trans: A transaction handle. 4616 * @root: The root from which to remove items. 4617 * @inode: The inode whose items we want to remove. 4618 * @new_size: The new i_size for the inode. This is only applicable when 4619 * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise. 4620 * @min_type: The minimum key type to remove. All keys with a type 4621 * greater than this value are removed and all keys with 4622 * this type are removed only if their offset is >= @new_size. 4623 * @extents_found: Output parameter that will contain the number of file 4624 * extent items that were removed or adjusted to the new 4625 * inode i_size. The caller is responsible for initializing 4626 * the counter. Also, it can be NULL if the caller does not 4627 * need this counter. 4628 * 4629 * Remove all keys associated with the inode from the given root that have a key 4630 * with a type greater than or equals to @min_type. When @min_type has a value of 4631 * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value 4632 * greater than or equals to @new_size. If a file extent item that starts before 4633 * @new_size and ends after it is found, its length is adjusted. 4634 * 4635 * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is 4636 * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. 4637 */ 4638 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 4639 struct btrfs_root *root, 4640 struct btrfs_inode *inode, 4641 u64 new_size, u32 min_type, 4642 u64 *extents_found) 4643 { 4644 struct btrfs_fs_info *fs_info = root->fs_info; 4645 struct btrfs_path *path; 4646 struct extent_buffer *leaf; 4647 struct btrfs_file_extent_item *fi; 4648 struct btrfs_key key; 4649 struct btrfs_key found_key; 4650 u64 extent_start = 0; 4651 u64 extent_num_bytes = 0; 4652 u64 extent_offset = 0; 4653 u64 item_end = 0; 4654 u64 last_size = new_size; 4655 u32 found_type = (u8)-1; 4656 int found_extent; 4657 int del_item; 4658 int pending_del_nr = 0; 4659 int pending_del_slot = 0; 4660 int extent_type = -1; 4661 int ret; 4662 u64 ino = btrfs_ino(inode); 4663 u64 bytes_deleted = 0; 4664 bool be_nice = false; 4665 bool should_throttle = false; 4666 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); 4667 struct extent_state *cached_state = NULL; 4668 4669 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 4670 4671 /* 4672 * For non-free space inodes and non-shareable roots, we want to back 4673 * off from time to time. This means all inodes in subvolume roots, 4674 * reloc roots, and data reloc roots. 4675 */ 4676 if (!btrfs_is_free_space_inode(inode) && 4677 test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 4678 be_nice = true; 4679 4680 path = btrfs_alloc_path(); 4681 if (!path) 4682 return -ENOMEM; 4683 path->reada = READA_BACK; 4684 4685 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4686 lock_extent_bits(&inode->io_tree, lock_start, (u64)-1, 4687 &cached_state); 4688 4689 /* 4690 * We want to drop from the next block forward in case this 4691 * new size is not block aligned since we will be keeping the 4692 * last block of the extent just the way it is. 4693 */ 4694 btrfs_drop_extent_cache(inode, ALIGN(new_size, 4695 fs_info->sectorsize), 4696 (u64)-1, 0); 4697 } 4698 4699 /* 4700 * This function is also used to drop the items in the log tree before 4701 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 4702 * it is used to drop the logged items. So we shouldn't kill the delayed 4703 * items. 4704 */ 4705 if (min_type == 0 && root == inode->root) 4706 btrfs_kill_delayed_inode_items(inode); 4707 4708 key.objectid = ino; 4709 key.offset = (u64)-1; 4710 key.type = (u8)-1; 4711 4712 search_again: 4713 /* 4714 * with a 16K leaf size and 128MB extents, you can actually queue 4715 * up a huge file in a single leaf. Most of the time that 4716 * bytes_deleted is > 0, it will be huge by the time we get here 4717 */ 4718 if (be_nice && bytes_deleted > SZ_32M && 4719 btrfs_should_end_transaction(trans)) { 4720 ret = -EAGAIN; 4721 goto out; 4722 } 4723 4724 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 4725 if (ret < 0) 4726 goto out; 4727 4728 if (ret > 0) { 4729 ret = 0; 4730 /* there are no items in the tree for us to truncate, we're 4731 * done 4732 */ 4733 if (path->slots[0] == 0) 4734 goto out; 4735 path->slots[0]--; 4736 } 4737 4738 while (1) { 4739 u64 clear_start = 0, clear_len = 0; 4740 4741 fi = NULL; 4742 leaf = path->nodes[0]; 4743 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4744 found_type = found_key.type; 4745 4746 if (found_key.objectid != ino) 4747 break; 4748 4749 if (found_type < min_type) 4750 break; 4751 4752 item_end = found_key.offset; 4753 if (found_type == BTRFS_EXTENT_DATA_KEY) { 4754 fi = btrfs_item_ptr(leaf, path->slots[0], 4755 struct btrfs_file_extent_item); 4756 extent_type = btrfs_file_extent_type(leaf, fi); 4757 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4758 item_end += 4759 btrfs_file_extent_num_bytes(leaf, fi); 4760 4761 trace_btrfs_truncate_show_fi_regular( 4762 inode, leaf, fi, found_key.offset); 4763 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4764 item_end += btrfs_file_extent_ram_bytes(leaf, 4765 fi); 4766 4767 trace_btrfs_truncate_show_fi_inline( 4768 inode, leaf, fi, path->slots[0], 4769 found_key.offset); 4770 } 4771 item_end--; 4772 } 4773 if (found_type > min_type) { 4774 del_item = 1; 4775 } else { 4776 if (item_end < new_size) 4777 break; 4778 if (found_key.offset >= new_size) 4779 del_item = 1; 4780 else 4781 del_item = 0; 4782 } 4783 found_extent = 0; 4784 /* FIXME, shrink the extent if the ref count is only 1 */ 4785 if (found_type != BTRFS_EXTENT_DATA_KEY) 4786 goto delete; 4787 4788 if (extents_found != NULL) 4789 (*extents_found)++; 4790 4791 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4792 u64 num_dec; 4793 4794 clear_start = found_key.offset; 4795 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 4796 if (!del_item) { 4797 u64 orig_num_bytes = 4798 btrfs_file_extent_num_bytes(leaf, fi); 4799 extent_num_bytes = ALIGN(new_size - 4800 found_key.offset, 4801 fs_info->sectorsize); 4802 clear_start = ALIGN(new_size, fs_info->sectorsize); 4803 btrfs_set_file_extent_num_bytes(leaf, fi, 4804 extent_num_bytes); 4805 num_dec = (orig_num_bytes - 4806 extent_num_bytes); 4807 if (test_bit(BTRFS_ROOT_SHAREABLE, 4808 &root->state) && 4809 extent_start != 0) 4810 inode_sub_bytes(&inode->vfs_inode, 4811 num_dec); 4812 btrfs_mark_buffer_dirty(leaf); 4813 } else { 4814 extent_num_bytes = 4815 btrfs_file_extent_disk_num_bytes(leaf, 4816 fi); 4817 extent_offset = found_key.offset - 4818 btrfs_file_extent_offset(leaf, fi); 4819 4820 /* FIXME blocksize != 4096 */ 4821 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 4822 if (extent_start != 0) { 4823 found_extent = 1; 4824 if (test_bit(BTRFS_ROOT_SHAREABLE, 4825 &root->state)) 4826 inode_sub_bytes(&inode->vfs_inode, 4827 num_dec); 4828 } 4829 } 4830 clear_len = num_dec; 4831 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4832 /* 4833 * we can't truncate inline items that have had 4834 * special encodings 4835 */ 4836 if (!del_item && 4837 btrfs_file_extent_encryption(leaf, fi) == 0 && 4838 btrfs_file_extent_other_encoding(leaf, fi) == 0 && 4839 btrfs_file_extent_compression(leaf, fi) == 0) { 4840 u32 size = (u32)(new_size - found_key.offset); 4841 4842 btrfs_set_file_extent_ram_bytes(leaf, fi, size); 4843 size = btrfs_file_extent_calc_inline_size(size); 4844 btrfs_truncate_item(path, size, 1); 4845 } else if (!del_item) { 4846 /* 4847 * We have to bail so the last_size is set to 4848 * just before this extent. 4849 */ 4850 ret = NEED_TRUNCATE_BLOCK; 4851 break; 4852 } else { 4853 /* 4854 * Inline extents are special, we just treat 4855 * them as a full sector worth in the file 4856 * extent tree just for simplicity sake. 4857 */ 4858 clear_len = fs_info->sectorsize; 4859 } 4860 4861 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 4862 inode_sub_bytes(&inode->vfs_inode, 4863 item_end + 1 - new_size); 4864 } 4865 delete: 4866 /* 4867 * We use btrfs_truncate_inode_items() to clean up log trees for 4868 * multiple fsyncs, and in this case we don't want to clear the 4869 * file extent range because it's just the log. 4870 */ 4871 if (root == inode->root) { 4872 ret = btrfs_inode_clear_file_extent_range(inode, 4873 clear_start, clear_len); 4874 if (ret) { 4875 btrfs_abort_transaction(trans, ret); 4876 break; 4877 } 4878 } 4879 4880 if (del_item) 4881 last_size = found_key.offset; 4882 else 4883 last_size = new_size; 4884 if (del_item) { 4885 if (!pending_del_nr) { 4886 /* no pending yet, add ourselves */ 4887 pending_del_slot = path->slots[0]; 4888 pending_del_nr = 1; 4889 } else if (pending_del_nr && 4890 path->slots[0] + 1 == pending_del_slot) { 4891 /* hop on the pending chunk */ 4892 pending_del_nr++; 4893 pending_del_slot = path->slots[0]; 4894 } else { 4895 BUG(); 4896 } 4897 } else { 4898 break; 4899 } 4900 should_throttle = false; 4901 4902 if (found_extent && 4903 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4904 struct btrfs_ref ref = { 0 }; 4905 4906 bytes_deleted += extent_num_bytes; 4907 4908 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, 4909 extent_start, extent_num_bytes, 0); 4910 ref.real_root = root->root_key.objectid; 4911 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 4912 ino, extent_offset); 4913 ret = btrfs_free_extent(trans, &ref); 4914 if (ret) { 4915 btrfs_abort_transaction(trans, ret); 4916 break; 4917 } 4918 if (be_nice) { 4919 if (btrfs_should_throttle_delayed_refs(trans)) 4920 should_throttle = true; 4921 } 4922 } 4923 4924 if (found_type == BTRFS_INODE_ITEM_KEY) 4925 break; 4926 4927 if (path->slots[0] == 0 || 4928 path->slots[0] != pending_del_slot || 4929 should_throttle) { 4930 if (pending_del_nr) { 4931 ret = btrfs_del_items(trans, root, path, 4932 pending_del_slot, 4933 pending_del_nr); 4934 if (ret) { 4935 btrfs_abort_transaction(trans, ret); 4936 break; 4937 } 4938 pending_del_nr = 0; 4939 } 4940 btrfs_release_path(path); 4941 4942 /* 4943 * We can generate a lot of delayed refs, so we need to 4944 * throttle every once and a while and make sure we're 4945 * adding enough space to keep up with the work we are 4946 * generating. Since we hold a transaction here we 4947 * can't flush, and we don't want to FLUSH_LIMIT because 4948 * we could have generated too many delayed refs to 4949 * actually allocate, so just bail if we're short and 4950 * let the normal reservation dance happen higher up. 4951 */ 4952 if (should_throttle) { 4953 ret = btrfs_delayed_refs_rsv_refill(fs_info, 4954 BTRFS_RESERVE_NO_FLUSH); 4955 if (ret) { 4956 ret = -EAGAIN; 4957 break; 4958 } 4959 } 4960 goto search_again; 4961 } else { 4962 path->slots[0]--; 4963 } 4964 } 4965 out: 4966 if (ret >= 0 && pending_del_nr) { 4967 int err; 4968 4969 err = btrfs_del_items(trans, root, path, pending_del_slot, 4970 pending_del_nr); 4971 if (err) { 4972 btrfs_abort_transaction(trans, err); 4973 ret = err; 4974 } 4975 } 4976 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4977 ASSERT(last_size >= new_size); 4978 if (!ret && last_size > new_size) 4979 last_size = new_size; 4980 btrfs_inode_safe_disk_i_size_write(inode, last_size); 4981 unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1, 4982 &cached_state); 4983 } 4984 4985 btrfs_free_path(path); 4986 return ret; 4987 } 4988 4989 /* 4990 * btrfs_truncate_block - read, zero a chunk and write a block 4991 * @inode - inode that we're zeroing 4992 * @from - the offset to start zeroing 4993 * @len - the length to zero, 0 to zero the entire range respective to the 4994 * offset 4995 * @front - zero up to the offset instead of from the offset on 4996 * 4997 * This will find the block for the "from" offset and cow the block and zero the 4998 * part we want to zero. This is used with truncate and hole punching. 4999 */ 5000 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, 5001 int front) 5002 { 5003 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5004 struct address_space *mapping = inode->vfs_inode.i_mapping; 5005 struct extent_io_tree *io_tree = &inode->io_tree; 5006 struct btrfs_ordered_extent *ordered; 5007 struct extent_state *cached_state = NULL; 5008 struct extent_changeset *data_reserved = NULL; 5009 bool only_release_metadata = false; 5010 u32 blocksize = fs_info->sectorsize; 5011 pgoff_t index = from >> PAGE_SHIFT; 5012 unsigned offset = from & (blocksize - 1); 5013 struct page *page; 5014 gfp_t mask = btrfs_alloc_write_mask(mapping); 5015 size_t write_bytes = blocksize; 5016 int ret = 0; 5017 u64 block_start; 5018 u64 block_end; 5019 5020 if (IS_ALIGNED(offset, blocksize) && 5021 (!len || IS_ALIGNED(len, blocksize))) 5022 goto out; 5023 5024 block_start = round_down(from, blocksize); 5025 block_end = block_start + blocksize - 1; 5026 5027 ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, 5028 blocksize); 5029 if (ret < 0) { 5030 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) { 5031 /* For nocow case, no need to reserve data space */ 5032 only_release_metadata = true; 5033 } else { 5034 goto out; 5035 } 5036 } 5037 ret = btrfs_delalloc_reserve_metadata(inode, blocksize); 5038 if (ret < 0) { 5039 if (!only_release_metadata) 5040 btrfs_free_reserved_data_space(inode, data_reserved, 5041 block_start, blocksize); 5042 goto out; 5043 } 5044 again: 5045 page = find_or_create_page(mapping, index, mask); 5046 if (!page) { 5047 btrfs_delalloc_release_space(inode, data_reserved, block_start, 5048 blocksize, true); 5049 btrfs_delalloc_release_extents(inode, blocksize); 5050 ret = -ENOMEM; 5051 goto out; 5052 } 5053 ret = set_page_extent_mapped(page); 5054 if (ret < 0) 5055 goto out_unlock; 5056 5057 if (!PageUptodate(page)) { 5058 ret = btrfs_readpage(NULL, page); 5059 lock_page(page); 5060 if (page->mapping != mapping) { 5061 unlock_page(page); 5062 put_page(page); 5063 goto again; 5064 } 5065 if (!PageUptodate(page)) { 5066 ret = -EIO; 5067 goto out_unlock; 5068 } 5069 } 5070 wait_on_page_writeback(page); 5071 5072 lock_extent_bits(io_tree, block_start, block_end, &cached_state); 5073 5074 ordered = btrfs_lookup_ordered_extent(inode, block_start); 5075 if (ordered) { 5076 unlock_extent_cached(io_tree, block_start, block_end, 5077 &cached_state); 5078 unlock_page(page); 5079 put_page(page); 5080 btrfs_start_ordered_extent(ordered, 1); 5081 btrfs_put_ordered_extent(ordered); 5082 goto again; 5083 } 5084 5085 clear_extent_bit(&inode->io_tree, block_start, block_end, 5086 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 5087 0, 0, &cached_state); 5088 5089 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, 5090 &cached_state); 5091 if (ret) { 5092 unlock_extent_cached(io_tree, block_start, block_end, 5093 &cached_state); 5094 goto out_unlock; 5095 } 5096 5097 if (offset != blocksize) { 5098 if (!len) 5099 len = blocksize - offset; 5100 if (front) 5101 memzero_page(page, (block_start - page_offset(page)), 5102 offset); 5103 else 5104 memzero_page(page, (block_start - page_offset(page)) + offset, 5105 len); 5106 flush_dcache_page(page); 5107 } 5108 ClearPageChecked(page); 5109 btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); 5110 unlock_extent_cached(io_tree, block_start, block_end, &cached_state); 5111 5112 if (only_release_metadata) 5113 set_extent_bit(&inode->io_tree, block_start, block_end, 5114 EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL); 5115 5116 out_unlock: 5117 if (ret) { 5118 if (only_release_metadata) 5119 btrfs_delalloc_release_metadata(inode, blocksize, true); 5120 else 5121 btrfs_delalloc_release_space(inode, data_reserved, 5122 block_start, blocksize, true); 5123 } 5124 btrfs_delalloc_release_extents(inode, blocksize); 5125 unlock_page(page); 5126 put_page(page); 5127 out: 5128 if (only_release_metadata) 5129 btrfs_check_nocow_unlock(inode); 5130 extent_changeset_free(data_reserved); 5131 return ret; 5132 } 5133 5134 static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, 5135 u64 offset, u64 len) 5136 { 5137 struct btrfs_fs_info *fs_info = root->fs_info; 5138 struct btrfs_trans_handle *trans; 5139 struct btrfs_drop_extents_args drop_args = { 0 }; 5140 int ret; 5141 5142 /* 5143 * If NO_HOLES is enabled, we don't need to do anything. 5144 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() 5145 * or btrfs_update_inode() will be called, which guarantee that the next 5146 * fsync will know this inode was changed and needs to be logged. 5147 */ 5148 if (btrfs_fs_incompat(fs_info, NO_HOLES)) 5149 return 0; 5150 5151 /* 5152 * 1 - for the one we're dropping 5153 * 1 - for the one we're adding 5154 * 1 - for updating the inode. 5155 */ 5156 trans = btrfs_start_transaction(root, 3); 5157 if (IS_ERR(trans)) 5158 return PTR_ERR(trans); 5159 5160 drop_args.start = offset; 5161 drop_args.end = offset + len; 5162 drop_args.drop_cache = true; 5163 5164 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 5165 if (ret) { 5166 btrfs_abort_transaction(trans, ret); 5167 btrfs_end_transaction(trans); 5168 return ret; 5169 } 5170 5171 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), 5172 offset, 0, 0, len, 0, len, 0, 0, 0); 5173 if (ret) { 5174 btrfs_abort_transaction(trans, ret); 5175 } else { 5176 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); 5177 btrfs_update_inode(trans, root, inode); 5178 } 5179 btrfs_end_transaction(trans); 5180 return ret; 5181 } 5182 5183 /* 5184 * This function puts in dummy file extents for the area we're creating a hole 5185 * for. So if we are truncating this file to a larger size we need to insert 5186 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 5187 * the range between oldsize and size 5188 */ 5189 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) 5190 { 5191 struct btrfs_root *root = inode->root; 5192 struct btrfs_fs_info *fs_info = root->fs_info; 5193 struct extent_io_tree *io_tree = &inode->io_tree; 5194 struct extent_map *em = NULL; 5195 struct extent_state *cached_state = NULL; 5196 struct extent_map_tree *em_tree = &inode->extent_tree; 5197 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); 5198 u64 block_end = ALIGN(size, fs_info->sectorsize); 5199 u64 last_byte; 5200 u64 cur_offset; 5201 u64 hole_size; 5202 int err = 0; 5203 5204 /* 5205 * If our size started in the middle of a block we need to zero out the 5206 * rest of the block before we expand the i_size, otherwise we could 5207 * expose stale data. 5208 */ 5209 err = btrfs_truncate_block(inode, oldsize, 0, 0); 5210 if (err) 5211 return err; 5212 5213 if (size <= hole_start) 5214 return 0; 5215 5216 btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1, 5217 &cached_state); 5218 cur_offset = hole_start; 5219 while (1) { 5220 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 5221 block_end - cur_offset); 5222 if (IS_ERR(em)) { 5223 err = PTR_ERR(em); 5224 em = NULL; 5225 break; 5226 } 5227 last_byte = min(extent_map_end(em), block_end); 5228 last_byte = ALIGN(last_byte, fs_info->sectorsize); 5229 hole_size = last_byte - cur_offset; 5230 5231 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 5232 struct extent_map *hole_em; 5233 5234 err = maybe_insert_hole(root, inode, cur_offset, 5235 hole_size); 5236 if (err) 5237 break; 5238 5239 err = btrfs_inode_set_file_extent_range(inode, 5240 cur_offset, hole_size); 5241 if (err) 5242 break; 5243 5244 btrfs_drop_extent_cache(inode, cur_offset, 5245 cur_offset + hole_size - 1, 0); 5246 hole_em = alloc_extent_map(); 5247 if (!hole_em) { 5248 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5249 &inode->runtime_flags); 5250 goto next; 5251 } 5252 hole_em->start = cur_offset; 5253 hole_em->len = hole_size; 5254 hole_em->orig_start = cur_offset; 5255 5256 hole_em->block_start = EXTENT_MAP_HOLE; 5257 hole_em->block_len = 0; 5258 hole_em->orig_block_len = 0; 5259 hole_em->ram_bytes = hole_size; 5260 hole_em->compress_type = BTRFS_COMPRESS_NONE; 5261 hole_em->generation = fs_info->generation; 5262 5263 while (1) { 5264 write_lock(&em_tree->lock); 5265 err = add_extent_mapping(em_tree, hole_em, 1); 5266 write_unlock(&em_tree->lock); 5267 if (err != -EEXIST) 5268 break; 5269 btrfs_drop_extent_cache(inode, cur_offset, 5270 cur_offset + 5271 hole_size - 1, 0); 5272 } 5273 free_extent_map(hole_em); 5274 } else { 5275 err = btrfs_inode_set_file_extent_range(inode, 5276 cur_offset, hole_size); 5277 if (err) 5278 break; 5279 } 5280 next: 5281 free_extent_map(em); 5282 em = NULL; 5283 cur_offset = last_byte; 5284 if (cur_offset >= block_end) 5285 break; 5286 } 5287 free_extent_map(em); 5288 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state); 5289 return err; 5290 } 5291 5292 static int btrfs_setsize(struct inode *inode, struct iattr *attr) 5293 { 5294 struct btrfs_root *root = BTRFS_I(inode)->root; 5295 struct btrfs_trans_handle *trans; 5296 loff_t oldsize = i_size_read(inode); 5297 loff_t newsize = attr->ia_size; 5298 int mask = attr->ia_valid; 5299 int ret; 5300 5301 /* 5302 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a 5303 * special case where we need to update the times despite not having 5304 * these flags set. For all other operations the VFS set these flags 5305 * explicitly if it wants a timestamp update. 5306 */ 5307 if (newsize != oldsize) { 5308 inode_inc_iversion(inode); 5309 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) 5310 inode->i_ctime = inode->i_mtime = 5311 current_time(inode); 5312 } 5313 5314 if (newsize > oldsize) { 5315 /* 5316 * Don't do an expanding truncate while snapshotting is ongoing. 5317 * This is to ensure the snapshot captures a fully consistent 5318 * state of this file - if the snapshot captures this expanding 5319 * truncation, it must capture all writes that happened before 5320 * this truncation. 5321 */ 5322 btrfs_drew_write_lock(&root->snapshot_lock); 5323 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); 5324 if (ret) { 5325 btrfs_drew_write_unlock(&root->snapshot_lock); 5326 return ret; 5327 } 5328 5329 trans = btrfs_start_transaction(root, 1); 5330 if (IS_ERR(trans)) { 5331 btrfs_drew_write_unlock(&root->snapshot_lock); 5332 return PTR_ERR(trans); 5333 } 5334 5335 i_size_write(inode, newsize); 5336 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 5337 pagecache_isize_extended(inode, oldsize, newsize); 5338 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 5339 btrfs_drew_write_unlock(&root->snapshot_lock); 5340 btrfs_end_transaction(trans); 5341 } else { 5342 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5343 5344 if (btrfs_is_zoned(fs_info)) { 5345 ret = btrfs_wait_ordered_range(inode, 5346 ALIGN(newsize, fs_info->sectorsize), 5347 (u64)-1); 5348 if (ret) 5349 return ret; 5350 } 5351 5352 /* 5353 * We're truncating a file that used to have good data down to 5354 * zero. Make sure any new writes to the file get on disk 5355 * on close. 5356 */ 5357 if (newsize == 0) 5358 set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, 5359 &BTRFS_I(inode)->runtime_flags); 5360 5361 truncate_setsize(inode, newsize); 5362 5363 inode_dio_wait(inode); 5364 5365 ret = btrfs_truncate(inode, newsize == oldsize); 5366 if (ret && inode->i_nlink) { 5367 int err; 5368 5369 /* 5370 * Truncate failed, so fix up the in-memory size. We 5371 * adjusted disk_i_size down as we removed extents, so 5372 * wait for disk_i_size to be stable and then update the 5373 * in-memory size to match. 5374 */ 5375 err = btrfs_wait_ordered_range(inode, 0, (u64)-1); 5376 if (err) 5377 return err; 5378 i_size_write(inode, BTRFS_I(inode)->disk_i_size); 5379 } 5380 } 5381 5382 return ret; 5383 } 5384 5385 static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, 5386 struct iattr *attr) 5387 { 5388 struct inode *inode = d_inode(dentry); 5389 struct btrfs_root *root = BTRFS_I(inode)->root; 5390 int err; 5391 5392 if (btrfs_root_readonly(root)) 5393 return -EROFS; 5394 5395 err = setattr_prepare(mnt_userns, dentry, attr); 5396 if (err) 5397 return err; 5398 5399 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 5400 err = btrfs_setsize(inode, attr); 5401 if (err) 5402 return err; 5403 } 5404 5405 if (attr->ia_valid) { 5406 setattr_copy(mnt_userns, inode, attr); 5407 inode_inc_iversion(inode); 5408 err = btrfs_dirty_inode(inode); 5409 5410 if (!err && attr->ia_valid & ATTR_MODE) 5411 err = posix_acl_chmod(mnt_userns, inode, inode->i_mode); 5412 } 5413 5414 return err; 5415 } 5416 5417 /* 5418 * While truncating the inode pages during eviction, we get the VFS calling 5419 * btrfs_invalidatepage() against each page of the inode. This is slow because 5420 * the calls to btrfs_invalidatepage() result in a huge amount of calls to 5421 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting 5422 * extent_state structures over and over, wasting lots of time. 5423 * 5424 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all 5425 * those expensive operations on a per page basis and do only the ordered io 5426 * finishing, while we release here the extent_map and extent_state structures, 5427 * without the excessive merging and splitting. 5428 */ 5429 static void evict_inode_truncate_pages(struct inode *inode) 5430 { 5431 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5432 struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; 5433 struct rb_node *node; 5434 5435 ASSERT(inode->i_state & I_FREEING); 5436 truncate_inode_pages_final(&inode->i_data); 5437 5438 write_lock(&map_tree->lock); 5439 while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { 5440 struct extent_map *em; 5441 5442 node = rb_first_cached(&map_tree->map); 5443 em = rb_entry(node, struct extent_map, rb_node); 5444 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 5445 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 5446 remove_extent_mapping(map_tree, em); 5447 free_extent_map(em); 5448 if (need_resched()) { 5449 write_unlock(&map_tree->lock); 5450 cond_resched(); 5451 write_lock(&map_tree->lock); 5452 } 5453 } 5454 write_unlock(&map_tree->lock); 5455 5456 /* 5457 * Keep looping until we have no more ranges in the io tree. 5458 * We can have ongoing bios started by readahead that have 5459 * their endio callback (extent_io.c:end_bio_extent_readpage) 5460 * still in progress (unlocked the pages in the bio but did not yet 5461 * unlocked the ranges in the io tree). Therefore this means some 5462 * ranges can still be locked and eviction started because before 5463 * submitting those bios, which are executed by a separate task (work 5464 * queue kthread), inode references (inode->i_count) were not taken 5465 * (which would be dropped in the end io callback of each bio). 5466 * Therefore here we effectively end up waiting for those bios and 5467 * anyone else holding locked ranges without having bumped the inode's 5468 * reference count - if we don't do it, when they access the inode's 5469 * io_tree to unlock a range it may be too late, leading to an 5470 * use-after-free issue. 5471 */ 5472 spin_lock(&io_tree->lock); 5473 while (!RB_EMPTY_ROOT(&io_tree->state)) { 5474 struct extent_state *state; 5475 struct extent_state *cached_state = NULL; 5476 u64 start; 5477 u64 end; 5478 unsigned state_flags; 5479 5480 node = rb_first(&io_tree->state); 5481 state = rb_entry(node, struct extent_state, rb_node); 5482 start = state->start; 5483 end = state->end; 5484 state_flags = state->state; 5485 spin_unlock(&io_tree->lock); 5486 5487 lock_extent_bits(io_tree, start, end, &cached_state); 5488 5489 /* 5490 * If still has DELALLOC flag, the extent didn't reach disk, 5491 * and its reserved space won't be freed by delayed_ref. 5492 * So we need to free its reserved space here. 5493 * (Refer to comment in btrfs_invalidatepage, case 2) 5494 * 5495 * Note, end is the bytenr of last byte, so we need + 1 here. 5496 */ 5497 if (state_flags & EXTENT_DELALLOC) 5498 btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, 5499 end - start + 1); 5500 5501 clear_extent_bit(io_tree, start, end, 5502 EXTENT_LOCKED | EXTENT_DELALLOC | 5503 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, 5504 &cached_state); 5505 5506 cond_resched(); 5507 spin_lock(&io_tree->lock); 5508 } 5509 spin_unlock(&io_tree->lock); 5510 } 5511 5512 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, 5513 struct btrfs_block_rsv *rsv) 5514 { 5515 struct btrfs_fs_info *fs_info = root->fs_info; 5516 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5517 struct btrfs_trans_handle *trans; 5518 u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); 5519 int ret; 5520 5521 /* 5522 * Eviction should be taking place at some place safe because of our 5523 * delayed iputs. However the normal flushing code will run delayed 5524 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. 5525 * 5526 * We reserve the delayed_refs_extra here again because we can't use 5527 * btrfs_start_transaction(root, 0) for the same deadlocky reason as 5528 * above. We reserve our extra bit here because we generate a ton of 5529 * delayed refs activity by truncating. 5530 * 5531 * If we cannot make our reservation we'll attempt to steal from the 5532 * global reserve, because we really want to be able to free up space. 5533 */ 5534 ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra, 5535 BTRFS_RESERVE_FLUSH_EVICT); 5536 if (ret) { 5537 /* 5538 * Try to steal from the global reserve if there is space for 5539 * it. 5540 */ 5541 if (btrfs_check_space_for_delayed_refs(fs_info) || 5542 btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) { 5543 btrfs_warn(fs_info, 5544 "could not allocate space for delete; will truncate on mount"); 5545 return ERR_PTR(-ENOSPC); 5546 } 5547 delayed_refs_extra = 0; 5548 } 5549 5550 trans = btrfs_join_transaction(root); 5551 if (IS_ERR(trans)) 5552 return trans; 5553 5554 if (delayed_refs_extra) { 5555 trans->block_rsv = &fs_info->trans_block_rsv; 5556 trans->bytes_reserved = delayed_refs_extra; 5557 btrfs_block_rsv_migrate(rsv, trans->block_rsv, 5558 delayed_refs_extra, 1); 5559 } 5560 return trans; 5561 } 5562 5563 void btrfs_evict_inode(struct inode *inode) 5564 { 5565 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5566 struct btrfs_trans_handle *trans; 5567 struct btrfs_root *root = BTRFS_I(inode)->root; 5568 struct btrfs_block_rsv *rsv; 5569 int ret; 5570 5571 trace_btrfs_inode_evict(inode); 5572 5573 if (!root) { 5574 fsverity_cleanup_inode(inode); 5575 clear_inode(inode); 5576 return; 5577 } 5578 5579 evict_inode_truncate_pages(inode); 5580 5581 if (inode->i_nlink && 5582 ((btrfs_root_refs(&root->root_item) != 0 && 5583 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || 5584 btrfs_is_free_space_inode(BTRFS_I(inode)))) 5585 goto no_delete; 5586 5587 if (is_bad_inode(inode)) 5588 goto no_delete; 5589 5590 btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); 5591 5592 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 5593 goto no_delete; 5594 5595 if (inode->i_nlink > 0) { 5596 BUG_ON(btrfs_root_refs(&root->root_item) != 0 && 5597 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); 5598 goto no_delete; 5599 } 5600 5601 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); 5602 if (ret) 5603 goto no_delete; 5604 5605 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 5606 if (!rsv) 5607 goto no_delete; 5608 rsv->size = btrfs_calc_metadata_size(fs_info, 1); 5609 rsv->failfast = 1; 5610 5611 btrfs_i_size_write(BTRFS_I(inode), 0); 5612 5613 while (1) { 5614 trans = evict_refill_and_join(root, rsv); 5615 if (IS_ERR(trans)) 5616 goto free_rsv; 5617 5618 trans->block_rsv = rsv; 5619 5620 ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 5621 0, 0, NULL); 5622 trans->block_rsv = &fs_info->trans_block_rsv; 5623 btrfs_end_transaction(trans); 5624 btrfs_btree_balance_dirty(fs_info); 5625 if (ret && ret != -ENOSPC && ret != -EAGAIN) 5626 goto free_rsv; 5627 else if (!ret) 5628 break; 5629 } 5630 5631 /* 5632 * Errors here aren't a big deal, it just means we leave orphan items in 5633 * the tree. They will be cleaned up on the next mount. If the inode 5634 * number gets reused, cleanup deletes the orphan item without doing 5635 * anything, and unlink reuses the existing orphan item. 5636 * 5637 * If it turns out that we are dropping too many of these, we might want 5638 * to add a mechanism for retrying these after a commit. 5639 */ 5640 trans = evict_refill_and_join(root, rsv); 5641 if (!IS_ERR(trans)) { 5642 trans->block_rsv = rsv; 5643 btrfs_orphan_del(trans, BTRFS_I(inode)); 5644 trans->block_rsv = &fs_info->trans_block_rsv; 5645 btrfs_end_transaction(trans); 5646 } 5647 5648 free_rsv: 5649 btrfs_free_block_rsv(fs_info, rsv); 5650 no_delete: 5651 /* 5652 * If we didn't successfully delete, the orphan item will still be in 5653 * the tree and we'll retry on the next mount. Again, we might also want 5654 * to retry these periodically in the future. 5655 */ 5656 btrfs_remove_delayed_node(BTRFS_I(inode)); 5657 fsverity_cleanup_inode(inode); 5658 clear_inode(inode); 5659 } 5660 5661 /* 5662 * Return the key found in the dir entry in the location pointer, fill @type 5663 * with BTRFS_FT_*, and return 0. 5664 * 5665 * If no dir entries were found, returns -ENOENT. 5666 * If found a corrupted location in dir entry, returns -EUCLEAN. 5667 */ 5668 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 5669 struct btrfs_key *location, u8 *type) 5670 { 5671 const char *name = dentry->d_name.name; 5672 int namelen = dentry->d_name.len; 5673 struct btrfs_dir_item *di; 5674 struct btrfs_path *path; 5675 struct btrfs_root *root = BTRFS_I(dir)->root; 5676 int ret = 0; 5677 5678 path = btrfs_alloc_path(); 5679 if (!path) 5680 return -ENOMEM; 5681 5682 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), 5683 name, namelen, 0); 5684 if (IS_ERR_OR_NULL(di)) { 5685 ret = di ? PTR_ERR(di) : -ENOENT; 5686 goto out; 5687 } 5688 5689 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 5690 if (location->type != BTRFS_INODE_ITEM_KEY && 5691 location->type != BTRFS_ROOT_ITEM_KEY) { 5692 ret = -EUCLEAN; 5693 btrfs_warn(root->fs_info, 5694 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", 5695 __func__, name, btrfs_ino(BTRFS_I(dir)), 5696 location->objectid, location->type, location->offset); 5697 } 5698 if (!ret) 5699 *type = btrfs_dir_type(path->nodes[0], di); 5700 out: 5701 btrfs_free_path(path); 5702 return ret; 5703 } 5704 5705 /* 5706 * when we hit a tree root in a directory, the btrfs part of the inode 5707 * needs to be changed to reflect the root directory of the tree root. This 5708 * is kind of like crossing a mount point. 5709 */ 5710 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, 5711 struct inode *dir, 5712 struct dentry *dentry, 5713 struct btrfs_key *location, 5714 struct btrfs_root **sub_root) 5715 { 5716 struct btrfs_path *path; 5717 struct btrfs_root *new_root; 5718 struct btrfs_root_ref *ref; 5719 struct extent_buffer *leaf; 5720 struct btrfs_key key; 5721 int ret; 5722 int err = 0; 5723 5724 path = btrfs_alloc_path(); 5725 if (!path) { 5726 err = -ENOMEM; 5727 goto out; 5728 } 5729 5730 err = -ENOENT; 5731 key.objectid = BTRFS_I(dir)->root->root_key.objectid; 5732 key.type = BTRFS_ROOT_REF_KEY; 5733 key.offset = location->objectid; 5734 5735 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 5736 if (ret) { 5737 if (ret < 0) 5738 err = ret; 5739 goto out; 5740 } 5741 5742 leaf = path->nodes[0]; 5743 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 5744 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || 5745 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 5746 goto out; 5747 5748 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 5749 (unsigned long)(ref + 1), 5750 dentry->d_name.len); 5751 if (ret) 5752 goto out; 5753 5754 btrfs_release_path(path); 5755 5756 new_root = btrfs_get_fs_root(fs_info, location->objectid, true); 5757 if (IS_ERR(new_root)) { 5758 err = PTR_ERR(new_root); 5759 goto out; 5760 } 5761 5762 *sub_root = new_root; 5763 location->objectid = btrfs_root_dirid(&new_root->root_item); 5764 location->type = BTRFS_INODE_ITEM_KEY; 5765 location->offset = 0; 5766 err = 0; 5767 out: 5768 btrfs_free_path(path); 5769 return err; 5770 } 5771 5772 static void inode_tree_add(struct inode *inode) 5773 { 5774 struct btrfs_root *root = BTRFS_I(inode)->root; 5775 struct btrfs_inode *entry; 5776 struct rb_node **p; 5777 struct rb_node *parent; 5778 struct rb_node *new = &BTRFS_I(inode)->rb_node; 5779 u64 ino = btrfs_ino(BTRFS_I(inode)); 5780 5781 if (inode_unhashed(inode)) 5782 return; 5783 parent = NULL; 5784 spin_lock(&root->inode_lock); 5785 p = &root->inode_tree.rb_node; 5786 while (*p) { 5787 parent = *p; 5788 entry = rb_entry(parent, struct btrfs_inode, rb_node); 5789 5790 if (ino < btrfs_ino(entry)) 5791 p = &parent->rb_left; 5792 else if (ino > btrfs_ino(entry)) 5793 p = &parent->rb_right; 5794 else { 5795 WARN_ON(!(entry->vfs_inode.i_state & 5796 (I_WILL_FREE | I_FREEING))); 5797 rb_replace_node(parent, new, &root->inode_tree); 5798 RB_CLEAR_NODE(parent); 5799 spin_unlock(&root->inode_lock); 5800 return; 5801 } 5802 } 5803 rb_link_node(new, parent, p); 5804 rb_insert_color(new, &root->inode_tree); 5805 spin_unlock(&root->inode_lock); 5806 } 5807 5808 static void inode_tree_del(struct btrfs_inode *inode) 5809 { 5810 struct btrfs_root *root = inode->root; 5811 int empty = 0; 5812 5813 spin_lock(&root->inode_lock); 5814 if (!RB_EMPTY_NODE(&inode->rb_node)) { 5815 rb_erase(&inode->rb_node, &root->inode_tree); 5816 RB_CLEAR_NODE(&inode->rb_node); 5817 empty = RB_EMPTY_ROOT(&root->inode_tree); 5818 } 5819 spin_unlock(&root->inode_lock); 5820 5821 if (empty && btrfs_root_refs(&root->root_item) == 0) { 5822 spin_lock(&root->inode_lock); 5823 empty = RB_EMPTY_ROOT(&root->inode_tree); 5824 spin_unlock(&root->inode_lock); 5825 if (empty) 5826 btrfs_add_dead_root(root); 5827 } 5828 } 5829 5830 5831 static int btrfs_init_locked_inode(struct inode *inode, void *p) 5832 { 5833 struct btrfs_iget_args *args = p; 5834 5835 inode->i_ino = args->ino; 5836 BTRFS_I(inode)->location.objectid = args->ino; 5837 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; 5838 BTRFS_I(inode)->location.offset = 0; 5839 BTRFS_I(inode)->root = btrfs_grab_root(args->root); 5840 BUG_ON(args->root && !BTRFS_I(inode)->root); 5841 return 0; 5842 } 5843 5844 static int btrfs_find_actor(struct inode *inode, void *opaque) 5845 { 5846 struct btrfs_iget_args *args = opaque; 5847 5848 return args->ino == BTRFS_I(inode)->location.objectid && 5849 args->root == BTRFS_I(inode)->root; 5850 } 5851 5852 static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, 5853 struct btrfs_root *root) 5854 { 5855 struct inode *inode; 5856 struct btrfs_iget_args args; 5857 unsigned long hashval = btrfs_inode_hash(ino, root); 5858 5859 args.ino = ino; 5860 args.root = root; 5861 5862 inode = iget5_locked(s, hashval, btrfs_find_actor, 5863 btrfs_init_locked_inode, 5864 (void *)&args); 5865 return inode; 5866 } 5867 5868 /* 5869 * Get an inode object given its inode number and corresponding root. 5870 * Path can be preallocated to prevent recursing back to iget through 5871 * allocator. NULL is also valid but may require an additional allocation 5872 * later. 5873 */ 5874 struct inode *btrfs_iget_path(struct super_block *s, u64 ino, 5875 struct btrfs_root *root, struct btrfs_path *path) 5876 { 5877 struct inode *inode; 5878 5879 inode = btrfs_iget_locked(s, ino, root); 5880 if (!inode) 5881 return ERR_PTR(-ENOMEM); 5882 5883 if (inode->i_state & I_NEW) { 5884 int ret; 5885 5886 ret = btrfs_read_locked_inode(inode, path); 5887 if (!ret) { 5888 inode_tree_add(inode); 5889 unlock_new_inode(inode); 5890 } else { 5891 iget_failed(inode); 5892 /* 5893 * ret > 0 can come from btrfs_search_slot called by 5894 * btrfs_read_locked_inode, this means the inode item 5895 * was not found. 5896 */ 5897 if (ret > 0) 5898 ret = -ENOENT; 5899 inode = ERR_PTR(ret); 5900 } 5901 } 5902 5903 return inode; 5904 } 5905 5906 struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) 5907 { 5908 return btrfs_iget_path(s, ino, root, NULL); 5909 } 5910 5911 static struct inode *new_simple_dir(struct super_block *s, 5912 struct btrfs_key *key, 5913 struct btrfs_root *root) 5914 { 5915 struct inode *inode = new_inode(s); 5916 5917 if (!inode) 5918 return ERR_PTR(-ENOMEM); 5919 5920 BTRFS_I(inode)->root = btrfs_grab_root(root); 5921 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 5922 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 5923 5924 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 5925 /* 5926 * We only need lookup, the rest is read-only and there's no inode 5927 * associated with the dentry 5928 */ 5929 inode->i_op = &simple_dir_inode_operations; 5930 inode->i_opflags &= ~IOP_XATTR; 5931 inode->i_fop = &simple_dir_operations; 5932 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5933 inode->i_mtime = current_time(inode); 5934 inode->i_atime = inode->i_mtime; 5935 inode->i_ctime = inode->i_mtime; 5936 BTRFS_I(inode)->i_otime = inode->i_mtime; 5937 5938 return inode; 5939 } 5940 5941 static inline u8 btrfs_inode_type(struct inode *inode) 5942 { 5943 /* 5944 * Compile-time asserts that generic FT_* types still match 5945 * BTRFS_FT_* types 5946 */ 5947 BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); 5948 BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); 5949 BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); 5950 BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); 5951 BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); 5952 BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); 5953 BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); 5954 BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); 5955 5956 return fs_umode_to_ftype(inode->i_mode); 5957 } 5958 5959 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 5960 { 5961 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 5962 struct inode *inode; 5963 struct btrfs_root *root = BTRFS_I(dir)->root; 5964 struct btrfs_root *sub_root = root; 5965 struct btrfs_key location; 5966 u8 di_type = 0; 5967 int ret = 0; 5968 5969 if (dentry->d_name.len > BTRFS_NAME_LEN) 5970 return ERR_PTR(-ENAMETOOLONG); 5971 5972 ret = btrfs_inode_by_name(dir, dentry, &location, &di_type); 5973 if (ret < 0) 5974 return ERR_PTR(ret); 5975 5976 if (location.type == BTRFS_INODE_ITEM_KEY) { 5977 inode = btrfs_iget(dir->i_sb, location.objectid, root); 5978 if (IS_ERR(inode)) 5979 return inode; 5980 5981 /* Do extra check against inode mode with di_type */ 5982 if (btrfs_inode_type(inode) != di_type) { 5983 btrfs_crit(fs_info, 5984 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", 5985 inode->i_mode, btrfs_inode_type(inode), 5986 di_type); 5987 iput(inode); 5988 return ERR_PTR(-EUCLEAN); 5989 } 5990 return inode; 5991 } 5992 5993 ret = fixup_tree_root_location(fs_info, dir, dentry, 5994 &location, &sub_root); 5995 if (ret < 0) { 5996 if (ret != -ENOENT) 5997 inode = ERR_PTR(ret); 5998 else 5999 inode = new_simple_dir(dir->i_sb, &location, sub_root); 6000 } else { 6001 inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); 6002 } 6003 if (root != sub_root) 6004 btrfs_put_root(sub_root); 6005 6006 if (!IS_ERR(inode) && root != sub_root) { 6007 down_read(&fs_info->cleanup_work_sem); 6008 if (!sb_rdonly(inode->i_sb)) 6009 ret = btrfs_orphan_cleanup(sub_root); 6010 up_read(&fs_info->cleanup_work_sem); 6011 if (ret) { 6012 iput(inode); 6013 inode = ERR_PTR(ret); 6014 } 6015 } 6016 6017 return inode; 6018 } 6019 6020 static int btrfs_dentry_delete(const struct dentry *dentry) 6021 { 6022 struct btrfs_root *root; 6023 struct inode *inode = d_inode(dentry); 6024 6025 if (!inode && !IS_ROOT(dentry)) 6026 inode = d_inode(dentry->d_parent); 6027 6028 if (inode) { 6029 root = BTRFS_I(inode)->root; 6030 if (btrfs_root_refs(&root->root_item) == 0) 6031 return 1; 6032 6033 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 6034 return 1; 6035 } 6036 return 0; 6037 } 6038 6039 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 6040 unsigned int flags) 6041 { 6042 struct inode *inode = btrfs_lookup_dentry(dir, dentry); 6043 6044 if (inode == ERR_PTR(-ENOENT)) 6045 inode = NULL; 6046 return d_splice_alias(inode, dentry); 6047 } 6048 6049 /* 6050 * All this infrastructure exists because dir_emit can fault, and we are holding 6051 * the tree lock when doing readdir. For now just allocate a buffer and copy 6052 * our information into that, and then dir_emit from the buffer. This is 6053 * similar to what NFS does, only we don't keep the buffer around in pagecache 6054 * because I'm afraid I'll mess that up. Long term we need to make filldir do 6055 * copy_to_user_inatomic so we don't have to worry about page faulting under the 6056 * tree lock. 6057 */ 6058 static int btrfs_opendir(struct inode *inode, struct file *file) 6059 { 6060 struct btrfs_file_private *private; 6061 6062 private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); 6063 if (!private) 6064 return -ENOMEM; 6065 private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); 6066 if (!private->filldir_buf) { 6067 kfree(private); 6068 return -ENOMEM; 6069 } 6070 file->private_data = private; 6071 return 0; 6072 } 6073 6074 struct dir_entry { 6075 u64 ino; 6076 u64 offset; 6077 unsigned type; 6078 int name_len; 6079 }; 6080 6081 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) 6082 { 6083 while (entries--) { 6084 struct dir_entry *entry = addr; 6085 char *name = (char *)(entry + 1); 6086 6087 ctx->pos = get_unaligned(&entry->offset); 6088 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), 6089 get_unaligned(&entry->ino), 6090 get_unaligned(&entry->type))) 6091 return 1; 6092 addr += sizeof(struct dir_entry) + 6093 get_unaligned(&entry->name_len); 6094 ctx->pos++; 6095 } 6096 return 0; 6097 } 6098 6099 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) 6100 { 6101 struct inode *inode = file_inode(file); 6102 struct btrfs_root *root = BTRFS_I(inode)->root; 6103 struct btrfs_file_private *private = file->private_data; 6104 struct btrfs_dir_item *di; 6105 struct btrfs_key key; 6106 struct btrfs_key found_key; 6107 struct btrfs_path *path; 6108 void *addr; 6109 struct list_head ins_list; 6110 struct list_head del_list; 6111 int ret; 6112 struct extent_buffer *leaf; 6113 int slot; 6114 char *name_ptr; 6115 int name_len; 6116 int entries = 0; 6117 int total_len = 0; 6118 bool put = false; 6119 struct btrfs_key location; 6120 6121 if (!dir_emit_dots(file, ctx)) 6122 return 0; 6123 6124 path = btrfs_alloc_path(); 6125 if (!path) 6126 return -ENOMEM; 6127 6128 addr = private->filldir_buf; 6129 path->reada = READA_FORWARD; 6130 6131 INIT_LIST_HEAD(&ins_list); 6132 INIT_LIST_HEAD(&del_list); 6133 put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list); 6134 6135 again: 6136 key.type = BTRFS_DIR_INDEX_KEY; 6137 key.offset = ctx->pos; 6138 key.objectid = btrfs_ino(BTRFS_I(inode)); 6139 6140 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6141 if (ret < 0) 6142 goto err; 6143 6144 while (1) { 6145 struct dir_entry *entry; 6146 6147 leaf = path->nodes[0]; 6148 slot = path->slots[0]; 6149 if (slot >= btrfs_header_nritems(leaf)) { 6150 ret = btrfs_next_leaf(root, path); 6151 if (ret < 0) 6152 goto err; 6153 else if (ret > 0) 6154 break; 6155 continue; 6156 } 6157 6158 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6159 6160 if (found_key.objectid != key.objectid) 6161 break; 6162 if (found_key.type != BTRFS_DIR_INDEX_KEY) 6163 break; 6164 if (found_key.offset < ctx->pos) 6165 goto next; 6166 if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) 6167 goto next; 6168 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 6169 name_len = btrfs_dir_name_len(leaf, di); 6170 if ((total_len + sizeof(struct dir_entry) + name_len) >= 6171 PAGE_SIZE) { 6172 btrfs_release_path(path); 6173 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 6174 if (ret) 6175 goto nopos; 6176 addr = private->filldir_buf; 6177 entries = 0; 6178 total_len = 0; 6179 goto again; 6180 } 6181 6182 entry = addr; 6183 put_unaligned(name_len, &entry->name_len); 6184 name_ptr = (char *)(entry + 1); 6185 read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), 6186 name_len); 6187 put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), 6188 &entry->type); 6189 btrfs_dir_item_key_to_cpu(leaf, di, &location); 6190 put_unaligned(location.objectid, &entry->ino); 6191 put_unaligned(found_key.offset, &entry->offset); 6192 entries++; 6193 addr += sizeof(struct dir_entry) + name_len; 6194 total_len += sizeof(struct dir_entry) + name_len; 6195 next: 6196 path->slots[0]++; 6197 } 6198 btrfs_release_path(path); 6199 6200 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 6201 if (ret) 6202 goto nopos; 6203 6204 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); 6205 if (ret) 6206 goto nopos; 6207 6208 /* 6209 * Stop new entries from being returned after we return the last 6210 * entry. 6211 * 6212 * New directory entries are assigned a strictly increasing 6213 * offset. This means that new entries created during readdir 6214 * are *guaranteed* to be seen in the future by that readdir. 6215 * This has broken buggy programs which operate on names as 6216 * they're returned by readdir. Until we re-use freed offsets 6217 * we have this hack to stop new entries from being returned 6218 * under the assumption that they'll never reach this huge 6219 * offset. 6220 * 6221 * This is being careful not to overflow 32bit loff_t unless the 6222 * last entry requires it because doing so has broken 32bit apps 6223 * in the past. 6224 */ 6225 if (ctx->pos >= INT_MAX) 6226 ctx->pos = LLONG_MAX; 6227 else 6228 ctx->pos = INT_MAX; 6229 nopos: 6230 ret = 0; 6231 err: 6232 if (put) 6233 btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); 6234 btrfs_free_path(path); 6235 return ret; 6236 } 6237 6238 /* 6239 * This is somewhat expensive, updating the tree every time the 6240 * inode changes. But, it is most likely to find the inode in cache. 6241 * FIXME, needs more benchmarking...there are no reasons other than performance 6242 * to keep or drop this code. 6243 */ 6244 static int btrfs_dirty_inode(struct inode *inode) 6245 { 6246 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 6247 struct btrfs_root *root = BTRFS_I(inode)->root; 6248 struct btrfs_trans_handle *trans; 6249 int ret; 6250 6251 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 6252 return 0; 6253 6254 trans = btrfs_join_transaction(root); 6255 if (IS_ERR(trans)) 6256 return PTR_ERR(trans); 6257 6258 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6259 if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { 6260 /* whoops, lets try again with the full transaction */ 6261 btrfs_end_transaction(trans); 6262 trans = btrfs_start_transaction(root, 1); 6263 if (IS_ERR(trans)) 6264 return PTR_ERR(trans); 6265 6266 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6267 } 6268 btrfs_end_transaction(trans); 6269 if (BTRFS_I(inode)->delayed_node) 6270 btrfs_balance_delayed_items(fs_info); 6271 6272 return ret; 6273 } 6274 6275 /* 6276 * This is a copy of file_update_time. We need this so we can return error on 6277 * ENOSPC for updating the inode in the case of file write and mmap writes. 6278 */ 6279 static int btrfs_update_time(struct inode *inode, struct timespec64 *now, 6280 int flags) 6281 { 6282 struct btrfs_root *root = BTRFS_I(inode)->root; 6283 bool dirty = flags & ~S_VERSION; 6284 6285 if (btrfs_root_readonly(root)) 6286 return -EROFS; 6287 6288 if (flags & S_VERSION) 6289 dirty |= inode_maybe_inc_iversion(inode, dirty); 6290 if (flags & S_CTIME) 6291 inode->i_ctime = *now; 6292 if (flags & S_MTIME) 6293 inode->i_mtime = *now; 6294 if (flags & S_ATIME) 6295 inode->i_atime = *now; 6296 return dirty ? btrfs_dirty_inode(inode) : 0; 6297 } 6298 6299 /* 6300 * find the highest existing sequence number in a directory 6301 * and then set the in-memory index_cnt variable to reflect 6302 * free sequence numbers 6303 */ 6304 static int btrfs_set_inode_index_count(struct btrfs_inode *inode) 6305 { 6306 struct btrfs_root *root = inode->root; 6307 struct btrfs_key key, found_key; 6308 struct btrfs_path *path; 6309 struct extent_buffer *leaf; 6310 int ret; 6311 6312 key.objectid = btrfs_ino(inode); 6313 key.type = BTRFS_DIR_INDEX_KEY; 6314 key.offset = (u64)-1; 6315 6316 path = btrfs_alloc_path(); 6317 if (!path) 6318 return -ENOMEM; 6319 6320 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6321 if (ret < 0) 6322 goto out; 6323 /* FIXME: we should be able to handle this */ 6324 if (ret == 0) 6325 goto out; 6326 ret = 0; 6327 6328 /* 6329 * MAGIC NUMBER EXPLANATION: 6330 * since we search a directory based on f_pos we have to start at 2 6331 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 6332 * else has to start at 2 6333 */ 6334 if (path->slots[0] == 0) { 6335 inode->index_cnt = 2; 6336 goto out; 6337 } 6338 6339 path->slots[0]--; 6340 6341 leaf = path->nodes[0]; 6342 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6343 6344 if (found_key.objectid != btrfs_ino(inode) || 6345 found_key.type != BTRFS_DIR_INDEX_KEY) { 6346 inode->index_cnt = 2; 6347 goto out; 6348 } 6349 6350 inode->index_cnt = found_key.offset + 1; 6351 out: 6352 btrfs_free_path(path); 6353 return ret; 6354 } 6355 6356 /* 6357 * helper to find a free sequence number in a given directory. This current 6358 * code is very simple, later versions will do smarter things in the btree 6359 */ 6360 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) 6361 { 6362 int ret = 0; 6363 6364 if (dir->index_cnt == (u64)-1) { 6365 ret = btrfs_inode_delayed_dir_index_count(dir); 6366 if (ret) { 6367 ret = btrfs_set_inode_index_count(dir); 6368 if (ret) 6369 return ret; 6370 } 6371 } 6372 6373 *index = dir->index_cnt; 6374 dir->index_cnt++; 6375 6376 return ret; 6377 } 6378 6379 static int btrfs_insert_inode_locked(struct inode *inode) 6380 { 6381 struct btrfs_iget_args args; 6382 6383 args.ino = BTRFS_I(inode)->location.objectid; 6384 args.root = BTRFS_I(inode)->root; 6385 6386 return insert_inode_locked4(inode, 6387 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), 6388 btrfs_find_actor, &args); 6389 } 6390 6391 /* 6392 * Inherit flags from the parent inode. 6393 * 6394 * Currently only the compression flags and the cow flags are inherited. 6395 */ 6396 static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 6397 { 6398 unsigned int flags; 6399 6400 if (!dir) 6401 return; 6402 6403 flags = BTRFS_I(dir)->flags; 6404 6405 if (flags & BTRFS_INODE_NOCOMPRESS) { 6406 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; 6407 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 6408 } else if (flags & BTRFS_INODE_COMPRESS) { 6409 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; 6410 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 6411 } 6412 6413 if (flags & BTRFS_INODE_NODATACOW) { 6414 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 6415 if (S_ISREG(inode->i_mode)) 6416 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6417 } 6418 6419 btrfs_sync_inode_flags_to_i_flags(inode); 6420 } 6421 6422 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 6423 struct btrfs_root *root, 6424 struct user_namespace *mnt_userns, 6425 struct inode *dir, 6426 const char *name, int name_len, 6427 u64 ref_objectid, u64 objectid, 6428 umode_t mode, u64 *index) 6429 { 6430 struct btrfs_fs_info *fs_info = root->fs_info; 6431 struct inode *inode; 6432 struct btrfs_inode_item *inode_item; 6433 struct btrfs_key *location; 6434 struct btrfs_path *path; 6435 struct btrfs_inode_ref *ref; 6436 struct btrfs_key key[2]; 6437 u32 sizes[2]; 6438 int nitems = name ? 2 : 1; 6439 unsigned long ptr; 6440 unsigned int nofs_flag; 6441 int ret; 6442 6443 path = btrfs_alloc_path(); 6444 if (!path) 6445 return ERR_PTR(-ENOMEM); 6446 6447 nofs_flag = memalloc_nofs_save(); 6448 inode = new_inode(fs_info->sb); 6449 memalloc_nofs_restore(nofs_flag); 6450 if (!inode) { 6451 btrfs_free_path(path); 6452 return ERR_PTR(-ENOMEM); 6453 } 6454 6455 /* 6456 * O_TMPFILE, set link count to 0, so that after this point, 6457 * we fill in an inode item with the correct link count. 6458 */ 6459 if (!name) 6460 set_nlink(inode, 0); 6461 6462 /* 6463 * we have to initialize this early, so we can reclaim the inode 6464 * number if we fail afterwards in this function. 6465 */ 6466 inode->i_ino = objectid; 6467 6468 if (dir && name) { 6469 trace_btrfs_inode_request(dir); 6470 6471 ret = btrfs_set_inode_index(BTRFS_I(dir), index); 6472 if (ret) { 6473 btrfs_free_path(path); 6474 iput(inode); 6475 return ERR_PTR(ret); 6476 } 6477 } else if (dir) { 6478 *index = 0; 6479 } 6480 /* 6481 * index_cnt is ignored for everything but a dir, 6482 * btrfs_set_inode_index_count has an explanation for the magic 6483 * number 6484 */ 6485 BTRFS_I(inode)->index_cnt = 2; 6486 BTRFS_I(inode)->dir_index = *index; 6487 BTRFS_I(inode)->root = btrfs_grab_root(root); 6488 BTRFS_I(inode)->generation = trans->transid; 6489 inode->i_generation = BTRFS_I(inode)->generation; 6490 6491 /* 6492 * We could have gotten an inode number from somebody who was fsynced 6493 * and then removed in this same transaction, so let's just set full 6494 * sync since it will be a full sync anyway and this will blow away the 6495 * old info in the log. 6496 */ 6497 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 6498 6499 key[0].objectid = objectid; 6500 key[0].type = BTRFS_INODE_ITEM_KEY; 6501 key[0].offset = 0; 6502 6503 sizes[0] = sizeof(struct btrfs_inode_item); 6504 6505 if (name) { 6506 /* 6507 * Start new inodes with an inode_ref. This is slightly more 6508 * efficient for small numbers of hard links since they will 6509 * be packed into one item. Extended refs will kick in if we 6510 * add more hard links than can fit in the ref item. 6511 */ 6512 key[1].objectid = objectid; 6513 key[1].type = BTRFS_INODE_REF_KEY; 6514 key[1].offset = ref_objectid; 6515 6516 sizes[1] = name_len + sizeof(*ref); 6517 } 6518 6519 location = &BTRFS_I(inode)->location; 6520 location->objectid = objectid; 6521 location->offset = 0; 6522 location->type = BTRFS_INODE_ITEM_KEY; 6523 6524 ret = btrfs_insert_inode_locked(inode); 6525 if (ret < 0) { 6526 iput(inode); 6527 goto fail; 6528 } 6529 6530 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); 6531 if (ret != 0) 6532 goto fail_unlock; 6533 6534 inode_init_owner(mnt_userns, inode, dir, mode); 6535 inode_set_bytes(inode, 0); 6536 6537 inode->i_mtime = current_time(inode); 6538 inode->i_atime = inode->i_mtime; 6539 inode->i_ctime = inode->i_mtime; 6540 BTRFS_I(inode)->i_otime = inode->i_mtime; 6541 6542 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 6543 struct btrfs_inode_item); 6544 memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, 6545 sizeof(*inode_item)); 6546 fill_inode_item(trans, path->nodes[0], inode_item, inode); 6547 6548 if (name) { 6549 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 6550 struct btrfs_inode_ref); 6551 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 6552 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 6553 ptr = (unsigned long)(ref + 1); 6554 write_extent_buffer(path->nodes[0], name, ptr, name_len); 6555 } 6556 6557 btrfs_mark_buffer_dirty(path->nodes[0]); 6558 btrfs_free_path(path); 6559 6560 btrfs_inherit_iflags(inode, dir); 6561 6562 if (S_ISREG(mode)) { 6563 if (btrfs_test_opt(fs_info, NODATASUM)) 6564 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6565 if (btrfs_test_opt(fs_info, NODATACOW)) 6566 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | 6567 BTRFS_INODE_NODATASUM; 6568 } 6569 6570 inode_tree_add(inode); 6571 6572 trace_btrfs_inode_new(inode); 6573 btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); 6574 6575 btrfs_update_root_times(trans, root); 6576 6577 ret = btrfs_inode_inherit_props(trans, inode, dir); 6578 if (ret) 6579 btrfs_err(fs_info, 6580 "error inheriting props for ino %llu (root %llu): %d", 6581 btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); 6582 6583 return inode; 6584 6585 fail_unlock: 6586 discard_new_inode(inode); 6587 fail: 6588 if (dir && name) 6589 BTRFS_I(dir)->index_cnt--; 6590 btrfs_free_path(path); 6591 return ERR_PTR(ret); 6592 } 6593 6594 /* 6595 * utility function to add 'inode' into 'parent_inode' with 6596 * a give name and a given sequence number. 6597 * if 'add_backref' is true, also insert a backref from the 6598 * inode to the parent directory. 6599 */ 6600 int btrfs_add_link(struct btrfs_trans_handle *trans, 6601 struct btrfs_inode *parent_inode, struct btrfs_inode *inode, 6602 const char *name, int name_len, int add_backref, u64 index) 6603 { 6604 int ret = 0; 6605 struct btrfs_key key; 6606 struct btrfs_root *root = parent_inode->root; 6607 u64 ino = btrfs_ino(inode); 6608 u64 parent_ino = btrfs_ino(parent_inode); 6609 6610 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6611 memcpy(&key, &inode->root->root_key, sizeof(key)); 6612 } else { 6613 key.objectid = ino; 6614 key.type = BTRFS_INODE_ITEM_KEY; 6615 key.offset = 0; 6616 } 6617 6618 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6619 ret = btrfs_add_root_ref(trans, key.objectid, 6620 root->root_key.objectid, parent_ino, 6621 index, name, name_len); 6622 } else if (add_backref) { 6623 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 6624 parent_ino, index); 6625 } 6626 6627 /* Nothing to clean up yet */ 6628 if (ret) 6629 return ret; 6630 6631 ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, 6632 btrfs_inode_type(&inode->vfs_inode), index); 6633 if (ret == -EEXIST || ret == -EOVERFLOW) 6634 goto fail_dir_item; 6635 else if (ret) { 6636 btrfs_abort_transaction(trans, ret); 6637 return ret; 6638 } 6639 6640 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + 6641 name_len * 2); 6642 inode_inc_iversion(&parent_inode->vfs_inode); 6643 /* 6644 * If we are replaying a log tree, we do not want to update the mtime 6645 * and ctime of the parent directory with the current time, since the 6646 * log replay procedure is responsible for setting them to their correct 6647 * values (the ones it had when the fsync was done). 6648 */ 6649 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { 6650 struct timespec64 now = current_time(&parent_inode->vfs_inode); 6651 6652 parent_inode->vfs_inode.i_mtime = now; 6653 parent_inode->vfs_inode.i_ctime = now; 6654 } 6655 ret = btrfs_update_inode(trans, root, parent_inode); 6656 if (ret) 6657 btrfs_abort_transaction(trans, ret); 6658 return ret; 6659 6660 fail_dir_item: 6661 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6662 u64 local_index; 6663 int err; 6664 err = btrfs_del_root_ref(trans, key.objectid, 6665 root->root_key.objectid, parent_ino, 6666 &local_index, name, name_len); 6667 if (err) 6668 btrfs_abort_transaction(trans, err); 6669 } else if (add_backref) { 6670 u64 local_index; 6671 int err; 6672 6673 err = btrfs_del_inode_ref(trans, root, name, name_len, 6674 ino, parent_ino, &local_index); 6675 if (err) 6676 btrfs_abort_transaction(trans, err); 6677 } 6678 6679 /* Return the original error code */ 6680 return ret; 6681 } 6682 6683 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 6684 struct btrfs_inode *dir, struct dentry *dentry, 6685 struct btrfs_inode *inode, int backref, u64 index) 6686 { 6687 int err = btrfs_add_link(trans, dir, inode, 6688 dentry->d_name.name, dentry->d_name.len, 6689 backref, index); 6690 if (err > 0) 6691 err = -EEXIST; 6692 return err; 6693 } 6694 6695 static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, 6696 struct dentry *dentry, umode_t mode, dev_t rdev) 6697 { 6698 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6699 struct btrfs_trans_handle *trans; 6700 struct btrfs_root *root = BTRFS_I(dir)->root; 6701 struct inode *inode = NULL; 6702 int err; 6703 u64 objectid; 6704 u64 index = 0; 6705 6706 /* 6707 * 2 for inode item and ref 6708 * 2 for dir items 6709 * 1 for xattr if selinux is on 6710 */ 6711 trans = btrfs_start_transaction(root, 5); 6712 if (IS_ERR(trans)) 6713 return PTR_ERR(trans); 6714 6715 err = btrfs_get_free_objectid(root, &objectid); 6716 if (err) 6717 goto out_unlock; 6718 6719 inode = btrfs_new_inode(trans, root, mnt_userns, dir, 6720 dentry->d_name.name, dentry->d_name.len, 6721 btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); 6722 if (IS_ERR(inode)) { 6723 err = PTR_ERR(inode); 6724 inode = NULL; 6725 goto out_unlock; 6726 } 6727 6728 /* 6729 * If the active LSM wants to access the inode during 6730 * d_instantiate it needs these. Smack checks to see 6731 * if the filesystem supports xattrs by looking at the 6732 * ops vector. 6733 */ 6734 inode->i_op = &btrfs_special_inode_operations; 6735 init_special_inode(inode, inode->i_mode, rdev); 6736 6737 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6738 if (err) 6739 goto out_unlock; 6740 6741 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6742 0, index); 6743 if (err) 6744 goto out_unlock; 6745 6746 btrfs_update_inode(trans, root, BTRFS_I(inode)); 6747 d_instantiate_new(dentry, inode); 6748 6749 out_unlock: 6750 btrfs_end_transaction(trans); 6751 btrfs_btree_balance_dirty(fs_info); 6752 if (err && inode) { 6753 inode_dec_link_count(inode); 6754 discard_new_inode(inode); 6755 } 6756 return err; 6757 } 6758 6759 static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, 6760 struct dentry *dentry, umode_t mode, bool excl) 6761 { 6762 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6763 struct btrfs_trans_handle *trans; 6764 struct btrfs_root *root = BTRFS_I(dir)->root; 6765 struct inode *inode = NULL; 6766 int err; 6767 u64 objectid; 6768 u64 index = 0; 6769 6770 /* 6771 * 2 for inode item and ref 6772 * 2 for dir items 6773 * 1 for xattr if selinux is on 6774 */ 6775 trans = btrfs_start_transaction(root, 5); 6776 if (IS_ERR(trans)) 6777 return PTR_ERR(trans); 6778 6779 err = btrfs_get_free_objectid(root, &objectid); 6780 if (err) 6781 goto out_unlock; 6782 6783 inode = btrfs_new_inode(trans, root, mnt_userns, dir, 6784 dentry->d_name.name, dentry->d_name.len, 6785 btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); 6786 if (IS_ERR(inode)) { 6787 err = PTR_ERR(inode); 6788 inode = NULL; 6789 goto out_unlock; 6790 } 6791 /* 6792 * If the active LSM wants to access the inode during 6793 * d_instantiate it needs these. Smack checks to see 6794 * if the filesystem supports xattrs by looking at the 6795 * ops vector. 6796 */ 6797 inode->i_fop = &btrfs_file_operations; 6798 inode->i_op = &btrfs_file_inode_operations; 6799 inode->i_mapping->a_ops = &btrfs_aops; 6800 6801 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6802 if (err) 6803 goto out_unlock; 6804 6805 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6806 if (err) 6807 goto out_unlock; 6808 6809 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6810 0, index); 6811 if (err) 6812 goto out_unlock; 6813 6814 d_instantiate_new(dentry, inode); 6815 6816 out_unlock: 6817 btrfs_end_transaction(trans); 6818 if (err && inode) { 6819 inode_dec_link_count(inode); 6820 discard_new_inode(inode); 6821 } 6822 btrfs_btree_balance_dirty(fs_info); 6823 return err; 6824 } 6825 6826 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 6827 struct dentry *dentry) 6828 { 6829 struct btrfs_trans_handle *trans = NULL; 6830 struct btrfs_root *root = BTRFS_I(dir)->root; 6831 struct inode *inode = d_inode(old_dentry); 6832 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 6833 u64 index; 6834 int err; 6835 int drop_inode = 0; 6836 6837 /* do not allow sys_link's with other subvols of the same device */ 6838 if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) 6839 return -EXDEV; 6840 6841 if (inode->i_nlink >= BTRFS_LINK_MAX) 6842 return -EMLINK; 6843 6844 err = btrfs_set_inode_index(BTRFS_I(dir), &index); 6845 if (err) 6846 goto fail; 6847 6848 /* 6849 * 2 items for inode and inode ref 6850 * 2 items for dir items 6851 * 1 item for parent inode 6852 * 1 item for orphan item deletion if O_TMPFILE 6853 */ 6854 trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); 6855 if (IS_ERR(trans)) { 6856 err = PTR_ERR(trans); 6857 trans = NULL; 6858 goto fail; 6859 } 6860 6861 /* There are several dir indexes for this inode, clear the cache. */ 6862 BTRFS_I(inode)->dir_index = 0ULL; 6863 inc_nlink(inode); 6864 inode_inc_iversion(inode); 6865 inode->i_ctime = current_time(inode); 6866 ihold(inode); 6867 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6868 6869 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6870 1, index); 6871 6872 if (err) { 6873 drop_inode = 1; 6874 } else { 6875 struct dentry *parent = dentry->d_parent; 6876 6877 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6878 if (err) 6879 goto fail; 6880 if (inode->i_nlink == 1) { 6881 /* 6882 * If new hard link count is 1, it's a file created 6883 * with open(2) O_TMPFILE flag. 6884 */ 6885 err = btrfs_orphan_del(trans, BTRFS_I(inode)); 6886 if (err) 6887 goto fail; 6888 } 6889 d_instantiate(dentry, inode); 6890 btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); 6891 } 6892 6893 fail: 6894 if (trans) 6895 btrfs_end_transaction(trans); 6896 if (drop_inode) { 6897 inode_dec_link_count(inode); 6898 iput(inode); 6899 } 6900 btrfs_btree_balance_dirty(fs_info); 6901 return err; 6902 } 6903 6904 static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, 6905 struct dentry *dentry, umode_t mode) 6906 { 6907 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6908 struct inode *inode = NULL; 6909 struct btrfs_trans_handle *trans; 6910 struct btrfs_root *root = BTRFS_I(dir)->root; 6911 int err = 0; 6912 u64 objectid = 0; 6913 u64 index = 0; 6914 6915 /* 6916 * 2 items for inode and ref 6917 * 2 items for dir items 6918 * 1 for xattr if selinux is on 6919 */ 6920 trans = btrfs_start_transaction(root, 5); 6921 if (IS_ERR(trans)) 6922 return PTR_ERR(trans); 6923 6924 err = btrfs_get_free_objectid(root, &objectid); 6925 if (err) 6926 goto out_fail; 6927 6928 inode = btrfs_new_inode(trans, root, mnt_userns, dir, 6929 dentry->d_name.name, dentry->d_name.len, 6930 btrfs_ino(BTRFS_I(dir)), objectid, 6931 S_IFDIR | mode, &index); 6932 if (IS_ERR(inode)) { 6933 err = PTR_ERR(inode); 6934 inode = NULL; 6935 goto out_fail; 6936 } 6937 6938 /* these must be set before we unlock the inode */ 6939 inode->i_op = &btrfs_dir_inode_operations; 6940 inode->i_fop = &btrfs_dir_file_operations; 6941 6942 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6943 if (err) 6944 goto out_fail; 6945 6946 btrfs_i_size_write(BTRFS_I(inode), 0); 6947 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6948 if (err) 6949 goto out_fail; 6950 6951 err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 6952 dentry->d_name.name, 6953 dentry->d_name.len, 0, index); 6954 if (err) 6955 goto out_fail; 6956 6957 d_instantiate_new(dentry, inode); 6958 6959 out_fail: 6960 btrfs_end_transaction(trans); 6961 if (err && inode) { 6962 inode_dec_link_count(inode); 6963 discard_new_inode(inode); 6964 } 6965 btrfs_btree_balance_dirty(fs_info); 6966 return err; 6967 } 6968 6969 static noinline int uncompress_inline(struct btrfs_path *path, 6970 struct page *page, 6971 size_t pg_offset, u64 extent_offset, 6972 struct btrfs_file_extent_item *item) 6973 { 6974 int ret; 6975 struct extent_buffer *leaf = path->nodes[0]; 6976 char *tmp; 6977 size_t max_size; 6978 unsigned long inline_size; 6979 unsigned long ptr; 6980 int compress_type; 6981 6982 WARN_ON(pg_offset != 0); 6983 compress_type = btrfs_file_extent_compression(leaf, item); 6984 max_size = btrfs_file_extent_ram_bytes(leaf, item); 6985 inline_size = btrfs_file_extent_inline_item_len(leaf, 6986 btrfs_item_nr(path->slots[0])); 6987 tmp = kmalloc(inline_size, GFP_NOFS); 6988 if (!tmp) 6989 return -ENOMEM; 6990 ptr = btrfs_file_extent_inline_start(item); 6991 6992 read_extent_buffer(leaf, tmp, ptr, inline_size); 6993 6994 max_size = min_t(unsigned long, PAGE_SIZE, max_size); 6995 ret = btrfs_decompress(compress_type, tmp, page, 6996 extent_offset, inline_size, max_size); 6997 6998 /* 6999 * decompression code contains a memset to fill in any space between the end 7000 * of the uncompressed data and the end of max_size in case the decompressed 7001 * data ends up shorter than ram_bytes. That doesn't cover the hole between 7002 * the end of an inline extent and the beginning of the next block, so we 7003 * cover that region here. 7004 */ 7005 7006 if (max_size + pg_offset < PAGE_SIZE) 7007 memzero_page(page, pg_offset + max_size, 7008 PAGE_SIZE - max_size - pg_offset); 7009 kfree(tmp); 7010 return ret; 7011 } 7012 7013 /** 7014 * btrfs_get_extent - Lookup the first extent overlapping a range in a file. 7015 * @inode: file to search in 7016 * @page: page to read extent data into if the extent is inline 7017 * @pg_offset: offset into @page to copy to 7018 * @start: file offset 7019 * @len: length of range starting at @start 7020 * 7021 * This returns the first &struct extent_map which overlaps with the given 7022 * range, reading it from the B-tree and caching it if necessary. Note that 7023 * there may be more extents which overlap the given range after the returned 7024 * extent_map. 7025 * 7026 * If @page is not NULL and the extent is inline, this also reads the extent 7027 * data directly into the page and marks the extent up to date in the io_tree. 7028 * 7029 * Return: ERR_PTR on error, non-NULL extent_map on success. 7030 */ 7031 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, 7032 struct page *page, size_t pg_offset, 7033 u64 start, u64 len) 7034 { 7035 struct btrfs_fs_info *fs_info = inode->root->fs_info; 7036 int ret = 0; 7037 u64 extent_start = 0; 7038 u64 extent_end = 0; 7039 u64 objectid = btrfs_ino(inode); 7040 int extent_type = -1; 7041 struct btrfs_path *path = NULL; 7042 struct btrfs_root *root = inode->root; 7043 struct btrfs_file_extent_item *item; 7044 struct extent_buffer *leaf; 7045 struct btrfs_key found_key; 7046 struct extent_map *em = NULL; 7047 struct extent_map_tree *em_tree = &inode->extent_tree; 7048 struct extent_io_tree *io_tree = &inode->io_tree; 7049 7050 read_lock(&em_tree->lock); 7051 em = lookup_extent_mapping(em_tree, start, len); 7052 read_unlock(&em_tree->lock); 7053 7054 if (em) { 7055 if (em->start > start || em->start + em->len <= start) 7056 free_extent_map(em); 7057 else if (em->block_start == EXTENT_MAP_INLINE && page) 7058 free_extent_map(em); 7059 else 7060 goto out; 7061 } 7062 em = alloc_extent_map(); 7063 if (!em) { 7064 ret = -ENOMEM; 7065 goto out; 7066 } 7067 em->start = EXTENT_MAP_HOLE; 7068 em->orig_start = EXTENT_MAP_HOLE; 7069 em->len = (u64)-1; 7070 em->block_len = (u64)-1; 7071 7072 path = btrfs_alloc_path(); 7073 if (!path) { 7074 ret = -ENOMEM; 7075 goto out; 7076 } 7077 7078 /* Chances are we'll be called again, so go ahead and do readahead */ 7079 path->reada = READA_FORWARD; 7080 7081 /* 7082 * The same explanation in load_free_space_cache applies here as well, 7083 * we only read when we're loading the free space cache, and at that 7084 * point the commit_root has everything we need. 7085 */ 7086 if (btrfs_is_free_space_inode(inode)) { 7087 path->search_commit_root = 1; 7088 path->skip_locking = 1; 7089 } 7090 7091 ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); 7092 if (ret < 0) { 7093 goto out; 7094 } else if (ret > 0) { 7095 if (path->slots[0] == 0) 7096 goto not_found; 7097 path->slots[0]--; 7098 ret = 0; 7099 } 7100 7101 leaf = path->nodes[0]; 7102 item = btrfs_item_ptr(leaf, path->slots[0], 7103 struct btrfs_file_extent_item); 7104 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 7105 if (found_key.objectid != objectid || 7106 found_key.type != BTRFS_EXTENT_DATA_KEY) { 7107 /* 7108 * If we backup past the first extent we want to move forward 7109 * and see if there is an extent in front of us, otherwise we'll 7110 * say there is a hole for our whole search range which can 7111 * cause problems. 7112 */ 7113 extent_end = start; 7114 goto next; 7115 } 7116 7117 extent_type = btrfs_file_extent_type(leaf, item); 7118 extent_start = found_key.offset; 7119 extent_end = btrfs_file_extent_end(path); 7120 if (extent_type == BTRFS_FILE_EXTENT_REG || 7121 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 7122 /* Only regular file could have regular/prealloc extent */ 7123 if (!S_ISREG(inode->vfs_inode.i_mode)) { 7124 ret = -EUCLEAN; 7125 btrfs_crit(fs_info, 7126 "regular/prealloc extent found for non-regular inode %llu", 7127 btrfs_ino(inode)); 7128 goto out; 7129 } 7130 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, 7131 extent_start); 7132 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 7133 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, 7134 path->slots[0], 7135 extent_start); 7136 } 7137 next: 7138 if (start >= extent_end) { 7139 path->slots[0]++; 7140 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 7141 ret = btrfs_next_leaf(root, path); 7142 if (ret < 0) 7143 goto out; 7144 else if (ret > 0) 7145 goto not_found; 7146 7147 leaf = path->nodes[0]; 7148 } 7149 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 7150 if (found_key.objectid != objectid || 7151 found_key.type != BTRFS_EXTENT_DATA_KEY) 7152 goto not_found; 7153 if (start + len <= found_key.offset) 7154 goto not_found; 7155 if (start > found_key.offset) 7156 goto next; 7157 7158 /* New extent overlaps with existing one */ 7159 em->start = start; 7160 em->orig_start = start; 7161 em->len = found_key.offset - start; 7162 em->block_start = EXTENT_MAP_HOLE; 7163 goto insert; 7164 } 7165 7166 btrfs_extent_item_to_extent_map(inode, path, item, !page, em); 7167 7168 if (extent_type == BTRFS_FILE_EXTENT_REG || 7169 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 7170 goto insert; 7171 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 7172 unsigned long ptr; 7173 char *map; 7174 size_t size; 7175 size_t extent_offset; 7176 size_t copy_size; 7177 7178 if (!page) 7179 goto out; 7180 7181 size = btrfs_file_extent_ram_bytes(leaf, item); 7182 extent_offset = page_offset(page) + pg_offset - extent_start; 7183 copy_size = min_t(u64, PAGE_SIZE - pg_offset, 7184 size - extent_offset); 7185 em->start = extent_start + extent_offset; 7186 em->len = ALIGN(copy_size, fs_info->sectorsize); 7187 em->orig_block_len = em->len; 7188 em->orig_start = em->start; 7189 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 7190 7191 if (!PageUptodate(page)) { 7192 if (btrfs_file_extent_compression(leaf, item) != 7193 BTRFS_COMPRESS_NONE) { 7194 ret = uncompress_inline(path, page, pg_offset, 7195 extent_offset, item); 7196 if (ret) 7197 goto out; 7198 } else { 7199 map = kmap_local_page(page); 7200 read_extent_buffer(leaf, map + pg_offset, ptr, 7201 copy_size); 7202 if (pg_offset + copy_size < PAGE_SIZE) { 7203 memset(map + pg_offset + copy_size, 0, 7204 PAGE_SIZE - pg_offset - 7205 copy_size); 7206 } 7207 kunmap_local(map); 7208 } 7209 flush_dcache_page(page); 7210 } 7211 set_extent_uptodate(io_tree, em->start, 7212 extent_map_end(em) - 1, NULL, GFP_NOFS); 7213 goto insert; 7214 } 7215 not_found: 7216 em->start = start; 7217 em->orig_start = start; 7218 em->len = len; 7219 em->block_start = EXTENT_MAP_HOLE; 7220 insert: 7221 ret = 0; 7222 btrfs_release_path(path); 7223 if (em->start > start || extent_map_end(em) <= start) { 7224 btrfs_err(fs_info, 7225 "bad extent! em: [%llu %llu] passed [%llu %llu]", 7226 em->start, em->len, start, len); 7227 ret = -EIO; 7228 goto out; 7229 } 7230 7231 write_lock(&em_tree->lock); 7232 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); 7233 write_unlock(&em_tree->lock); 7234 out: 7235 btrfs_free_path(path); 7236 7237 trace_btrfs_get_extent(root, inode, em); 7238 7239 if (ret) { 7240 free_extent_map(em); 7241 return ERR_PTR(ret); 7242 } 7243 return em; 7244 } 7245 7246 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, 7247 u64 start, u64 len) 7248 { 7249 struct extent_map *em; 7250 struct extent_map *hole_em = NULL; 7251 u64 delalloc_start = start; 7252 u64 end; 7253 u64 delalloc_len; 7254 u64 delalloc_end; 7255 int err = 0; 7256 7257 em = btrfs_get_extent(inode, NULL, 0, start, len); 7258 if (IS_ERR(em)) 7259 return em; 7260 /* 7261 * If our em maps to: 7262 * - a hole or 7263 * - a pre-alloc extent, 7264 * there might actually be delalloc bytes behind it. 7265 */ 7266 if (em->block_start != EXTENT_MAP_HOLE && 7267 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7268 return em; 7269 else 7270 hole_em = em; 7271 7272 /* check to see if we've wrapped (len == -1 or similar) */ 7273 end = start + len; 7274 if (end < start) 7275 end = (u64)-1; 7276 else 7277 end -= 1; 7278 7279 em = NULL; 7280 7281 /* ok, we didn't find anything, lets look for delalloc */ 7282 delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start, 7283 end, len, EXTENT_DELALLOC, 1); 7284 delalloc_end = delalloc_start + delalloc_len; 7285 if (delalloc_end < delalloc_start) 7286 delalloc_end = (u64)-1; 7287 7288 /* 7289 * We didn't find anything useful, return the original results from 7290 * get_extent() 7291 */ 7292 if (delalloc_start > end || delalloc_end <= start) { 7293 em = hole_em; 7294 hole_em = NULL; 7295 goto out; 7296 } 7297 7298 /* 7299 * Adjust the delalloc_start to make sure it doesn't go backwards from 7300 * the start they passed in 7301 */ 7302 delalloc_start = max(start, delalloc_start); 7303 delalloc_len = delalloc_end - delalloc_start; 7304 7305 if (delalloc_len > 0) { 7306 u64 hole_start; 7307 u64 hole_len; 7308 const u64 hole_end = extent_map_end(hole_em); 7309 7310 em = alloc_extent_map(); 7311 if (!em) { 7312 err = -ENOMEM; 7313 goto out; 7314 } 7315 7316 ASSERT(hole_em); 7317 /* 7318 * When btrfs_get_extent can't find anything it returns one 7319 * huge hole 7320 * 7321 * Make sure what it found really fits our range, and adjust to 7322 * make sure it is based on the start from the caller 7323 */ 7324 if (hole_end <= start || hole_em->start > end) { 7325 free_extent_map(hole_em); 7326 hole_em = NULL; 7327 } else { 7328 hole_start = max(hole_em->start, start); 7329 hole_len = hole_end - hole_start; 7330 } 7331 7332 if (hole_em && delalloc_start > hole_start) { 7333 /* 7334 * Our hole starts before our delalloc, so we have to 7335 * return just the parts of the hole that go until the 7336 * delalloc starts 7337 */ 7338 em->len = min(hole_len, delalloc_start - hole_start); 7339 em->start = hole_start; 7340 em->orig_start = hole_start; 7341 /* 7342 * Don't adjust block start at all, it is fixed at 7343 * EXTENT_MAP_HOLE 7344 */ 7345 em->block_start = hole_em->block_start; 7346 em->block_len = hole_len; 7347 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) 7348 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7349 } else { 7350 /* 7351 * Hole is out of passed range or it starts after 7352 * delalloc range 7353 */ 7354 em->start = delalloc_start; 7355 em->len = delalloc_len; 7356 em->orig_start = delalloc_start; 7357 em->block_start = EXTENT_MAP_DELALLOC; 7358 em->block_len = delalloc_len; 7359 } 7360 } else { 7361 return hole_em; 7362 } 7363 out: 7364 7365 free_extent_map(hole_em); 7366 if (err) { 7367 free_extent_map(em); 7368 return ERR_PTR(err); 7369 } 7370 return em; 7371 } 7372 7373 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, 7374 const u64 start, 7375 const u64 len, 7376 const u64 orig_start, 7377 const u64 block_start, 7378 const u64 block_len, 7379 const u64 orig_block_len, 7380 const u64 ram_bytes, 7381 const int type) 7382 { 7383 struct extent_map *em = NULL; 7384 int ret; 7385 7386 if (type != BTRFS_ORDERED_NOCOW) { 7387 em = create_io_em(inode, start, len, orig_start, block_start, 7388 block_len, orig_block_len, ram_bytes, 7389 BTRFS_COMPRESS_NONE, /* compress_type */ 7390 type); 7391 if (IS_ERR(em)) 7392 goto out; 7393 } 7394 ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, 7395 block_len, type); 7396 if (ret) { 7397 if (em) { 7398 free_extent_map(em); 7399 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 7400 } 7401 em = ERR_PTR(ret); 7402 } 7403 out: 7404 7405 return em; 7406 } 7407 7408 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, 7409 u64 start, u64 len) 7410 { 7411 struct btrfs_root *root = inode->root; 7412 struct btrfs_fs_info *fs_info = root->fs_info; 7413 struct extent_map *em; 7414 struct btrfs_key ins; 7415 u64 alloc_hint; 7416 int ret; 7417 7418 alloc_hint = get_extent_allocation_hint(inode, start, len); 7419 ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 7420 0, alloc_hint, &ins, 1, 1); 7421 if (ret) 7422 return ERR_PTR(ret); 7423 7424 em = btrfs_create_dio_extent(inode, start, ins.offset, start, 7425 ins.objectid, ins.offset, ins.offset, 7426 ins.offset, BTRFS_ORDERED_REGULAR); 7427 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 7428 if (IS_ERR(em)) 7429 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 7430 1); 7431 7432 return em; 7433 } 7434 7435 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 7436 { 7437 struct btrfs_block_group *block_group; 7438 bool readonly = false; 7439 7440 block_group = btrfs_lookup_block_group(fs_info, bytenr); 7441 if (!block_group || block_group->ro) 7442 readonly = true; 7443 if (block_group) 7444 btrfs_put_block_group(block_group); 7445 return readonly; 7446 } 7447 7448 /* 7449 * Check if we can do nocow write into the range [@offset, @offset + @len) 7450 * 7451 * @offset: File offset 7452 * @len: The length to write, will be updated to the nocow writeable 7453 * range 7454 * @orig_start: (optional) Return the original file offset of the file extent 7455 * @orig_len: (optional) Return the original on-disk length of the file extent 7456 * @ram_bytes: (optional) Return the ram_bytes of the file extent 7457 * @strict: if true, omit optimizations that might force us into unnecessary 7458 * cow. e.g., don't trust generation number. 7459 * 7460 * Return: 7461 * >0 and update @len if we can do nocow write 7462 * 0 if we can't do nocow write 7463 * <0 if error happened 7464 * 7465 * NOTE: This only checks the file extents, caller is responsible to wait for 7466 * any ordered extents. 7467 */ 7468 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, 7469 u64 *orig_start, u64 *orig_block_len, 7470 u64 *ram_bytes, bool strict) 7471 { 7472 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7473 struct btrfs_path *path; 7474 int ret; 7475 struct extent_buffer *leaf; 7476 struct btrfs_root *root = BTRFS_I(inode)->root; 7477 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7478 struct btrfs_file_extent_item *fi; 7479 struct btrfs_key key; 7480 u64 disk_bytenr; 7481 u64 backref_offset; 7482 u64 extent_end; 7483 u64 num_bytes; 7484 int slot; 7485 int found_type; 7486 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); 7487 7488 path = btrfs_alloc_path(); 7489 if (!path) 7490 return -ENOMEM; 7491 7492 ret = btrfs_lookup_file_extent(NULL, root, path, 7493 btrfs_ino(BTRFS_I(inode)), offset, 0); 7494 if (ret < 0) 7495 goto out; 7496 7497 slot = path->slots[0]; 7498 if (ret == 1) { 7499 if (slot == 0) { 7500 /* can't find the item, must cow */ 7501 ret = 0; 7502 goto out; 7503 } 7504 slot--; 7505 } 7506 ret = 0; 7507 leaf = path->nodes[0]; 7508 btrfs_item_key_to_cpu(leaf, &key, slot); 7509 if (key.objectid != btrfs_ino(BTRFS_I(inode)) || 7510 key.type != BTRFS_EXTENT_DATA_KEY) { 7511 /* not our file or wrong item type, must cow */ 7512 goto out; 7513 } 7514 7515 if (key.offset > offset) { 7516 /* Wrong offset, must cow */ 7517 goto out; 7518 } 7519 7520 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 7521 found_type = btrfs_file_extent_type(leaf, fi); 7522 if (found_type != BTRFS_FILE_EXTENT_REG && 7523 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 7524 /* not a regular extent, must cow */ 7525 goto out; 7526 } 7527 7528 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) 7529 goto out; 7530 7531 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 7532 if (extent_end <= offset) 7533 goto out; 7534 7535 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 7536 if (disk_bytenr == 0) 7537 goto out; 7538 7539 if (btrfs_file_extent_compression(leaf, fi) || 7540 btrfs_file_extent_encryption(leaf, fi) || 7541 btrfs_file_extent_other_encoding(leaf, fi)) 7542 goto out; 7543 7544 /* 7545 * Do the same check as in btrfs_cross_ref_exist but without the 7546 * unnecessary search. 7547 */ 7548 if (!strict && 7549 (btrfs_file_extent_generation(leaf, fi) <= 7550 btrfs_root_last_snapshot(&root->root_item))) 7551 goto out; 7552 7553 backref_offset = btrfs_file_extent_offset(leaf, fi); 7554 7555 if (orig_start) { 7556 *orig_start = key.offset - backref_offset; 7557 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 7558 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 7559 } 7560 7561 if (btrfs_extent_readonly(fs_info, disk_bytenr)) 7562 goto out; 7563 7564 num_bytes = min(offset + *len, extent_end) - offset; 7565 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { 7566 u64 range_end; 7567 7568 range_end = round_up(offset + num_bytes, 7569 root->fs_info->sectorsize) - 1; 7570 ret = test_range_bit(io_tree, offset, range_end, 7571 EXTENT_DELALLOC, 0, NULL); 7572 if (ret) { 7573 ret = -EAGAIN; 7574 goto out; 7575 } 7576 } 7577 7578 btrfs_release_path(path); 7579 7580 /* 7581 * look for other files referencing this extent, if we 7582 * find any we must cow 7583 */ 7584 7585 ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), 7586 key.offset - backref_offset, disk_bytenr, 7587 strict); 7588 if (ret) { 7589 ret = 0; 7590 goto out; 7591 } 7592 7593 /* 7594 * adjust disk_bytenr and num_bytes to cover just the bytes 7595 * in this extent we are about to write. If there 7596 * are any csums in that range we have to cow in order 7597 * to keep the csums correct 7598 */ 7599 disk_bytenr += backref_offset; 7600 disk_bytenr += offset - key.offset; 7601 if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) 7602 goto out; 7603 /* 7604 * all of the above have passed, it is safe to overwrite this extent 7605 * without cow 7606 */ 7607 *len = num_bytes; 7608 ret = 1; 7609 out: 7610 btrfs_free_path(path); 7611 return ret; 7612 } 7613 7614 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 7615 struct extent_state **cached_state, bool writing) 7616 { 7617 struct btrfs_ordered_extent *ordered; 7618 int ret = 0; 7619 7620 while (1) { 7621 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7622 cached_state); 7623 /* 7624 * We're concerned with the entire range that we're going to be 7625 * doing DIO to, so we need to make sure there's no ordered 7626 * extents in this range. 7627 */ 7628 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, 7629 lockend - lockstart + 1); 7630 7631 /* 7632 * We need to make sure there are no buffered pages in this 7633 * range either, we could have raced between the invalidate in 7634 * generic_file_direct_write and locking the extent. The 7635 * invalidate needs to happen so that reads after a write do not 7636 * get stale data. 7637 */ 7638 if (!ordered && 7639 (!writing || !filemap_range_has_page(inode->i_mapping, 7640 lockstart, lockend))) 7641 break; 7642 7643 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7644 cached_state); 7645 7646 if (ordered) { 7647 /* 7648 * If we are doing a DIO read and the ordered extent we 7649 * found is for a buffered write, we can not wait for it 7650 * to complete and retry, because if we do so we can 7651 * deadlock with concurrent buffered writes on page 7652 * locks. This happens only if our DIO read covers more 7653 * than one extent map, if at this point has already 7654 * created an ordered extent for a previous extent map 7655 * and locked its range in the inode's io tree, and a 7656 * concurrent write against that previous extent map's 7657 * range and this range started (we unlock the ranges 7658 * in the io tree only when the bios complete and 7659 * buffered writes always lock pages before attempting 7660 * to lock range in the io tree). 7661 */ 7662 if (writing || 7663 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) 7664 btrfs_start_ordered_extent(ordered, 1); 7665 else 7666 ret = -ENOTBLK; 7667 btrfs_put_ordered_extent(ordered); 7668 } else { 7669 /* 7670 * We could trigger writeback for this range (and wait 7671 * for it to complete) and then invalidate the pages for 7672 * this range (through invalidate_inode_pages2_range()), 7673 * but that can lead us to a deadlock with a concurrent 7674 * call to readahead (a buffered read or a defrag call 7675 * triggered a readahead) on a page lock due to an 7676 * ordered dio extent we created before but did not have 7677 * yet a corresponding bio submitted (whence it can not 7678 * complete), which makes readahead wait for that 7679 * ordered extent to complete while holding a lock on 7680 * that page. 7681 */ 7682 ret = -ENOTBLK; 7683 } 7684 7685 if (ret) 7686 break; 7687 7688 cond_resched(); 7689 } 7690 7691 return ret; 7692 } 7693 7694 /* The callers of this must take lock_extent() */ 7695 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, 7696 u64 len, u64 orig_start, u64 block_start, 7697 u64 block_len, u64 orig_block_len, 7698 u64 ram_bytes, int compress_type, 7699 int type) 7700 { 7701 struct extent_map_tree *em_tree; 7702 struct extent_map *em; 7703 int ret; 7704 7705 ASSERT(type == BTRFS_ORDERED_PREALLOC || 7706 type == BTRFS_ORDERED_COMPRESSED || 7707 type == BTRFS_ORDERED_NOCOW || 7708 type == BTRFS_ORDERED_REGULAR); 7709 7710 em_tree = &inode->extent_tree; 7711 em = alloc_extent_map(); 7712 if (!em) 7713 return ERR_PTR(-ENOMEM); 7714 7715 em->start = start; 7716 em->orig_start = orig_start; 7717 em->len = len; 7718 em->block_len = block_len; 7719 em->block_start = block_start; 7720 em->orig_block_len = orig_block_len; 7721 em->ram_bytes = ram_bytes; 7722 em->generation = -1; 7723 set_bit(EXTENT_FLAG_PINNED, &em->flags); 7724 if (type == BTRFS_ORDERED_PREALLOC) { 7725 set_bit(EXTENT_FLAG_FILLING, &em->flags); 7726 } else if (type == BTRFS_ORDERED_COMPRESSED) { 7727 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 7728 em->compress_type = compress_type; 7729 } 7730 7731 do { 7732 btrfs_drop_extent_cache(inode, em->start, 7733 em->start + em->len - 1, 0); 7734 write_lock(&em_tree->lock); 7735 ret = add_extent_mapping(em_tree, em, 1); 7736 write_unlock(&em_tree->lock); 7737 /* 7738 * The caller has taken lock_extent(), who could race with us 7739 * to add em? 7740 */ 7741 } while (ret == -EEXIST); 7742 7743 if (ret) { 7744 free_extent_map(em); 7745 return ERR_PTR(ret); 7746 } 7747 7748 /* em got 2 refs now, callers needs to do free_extent_map once. */ 7749 return em; 7750 } 7751 7752 7753 static int btrfs_get_blocks_direct_write(struct extent_map **map, 7754 struct inode *inode, 7755 struct btrfs_dio_data *dio_data, 7756 u64 start, u64 len) 7757 { 7758 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7759 struct extent_map *em = *map; 7760 int ret = 0; 7761 7762 /* 7763 * We don't allocate a new extent in the following cases 7764 * 7765 * 1) The inode is marked as NODATACOW. In this case we'll just use the 7766 * existing extent. 7767 * 2) The extent is marked as PREALLOC. We're good to go here and can 7768 * just use the extent. 7769 * 7770 */ 7771 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 7772 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 7773 em->block_start != EXTENT_MAP_HOLE)) { 7774 int type; 7775 u64 block_start, orig_start, orig_block_len, ram_bytes; 7776 7777 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7778 type = BTRFS_ORDERED_PREALLOC; 7779 else 7780 type = BTRFS_ORDERED_NOCOW; 7781 len = min(len, em->len - (start - em->start)); 7782 block_start = em->block_start + (start - em->start); 7783 7784 if (can_nocow_extent(inode, start, &len, &orig_start, 7785 &orig_block_len, &ram_bytes, false) == 1 && 7786 btrfs_inc_nocow_writers(fs_info, block_start)) { 7787 struct extent_map *em2; 7788 7789 em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, 7790 orig_start, block_start, 7791 len, orig_block_len, 7792 ram_bytes, type); 7793 btrfs_dec_nocow_writers(fs_info, block_start); 7794 if (type == BTRFS_ORDERED_PREALLOC) { 7795 free_extent_map(em); 7796 *map = em = em2; 7797 } 7798 7799 if (em2 && IS_ERR(em2)) { 7800 ret = PTR_ERR(em2); 7801 goto out; 7802 } 7803 /* 7804 * For inode marked NODATACOW or extent marked PREALLOC, 7805 * use the existing or preallocated extent, so does not 7806 * need to adjust btrfs_space_info's bytes_may_use. 7807 */ 7808 btrfs_free_reserved_data_space_noquota(fs_info, len); 7809 goto skip_cow; 7810 } 7811 } 7812 7813 /* this will cow the extent */ 7814 free_extent_map(em); 7815 *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); 7816 if (IS_ERR(em)) { 7817 ret = PTR_ERR(em); 7818 goto out; 7819 } 7820 7821 len = min(len, em->len - (start - em->start)); 7822 7823 skip_cow: 7824 /* 7825 * Need to update the i_size under the extent lock so buffered 7826 * readers will get the updated i_size when we unlock. 7827 */ 7828 if (start + len > i_size_read(inode)) 7829 i_size_write(inode, start + len); 7830 7831 dio_data->reserve -= len; 7832 out: 7833 return ret; 7834 } 7835 7836 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, 7837 loff_t length, unsigned int flags, struct iomap *iomap, 7838 struct iomap *srcmap) 7839 { 7840 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7841 struct extent_map *em; 7842 struct extent_state *cached_state = NULL; 7843 struct btrfs_dio_data *dio_data = NULL; 7844 u64 lockstart, lockend; 7845 const bool write = !!(flags & IOMAP_WRITE); 7846 int ret = 0; 7847 u64 len = length; 7848 bool unlock_extents = false; 7849 7850 if (!write) 7851 len = min_t(u64, len, fs_info->sectorsize); 7852 7853 lockstart = start; 7854 lockend = start + len - 1; 7855 7856 /* 7857 * The generic stuff only does filemap_write_and_wait_range, which 7858 * isn't enough if we've written compressed pages to this area, so we 7859 * need to flush the dirty pages again to make absolutely sure that any 7860 * outstanding dirty pages are on disk. 7861 */ 7862 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 7863 &BTRFS_I(inode)->runtime_flags)) { 7864 ret = filemap_fdatawrite_range(inode->i_mapping, start, 7865 start + length - 1); 7866 if (ret) 7867 return ret; 7868 } 7869 7870 dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); 7871 if (!dio_data) 7872 return -ENOMEM; 7873 7874 dio_data->length = length; 7875 if (write) { 7876 dio_data->reserve = round_up(length, fs_info->sectorsize); 7877 ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), 7878 &dio_data->data_reserved, 7879 start, dio_data->reserve); 7880 if (ret) { 7881 extent_changeset_free(dio_data->data_reserved); 7882 kfree(dio_data); 7883 return ret; 7884 } 7885 } 7886 iomap->private = dio_data; 7887 7888 7889 /* 7890 * If this errors out it's because we couldn't invalidate pagecache for 7891 * this range and we need to fallback to buffered. 7892 */ 7893 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { 7894 ret = -ENOTBLK; 7895 goto err; 7896 } 7897 7898 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); 7899 if (IS_ERR(em)) { 7900 ret = PTR_ERR(em); 7901 goto unlock_err; 7902 } 7903 7904 /* 7905 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 7906 * io. INLINE is special, and we could probably kludge it in here, but 7907 * it's still buffered so for safety lets just fall back to the generic 7908 * buffered path. 7909 * 7910 * For COMPRESSED we _have_ to read the entire extent in so we can 7911 * decompress it, so there will be buffering required no matter what we 7912 * do, so go ahead and fallback to buffered. 7913 * 7914 * We return -ENOTBLK because that's what makes DIO go ahead and go back 7915 * to buffered IO. Don't blame me, this is the price we pay for using 7916 * the generic code. 7917 */ 7918 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 7919 em->block_start == EXTENT_MAP_INLINE) { 7920 free_extent_map(em); 7921 ret = -ENOTBLK; 7922 goto unlock_err; 7923 } 7924 7925 len = min(len, em->len - (start - em->start)); 7926 if (write) { 7927 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, 7928 start, len); 7929 if (ret < 0) 7930 goto unlock_err; 7931 unlock_extents = true; 7932 /* Recalc len in case the new em is smaller than requested */ 7933 len = min(len, em->len - (start - em->start)); 7934 } else { 7935 /* 7936 * We need to unlock only the end area that we aren't using. 7937 * The rest is going to be unlocked by the endio routine. 7938 */ 7939 lockstart = start + len; 7940 if (lockstart < lockend) 7941 unlock_extents = true; 7942 } 7943 7944 if (unlock_extents) 7945 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 7946 lockstart, lockend, &cached_state); 7947 else 7948 free_extent_state(cached_state); 7949 7950 /* 7951 * Translate extent map information to iomap. 7952 * We trim the extents (and move the addr) even though iomap code does 7953 * that, since we have locked only the parts we are performing I/O in. 7954 */ 7955 if ((em->block_start == EXTENT_MAP_HOLE) || 7956 (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { 7957 iomap->addr = IOMAP_NULL_ADDR; 7958 iomap->type = IOMAP_HOLE; 7959 } else { 7960 iomap->addr = em->block_start + (start - em->start); 7961 iomap->type = IOMAP_MAPPED; 7962 } 7963 iomap->offset = start; 7964 iomap->bdev = fs_info->fs_devices->latest_bdev; 7965 iomap->length = len; 7966 7967 if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) 7968 iomap->flags |= IOMAP_F_ZONE_APPEND; 7969 7970 free_extent_map(em); 7971 7972 return 0; 7973 7974 unlock_err: 7975 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7976 &cached_state); 7977 err: 7978 if (dio_data) { 7979 btrfs_delalloc_release_space(BTRFS_I(inode), 7980 dio_data->data_reserved, start, 7981 dio_data->reserve, true); 7982 btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); 7983 extent_changeset_free(dio_data->data_reserved); 7984 kfree(dio_data); 7985 } 7986 return ret; 7987 } 7988 7989 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, 7990 ssize_t written, unsigned int flags, struct iomap *iomap) 7991 { 7992 int ret = 0; 7993 struct btrfs_dio_data *dio_data = iomap->private; 7994 size_t submitted = dio_data->submitted; 7995 const bool write = !!(flags & IOMAP_WRITE); 7996 7997 if (!write && (iomap->type == IOMAP_HOLE)) { 7998 /* If reading from a hole, unlock and return */ 7999 unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); 8000 goto out; 8001 } 8002 8003 if (submitted < length) { 8004 pos += submitted; 8005 length -= submitted; 8006 if (write) 8007 __endio_write_update_ordered(BTRFS_I(inode), pos, 8008 length, false); 8009 else 8010 unlock_extent(&BTRFS_I(inode)->io_tree, pos, 8011 pos + length - 1); 8012 ret = -ENOTBLK; 8013 } 8014 8015 if (write) { 8016 if (dio_data->reserve) 8017 btrfs_delalloc_release_space(BTRFS_I(inode), 8018 dio_data->data_reserved, pos, 8019 dio_data->reserve, true); 8020 btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); 8021 extent_changeset_free(dio_data->data_reserved); 8022 } 8023 out: 8024 kfree(dio_data); 8025 iomap->private = NULL; 8026 8027 return ret; 8028 } 8029 8030 static void btrfs_dio_private_put(struct btrfs_dio_private *dip) 8031 { 8032 /* 8033 * This implies a barrier so that stores to dio_bio->bi_status before 8034 * this and loads of dio_bio->bi_status after this are fully ordered. 8035 */ 8036 if (!refcount_dec_and_test(&dip->refs)) 8037 return; 8038 8039 if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { 8040 __endio_write_update_ordered(BTRFS_I(dip->inode), 8041 dip->logical_offset, 8042 dip->bytes, 8043 !dip->dio_bio->bi_status); 8044 } else { 8045 unlock_extent(&BTRFS_I(dip->inode)->io_tree, 8046 dip->logical_offset, 8047 dip->logical_offset + dip->bytes - 1); 8048 } 8049 8050 bio_endio(dip->dio_bio); 8051 kfree(dip); 8052 } 8053 8054 static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, 8055 int mirror_num, 8056 unsigned long bio_flags) 8057 { 8058 struct btrfs_dio_private *dip = bio->bi_private; 8059 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8060 blk_status_t ret; 8061 8062 BUG_ON(bio_op(bio) == REQ_OP_WRITE); 8063 8064 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 8065 if (ret) 8066 return ret; 8067 8068 refcount_inc(&dip->refs); 8069 ret = btrfs_map_bio(fs_info, bio, mirror_num); 8070 if (ret) 8071 refcount_dec(&dip->refs); 8072 return ret; 8073 } 8074 8075 static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, 8076 struct btrfs_io_bio *io_bio, 8077 const bool uptodate) 8078 { 8079 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 8080 const u32 sectorsize = fs_info->sectorsize; 8081 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 8082 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 8083 const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); 8084 struct bio_vec bvec; 8085 struct bvec_iter iter; 8086 u64 start = io_bio->logical; 8087 u32 bio_offset = 0; 8088 blk_status_t err = BLK_STS_OK; 8089 8090 __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) { 8091 unsigned int i, nr_sectors, pgoff; 8092 8093 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); 8094 pgoff = bvec.bv_offset; 8095 for (i = 0; i < nr_sectors; i++) { 8096 ASSERT(pgoff < PAGE_SIZE); 8097 if (uptodate && 8098 (!csum || !check_data_csum(inode, io_bio, 8099 bio_offset, bvec.bv_page, 8100 pgoff, start))) { 8101 clean_io_failure(fs_info, failure_tree, io_tree, 8102 start, bvec.bv_page, 8103 btrfs_ino(BTRFS_I(inode)), 8104 pgoff); 8105 } else { 8106 int ret; 8107 8108 ASSERT((start - io_bio->logical) < UINT_MAX); 8109 ret = btrfs_repair_one_sector(inode, 8110 &io_bio->bio, 8111 start - io_bio->logical, 8112 bvec.bv_page, pgoff, 8113 start, io_bio->mirror_num, 8114 submit_dio_repair_bio); 8115 if (ret) 8116 err = errno_to_blk_status(ret); 8117 } 8118 start += sectorsize; 8119 ASSERT(bio_offset + sectorsize > bio_offset); 8120 bio_offset += sectorsize; 8121 pgoff += sectorsize; 8122 } 8123 } 8124 return err; 8125 } 8126 8127 static void __endio_write_update_ordered(struct btrfs_inode *inode, 8128 const u64 offset, const u64 bytes, 8129 const bool uptodate) 8130 { 8131 btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, 8132 finish_ordered_fn, uptodate); 8133 } 8134 8135 static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, 8136 struct bio *bio, 8137 u64 dio_file_offset) 8138 { 8139 return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1); 8140 } 8141 8142 static void btrfs_end_dio_bio(struct bio *bio) 8143 { 8144 struct btrfs_dio_private *dip = bio->bi_private; 8145 blk_status_t err = bio->bi_status; 8146 8147 if (err) 8148 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, 8149 "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", 8150 btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), 8151 bio->bi_opf, bio->bi_iter.bi_sector, 8152 bio->bi_iter.bi_size, err); 8153 8154 if (bio_op(bio) == REQ_OP_READ) { 8155 err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio), 8156 !err); 8157 } 8158 8159 if (err) 8160 dip->dio_bio->bi_status = err; 8161 8162 btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio); 8163 8164 bio_put(bio); 8165 btrfs_dio_private_put(dip); 8166 } 8167 8168 static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, 8169 struct inode *inode, u64 file_offset, int async_submit) 8170 { 8171 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8172 struct btrfs_dio_private *dip = bio->bi_private; 8173 bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; 8174 blk_status_t ret; 8175 8176 /* Check btrfs_submit_bio_hook() for rules about async submit. */ 8177 if (async_submit) 8178 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); 8179 8180 if (!write) { 8181 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 8182 if (ret) 8183 goto err; 8184 } 8185 8186 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 8187 goto map; 8188 8189 if (write && async_submit) { 8190 ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset, 8191 btrfs_submit_bio_start_direct_io); 8192 goto err; 8193 } else if (write) { 8194 /* 8195 * If we aren't doing async submit, calculate the csum of the 8196 * bio now. 8197 */ 8198 ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1); 8199 if (ret) 8200 goto err; 8201 } else { 8202 u64 csum_offset; 8203 8204 csum_offset = file_offset - dip->logical_offset; 8205 csum_offset >>= fs_info->sectorsize_bits; 8206 csum_offset *= fs_info->csum_size; 8207 btrfs_io_bio(bio)->csum = dip->csums + csum_offset; 8208 } 8209 map: 8210 ret = btrfs_map_bio(fs_info, bio, 0); 8211 err: 8212 return ret; 8213 } 8214 8215 /* 8216 * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked 8217 * or ordered extents whether or not we submit any bios. 8218 */ 8219 static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, 8220 struct inode *inode, 8221 loff_t file_offset) 8222 { 8223 const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); 8224 const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); 8225 size_t dip_size; 8226 struct btrfs_dio_private *dip; 8227 8228 dip_size = sizeof(*dip); 8229 if (!write && csum) { 8230 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8231 size_t nblocks; 8232 8233 nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; 8234 dip_size += fs_info->csum_size * nblocks; 8235 } 8236 8237 dip = kzalloc(dip_size, GFP_NOFS); 8238 if (!dip) 8239 return NULL; 8240 8241 dip->inode = inode; 8242 dip->logical_offset = file_offset; 8243 dip->bytes = dio_bio->bi_iter.bi_size; 8244 dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; 8245 dip->dio_bio = dio_bio; 8246 refcount_set(&dip->refs, 1); 8247 return dip; 8248 } 8249 8250 static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, 8251 struct bio *dio_bio, loff_t file_offset) 8252 { 8253 struct inode *inode = iter->inode; 8254 const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); 8255 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8256 const bool raid56 = (btrfs_data_alloc_profile(fs_info) & 8257 BTRFS_BLOCK_GROUP_RAID56_MASK); 8258 struct btrfs_dio_private *dip; 8259 struct bio *bio; 8260 u64 start_sector; 8261 int async_submit = 0; 8262 u64 submit_len; 8263 u64 clone_offset = 0; 8264 u64 clone_len; 8265 u64 logical; 8266 int ret; 8267 blk_status_t status; 8268 struct btrfs_io_geometry geom; 8269 struct btrfs_dio_data *dio_data = iter->iomap.private; 8270 struct extent_map *em = NULL; 8271 8272 dip = btrfs_create_dio_private(dio_bio, inode, file_offset); 8273 if (!dip) { 8274 if (!write) { 8275 unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, 8276 file_offset + dio_bio->bi_iter.bi_size - 1); 8277 } 8278 dio_bio->bi_status = BLK_STS_RESOURCE; 8279 bio_endio(dio_bio); 8280 return BLK_QC_T_NONE; 8281 } 8282 8283 if (!write) { 8284 /* 8285 * Load the csums up front to reduce csum tree searches and 8286 * contention when submitting bios. 8287 * 8288 * If we have csums disabled this will do nothing. 8289 */ 8290 status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); 8291 if (status != BLK_STS_OK) 8292 goto out_err; 8293 } 8294 8295 start_sector = dio_bio->bi_iter.bi_sector; 8296 submit_len = dio_bio->bi_iter.bi_size; 8297 8298 do { 8299 logical = start_sector << 9; 8300 em = btrfs_get_chunk_map(fs_info, logical, submit_len); 8301 if (IS_ERR(em)) { 8302 status = errno_to_blk_status(PTR_ERR(em)); 8303 em = NULL; 8304 goto out_err_em; 8305 } 8306 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), 8307 logical, &geom); 8308 if (ret) { 8309 status = errno_to_blk_status(ret); 8310 goto out_err_em; 8311 } 8312 8313 clone_len = min(submit_len, geom.len); 8314 ASSERT(clone_len <= UINT_MAX); 8315 8316 /* 8317 * This will never fail as it's passing GPF_NOFS and 8318 * the allocation is backed by btrfs_bioset. 8319 */ 8320 bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); 8321 bio->bi_private = dip; 8322 bio->bi_end_io = btrfs_end_dio_bio; 8323 btrfs_io_bio(bio)->logical = file_offset; 8324 8325 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 8326 status = extract_ordered_extent(BTRFS_I(inode), bio, 8327 file_offset); 8328 if (status) { 8329 bio_put(bio); 8330 goto out_err; 8331 } 8332 } 8333 8334 ASSERT(submit_len >= clone_len); 8335 submit_len -= clone_len; 8336 8337 /* 8338 * Increase the count before we submit the bio so we know 8339 * the end IO handler won't happen before we increase the 8340 * count. Otherwise, the dip might get freed before we're 8341 * done setting it up. 8342 * 8343 * We transfer the initial reference to the last bio, so we 8344 * don't need to increment the reference count for the last one. 8345 */ 8346 if (submit_len > 0) { 8347 refcount_inc(&dip->refs); 8348 /* 8349 * If we are submitting more than one bio, submit them 8350 * all asynchronously. The exception is RAID 5 or 6, as 8351 * asynchronous checksums make it difficult to collect 8352 * full stripe writes. 8353 */ 8354 if (!raid56) 8355 async_submit = 1; 8356 } 8357 8358 status = btrfs_submit_dio_bio(bio, inode, file_offset, 8359 async_submit); 8360 if (status) { 8361 bio_put(bio); 8362 if (submit_len > 0) 8363 refcount_dec(&dip->refs); 8364 goto out_err_em; 8365 } 8366 8367 dio_data->submitted += clone_len; 8368 clone_offset += clone_len; 8369 start_sector += clone_len >> 9; 8370 file_offset += clone_len; 8371 8372 free_extent_map(em); 8373 } while (submit_len > 0); 8374 return BLK_QC_T_NONE; 8375 8376 out_err_em: 8377 free_extent_map(em); 8378 out_err: 8379 dip->dio_bio->bi_status = status; 8380 btrfs_dio_private_put(dip); 8381 8382 return BLK_QC_T_NONE; 8383 } 8384 8385 const struct iomap_ops btrfs_dio_iomap_ops = { 8386 .iomap_begin = btrfs_dio_iomap_begin, 8387 .iomap_end = btrfs_dio_iomap_end, 8388 }; 8389 8390 const struct iomap_dio_ops btrfs_dio_ops = { 8391 .submit_io = btrfs_submit_direct, 8392 }; 8393 8394 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 8395 u64 start, u64 len) 8396 { 8397 int ret; 8398 8399 ret = fiemap_prep(inode, fieinfo, start, &len, 0); 8400 if (ret) 8401 return ret; 8402 8403 return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); 8404 } 8405 8406 int btrfs_readpage(struct file *file, struct page *page) 8407 { 8408 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 8409 u64 start = page_offset(page); 8410 u64 end = start + PAGE_SIZE - 1; 8411 struct btrfs_bio_ctrl bio_ctrl = { 0 }; 8412 int ret; 8413 8414 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 8415 8416 ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); 8417 if (bio_ctrl.bio) 8418 ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); 8419 return ret; 8420 } 8421 8422 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 8423 { 8424 struct inode *inode = page->mapping->host; 8425 int ret; 8426 8427 if (current->flags & PF_MEMALLOC) { 8428 redirty_page_for_writepage(wbc, page); 8429 unlock_page(page); 8430 return 0; 8431 } 8432 8433 /* 8434 * If we are under memory pressure we will call this directly from the 8435 * VM, we need to make sure we have the inode referenced for the ordered 8436 * extent. If not just return like we didn't do anything. 8437 */ 8438 if (!igrab(inode)) { 8439 redirty_page_for_writepage(wbc, page); 8440 return AOP_WRITEPAGE_ACTIVATE; 8441 } 8442 ret = extent_write_full_page(page, wbc); 8443 btrfs_add_delayed_iput(inode); 8444 return ret; 8445 } 8446 8447 static int btrfs_writepages(struct address_space *mapping, 8448 struct writeback_control *wbc) 8449 { 8450 return extent_writepages(mapping, wbc); 8451 } 8452 8453 static void btrfs_readahead(struct readahead_control *rac) 8454 { 8455 extent_readahead(rac); 8456 } 8457 8458 /* 8459 * For releasepage() and invalidatepage() we have a race window where 8460 * end_page_writeback() is called but the subpage spinlock is not yet released. 8461 * If we continue to release/invalidate the page, we could cause use-after-free 8462 * for subpage spinlock. So this function is to spin and wait for subpage 8463 * spinlock. 8464 */ 8465 static void wait_subpage_spinlock(struct page *page) 8466 { 8467 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 8468 struct btrfs_subpage *subpage; 8469 8470 if (fs_info->sectorsize == PAGE_SIZE) 8471 return; 8472 8473 ASSERT(PagePrivate(page) && page->private); 8474 subpage = (struct btrfs_subpage *)page->private; 8475 8476 /* 8477 * This may look insane as we just acquire the spinlock and release it, 8478 * without doing anything. But we just want to make sure no one is 8479 * still holding the subpage spinlock. 8480 * And since the page is not dirty nor writeback, and we have page 8481 * locked, the only possible way to hold a spinlock is from the endio 8482 * function to clear page writeback. 8483 * 8484 * Here we just acquire the spinlock so that all existing callers 8485 * should exit and we're safe to release/invalidate the page. 8486 */ 8487 spin_lock_irq(&subpage->lock); 8488 spin_unlock_irq(&subpage->lock); 8489 } 8490 8491 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8492 { 8493 int ret = try_release_extent_mapping(page, gfp_flags); 8494 8495 if (ret == 1) { 8496 wait_subpage_spinlock(page); 8497 clear_page_extent_mapped(page); 8498 } 8499 return ret; 8500 } 8501 8502 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8503 { 8504 if (PageWriteback(page) || PageDirty(page)) 8505 return 0; 8506 return __btrfs_releasepage(page, gfp_flags); 8507 } 8508 8509 #ifdef CONFIG_MIGRATION 8510 static int btrfs_migratepage(struct address_space *mapping, 8511 struct page *newpage, struct page *page, 8512 enum migrate_mode mode) 8513 { 8514 int ret; 8515 8516 ret = migrate_page_move_mapping(mapping, newpage, page, 0); 8517 if (ret != MIGRATEPAGE_SUCCESS) 8518 return ret; 8519 8520 if (page_has_private(page)) 8521 attach_page_private(newpage, detach_page_private(page)); 8522 8523 if (PageOrdered(page)) { 8524 ClearPageOrdered(page); 8525 SetPageOrdered(newpage); 8526 } 8527 8528 if (mode != MIGRATE_SYNC_NO_COPY) 8529 migrate_page_copy(newpage, page); 8530 else 8531 migrate_page_states(newpage, page); 8532 return MIGRATEPAGE_SUCCESS; 8533 } 8534 #endif 8535 8536 static void btrfs_invalidatepage(struct page *page, unsigned int offset, 8537 unsigned int length) 8538 { 8539 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 8540 struct btrfs_fs_info *fs_info = inode->root->fs_info; 8541 struct extent_io_tree *tree = &inode->io_tree; 8542 struct extent_state *cached_state = NULL; 8543 u64 page_start = page_offset(page); 8544 u64 page_end = page_start + PAGE_SIZE - 1; 8545 u64 cur; 8546 int inode_evicting = inode->vfs_inode.i_state & I_FREEING; 8547 8548 /* 8549 * We have page locked so no new ordered extent can be created on this 8550 * page, nor bio can be submitted for this page. 8551 * 8552 * But already submitted bio can still be finished on this page. 8553 * Furthermore, endio function won't skip page which has Ordered 8554 * (Private2) already cleared, so it's possible for endio and 8555 * invalidatepage to do the same ordered extent accounting twice 8556 * on one page. 8557 * 8558 * So here we wait for any submitted bios to finish, so that we won't 8559 * do double ordered extent accounting on the same page. 8560 */ 8561 wait_on_page_writeback(page); 8562 wait_subpage_spinlock(page); 8563 8564 /* 8565 * For subpage case, we have call sites like 8566 * btrfs_punch_hole_lock_range() which passes range not aligned to 8567 * sectorsize. 8568 * If the range doesn't cover the full page, we don't need to and 8569 * shouldn't clear page extent mapped, as page->private can still 8570 * record subpage dirty bits for other part of the range. 8571 * 8572 * For cases that can invalidate the full even the range doesn't 8573 * cover the full page, like invalidating the last page, we're 8574 * still safe to wait for ordered extent to finish. 8575 */ 8576 if (!(offset == 0 && length == PAGE_SIZE)) { 8577 btrfs_releasepage(page, GFP_NOFS); 8578 return; 8579 } 8580 8581 if (!inode_evicting) 8582 lock_extent_bits(tree, page_start, page_end, &cached_state); 8583 8584 cur = page_start; 8585 while (cur < page_end) { 8586 struct btrfs_ordered_extent *ordered; 8587 bool delete_states; 8588 u64 range_end; 8589 u32 range_len; 8590 8591 ordered = btrfs_lookup_first_ordered_range(inode, cur, 8592 page_end + 1 - cur); 8593 if (!ordered) { 8594 range_end = page_end; 8595 /* 8596 * No ordered extent covering this range, we are safe 8597 * to delete all extent states in the range. 8598 */ 8599 delete_states = true; 8600 goto next; 8601 } 8602 if (ordered->file_offset > cur) { 8603 /* 8604 * There is a range between [cur, oe->file_offset) not 8605 * covered by any ordered extent. 8606 * We are safe to delete all extent states, and handle 8607 * the ordered extent in the next iteration. 8608 */ 8609 range_end = ordered->file_offset - 1; 8610 delete_states = true; 8611 goto next; 8612 } 8613 8614 range_end = min(ordered->file_offset + ordered->num_bytes - 1, 8615 page_end); 8616 ASSERT(range_end + 1 - cur < U32_MAX); 8617 range_len = range_end + 1 - cur; 8618 if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { 8619 /* 8620 * If Ordered (Private2) is cleared, it means endio has 8621 * already been executed for the range. 8622 * We can't delete the extent states as 8623 * btrfs_finish_ordered_io() may still use some of them. 8624 */ 8625 delete_states = false; 8626 goto next; 8627 } 8628 btrfs_page_clear_ordered(fs_info, page, cur, range_len); 8629 8630 /* 8631 * IO on this page will never be started, so we need to account 8632 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW 8633 * here, must leave that up for the ordered extent completion. 8634 * 8635 * This will also unlock the range for incoming 8636 * btrfs_finish_ordered_io(). 8637 */ 8638 if (!inode_evicting) 8639 clear_extent_bit(tree, cur, range_end, 8640 EXTENT_DELALLOC | 8641 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 8642 EXTENT_DEFRAG, 1, 0, &cached_state); 8643 8644 spin_lock_irq(&inode->ordered_tree.lock); 8645 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 8646 ordered->truncated_len = min(ordered->truncated_len, 8647 cur - ordered->file_offset); 8648 spin_unlock_irq(&inode->ordered_tree.lock); 8649 8650 if (btrfs_dec_test_ordered_pending(inode, &ordered, 8651 cur, range_end + 1 - cur)) { 8652 btrfs_finish_ordered_io(ordered); 8653 /* 8654 * The ordered extent has finished, now we're again 8655 * safe to delete all extent states of the range. 8656 */ 8657 delete_states = true; 8658 } else { 8659 /* 8660 * btrfs_finish_ordered_io() will get executed by endio 8661 * of other pages, thus we can't delete extent states 8662 * anymore 8663 */ 8664 delete_states = false; 8665 } 8666 next: 8667 if (ordered) 8668 btrfs_put_ordered_extent(ordered); 8669 /* 8670 * Qgroup reserved space handler 8671 * Sector(s) here will be either: 8672 * 8673 * 1) Already written to disk or bio already finished 8674 * Then its QGROUP_RESERVED bit in io_tree is already cleared. 8675 * Qgroup will be handled by its qgroup_record then. 8676 * btrfs_qgroup_free_data() call will do nothing here. 8677 * 8678 * 2) Not written to disk yet 8679 * Then btrfs_qgroup_free_data() call will clear the 8680 * QGROUP_RESERVED bit of its io_tree, and free the qgroup 8681 * reserved data space. 8682 * Since the IO will never happen for this page. 8683 */ 8684 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); 8685 if (!inode_evicting) { 8686 clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | 8687 EXTENT_DELALLOC | EXTENT_UPTODATE | 8688 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 8689 delete_states, &cached_state); 8690 } 8691 cur = range_end + 1; 8692 } 8693 /* 8694 * We have iterated through all ordered extents of the page, the page 8695 * should not have Ordered (Private2) anymore, or the above iteration 8696 * did something wrong. 8697 */ 8698 ASSERT(!PageOrdered(page)); 8699 if (!inode_evicting) 8700 __btrfs_releasepage(page, GFP_NOFS); 8701 ClearPageChecked(page); 8702 clear_page_extent_mapped(page); 8703 } 8704 8705 /* 8706 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 8707 * called from a page fault handler when a page is first dirtied. Hence we must 8708 * be careful to check for EOF conditions here. We set the page up correctly 8709 * for a written page which means we get ENOSPC checking when writing into 8710 * holes and correct delalloc and unwritten extent mapping on filesystems that 8711 * support these features. 8712 * 8713 * We are not allowed to take the i_mutex here so we have to play games to 8714 * protect against truncate races as the page could now be beyond EOF. Because 8715 * truncate_setsize() writes the inode size before removing pages, once we have 8716 * the page lock we can determine safely if the page is beyond EOF. If it is not 8717 * beyond EOF, then the page is guaranteed safe against truncation until we 8718 * unlock the page. 8719 */ 8720 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) 8721 { 8722 struct page *page = vmf->page; 8723 struct inode *inode = file_inode(vmf->vma->vm_file); 8724 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8725 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 8726 struct btrfs_ordered_extent *ordered; 8727 struct extent_state *cached_state = NULL; 8728 struct extent_changeset *data_reserved = NULL; 8729 unsigned long zero_start; 8730 loff_t size; 8731 vm_fault_t ret; 8732 int ret2; 8733 int reserved = 0; 8734 u64 reserved_space; 8735 u64 page_start; 8736 u64 page_end; 8737 u64 end; 8738 8739 reserved_space = PAGE_SIZE; 8740 8741 sb_start_pagefault(inode->i_sb); 8742 page_start = page_offset(page); 8743 page_end = page_start + PAGE_SIZE - 1; 8744 end = page_end; 8745 8746 /* 8747 * Reserving delalloc space after obtaining the page lock can lead to 8748 * deadlock. For example, if a dirty page is locked by this function 8749 * and the call to btrfs_delalloc_reserve_space() ends up triggering 8750 * dirty page write out, then the btrfs_writepage() function could 8751 * end up waiting indefinitely to get a lock on the page currently 8752 * being processed by btrfs_page_mkwrite() function. 8753 */ 8754 ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 8755 page_start, reserved_space); 8756 if (!ret2) { 8757 ret2 = file_update_time(vmf->vma->vm_file); 8758 reserved = 1; 8759 } 8760 if (ret2) { 8761 ret = vmf_error(ret2); 8762 if (reserved) 8763 goto out; 8764 goto out_noreserve; 8765 } 8766 8767 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 8768 again: 8769 down_read(&BTRFS_I(inode)->i_mmap_lock); 8770 lock_page(page); 8771 size = i_size_read(inode); 8772 8773 if ((page->mapping != inode->i_mapping) || 8774 (page_start >= size)) { 8775 /* page got truncated out from underneath us */ 8776 goto out_unlock; 8777 } 8778 wait_on_page_writeback(page); 8779 8780 lock_extent_bits(io_tree, page_start, page_end, &cached_state); 8781 ret2 = set_page_extent_mapped(page); 8782 if (ret2 < 0) { 8783 ret = vmf_error(ret2); 8784 unlock_extent_cached(io_tree, page_start, page_end, &cached_state); 8785 goto out_unlock; 8786 } 8787 8788 /* 8789 * we can't set the delalloc bits if there are pending ordered 8790 * extents. Drop our locks and wait for them to finish 8791 */ 8792 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, 8793 PAGE_SIZE); 8794 if (ordered) { 8795 unlock_extent_cached(io_tree, page_start, page_end, 8796 &cached_state); 8797 unlock_page(page); 8798 up_read(&BTRFS_I(inode)->i_mmap_lock); 8799 btrfs_start_ordered_extent(ordered, 1); 8800 btrfs_put_ordered_extent(ordered); 8801 goto again; 8802 } 8803 8804 if (page->index == ((size - 1) >> PAGE_SHIFT)) { 8805 reserved_space = round_up(size - page_start, 8806 fs_info->sectorsize); 8807 if (reserved_space < PAGE_SIZE) { 8808 end = page_start + reserved_space - 1; 8809 btrfs_delalloc_release_space(BTRFS_I(inode), 8810 data_reserved, page_start, 8811 PAGE_SIZE - reserved_space, true); 8812 } 8813 } 8814 8815 /* 8816 * page_mkwrite gets called when the page is firstly dirtied after it's 8817 * faulted in, but write(2) could also dirty a page and set delalloc 8818 * bits, thus in this case for space account reason, we still need to 8819 * clear any delalloc bits within this page range since we have to 8820 * reserve data&meta space before lock_page() (see above comments). 8821 */ 8822 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, 8823 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 8824 EXTENT_DEFRAG, 0, 0, &cached_state); 8825 8826 ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, 8827 &cached_state); 8828 if (ret2) { 8829 unlock_extent_cached(io_tree, page_start, page_end, 8830 &cached_state); 8831 ret = VM_FAULT_SIGBUS; 8832 goto out_unlock; 8833 } 8834 8835 /* page is wholly or partially inside EOF */ 8836 if (page_start + PAGE_SIZE > size) 8837 zero_start = offset_in_page(size); 8838 else 8839 zero_start = PAGE_SIZE; 8840 8841 if (zero_start != PAGE_SIZE) { 8842 memzero_page(page, zero_start, PAGE_SIZE - zero_start); 8843 flush_dcache_page(page); 8844 } 8845 ClearPageChecked(page); 8846 btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); 8847 btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); 8848 8849 btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); 8850 8851 unlock_extent_cached(io_tree, page_start, page_end, &cached_state); 8852 up_read(&BTRFS_I(inode)->i_mmap_lock); 8853 8854 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 8855 sb_end_pagefault(inode->i_sb); 8856 extent_changeset_free(data_reserved); 8857 return VM_FAULT_LOCKED; 8858 8859 out_unlock: 8860 unlock_page(page); 8861 up_read(&BTRFS_I(inode)->i_mmap_lock); 8862 out: 8863 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 8864 btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, 8865 reserved_space, (ret != 0)); 8866 out_noreserve: 8867 sb_end_pagefault(inode->i_sb); 8868 extent_changeset_free(data_reserved); 8869 return ret; 8870 } 8871 8872 static int btrfs_truncate(struct inode *inode, bool skip_writeback) 8873 { 8874 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8875 struct btrfs_root *root = BTRFS_I(inode)->root; 8876 struct btrfs_block_rsv *rsv; 8877 int ret; 8878 struct btrfs_trans_handle *trans; 8879 u64 mask = fs_info->sectorsize - 1; 8880 u64 min_size = btrfs_calc_metadata_size(fs_info, 1); 8881 u64 extents_found = 0; 8882 8883 if (!skip_writeback) { 8884 ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), 8885 (u64)-1); 8886 if (ret) 8887 return ret; 8888 } 8889 8890 /* 8891 * Yes ladies and gentlemen, this is indeed ugly. We have a couple of 8892 * things going on here: 8893 * 8894 * 1) We need to reserve space to update our inode. 8895 * 8896 * 2) We need to have something to cache all the space that is going to 8897 * be free'd up by the truncate operation, but also have some slack 8898 * space reserved in case it uses space during the truncate (thank you 8899 * very much snapshotting). 8900 * 8901 * And we need these to be separate. The fact is we can use a lot of 8902 * space doing the truncate, and we have no earthly idea how much space 8903 * we will use, so we need the truncate reservation to be separate so it 8904 * doesn't end up using space reserved for updating the inode. We also 8905 * need to be able to stop the transaction and start a new one, which 8906 * means we need to be able to update the inode several times, and we 8907 * have no idea of knowing how many times that will be, so we can't just 8908 * reserve 1 item for the entirety of the operation, so that has to be 8909 * done separately as well. 8910 * 8911 * So that leaves us with 8912 * 8913 * 1) rsv - for the truncate reservation, which we will steal from the 8914 * transaction reservation. 8915 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for 8916 * updating the inode. 8917 */ 8918 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 8919 if (!rsv) 8920 return -ENOMEM; 8921 rsv->size = min_size; 8922 rsv->failfast = 1; 8923 8924 /* 8925 * 1 for the truncate slack space 8926 * 1 for updating the inode. 8927 */ 8928 trans = btrfs_start_transaction(root, 2); 8929 if (IS_ERR(trans)) { 8930 ret = PTR_ERR(trans); 8931 goto out; 8932 } 8933 8934 /* Migrate the slack space for the truncate to our reserve */ 8935 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, 8936 min_size, false); 8937 BUG_ON(ret); 8938 8939 trans->block_rsv = rsv; 8940 8941 while (1) { 8942 ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 8943 inode->i_size, 8944 BTRFS_EXTENT_DATA_KEY, 8945 &extents_found); 8946 trans->block_rsv = &fs_info->trans_block_rsv; 8947 if (ret != -ENOSPC && ret != -EAGAIN) 8948 break; 8949 8950 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 8951 if (ret) 8952 break; 8953 8954 btrfs_end_transaction(trans); 8955 btrfs_btree_balance_dirty(fs_info); 8956 8957 trans = btrfs_start_transaction(root, 2); 8958 if (IS_ERR(trans)) { 8959 ret = PTR_ERR(trans); 8960 trans = NULL; 8961 break; 8962 } 8963 8964 btrfs_block_rsv_release(fs_info, rsv, -1, NULL); 8965 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 8966 rsv, min_size, false); 8967 BUG_ON(ret); /* shouldn't happen */ 8968 trans->block_rsv = rsv; 8969 } 8970 8971 /* 8972 * We can't call btrfs_truncate_block inside a trans handle as we could 8973 * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know 8974 * we've truncated everything except the last little bit, and can do 8975 * btrfs_truncate_block and then update the disk_i_size. 8976 */ 8977 if (ret == NEED_TRUNCATE_BLOCK) { 8978 btrfs_end_transaction(trans); 8979 btrfs_btree_balance_dirty(fs_info); 8980 8981 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); 8982 if (ret) 8983 goto out; 8984 trans = btrfs_start_transaction(root, 1); 8985 if (IS_ERR(trans)) { 8986 ret = PTR_ERR(trans); 8987 goto out; 8988 } 8989 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 8990 } 8991 8992 if (trans) { 8993 int ret2; 8994 8995 trans->block_rsv = &fs_info->trans_block_rsv; 8996 ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode)); 8997 if (ret2 && !ret) 8998 ret = ret2; 8999 9000 ret2 = btrfs_end_transaction(trans); 9001 if (ret2 && !ret) 9002 ret = ret2; 9003 btrfs_btree_balance_dirty(fs_info); 9004 } 9005 out: 9006 btrfs_free_block_rsv(fs_info, rsv); 9007 /* 9008 * So if we truncate and then write and fsync we normally would just 9009 * write the extents that changed, which is a problem if we need to 9010 * first truncate that entire inode. So set this flag so we write out 9011 * all of the extents in the inode to the sync log so we're completely 9012 * safe. 9013 * 9014 * If no extents were dropped or trimmed we don't need to force the next 9015 * fsync to truncate all the inode's items from the log and re-log them 9016 * all. This means the truncate operation did not change the file size, 9017 * or changed it to a smaller size but there was only an implicit hole 9018 * between the old i_size and the new i_size, and there were no prealloc 9019 * extents beyond i_size to drop. 9020 */ 9021 if (extents_found > 0) 9022 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 9023 9024 return ret; 9025 } 9026 9027 /* 9028 * create a new subvolume directory/inode (helper for the ioctl). 9029 */ 9030 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 9031 struct btrfs_root *new_root, 9032 struct btrfs_root *parent_root, 9033 struct user_namespace *mnt_userns) 9034 { 9035 struct inode *inode; 9036 int err; 9037 u64 index = 0; 9038 u64 ino; 9039 9040 err = btrfs_get_free_objectid(new_root, &ino); 9041 if (err < 0) 9042 return err; 9043 9044 inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, 9045 ino, ino, 9046 S_IFDIR | (~current_umask() & S_IRWXUGO), 9047 &index); 9048 if (IS_ERR(inode)) 9049 return PTR_ERR(inode); 9050 inode->i_op = &btrfs_dir_inode_operations; 9051 inode->i_fop = &btrfs_dir_file_operations; 9052 9053 set_nlink(inode, 1); 9054 btrfs_i_size_write(BTRFS_I(inode), 0); 9055 unlock_new_inode(inode); 9056 9057 err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 9058 if (err) 9059 btrfs_err(new_root->fs_info, 9060 "error inheriting subvolume %llu properties: %d", 9061 new_root->root_key.objectid, err); 9062 9063 err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); 9064 9065 iput(inode); 9066 return err; 9067 } 9068 9069 struct inode *btrfs_alloc_inode(struct super_block *sb) 9070 { 9071 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 9072 struct btrfs_inode *ei; 9073 struct inode *inode; 9074 9075 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); 9076 if (!ei) 9077 return NULL; 9078 9079 ei->root = NULL; 9080 ei->generation = 0; 9081 ei->last_trans = 0; 9082 ei->last_sub_trans = 0; 9083 ei->logged_trans = 0; 9084 ei->delalloc_bytes = 0; 9085 ei->new_delalloc_bytes = 0; 9086 ei->defrag_bytes = 0; 9087 ei->disk_i_size = 0; 9088 ei->flags = 0; 9089 ei->ro_flags = 0; 9090 ei->csum_bytes = 0; 9091 ei->index_cnt = (u64)-1; 9092 ei->dir_index = 0; 9093 ei->last_unlink_trans = 0; 9094 ei->last_reflink_trans = 0; 9095 ei->last_log_commit = 0; 9096 9097 spin_lock_init(&ei->lock); 9098 ei->outstanding_extents = 0; 9099 if (sb->s_magic != BTRFS_TEST_MAGIC) 9100 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, 9101 BTRFS_BLOCK_RSV_DELALLOC); 9102 ei->runtime_flags = 0; 9103 ei->prop_compress = BTRFS_COMPRESS_NONE; 9104 ei->defrag_compress = BTRFS_COMPRESS_NONE; 9105 9106 ei->delayed_node = NULL; 9107 9108 ei->i_otime.tv_sec = 0; 9109 ei->i_otime.tv_nsec = 0; 9110 9111 inode = &ei->vfs_inode; 9112 extent_map_tree_init(&ei->extent_tree); 9113 extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); 9114 extent_io_tree_init(fs_info, &ei->io_failure_tree, 9115 IO_TREE_INODE_IO_FAILURE, inode); 9116 extent_io_tree_init(fs_info, &ei->file_extent_tree, 9117 IO_TREE_INODE_FILE_EXTENT, inode); 9118 ei->io_tree.track_uptodate = true; 9119 ei->io_failure_tree.track_uptodate = true; 9120 atomic_set(&ei->sync_writers, 0); 9121 mutex_init(&ei->log_mutex); 9122 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 9123 INIT_LIST_HEAD(&ei->delalloc_inodes); 9124 INIT_LIST_HEAD(&ei->delayed_iput); 9125 RB_CLEAR_NODE(&ei->rb_node); 9126 init_rwsem(&ei->i_mmap_lock); 9127 9128 return inode; 9129 } 9130 9131 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 9132 void btrfs_test_destroy_inode(struct inode *inode) 9133 { 9134 btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); 9135 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 9136 } 9137 #endif 9138 9139 void btrfs_free_inode(struct inode *inode) 9140 { 9141 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 9142 } 9143 9144 void btrfs_destroy_inode(struct inode *vfs_inode) 9145 { 9146 struct btrfs_ordered_extent *ordered; 9147 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 9148 struct btrfs_root *root = inode->root; 9149 9150 WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); 9151 WARN_ON(vfs_inode->i_data.nrpages); 9152 WARN_ON(inode->block_rsv.reserved); 9153 WARN_ON(inode->block_rsv.size); 9154 WARN_ON(inode->outstanding_extents); 9155 WARN_ON(inode->delalloc_bytes); 9156 WARN_ON(inode->new_delalloc_bytes); 9157 WARN_ON(inode->csum_bytes); 9158 WARN_ON(inode->defrag_bytes); 9159 9160 /* 9161 * This can happen where we create an inode, but somebody else also 9162 * created the same inode and we need to destroy the one we already 9163 * created. 9164 */ 9165 if (!root) 9166 return; 9167 9168 while (1) { 9169 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 9170 if (!ordered) 9171 break; 9172 else { 9173 btrfs_err(root->fs_info, 9174 "found ordered extent %llu %llu on inode cleanup", 9175 ordered->file_offset, ordered->num_bytes); 9176 btrfs_remove_ordered_extent(inode, ordered); 9177 btrfs_put_ordered_extent(ordered); 9178 btrfs_put_ordered_extent(ordered); 9179 } 9180 } 9181 btrfs_qgroup_check_reserved_leak(inode); 9182 inode_tree_del(inode); 9183 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 9184 btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); 9185 btrfs_put_root(inode->root); 9186 } 9187 9188 int btrfs_drop_inode(struct inode *inode) 9189 { 9190 struct btrfs_root *root = BTRFS_I(inode)->root; 9191 9192 if (root == NULL) 9193 return 1; 9194 9195 /* the snap/subvol tree is on deleting */ 9196 if (btrfs_root_refs(&root->root_item) == 0) 9197 return 1; 9198 else 9199 return generic_drop_inode(inode); 9200 } 9201 9202 static void init_once(void *foo) 9203 { 9204 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 9205 9206 inode_init_once(&ei->vfs_inode); 9207 } 9208 9209 void __cold btrfs_destroy_cachep(void) 9210 { 9211 /* 9212 * Make sure all delayed rcu free inodes are flushed before we 9213 * destroy cache. 9214 */ 9215 rcu_barrier(); 9216 kmem_cache_destroy(btrfs_inode_cachep); 9217 kmem_cache_destroy(btrfs_trans_handle_cachep); 9218 kmem_cache_destroy(btrfs_path_cachep); 9219 kmem_cache_destroy(btrfs_free_space_cachep); 9220 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 9221 } 9222 9223 int __init btrfs_init_cachep(void) 9224 { 9225 btrfs_inode_cachep = kmem_cache_create("btrfs_inode", 9226 sizeof(struct btrfs_inode), 0, 9227 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, 9228 init_once); 9229 if (!btrfs_inode_cachep) 9230 goto fail; 9231 9232 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", 9233 sizeof(struct btrfs_trans_handle), 0, 9234 SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); 9235 if (!btrfs_trans_handle_cachep) 9236 goto fail; 9237 9238 btrfs_path_cachep = kmem_cache_create("btrfs_path", 9239 sizeof(struct btrfs_path), 0, 9240 SLAB_MEM_SPREAD, NULL); 9241 if (!btrfs_path_cachep) 9242 goto fail; 9243 9244 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 9245 sizeof(struct btrfs_free_space), 0, 9246 SLAB_MEM_SPREAD, NULL); 9247 if (!btrfs_free_space_cachep) 9248 goto fail; 9249 9250 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 9251 PAGE_SIZE, PAGE_SIZE, 9252 SLAB_MEM_SPREAD, NULL); 9253 if (!btrfs_free_space_bitmap_cachep) 9254 goto fail; 9255 9256 return 0; 9257 fail: 9258 btrfs_destroy_cachep(); 9259 return -ENOMEM; 9260 } 9261 9262 static int btrfs_getattr(struct user_namespace *mnt_userns, 9263 const struct path *path, struct kstat *stat, 9264 u32 request_mask, unsigned int flags) 9265 { 9266 u64 delalloc_bytes; 9267 u64 inode_bytes; 9268 struct inode *inode = d_inode(path->dentry); 9269 u32 blocksize = inode->i_sb->s_blocksize; 9270 u32 bi_flags = BTRFS_I(inode)->flags; 9271 u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; 9272 9273 stat->result_mask |= STATX_BTIME; 9274 stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; 9275 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; 9276 if (bi_flags & BTRFS_INODE_APPEND) 9277 stat->attributes |= STATX_ATTR_APPEND; 9278 if (bi_flags & BTRFS_INODE_COMPRESS) 9279 stat->attributes |= STATX_ATTR_COMPRESSED; 9280 if (bi_flags & BTRFS_INODE_IMMUTABLE) 9281 stat->attributes |= STATX_ATTR_IMMUTABLE; 9282 if (bi_flags & BTRFS_INODE_NODUMP) 9283 stat->attributes |= STATX_ATTR_NODUMP; 9284 if (bi_ro_flags & BTRFS_INODE_RO_VERITY) 9285 stat->attributes |= STATX_ATTR_VERITY; 9286 9287 stat->attributes_mask |= (STATX_ATTR_APPEND | 9288 STATX_ATTR_COMPRESSED | 9289 STATX_ATTR_IMMUTABLE | 9290 STATX_ATTR_NODUMP); 9291 9292 generic_fillattr(mnt_userns, inode, stat); 9293 stat->dev = BTRFS_I(inode)->root->anon_dev; 9294 9295 spin_lock(&BTRFS_I(inode)->lock); 9296 delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; 9297 inode_bytes = inode_get_bytes(inode); 9298 spin_unlock(&BTRFS_I(inode)->lock); 9299 stat->blocks = (ALIGN(inode_bytes, blocksize) + 9300 ALIGN(delalloc_bytes, blocksize)) >> 9; 9301 return 0; 9302 } 9303 9304 static int btrfs_rename_exchange(struct inode *old_dir, 9305 struct dentry *old_dentry, 9306 struct inode *new_dir, 9307 struct dentry *new_dentry) 9308 { 9309 struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); 9310 struct btrfs_trans_handle *trans; 9311 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9312 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9313 struct inode *new_inode = new_dentry->d_inode; 9314 struct inode *old_inode = old_dentry->d_inode; 9315 struct timespec64 ctime = current_time(old_inode); 9316 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 9317 u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); 9318 u64 old_idx = 0; 9319 u64 new_idx = 0; 9320 int ret; 9321 int ret2; 9322 bool root_log_pinned = false; 9323 bool dest_log_pinned = false; 9324 bool need_abort = false; 9325 9326 /* 9327 * For non-subvolumes allow exchange only within one subvolume, in the 9328 * same inode namespace. Two subvolumes (represented as directory) can 9329 * be exchanged as they're a logical link and have a fixed inode number. 9330 */ 9331 if (root != dest && 9332 (old_ino != BTRFS_FIRST_FREE_OBJECTID || 9333 new_ino != BTRFS_FIRST_FREE_OBJECTID)) 9334 return -EXDEV; 9335 9336 /* close the race window with snapshot create/destroy ioctl */ 9337 if (old_ino == BTRFS_FIRST_FREE_OBJECTID || 9338 new_ino == BTRFS_FIRST_FREE_OBJECTID) 9339 down_read(&fs_info->subvol_sem); 9340 9341 /* 9342 * We want to reserve the absolute worst case amount of items. So if 9343 * both inodes are subvols and we need to unlink them then that would 9344 * require 4 item modifications, but if they are both normal inodes it 9345 * would require 5 item modifications, so we'll assume their normal 9346 * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items 9347 * should cover the worst case number of items we'll modify. 9348 */ 9349 trans = btrfs_start_transaction(root, 12); 9350 if (IS_ERR(trans)) { 9351 ret = PTR_ERR(trans); 9352 goto out_notrans; 9353 } 9354 9355 if (dest != root) { 9356 ret = btrfs_record_root_in_trans(trans, dest); 9357 if (ret) 9358 goto out_fail; 9359 } 9360 9361 /* 9362 * We need to find a free sequence number both in the source and 9363 * in the destination directory for the exchange. 9364 */ 9365 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); 9366 if (ret) 9367 goto out_fail; 9368 ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); 9369 if (ret) 9370 goto out_fail; 9371 9372 BTRFS_I(old_inode)->dir_index = 0ULL; 9373 BTRFS_I(new_inode)->dir_index = 0ULL; 9374 9375 /* Reference for the source. */ 9376 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 9377 /* force full log commit if subvolume involved. */ 9378 btrfs_set_log_full_commit(trans); 9379 } else { 9380 ret = btrfs_insert_inode_ref(trans, dest, 9381 new_dentry->d_name.name, 9382 new_dentry->d_name.len, 9383 old_ino, 9384 btrfs_ino(BTRFS_I(new_dir)), 9385 old_idx); 9386 if (ret) 9387 goto out_fail; 9388 need_abort = true; 9389 } 9390 9391 /* And now for the dest. */ 9392 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 9393 /* force full log commit if subvolume involved. */ 9394 btrfs_set_log_full_commit(trans); 9395 } else { 9396 ret = btrfs_insert_inode_ref(trans, root, 9397 old_dentry->d_name.name, 9398 old_dentry->d_name.len, 9399 new_ino, 9400 btrfs_ino(BTRFS_I(old_dir)), 9401 new_idx); 9402 if (ret) { 9403 if (need_abort) 9404 btrfs_abort_transaction(trans, ret); 9405 goto out_fail; 9406 } 9407 } 9408 9409 /* Update inode version and ctime/mtime. */ 9410 inode_inc_iversion(old_dir); 9411 inode_inc_iversion(new_dir); 9412 inode_inc_iversion(old_inode); 9413 inode_inc_iversion(new_inode); 9414 old_dir->i_ctime = old_dir->i_mtime = ctime; 9415 new_dir->i_ctime = new_dir->i_mtime = ctime; 9416 old_inode->i_ctime = ctime; 9417 new_inode->i_ctime = ctime; 9418 9419 if (old_dentry->d_parent != new_dentry->d_parent) { 9420 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 9421 BTRFS_I(old_inode), 1); 9422 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), 9423 BTRFS_I(new_inode), 1); 9424 } 9425 9426 /* 9427 * Now pin the logs of the roots. We do it to ensure that no other task 9428 * can sync the logs while we are in progress with the rename, because 9429 * that could result in an inconsistency in case any of the inodes that 9430 * are part of this rename operation were logged before. 9431 * 9432 * We pin the logs even if at this precise moment none of the inodes was 9433 * logged before. This is because right after we checked for that, some 9434 * other task fsyncing some other inode not involved with this rename 9435 * operation could log that one of our inodes exists. 9436 * 9437 * We don't need to pin the logs before the above calls to 9438 * btrfs_insert_inode_ref(), since those don't ever need to change a log. 9439 */ 9440 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 9441 btrfs_pin_log_trans(root); 9442 root_log_pinned = true; 9443 } 9444 if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { 9445 btrfs_pin_log_trans(dest); 9446 dest_log_pinned = true; 9447 } 9448 9449 /* src is a subvolume */ 9450 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 9451 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); 9452 } else { /* src is an inode */ 9453 ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), 9454 BTRFS_I(old_dentry->d_inode), 9455 old_dentry->d_name.name, 9456 old_dentry->d_name.len); 9457 if (!ret) 9458 ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); 9459 } 9460 if (ret) { 9461 btrfs_abort_transaction(trans, ret); 9462 goto out_fail; 9463 } 9464 9465 /* dest is a subvolume */ 9466 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 9467 ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); 9468 } else { /* dest is an inode */ 9469 ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), 9470 BTRFS_I(new_dentry->d_inode), 9471 new_dentry->d_name.name, 9472 new_dentry->d_name.len); 9473 if (!ret) 9474 ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); 9475 } 9476 if (ret) { 9477 btrfs_abort_transaction(trans, ret); 9478 goto out_fail; 9479 } 9480 9481 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 9482 new_dentry->d_name.name, 9483 new_dentry->d_name.len, 0, old_idx); 9484 if (ret) { 9485 btrfs_abort_transaction(trans, ret); 9486 goto out_fail; 9487 } 9488 9489 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), 9490 old_dentry->d_name.name, 9491 old_dentry->d_name.len, 0, new_idx); 9492 if (ret) { 9493 btrfs_abort_transaction(trans, ret); 9494 goto out_fail; 9495 } 9496 9497 if (old_inode->i_nlink == 1) 9498 BTRFS_I(old_inode)->dir_index = old_idx; 9499 if (new_inode->i_nlink == 1) 9500 BTRFS_I(new_inode)->dir_index = new_idx; 9501 9502 if (root_log_pinned) { 9503 btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), 9504 new_dentry->d_parent); 9505 btrfs_end_log_trans(root); 9506 root_log_pinned = false; 9507 } 9508 if (dest_log_pinned) { 9509 btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), 9510 old_dentry->d_parent); 9511 btrfs_end_log_trans(dest); 9512 dest_log_pinned = false; 9513 } 9514 out_fail: 9515 /* 9516 * If we have pinned a log and an error happened, we unpin tasks 9517 * trying to sync the log and force them to fallback to a transaction 9518 * commit if the log currently contains any of the inodes involved in 9519 * this rename operation (to ensure we do not persist a log with an 9520 * inconsistent state for any of these inodes or leading to any 9521 * inconsistencies when replayed). If the transaction was aborted, the 9522 * abortion reason is propagated to userspace when attempting to commit 9523 * the transaction. If the log does not contain any of these inodes, we 9524 * allow the tasks to sync it. 9525 */ 9526 if (ret && (root_log_pinned || dest_log_pinned)) { 9527 if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || 9528 btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || 9529 btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || 9530 btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)) 9531 btrfs_set_log_full_commit(trans); 9532 9533 if (root_log_pinned) { 9534 btrfs_end_log_trans(root); 9535 root_log_pinned = false; 9536 } 9537 if (dest_log_pinned) { 9538 btrfs_end_log_trans(dest); 9539 dest_log_pinned = false; 9540 } 9541 } 9542 ret2 = btrfs_end_transaction(trans); 9543 ret = ret ? ret : ret2; 9544 out_notrans: 9545 if (new_ino == BTRFS_FIRST_FREE_OBJECTID || 9546 old_ino == BTRFS_FIRST_FREE_OBJECTID) 9547 up_read(&fs_info->subvol_sem); 9548 9549 return ret; 9550 } 9551 9552 static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, 9553 struct btrfs_root *root, 9554 struct user_namespace *mnt_userns, 9555 struct inode *dir, 9556 struct dentry *dentry) 9557 { 9558 int ret; 9559 struct inode *inode; 9560 u64 objectid; 9561 u64 index; 9562 9563 ret = btrfs_get_free_objectid(root, &objectid); 9564 if (ret) 9565 return ret; 9566 9567 inode = btrfs_new_inode(trans, root, mnt_userns, dir, 9568 dentry->d_name.name, 9569 dentry->d_name.len, 9570 btrfs_ino(BTRFS_I(dir)), 9571 objectid, 9572 S_IFCHR | WHITEOUT_MODE, 9573 &index); 9574 9575 if (IS_ERR(inode)) { 9576 ret = PTR_ERR(inode); 9577 return ret; 9578 } 9579 9580 inode->i_op = &btrfs_special_inode_operations; 9581 init_special_inode(inode, inode->i_mode, 9582 WHITEOUT_DEV); 9583 9584 ret = btrfs_init_inode_security(trans, inode, dir, 9585 &dentry->d_name); 9586 if (ret) 9587 goto out; 9588 9589 ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, 9590 BTRFS_I(inode), 0, index); 9591 if (ret) 9592 goto out; 9593 9594 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 9595 out: 9596 unlock_new_inode(inode); 9597 if (ret) 9598 inode_dec_link_count(inode); 9599 iput(inode); 9600 9601 return ret; 9602 } 9603 9604 static int btrfs_rename(struct user_namespace *mnt_userns, 9605 struct inode *old_dir, struct dentry *old_dentry, 9606 struct inode *new_dir, struct dentry *new_dentry, 9607 unsigned int flags) 9608 { 9609 struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); 9610 struct btrfs_trans_handle *trans; 9611 unsigned int trans_num_items; 9612 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9613 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9614 struct inode *new_inode = d_inode(new_dentry); 9615 struct inode *old_inode = d_inode(old_dentry); 9616 u64 index = 0; 9617 int ret; 9618 int ret2; 9619 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 9620 bool log_pinned = false; 9621 9622 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 9623 return -EPERM; 9624 9625 /* we only allow rename subvolume link between subvolumes */ 9626 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 9627 return -EXDEV; 9628 9629 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 9630 (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID)) 9631 return -ENOTEMPTY; 9632 9633 if (S_ISDIR(old_inode->i_mode) && new_inode && 9634 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 9635 return -ENOTEMPTY; 9636 9637 9638 /* check for collisions, even if the name isn't there */ 9639 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, 9640 new_dentry->d_name.name, 9641 new_dentry->d_name.len); 9642 9643 if (ret) { 9644 if (ret == -EEXIST) { 9645 /* we shouldn't get 9646 * eexist without a new_inode */ 9647 if (WARN_ON(!new_inode)) { 9648 return ret; 9649 } 9650 } else { 9651 /* maybe -EOVERFLOW */ 9652 return ret; 9653 } 9654 } 9655 ret = 0; 9656 9657 /* 9658 * we're using rename to replace one file with another. Start IO on it 9659 * now so we don't add too much work to the end of the transaction 9660 */ 9661 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) 9662 filemap_flush(old_inode->i_mapping); 9663 9664 /* close the racy window with snapshot create/destroy ioctl */ 9665 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9666 down_read(&fs_info->subvol_sem); 9667 /* 9668 * We want to reserve the absolute worst case amount of items. So if 9669 * both inodes are subvols and we need to unlink them then that would 9670 * require 4 item modifications, but if they are both normal inodes it 9671 * would require 5 item modifications, so we'll assume they are normal 9672 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 9673 * should cover the worst case number of items we'll modify. 9674 * If our rename has the whiteout flag, we need more 5 units for the 9675 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item 9676 * when selinux is enabled). 9677 */ 9678 trans_num_items = 11; 9679 if (flags & RENAME_WHITEOUT) 9680 trans_num_items += 5; 9681 trans = btrfs_start_transaction(root, trans_num_items); 9682 if (IS_ERR(trans)) { 9683 ret = PTR_ERR(trans); 9684 goto out_notrans; 9685 } 9686 9687 if (dest != root) { 9688 ret = btrfs_record_root_in_trans(trans, dest); 9689 if (ret) 9690 goto out_fail; 9691 } 9692 9693 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); 9694 if (ret) 9695 goto out_fail; 9696 9697 BTRFS_I(old_inode)->dir_index = 0ULL; 9698 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 9699 /* force full log commit if subvolume involved. */ 9700 btrfs_set_log_full_commit(trans); 9701 } else { 9702 ret = btrfs_insert_inode_ref(trans, dest, 9703 new_dentry->d_name.name, 9704 new_dentry->d_name.len, 9705 old_ino, 9706 btrfs_ino(BTRFS_I(new_dir)), index); 9707 if (ret) 9708 goto out_fail; 9709 } 9710 9711 inode_inc_iversion(old_dir); 9712 inode_inc_iversion(new_dir); 9713 inode_inc_iversion(old_inode); 9714 old_dir->i_ctime = old_dir->i_mtime = 9715 new_dir->i_ctime = new_dir->i_mtime = 9716 old_inode->i_ctime = current_time(old_dir); 9717 9718 if (old_dentry->d_parent != new_dentry->d_parent) 9719 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 9720 BTRFS_I(old_inode), 1); 9721 9722 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 9723 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); 9724 } else { 9725 /* 9726 * Now pin the log. We do it to ensure that no other task can 9727 * sync the log while we are in progress with the rename, as 9728 * that could result in an inconsistency in case any of the 9729 * inodes that are part of this rename operation were logged 9730 * before. 9731 * 9732 * We pin the log even if at this precise moment none of the 9733 * inodes was logged before. This is because right after we 9734 * checked for that, some other task fsyncing some other inode 9735 * not involved with this rename operation could log that one of 9736 * our inodes exists. 9737 * 9738 * We don't need to pin the logs before the above call to 9739 * btrfs_insert_inode_ref(), since that does not need to change 9740 * a log. 9741 */ 9742 btrfs_pin_log_trans(root); 9743 log_pinned = true; 9744 ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), 9745 BTRFS_I(d_inode(old_dentry)), 9746 old_dentry->d_name.name, 9747 old_dentry->d_name.len); 9748 if (!ret) 9749 ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); 9750 } 9751 if (ret) { 9752 btrfs_abort_transaction(trans, ret); 9753 goto out_fail; 9754 } 9755 9756 if (new_inode) { 9757 inode_inc_iversion(new_inode); 9758 new_inode->i_ctime = current_time(new_inode); 9759 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == 9760 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 9761 ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); 9762 BUG_ON(new_inode->i_nlink == 0); 9763 } else { 9764 ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), 9765 BTRFS_I(d_inode(new_dentry)), 9766 new_dentry->d_name.name, 9767 new_dentry->d_name.len); 9768 } 9769 if (!ret && new_inode->i_nlink == 0) 9770 ret = btrfs_orphan_add(trans, 9771 BTRFS_I(d_inode(new_dentry))); 9772 if (ret) { 9773 btrfs_abort_transaction(trans, ret); 9774 goto out_fail; 9775 } 9776 } 9777 9778 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 9779 new_dentry->d_name.name, 9780 new_dentry->d_name.len, 0, index); 9781 if (ret) { 9782 btrfs_abort_transaction(trans, ret); 9783 goto out_fail; 9784 } 9785 9786 if (old_inode->i_nlink == 1) 9787 BTRFS_I(old_inode)->dir_index = index; 9788 9789 if (log_pinned) { 9790 btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), 9791 new_dentry->d_parent); 9792 btrfs_end_log_trans(root); 9793 log_pinned = false; 9794 } 9795 9796 if (flags & RENAME_WHITEOUT) { 9797 ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, 9798 old_dir, old_dentry); 9799 9800 if (ret) { 9801 btrfs_abort_transaction(trans, ret); 9802 goto out_fail; 9803 } 9804 } 9805 out_fail: 9806 /* 9807 * If we have pinned the log and an error happened, we unpin tasks 9808 * trying to sync the log and force them to fallback to a transaction 9809 * commit if the log currently contains any of the inodes involved in 9810 * this rename operation (to ensure we do not persist a log with an 9811 * inconsistent state for any of these inodes or leading to any 9812 * inconsistencies when replayed). If the transaction was aborted, the 9813 * abortion reason is propagated to userspace when attempting to commit 9814 * the transaction. If the log does not contain any of these inodes, we 9815 * allow the tasks to sync it. 9816 */ 9817 if (ret && log_pinned) { 9818 if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || 9819 btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || 9820 btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || 9821 (new_inode && 9822 btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) 9823 btrfs_set_log_full_commit(trans); 9824 9825 btrfs_end_log_trans(root); 9826 log_pinned = false; 9827 } 9828 ret2 = btrfs_end_transaction(trans); 9829 ret = ret ? ret : ret2; 9830 out_notrans: 9831 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9832 up_read(&fs_info->subvol_sem); 9833 9834 return ret; 9835 } 9836 9837 static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, 9838 struct dentry *old_dentry, struct inode *new_dir, 9839 struct dentry *new_dentry, unsigned int flags) 9840 { 9841 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 9842 return -EINVAL; 9843 9844 if (flags & RENAME_EXCHANGE) 9845 return btrfs_rename_exchange(old_dir, old_dentry, new_dir, 9846 new_dentry); 9847 9848 return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, 9849 new_dentry, flags); 9850 } 9851 9852 struct btrfs_delalloc_work { 9853 struct inode *inode; 9854 struct completion completion; 9855 struct list_head list; 9856 struct btrfs_work work; 9857 }; 9858 9859 static void btrfs_run_delalloc_work(struct btrfs_work *work) 9860 { 9861 struct btrfs_delalloc_work *delalloc_work; 9862 struct inode *inode; 9863 9864 delalloc_work = container_of(work, struct btrfs_delalloc_work, 9865 work); 9866 inode = delalloc_work->inode; 9867 filemap_flush(inode->i_mapping); 9868 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 9869 &BTRFS_I(inode)->runtime_flags)) 9870 filemap_flush(inode->i_mapping); 9871 9872 iput(inode); 9873 complete(&delalloc_work->completion); 9874 } 9875 9876 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) 9877 { 9878 struct btrfs_delalloc_work *work; 9879 9880 work = kmalloc(sizeof(*work), GFP_NOFS); 9881 if (!work) 9882 return NULL; 9883 9884 init_completion(&work->completion); 9885 INIT_LIST_HEAD(&work->list); 9886 work->inode = inode; 9887 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); 9888 9889 return work; 9890 } 9891 9892 /* 9893 * some fairly slow code that needs optimization. This walks the list 9894 * of all the inodes with pending delalloc and forces them to disk. 9895 */ 9896 static int start_delalloc_inodes(struct btrfs_root *root, 9897 struct writeback_control *wbc, bool snapshot, 9898 bool in_reclaim_context) 9899 { 9900 struct btrfs_inode *binode; 9901 struct inode *inode; 9902 struct btrfs_delalloc_work *work, *next; 9903 struct list_head works; 9904 struct list_head splice; 9905 int ret = 0; 9906 bool full_flush = wbc->nr_to_write == LONG_MAX; 9907 9908 INIT_LIST_HEAD(&works); 9909 INIT_LIST_HEAD(&splice); 9910 9911 mutex_lock(&root->delalloc_mutex); 9912 spin_lock(&root->delalloc_lock); 9913 list_splice_init(&root->delalloc_inodes, &splice); 9914 while (!list_empty(&splice)) { 9915 binode = list_entry(splice.next, struct btrfs_inode, 9916 delalloc_inodes); 9917 9918 list_move_tail(&binode->delalloc_inodes, 9919 &root->delalloc_inodes); 9920 9921 if (in_reclaim_context && 9922 test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) 9923 continue; 9924 9925 inode = igrab(&binode->vfs_inode); 9926 if (!inode) { 9927 cond_resched_lock(&root->delalloc_lock); 9928 continue; 9929 } 9930 spin_unlock(&root->delalloc_lock); 9931 9932 if (snapshot) 9933 set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 9934 &binode->runtime_flags); 9935 if (full_flush) { 9936 work = btrfs_alloc_delalloc_work(inode); 9937 if (!work) { 9938 iput(inode); 9939 ret = -ENOMEM; 9940 goto out; 9941 } 9942 list_add_tail(&work->list, &works); 9943 btrfs_queue_work(root->fs_info->flush_workers, 9944 &work->work); 9945 } else { 9946 ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); 9947 btrfs_add_delayed_iput(inode); 9948 if (ret || wbc->nr_to_write <= 0) 9949 goto out; 9950 } 9951 cond_resched(); 9952 spin_lock(&root->delalloc_lock); 9953 } 9954 spin_unlock(&root->delalloc_lock); 9955 9956 out: 9957 list_for_each_entry_safe(work, next, &works, list) { 9958 list_del_init(&work->list); 9959 wait_for_completion(&work->completion); 9960 kfree(work); 9961 } 9962 9963 if (!list_empty(&splice)) { 9964 spin_lock(&root->delalloc_lock); 9965 list_splice_tail(&splice, &root->delalloc_inodes); 9966 spin_unlock(&root->delalloc_lock); 9967 } 9968 mutex_unlock(&root->delalloc_mutex); 9969 return ret; 9970 } 9971 9972 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) 9973 { 9974 struct writeback_control wbc = { 9975 .nr_to_write = LONG_MAX, 9976 .sync_mode = WB_SYNC_NONE, 9977 .range_start = 0, 9978 .range_end = LLONG_MAX, 9979 }; 9980 struct btrfs_fs_info *fs_info = root->fs_info; 9981 9982 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 9983 return -EROFS; 9984 9985 return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); 9986 } 9987 9988 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, 9989 bool in_reclaim_context) 9990 { 9991 struct writeback_control wbc = { 9992 .nr_to_write = nr, 9993 .sync_mode = WB_SYNC_NONE, 9994 .range_start = 0, 9995 .range_end = LLONG_MAX, 9996 }; 9997 struct btrfs_root *root; 9998 struct list_head splice; 9999 int ret; 10000 10001 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 10002 return -EROFS; 10003 10004 INIT_LIST_HEAD(&splice); 10005 10006 mutex_lock(&fs_info->delalloc_root_mutex); 10007 spin_lock(&fs_info->delalloc_root_lock); 10008 list_splice_init(&fs_info->delalloc_roots, &splice); 10009 while (!list_empty(&splice)) { 10010 /* 10011 * Reset nr_to_write here so we know that we're doing a full 10012 * flush. 10013 */ 10014 if (nr == LONG_MAX) 10015 wbc.nr_to_write = LONG_MAX; 10016 10017 root = list_first_entry(&splice, struct btrfs_root, 10018 delalloc_root); 10019 root = btrfs_grab_root(root); 10020 BUG_ON(!root); 10021 list_move_tail(&root->delalloc_root, 10022 &fs_info->delalloc_roots); 10023 spin_unlock(&fs_info->delalloc_root_lock); 10024 10025 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); 10026 btrfs_put_root(root); 10027 if (ret < 0 || wbc.nr_to_write <= 0) 10028 goto out; 10029 spin_lock(&fs_info->delalloc_root_lock); 10030 } 10031 spin_unlock(&fs_info->delalloc_root_lock); 10032 10033 ret = 0; 10034 out: 10035 if (!list_empty(&splice)) { 10036 spin_lock(&fs_info->delalloc_root_lock); 10037 list_splice_tail(&splice, &fs_info->delalloc_roots); 10038 spin_unlock(&fs_info->delalloc_root_lock); 10039 } 10040 mutex_unlock(&fs_info->delalloc_root_mutex); 10041 return ret; 10042 } 10043 10044 static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, 10045 struct dentry *dentry, const char *symname) 10046 { 10047 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 10048 struct btrfs_trans_handle *trans; 10049 struct btrfs_root *root = BTRFS_I(dir)->root; 10050 struct btrfs_path *path; 10051 struct btrfs_key key; 10052 struct inode *inode = NULL; 10053 int err; 10054 u64 objectid; 10055 u64 index = 0; 10056 int name_len; 10057 int datasize; 10058 unsigned long ptr; 10059 struct btrfs_file_extent_item *ei; 10060 struct extent_buffer *leaf; 10061 10062 name_len = strlen(symname); 10063 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) 10064 return -ENAMETOOLONG; 10065 10066 /* 10067 * 2 items for inode item and ref 10068 * 2 items for dir items 10069 * 1 item for updating parent inode item 10070 * 1 item for the inline extent item 10071 * 1 item for xattr if selinux is on 10072 */ 10073 trans = btrfs_start_transaction(root, 7); 10074 if (IS_ERR(trans)) 10075 return PTR_ERR(trans); 10076 10077 err = btrfs_get_free_objectid(root, &objectid); 10078 if (err) 10079 goto out_unlock; 10080 10081 inode = btrfs_new_inode(trans, root, mnt_userns, dir, 10082 dentry->d_name.name, dentry->d_name.len, 10083 btrfs_ino(BTRFS_I(dir)), objectid, 10084 S_IFLNK | S_IRWXUGO, &index); 10085 if (IS_ERR(inode)) { 10086 err = PTR_ERR(inode); 10087 inode = NULL; 10088 goto out_unlock; 10089 } 10090 10091 /* 10092 * If the active LSM wants to access the inode during 10093 * d_instantiate it needs these. Smack checks to see 10094 * if the filesystem supports xattrs by looking at the 10095 * ops vector. 10096 */ 10097 inode->i_fop = &btrfs_file_operations; 10098 inode->i_op = &btrfs_file_inode_operations; 10099 inode->i_mapping->a_ops = &btrfs_aops; 10100 10101 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 10102 if (err) 10103 goto out_unlock; 10104 10105 path = btrfs_alloc_path(); 10106 if (!path) { 10107 err = -ENOMEM; 10108 goto out_unlock; 10109 } 10110 key.objectid = btrfs_ino(BTRFS_I(inode)); 10111 key.offset = 0; 10112 key.type = BTRFS_EXTENT_DATA_KEY; 10113 datasize = btrfs_file_extent_calc_inline_size(name_len); 10114 err = btrfs_insert_empty_item(trans, root, path, &key, 10115 datasize); 10116 if (err) { 10117 btrfs_free_path(path); 10118 goto out_unlock; 10119 } 10120 leaf = path->nodes[0]; 10121 ei = btrfs_item_ptr(leaf, path->slots[0], 10122 struct btrfs_file_extent_item); 10123 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 10124 btrfs_set_file_extent_type(leaf, ei, 10125 BTRFS_FILE_EXTENT_INLINE); 10126 btrfs_set_file_extent_encryption(leaf, ei, 0); 10127 btrfs_set_file_extent_compression(leaf, ei, 0); 10128 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 10129 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 10130 10131 ptr = btrfs_file_extent_inline_start(ei); 10132 write_extent_buffer(leaf, symname, ptr, name_len); 10133 btrfs_mark_buffer_dirty(leaf); 10134 btrfs_free_path(path); 10135 10136 inode->i_op = &btrfs_symlink_inode_operations; 10137 inode_nohighmem(inode); 10138 inode_set_bytes(inode, name_len); 10139 btrfs_i_size_write(BTRFS_I(inode), name_len); 10140 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 10141 /* 10142 * Last step, add directory indexes for our symlink inode. This is the 10143 * last step to avoid extra cleanup of these indexes if an error happens 10144 * elsewhere above. 10145 */ 10146 if (!err) 10147 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, 10148 BTRFS_I(inode), 0, index); 10149 if (err) 10150 goto out_unlock; 10151 10152 d_instantiate_new(dentry, inode); 10153 10154 out_unlock: 10155 btrfs_end_transaction(trans); 10156 if (err && inode) { 10157 inode_dec_link_count(inode); 10158 discard_new_inode(inode); 10159 } 10160 btrfs_btree_balance_dirty(fs_info); 10161 return err; 10162 } 10163 10164 static struct btrfs_trans_handle *insert_prealloc_file_extent( 10165 struct btrfs_trans_handle *trans_in, 10166 struct btrfs_inode *inode, 10167 struct btrfs_key *ins, 10168 u64 file_offset) 10169 { 10170 struct btrfs_file_extent_item stack_fi; 10171 struct btrfs_replace_extent_info extent_info; 10172 struct btrfs_trans_handle *trans = trans_in; 10173 struct btrfs_path *path; 10174 u64 start = ins->objectid; 10175 u64 len = ins->offset; 10176 int qgroup_released; 10177 int ret; 10178 10179 memset(&stack_fi, 0, sizeof(stack_fi)); 10180 10181 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); 10182 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); 10183 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); 10184 btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); 10185 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); 10186 btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); 10187 /* Encryption and other encoding is reserved and all 0 */ 10188 10189 qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); 10190 if (qgroup_released < 0) 10191 return ERR_PTR(qgroup_released); 10192 10193 if (trans) { 10194 ret = insert_reserved_file_extent(trans, inode, 10195 file_offset, &stack_fi, 10196 true, qgroup_released); 10197 if (ret) 10198 goto free_qgroup; 10199 return trans; 10200 } 10201 10202 extent_info.disk_offset = start; 10203 extent_info.disk_len = len; 10204 extent_info.data_offset = 0; 10205 extent_info.data_len = len; 10206 extent_info.file_offset = file_offset; 10207 extent_info.extent_buf = (char *)&stack_fi; 10208 extent_info.is_new_extent = true; 10209 extent_info.qgroup_reserved = qgroup_released; 10210 extent_info.insertions = 0; 10211 10212 path = btrfs_alloc_path(); 10213 if (!path) { 10214 ret = -ENOMEM; 10215 goto free_qgroup; 10216 } 10217 10218 ret = btrfs_replace_file_extents(inode, path, file_offset, 10219 file_offset + len - 1, &extent_info, 10220 &trans); 10221 btrfs_free_path(path); 10222 if (ret) 10223 goto free_qgroup; 10224 return trans; 10225 10226 free_qgroup: 10227 /* 10228 * We have released qgroup data range at the beginning of the function, 10229 * and normally qgroup_released bytes will be freed when committing 10230 * transaction. 10231 * But if we error out early, we have to free what we have released 10232 * or we leak qgroup data reservation. 10233 */ 10234 btrfs_qgroup_free_refroot(inode->root->fs_info, 10235 inode->root->root_key.objectid, qgroup_released, 10236 BTRFS_QGROUP_RSV_DATA); 10237 return ERR_PTR(ret); 10238 } 10239 10240 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 10241 u64 start, u64 num_bytes, u64 min_size, 10242 loff_t actual_len, u64 *alloc_hint, 10243 struct btrfs_trans_handle *trans) 10244 { 10245 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 10246 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 10247 struct extent_map *em; 10248 struct btrfs_root *root = BTRFS_I(inode)->root; 10249 struct btrfs_key ins; 10250 u64 cur_offset = start; 10251 u64 clear_offset = start; 10252 u64 i_size; 10253 u64 cur_bytes; 10254 u64 last_alloc = (u64)-1; 10255 int ret = 0; 10256 bool own_trans = true; 10257 u64 end = start + num_bytes - 1; 10258 10259 if (trans) 10260 own_trans = false; 10261 while (num_bytes > 0) { 10262 cur_bytes = min_t(u64, num_bytes, SZ_256M); 10263 cur_bytes = max(cur_bytes, min_size); 10264 /* 10265 * If we are severely fragmented we could end up with really 10266 * small allocations, so if the allocator is returning small 10267 * chunks lets make its job easier by only searching for those 10268 * sized chunks. 10269 */ 10270 cur_bytes = min(cur_bytes, last_alloc); 10271 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, 10272 min_size, 0, *alloc_hint, &ins, 1, 0); 10273 if (ret) 10274 break; 10275 10276 /* 10277 * We've reserved this space, and thus converted it from 10278 * ->bytes_may_use to ->bytes_reserved. Any error that happens 10279 * from here on out we will only need to clear our reservation 10280 * for the remaining unreserved area, so advance our 10281 * clear_offset by our extent size. 10282 */ 10283 clear_offset += ins.offset; 10284 10285 last_alloc = ins.offset; 10286 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode), 10287 &ins, cur_offset); 10288 /* 10289 * Now that we inserted the prealloc extent we can finally 10290 * decrement the number of reservations in the block group. 10291 * If we did it before, we could race with relocation and have 10292 * relocation miss the reserved extent, making it fail later. 10293 */ 10294 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 10295 if (IS_ERR(trans)) { 10296 ret = PTR_ERR(trans); 10297 btrfs_free_reserved_extent(fs_info, ins.objectid, 10298 ins.offset, 0); 10299 break; 10300 } 10301 10302 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, 10303 cur_offset + ins.offset -1, 0); 10304 10305 em = alloc_extent_map(); 10306 if (!em) { 10307 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 10308 &BTRFS_I(inode)->runtime_flags); 10309 goto next; 10310 } 10311 10312 em->start = cur_offset; 10313 em->orig_start = cur_offset; 10314 em->len = ins.offset; 10315 em->block_start = ins.objectid; 10316 em->block_len = ins.offset; 10317 em->orig_block_len = ins.offset; 10318 em->ram_bytes = ins.offset; 10319 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 10320 em->generation = trans->transid; 10321 10322 while (1) { 10323 write_lock(&em_tree->lock); 10324 ret = add_extent_mapping(em_tree, em, 1); 10325 write_unlock(&em_tree->lock); 10326 if (ret != -EEXIST) 10327 break; 10328 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, 10329 cur_offset + ins.offset - 1, 10330 0); 10331 } 10332 free_extent_map(em); 10333 next: 10334 num_bytes -= ins.offset; 10335 cur_offset += ins.offset; 10336 *alloc_hint = ins.objectid + ins.offset; 10337 10338 inode_inc_iversion(inode); 10339 inode->i_ctime = current_time(inode); 10340 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 10341 if (!(mode & FALLOC_FL_KEEP_SIZE) && 10342 (actual_len > inode->i_size) && 10343 (cur_offset > inode->i_size)) { 10344 if (cur_offset > actual_len) 10345 i_size = actual_len; 10346 else 10347 i_size = cur_offset; 10348 i_size_write(inode, i_size); 10349 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 10350 } 10351 10352 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 10353 10354 if (ret) { 10355 btrfs_abort_transaction(trans, ret); 10356 if (own_trans) 10357 btrfs_end_transaction(trans); 10358 break; 10359 } 10360 10361 if (own_trans) { 10362 btrfs_end_transaction(trans); 10363 trans = NULL; 10364 } 10365 } 10366 if (clear_offset < end) 10367 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, 10368 end - clear_offset + 1); 10369 return ret; 10370 } 10371 10372 int btrfs_prealloc_file_range(struct inode *inode, int mode, 10373 u64 start, u64 num_bytes, u64 min_size, 10374 loff_t actual_len, u64 *alloc_hint) 10375 { 10376 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 10377 min_size, actual_len, alloc_hint, 10378 NULL); 10379 } 10380 10381 int btrfs_prealloc_file_range_trans(struct inode *inode, 10382 struct btrfs_trans_handle *trans, int mode, 10383 u64 start, u64 num_bytes, u64 min_size, 10384 loff_t actual_len, u64 *alloc_hint) 10385 { 10386 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 10387 min_size, actual_len, alloc_hint, trans); 10388 } 10389 10390 static int btrfs_set_page_dirty(struct page *page) 10391 { 10392 return __set_page_dirty_nobuffers(page); 10393 } 10394 10395 static int btrfs_permission(struct user_namespace *mnt_userns, 10396 struct inode *inode, int mask) 10397 { 10398 struct btrfs_root *root = BTRFS_I(inode)->root; 10399 umode_t mode = inode->i_mode; 10400 10401 if (mask & MAY_WRITE && 10402 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 10403 if (btrfs_root_readonly(root)) 10404 return -EROFS; 10405 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 10406 return -EACCES; 10407 } 10408 return generic_permission(mnt_userns, inode, mask); 10409 } 10410 10411 static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, 10412 struct dentry *dentry, umode_t mode) 10413 { 10414 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 10415 struct btrfs_trans_handle *trans; 10416 struct btrfs_root *root = BTRFS_I(dir)->root; 10417 struct inode *inode = NULL; 10418 u64 objectid; 10419 u64 index; 10420 int ret = 0; 10421 10422 /* 10423 * 5 units required for adding orphan entry 10424 */ 10425 trans = btrfs_start_transaction(root, 5); 10426 if (IS_ERR(trans)) 10427 return PTR_ERR(trans); 10428 10429 ret = btrfs_get_free_objectid(root, &objectid); 10430 if (ret) 10431 goto out; 10432 10433 inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0, 10434 btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); 10435 if (IS_ERR(inode)) { 10436 ret = PTR_ERR(inode); 10437 inode = NULL; 10438 goto out; 10439 } 10440 10441 inode->i_fop = &btrfs_file_operations; 10442 inode->i_op = &btrfs_file_inode_operations; 10443 10444 inode->i_mapping->a_ops = &btrfs_aops; 10445 10446 ret = btrfs_init_inode_security(trans, inode, dir, NULL); 10447 if (ret) 10448 goto out; 10449 10450 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 10451 if (ret) 10452 goto out; 10453 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 10454 if (ret) 10455 goto out; 10456 10457 /* 10458 * We set number of links to 0 in btrfs_new_inode(), and here we set 10459 * it to 1 because d_tmpfile() will issue a warning if the count is 0, 10460 * through: 10461 * 10462 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() 10463 */ 10464 set_nlink(inode, 1); 10465 d_tmpfile(dentry, inode); 10466 unlock_new_inode(inode); 10467 mark_inode_dirty(inode); 10468 out: 10469 btrfs_end_transaction(trans); 10470 if (ret && inode) 10471 discard_new_inode(inode); 10472 btrfs_btree_balance_dirty(fs_info); 10473 return ret; 10474 } 10475 10476 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) 10477 { 10478 struct btrfs_fs_info *fs_info = inode->root->fs_info; 10479 unsigned long index = start >> PAGE_SHIFT; 10480 unsigned long end_index = end >> PAGE_SHIFT; 10481 struct page *page; 10482 u32 len; 10483 10484 ASSERT(end + 1 - start <= U32_MAX); 10485 len = end + 1 - start; 10486 while (index <= end_index) { 10487 page = find_get_page(inode->vfs_inode.i_mapping, index); 10488 ASSERT(page); /* Pages should be in the extent_io_tree */ 10489 10490 btrfs_page_set_writeback(fs_info, page, start, len); 10491 put_page(page); 10492 index++; 10493 } 10494 } 10495 10496 #ifdef CONFIG_SWAP 10497 /* 10498 * Add an entry indicating a block group or device which is pinned by a 10499 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a 10500 * negative errno on failure. 10501 */ 10502 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, 10503 bool is_block_group) 10504 { 10505 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 10506 struct btrfs_swapfile_pin *sp, *entry; 10507 struct rb_node **p; 10508 struct rb_node *parent = NULL; 10509 10510 sp = kmalloc(sizeof(*sp), GFP_NOFS); 10511 if (!sp) 10512 return -ENOMEM; 10513 sp->ptr = ptr; 10514 sp->inode = inode; 10515 sp->is_block_group = is_block_group; 10516 sp->bg_extent_count = 1; 10517 10518 spin_lock(&fs_info->swapfile_pins_lock); 10519 p = &fs_info->swapfile_pins.rb_node; 10520 while (*p) { 10521 parent = *p; 10522 entry = rb_entry(parent, struct btrfs_swapfile_pin, node); 10523 if (sp->ptr < entry->ptr || 10524 (sp->ptr == entry->ptr && sp->inode < entry->inode)) { 10525 p = &(*p)->rb_left; 10526 } else if (sp->ptr > entry->ptr || 10527 (sp->ptr == entry->ptr && sp->inode > entry->inode)) { 10528 p = &(*p)->rb_right; 10529 } else { 10530 if (is_block_group) 10531 entry->bg_extent_count++; 10532 spin_unlock(&fs_info->swapfile_pins_lock); 10533 kfree(sp); 10534 return 1; 10535 } 10536 } 10537 rb_link_node(&sp->node, parent, p); 10538 rb_insert_color(&sp->node, &fs_info->swapfile_pins); 10539 spin_unlock(&fs_info->swapfile_pins_lock); 10540 return 0; 10541 } 10542 10543 /* Free all of the entries pinned by this swapfile. */ 10544 static void btrfs_free_swapfile_pins(struct inode *inode) 10545 { 10546 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 10547 struct btrfs_swapfile_pin *sp; 10548 struct rb_node *node, *next; 10549 10550 spin_lock(&fs_info->swapfile_pins_lock); 10551 node = rb_first(&fs_info->swapfile_pins); 10552 while (node) { 10553 next = rb_next(node); 10554 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 10555 if (sp->inode == inode) { 10556 rb_erase(&sp->node, &fs_info->swapfile_pins); 10557 if (sp->is_block_group) { 10558 btrfs_dec_block_group_swap_extents(sp->ptr, 10559 sp->bg_extent_count); 10560 btrfs_put_block_group(sp->ptr); 10561 } 10562 kfree(sp); 10563 } 10564 node = next; 10565 } 10566 spin_unlock(&fs_info->swapfile_pins_lock); 10567 } 10568 10569 struct btrfs_swap_info { 10570 u64 start; 10571 u64 block_start; 10572 u64 block_len; 10573 u64 lowest_ppage; 10574 u64 highest_ppage; 10575 unsigned long nr_pages; 10576 int nr_extents; 10577 }; 10578 10579 static int btrfs_add_swap_extent(struct swap_info_struct *sis, 10580 struct btrfs_swap_info *bsi) 10581 { 10582 unsigned long nr_pages; 10583 u64 first_ppage, first_ppage_reported, next_ppage; 10584 int ret; 10585 10586 first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; 10587 next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, 10588 PAGE_SIZE) >> PAGE_SHIFT; 10589 10590 if (first_ppage >= next_ppage) 10591 return 0; 10592 nr_pages = next_ppage - first_ppage; 10593 10594 first_ppage_reported = first_ppage; 10595 if (bsi->start == 0) 10596 first_ppage_reported++; 10597 if (bsi->lowest_ppage > first_ppage_reported) 10598 bsi->lowest_ppage = first_ppage_reported; 10599 if (bsi->highest_ppage < (next_ppage - 1)) 10600 bsi->highest_ppage = next_ppage - 1; 10601 10602 ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); 10603 if (ret < 0) 10604 return ret; 10605 bsi->nr_extents += ret; 10606 bsi->nr_pages += nr_pages; 10607 return 0; 10608 } 10609 10610 static void btrfs_swap_deactivate(struct file *file) 10611 { 10612 struct inode *inode = file_inode(file); 10613 10614 btrfs_free_swapfile_pins(inode); 10615 atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); 10616 } 10617 10618 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, 10619 sector_t *span) 10620 { 10621 struct inode *inode = file_inode(file); 10622 struct btrfs_root *root = BTRFS_I(inode)->root; 10623 struct btrfs_fs_info *fs_info = root->fs_info; 10624 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 10625 struct extent_state *cached_state = NULL; 10626 struct extent_map *em = NULL; 10627 struct btrfs_device *device = NULL; 10628 struct btrfs_swap_info bsi = { 10629 .lowest_ppage = (sector_t)-1ULL, 10630 }; 10631 int ret = 0; 10632 u64 isize; 10633 u64 start; 10634 10635 /* 10636 * If the swap file was just created, make sure delalloc is done. If the 10637 * file changes again after this, the user is doing something stupid and 10638 * we don't really care. 10639 */ 10640 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 10641 if (ret) 10642 return ret; 10643 10644 /* 10645 * The inode is locked, so these flags won't change after we check them. 10646 */ 10647 if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { 10648 btrfs_warn(fs_info, "swapfile must not be compressed"); 10649 return -EINVAL; 10650 } 10651 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { 10652 btrfs_warn(fs_info, "swapfile must not be copy-on-write"); 10653 return -EINVAL; 10654 } 10655 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 10656 btrfs_warn(fs_info, "swapfile must not be checksummed"); 10657 return -EINVAL; 10658 } 10659 10660 /* 10661 * Balance or device remove/replace/resize can move stuff around from 10662 * under us. The exclop protection makes sure they aren't running/won't 10663 * run concurrently while we are mapping the swap extents, and 10664 * fs_info->swapfile_pins prevents them from running while the swap 10665 * file is active and moving the extents. Note that this also prevents 10666 * a concurrent device add which isn't actually necessary, but it's not 10667 * really worth the trouble to allow it. 10668 */ 10669 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { 10670 btrfs_warn(fs_info, 10671 "cannot activate swapfile while exclusive operation is running"); 10672 return -EBUSY; 10673 } 10674 10675 /* 10676 * Prevent snapshot creation while we are activating the swap file. 10677 * We do not want to race with snapshot creation. If snapshot creation 10678 * already started before we bumped nr_swapfiles from 0 to 1 and 10679 * completes before the first write into the swap file after it is 10680 * activated, than that write would fallback to COW. 10681 */ 10682 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) { 10683 btrfs_exclop_finish(fs_info); 10684 btrfs_warn(fs_info, 10685 "cannot activate swapfile because snapshot creation is in progress"); 10686 return -EINVAL; 10687 } 10688 /* 10689 * Snapshots can create extents which require COW even if NODATACOW is 10690 * set. We use this counter to prevent snapshots. We must increment it 10691 * before walking the extents because we don't want a concurrent 10692 * snapshot to run after we've already checked the extents. 10693 */ 10694 atomic_inc(&root->nr_swapfiles); 10695 10696 isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); 10697 10698 lock_extent_bits(io_tree, 0, isize - 1, &cached_state); 10699 start = 0; 10700 while (start < isize) { 10701 u64 logical_block_start, physical_block_start; 10702 struct btrfs_block_group *bg; 10703 u64 len = isize - start; 10704 10705 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); 10706 if (IS_ERR(em)) { 10707 ret = PTR_ERR(em); 10708 goto out; 10709 } 10710 10711 if (em->block_start == EXTENT_MAP_HOLE) { 10712 btrfs_warn(fs_info, "swapfile must not have holes"); 10713 ret = -EINVAL; 10714 goto out; 10715 } 10716 if (em->block_start == EXTENT_MAP_INLINE) { 10717 /* 10718 * It's unlikely we'll ever actually find ourselves 10719 * here, as a file small enough to fit inline won't be 10720 * big enough to store more than the swap header, but in 10721 * case something changes in the future, let's catch it 10722 * here rather than later. 10723 */ 10724 btrfs_warn(fs_info, "swapfile must not be inline"); 10725 ret = -EINVAL; 10726 goto out; 10727 } 10728 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 10729 btrfs_warn(fs_info, "swapfile must not be compressed"); 10730 ret = -EINVAL; 10731 goto out; 10732 } 10733 10734 logical_block_start = em->block_start + (start - em->start); 10735 len = min(len, em->len - (start - em->start)); 10736 free_extent_map(em); 10737 em = NULL; 10738 10739 ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true); 10740 if (ret < 0) { 10741 goto out; 10742 } else if (ret) { 10743 ret = 0; 10744 } else { 10745 btrfs_warn(fs_info, 10746 "swapfile must not be copy-on-write"); 10747 ret = -EINVAL; 10748 goto out; 10749 } 10750 10751 em = btrfs_get_chunk_map(fs_info, logical_block_start, len); 10752 if (IS_ERR(em)) { 10753 ret = PTR_ERR(em); 10754 goto out; 10755 } 10756 10757 if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 10758 btrfs_warn(fs_info, 10759 "swapfile must have single data profile"); 10760 ret = -EINVAL; 10761 goto out; 10762 } 10763 10764 if (device == NULL) { 10765 device = em->map_lookup->stripes[0].dev; 10766 ret = btrfs_add_swapfile_pin(inode, device, false); 10767 if (ret == 1) 10768 ret = 0; 10769 else if (ret) 10770 goto out; 10771 } else if (device != em->map_lookup->stripes[0].dev) { 10772 btrfs_warn(fs_info, "swapfile must be on one device"); 10773 ret = -EINVAL; 10774 goto out; 10775 } 10776 10777 physical_block_start = (em->map_lookup->stripes[0].physical + 10778 (logical_block_start - em->start)); 10779 len = min(len, em->len - (logical_block_start - em->start)); 10780 free_extent_map(em); 10781 em = NULL; 10782 10783 bg = btrfs_lookup_block_group(fs_info, logical_block_start); 10784 if (!bg) { 10785 btrfs_warn(fs_info, 10786 "could not find block group containing swapfile"); 10787 ret = -EINVAL; 10788 goto out; 10789 } 10790 10791 if (!btrfs_inc_block_group_swap_extents(bg)) { 10792 btrfs_warn(fs_info, 10793 "block group for swapfile at %llu is read-only%s", 10794 bg->start, 10795 atomic_read(&fs_info->scrubs_running) ? 10796 " (scrub running)" : ""); 10797 btrfs_put_block_group(bg); 10798 ret = -EINVAL; 10799 goto out; 10800 } 10801 10802 ret = btrfs_add_swapfile_pin(inode, bg, true); 10803 if (ret) { 10804 btrfs_put_block_group(bg); 10805 if (ret == 1) 10806 ret = 0; 10807 else 10808 goto out; 10809 } 10810 10811 if (bsi.block_len && 10812 bsi.block_start + bsi.block_len == physical_block_start) { 10813 bsi.block_len += len; 10814 } else { 10815 if (bsi.block_len) { 10816 ret = btrfs_add_swap_extent(sis, &bsi); 10817 if (ret) 10818 goto out; 10819 } 10820 bsi.start = start; 10821 bsi.block_start = physical_block_start; 10822 bsi.block_len = len; 10823 } 10824 10825 start += len; 10826 } 10827 10828 if (bsi.block_len) 10829 ret = btrfs_add_swap_extent(sis, &bsi); 10830 10831 out: 10832 if (!IS_ERR_OR_NULL(em)) 10833 free_extent_map(em); 10834 10835 unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); 10836 10837 if (ret) 10838 btrfs_swap_deactivate(file); 10839 10840 btrfs_drew_write_unlock(&root->snapshot_lock); 10841 10842 btrfs_exclop_finish(fs_info); 10843 10844 if (ret) 10845 return ret; 10846 10847 if (device) 10848 sis->bdev = device->bdev; 10849 *span = bsi.highest_ppage - bsi.lowest_ppage + 1; 10850 sis->max = bsi.nr_pages; 10851 sis->pages = bsi.nr_pages - 1; 10852 sis->highest_bit = bsi.nr_pages - 1; 10853 return bsi.nr_extents; 10854 } 10855 #else 10856 static void btrfs_swap_deactivate(struct file *file) 10857 { 10858 } 10859 10860 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, 10861 sector_t *span) 10862 { 10863 return -EOPNOTSUPP; 10864 } 10865 #endif 10866 10867 /* 10868 * Update the number of bytes used in the VFS' inode. When we replace extents in 10869 * a range (clone, dedupe, fallocate's zero range), we must update the number of 10870 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls 10871 * always get a correct value. 10872 */ 10873 void btrfs_update_inode_bytes(struct btrfs_inode *inode, 10874 const u64 add_bytes, 10875 const u64 del_bytes) 10876 { 10877 if (add_bytes == del_bytes) 10878 return; 10879 10880 spin_lock(&inode->lock); 10881 if (del_bytes > 0) 10882 inode_sub_bytes(&inode->vfs_inode, del_bytes); 10883 if (add_bytes > 0) 10884 inode_add_bytes(&inode->vfs_inode, add_bytes); 10885 spin_unlock(&inode->lock); 10886 } 10887 10888 static const struct inode_operations btrfs_dir_inode_operations = { 10889 .getattr = btrfs_getattr, 10890 .lookup = btrfs_lookup, 10891 .create = btrfs_create, 10892 .unlink = btrfs_unlink, 10893 .link = btrfs_link, 10894 .mkdir = btrfs_mkdir, 10895 .rmdir = btrfs_rmdir, 10896 .rename = btrfs_rename2, 10897 .symlink = btrfs_symlink, 10898 .setattr = btrfs_setattr, 10899 .mknod = btrfs_mknod, 10900 .listxattr = btrfs_listxattr, 10901 .permission = btrfs_permission, 10902 .get_acl = btrfs_get_acl, 10903 .set_acl = btrfs_set_acl, 10904 .update_time = btrfs_update_time, 10905 .tmpfile = btrfs_tmpfile, 10906 .fileattr_get = btrfs_fileattr_get, 10907 .fileattr_set = btrfs_fileattr_set, 10908 }; 10909 10910 static const struct file_operations btrfs_dir_file_operations = { 10911 .llseek = generic_file_llseek, 10912 .read = generic_read_dir, 10913 .iterate_shared = btrfs_real_readdir, 10914 .open = btrfs_opendir, 10915 .unlocked_ioctl = btrfs_ioctl, 10916 #ifdef CONFIG_COMPAT 10917 .compat_ioctl = btrfs_compat_ioctl, 10918 #endif 10919 .release = btrfs_release_file, 10920 .fsync = btrfs_sync_file, 10921 }; 10922 10923 /* 10924 * btrfs doesn't support the bmap operation because swapfiles 10925 * use bmap to make a mapping of extents in the file. They assume 10926 * these extents won't change over the life of the file and they 10927 * use the bmap result to do IO directly to the drive. 10928 * 10929 * the btrfs bmap call would return logical addresses that aren't 10930 * suitable for IO and they also will change frequently as COW 10931 * operations happen. So, swapfile + btrfs == corruption. 10932 * 10933 * For now we're avoiding this by dropping bmap. 10934 */ 10935 static const struct address_space_operations btrfs_aops = { 10936 .readpage = btrfs_readpage, 10937 .writepage = btrfs_writepage, 10938 .writepages = btrfs_writepages, 10939 .readahead = btrfs_readahead, 10940 .direct_IO = noop_direct_IO, 10941 .invalidatepage = btrfs_invalidatepage, 10942 .releasepage = btrfs_releasepage, 10943 #ifdef CONFIG_MIGRATION 10944 .migratepage = btrfs_migratepage, 10945 #endif 10946 .set_page_dirty = btrfs_set_page_dirty, 10947 .error_remove_page = generic_error_remove_page, 10948 .swap_activate = btrfs_swap_activate, 10949 .swap_deactivate = btrfs_swap_deactivate, 10950 }; 10951 10952 static const struct inode_operations btrfs_file_inode_operations = { 10953 .getattr = btrfs_getattr, 10954 .setattr = btrfs_setattr, 10955 .listxattr = btrfs_listxattr, 10956 .permission = btrfs_permission, 10957 .fiemap = btrfs_fiemap, 10958 .get_acl = btrfs_get_acl, 10959 .set_acl = btrfs_set_acl, 10960 .update_time = btrfs_update_time, 10961 .fileattr_get = btrfs_fileattr_get, 10962 .fileattr_set = btrfs_fileattr_set, 10963 }; 10964 static const struct inode_operations btrfs_special_inode_operations = { 10965 .getattr = btrfs_getattr, 10966 .setattr = btrfs_setattr, 10967 .permission = btrfs_permission, 10968 .listxattr = btrfs_listxattr, 10969 .get_acl = btrfs_get_acl, 10970 .set_acl = btrfs_set_acl, 10971 .update_time = btrfs_update_time, 10972 }; 10973 static const struct inode_operations btrfs_symlink_inode_operations = { 10974 .get_link = page_get_link, 10975 .getattr = btrfs_getattr, 10976 .setattr = btrfs_setattr, 10977 .permission = btrfs_permission, 10978 .listxattr = btrfs_listxattr, 10979 .update_time = btrfs_update_time, 10980 }; 10981 10982 const struct dentry_operations btrfs_dentry_operations = { 10983 .d_delete = btrfs_dentry_delete, 10984 }; 10985