1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <crypto/hash.h> 7 #include <linux/kernel.h> 8 #include <linux/bio.h> 9 #include <linux/blk-cgroup.h> 10 #include <linux/file.h> 11 #include <linux/fs.h> 12 #include <linux/pagemap.h> 13 #include <linux/highmem.h> 14 #include <linux/time.h> 15 #include <linux/init.h> 16 #include <linux/string.h> 17 #include <linux/backing-dev.h> 18 #include <linux/writeback.h> 19 #include <linux/compat.h> 20 #include <linux/xattr.h> 21 #include <linux/posix_acl.h> 22 #include <linux/falloc.h> 23 #include <linux/slab.h> 24 #include <linux/ratelimit.h> 25 #include <linux/btrfs.h> 26 #include <linux/blkdev.h> 27 #include <linux/posix_acl_xattr.h> 28 #include <linux/uio.h> 29 #include <linux/magic.h> 30 #include <linux/iversion.h> 31 #include <linux/swap.h> 32 #include <linux/migrate.h> 33 #include <linux/sched/mm.h> 34 #include <linux/iomap.h> 35 #include <asm/unaligned.h> 36 #include <linux/fsverity.h> 37 #include "misc.h" 38 #include "ctree.h" 39 #include "disk-io.h" 40 #include "transaction.h" 41 #include "btrfs_inode.h" 42 #include "print-tree.h" 43 #include "ordered-data.h" 44 #include "xattr.h" 45 #include "tree-log.h" 46 #include "volumes.h" 47 #include "compression.h" 48 #include "locking.h" 49 #include "free-space-cache.h" 50 #include "props.h" 51 #include "qgroup.h" 52 #include "delalloc-space.h" 53 #include "block-group.h" 54 #include "space-info.h" 55 #include "zoned.h" 56 #include "subpage.h" 57 #include "inode-item.h" 58 59 struct btrfs_iget_args { 60 u64 ino; 61 struct btrfs_root *root; 62 }; 63 64 struct btrfs_dio_data { 65 ssize_t submitted; 66 struct extent_changeset *data_reserved; 67 }; 68 69 struct btrfs_rename_ctx { 70 /* Output field. Stores the index number of the old directory entry. */ 71 u64 index; 72 }; 73 74 static const struct inode_operations btrfs_dir_inode_operations; 75 static const struct inode_operations btrfs_symlink_inode_operations; 76 static const struct inode_operations btrfs_special_inode_operations; 77 static const struct inode_operations btrfs_file_inode_operations; 78 static const struct address_space_operations btrfs_aops; 79 static const struct file_operations btrfs_dir_file_operations; 80 81 static struct kmem_cache *btrfs_inode_cachep; 82 struct kmem_cache *btrfs_trans_handle_cachep; 83 struct kmem_cache *btrfs_path_cachep; 84 struct kmem_cache *btrfs_free_space_cachep; 85 struct kmem_cache *btrfs_free_space_bitmap_cachep; 86 87 static int btrfs_setsize(struct inode *inode, struct iattr *attr); 88 static int btrfs_truncate(struct inode *inode, bool skip_writeback); 89 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 90 static noinline int cow_file_range(struct btrfs_inode *inode, 91 struct page *locked_page, 92 u64 start, u64 end, int *page_started, 93 unsigned long *nr_written, int unlock); 94 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, 95 u64 len, u64 orig_start, u64 block_start, 96 u64 block_len, u64 orig_block_len, 97 u64 ram_bytes, int compress_type, 98 int type); 99 100 static void __endio_write_update_ordered(struct btrfs_inode *inode, 101 const u64 offset, const u64 bytes, 102 const bool uptodate); 103 104 /* 105 * btrfs_inode_lock - lock inode i_rwsem based on arguments passed 106 * 107 * ilock_flags can have the following bit set: 108 * 109 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode 110 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt 111 * return -EAGAIN 112 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock 113 */ 114 int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) 115 { 116 if (ilock_flags & BTRFS_ILOCK_SHARED) { 117 if (ilock_flags & BTRFS_ILOCK_TRY) { 118 if (!inode_trylock_shared(inode)) 119 return -EAGAIN; 120 else 121 return 0; 122 } 123 inode_lock_shared(inode); 124 } else { 125 if (ilock_flags & BTRFS_ILOCK_TRY) { 126 if (!inode_trylock(inode)) 127 return -EAGAIN; 128 else 129 return 0; 130 } 131 inode_lock(inode); 132 } 133 if (ilock_flags & BTRFS_ILOCK_MMAP) 134 down_write(&BTRFS_I(inode)->i_mmap_lock); 135 return 0; 136 } 137 138 /* 139 * btrfs_inode_unlock - unock inode i_rwsem 140 * 141 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() 142 * to decide whether the lock acquired is shared or exclusive. 143 */ 144 void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) 145 { 146 if (ilock_flags & BTRFS_ILOCK_MMAP) 147 up_write(&BTRFS_I(inode)->i_mmap_lock); 148 if (ilock_flags & BTRFS_ILOCK_SHARED) 149 inode_unlock_shared(inode); 150 else 151 inode_unlock(inode); 152 } 153 154 /* 155 * Cleanup all submitted ordered extents in specified range to handle errors 156 * from the btrfs_run_delalloc_range() callback. 157 * 158 * NOTE: caller must ensure that when an error happens, it can not call 159 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING 160 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata 161 * to be released, which we want to happen only when finishing the ordered 162 * extent (btrfs_finish_ordered_io()). 163 */ 164 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, 165 struct page *locked_page, 166 u64 offset, u64 bytes) 167 { 168 unsigned long index = offset >> PAGE_SHIFT; 169 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; 170 u64 page_start = page_offset(locked_page); 171 u64 page_end = page_start + PAGE_SIZE - 1; 172 173 struct page *page; 174 175 while (index <= end_index) { 176 /* 177 * For locked page, we will call end_extent_writepage() on it 178 * in run_delalloc_range() for the error handling. That 179 * end_extent_writepage() function will call 180 * btrfs_mark_ordered_io_finished() to clear page Ordered and 181 * run the ordered extent accounting. 182 * 183 * Here we can't just clear the Ordered bit, or 184 * btrfs_mark_ordered_io_finished() would skip the accounting 185 * for the page range, and the ordered extent will never finish. 186 */ 187 if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { 188 index++; 189 continue; 190 } 191 page = find_get_page(inode->vfs_inode.i_mapping, index); 192 index++; 193 if (!page) 194 continue; 195 196 /* 197 * Here we just clear all Ordered bits for every page in the 198 * range, then __endio_write_update_ordered() will handle 199 * the ordered extent accounting for the range. 200 */ 201 btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, 202 offset, bytes); 203 put_page(page); 204 } 205 206 /* The locked page covers the full range, nothing needs to be done */ 207 if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) 208 return; 209 /* 210 * In case this page belongs to the delalloc range being instantiated 211 * then skip it, since the first page of a range is going to be 212 * properly cleaned up by the caller of run_delalloc_range 213 */ 214 if (page_start >= offset && page_end <= (offset + bytes - 1)) { 215 bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; 216 offset = page_offset(locked_page) + PAGE_SIZE; 217 } 218 219 return __endio_write_update_ordered(inode, offset, bytes, false); 220 } 221 222 static int btrfs_dirty_inode(struct inode *inode); 223 224 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 225 struct inode *inode, struct inode *dir, 226 const struct qstr *qstr) 227 { 228 int err; 229 230 err = btrfs_init_acl(trans, inode, dir); 231 if (!err) 232 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 233 return err; 234 } 235 236 /* 237 * this does all the hard work for inserting an inline extent into 238 * the btree. The caller should have done a btrfs_drop_extents so that 239 * no overlapping inline items exist in the btree 240 */ 241 static int insert_inline_extent(struct btrfs_trans_handle *trans, 242 struct btrfs_path *path, 243 struct btrfs_inode *inode, bool extent_inserted, 244 size_t size, size_t compressed_size, 245 int compress_type, 246 struct page **compressed_pages, 247 bool update_i_size) 248 { 249 struct btrfs_root *root = inode->root; 250 struct extent_buffer *leaf; 251 struct page *page = NULL; 252 char *kaddr; 253 unsigned long ptr; 254 struct btrfs_file_extent_item *ei; 255 int ret; 256 size_t cur_size = size; 257 u64 i_size; 258 259 ASSERT((compressed_size > 0 && compressed_pages) || 260 (compressed_size == 0 && !compressed_pages)); 261 262 if (compressed_size && compressed_pages) 263 cur_size = compressed_size; 264 265 if (!extent_inserted) { 266 struct btrfs_key key; 267 size_t datasize; 268 269 key.objectid = btrfs_ino(inode); 270 key.offset = 0; 271 key.type = BTRFS_EXTENT_DATA_KEY; 272 273 datasize = btrfs_file_extent_calc_inline_size(cur_size); 274 ret = btrfs_insert_empty_item(trans, root, path, &key, 275 datasize); 276 if (ret) 277 goto fail; 278 } 279 leaf = path->nodes[0]; 280 ei = btrfs_item_ptr(leaf, path->slots[0], 281 struct btrfs_file_extent_item); 282 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 283 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 284 btrfs_set_file_extent_encryption(leaf, ei, 0); 285 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 286 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 287 ptr = btrfs_file_extent_inline_start(ei); 288 289 if (compress_type != BTRFS_COMPRESS_NONE) { 290 struct page *cpage; 291 int i = 0; 292 while (compressed_size > 0) { 293 cpage = compressed_pages[i]; 294 cur_size = min_t(unsigned long, compressed_size, 295 PAGE_SIZE); 296 297 kaddr = kmap_atomic(cpage); 298 write_extent_buffer(leaf, kaddr, ptr, cur_size); 299 kunmap_atomic(kaddr); 300 301 i++; 302 ptr += cur_size; 303 compressed_size -= cur_size; 304 } 305 btrfs_set_file_extent_compression(leaf, ei, 306 compress_type); 307 } else { 308 page = find_get_page(inode->vfs_inode.i_mapping, 0); 309 btrfs_set_file_extent_compression(leaf, ei, 0); 310 kaddr = kmap_atomic(page); 311 write_extent_buffer(leaf, kaddr, ptr, size); 312 kunmap_atomic(kaddr); 313 put_page(page); 314 } 315 btrfs_mark_buffer_dirty(leaf); 316 btrfs_release_path(path); 317 318 /* 319 * We align size to sectorsize for inline extents just for simplicity 320 * sake. 321 */ 322 ret = btrfs_inode_set_file_extent_range(inode, 0, 323 ALIGN(size, root->fs_info->sectorsize)); 324 if (ret) 325 goto fail; 326 327 /* 328 * We're an inline extent, so nobody can extend the file past i_size 329 * without locking a page we already have locked. 330 * 331 * We must do any i_size and inode updates before we unlock the pages. 332 * Otherwise we could end up racing with unlink. 333 */ 334 i_size = i_size_read(&inode->vfs_inode); 335 if (update_i_size && size > i_size) { 336 i_size_write(&inode->vfs_inode, size); 337 i_size = size; 338 } 339 inode->disk_i_size = i_size; 340 341 fail: 342 return ret; 343 } 344 345 346 /* 347 * conditionally insert an inline extent into the file. This 348 * does the checks required to make sure the data is small enough 349 * to fit as an inline extent. 350 */ 351 static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, 352 size_t compressed_size, 353 int compress_type, 354 struct page **compressed_pages, 355 bool update_i_size) 356 { 357 struct btrfs_drop_extents_args drop_args = { 0 }; 358 struct btrfs_root *root = inode->root; 359 struct btrfs_fs_info *fs_info = root->fs_info; 360 struct btrfs_trans_handle *trans; 361 u64 data_len = (compressed_size ?: size); 362 int ret; 363 struct btrfs_path *path; 364 365 /* 366 * We can create an inline extent if it ends at or beyond the current 367 * i_size, is no larger than a sector (decompressed), and the (possibly 368 * compressed) data fits in a leaf and the configured maximum inline 369 * size. 370 */ 371 if (size < i_size_read(&inode->vfs_inode) || 372 size > fs_info->sectorsize || 373 data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || 374 data_len > fs_info->max_inline) 375 return 1; 376 377 path = btrfs_alloc_path(); 378 if (!path) 379 return -ENOMEM; 380 381 trans = btrfs_join_transaction(root); 382 if (IS_ERR(trans)) { 383 btrfs_free_path(path); 384 return PTR_ERR(trans); 385 } 386 trans->block_rsv = &inode->block_rsv; 387 388 drop_args.path = path; 389 drop_args.start = 0; 390 drop_args.end = fs_info->sectorsize; 391 drop_args.drop_cache = true; 392 drop_args.replace_extent = true; 393 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); 394 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 395 if (ret) { 396 btrfs_abort_transaction(trans, ret); 397 goto out; 398 } 399 400 ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, 401 size, compressed_size, compress_type, 402 compressed_pages, update_i_size); 403 if (ret && ret != -ENOSPC) { 404 btrfs_abort_transaction(trans, ret); 405 goto out; 406 } else if (ret == -ENOSPC) { 407 ret = 1; 408 goto out; 409 } 410 411 btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); 412 ret = btrfs_update_inode(trans, root, inode); 413 if (ret && ret != -ENOSPC) { 414 btrfs_abort_transaction(trans, ret); 415 goto out; 416 } else if (ret == -ENOSPC) { 417 ret = 1; 418 goto out; 419 } 420 421 btrfs_set_inode_full_sync(inode); 422 out: 423 /* 424 * Don't forget to free the reserved space, as for inlined extent 425 * it won't count as data extent, free them directly here. 426 * And at reserve time, it's always aligned to page size, so 427 * just free one page here. 428 */ 429 btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); 430 btrfs_free_path(path); 431 btrfs_end_transaction(trans); 432 return ret; 433 } 434 435 struct async_extent { 436 u64 start; 437 u64 ram_size; 438 u64 compressed_size; 439 struct page **pages; 440 unsigned long nr_pages; 441 int compress_type; 442 struct list_head list; 443 }; 444 445 struct async_chunk { 446 struct inode *inode; 447 struct page *locked_page; 448 u64 start; 449 u64 end; 450 unsigned int write_flags; 451 struct list_head extents; 452 struct cgroup_subsys_state *blkcg_css; 453 struct btrfs_work work; 454 struct async_cow *async_cow; 455 }; 456 457 struct async_cow { 458 atomic_t num_chunks; 459 struct async_chunk chunks[]; 460 }; 461 462 static noinline int add_async_extent(struct async_chunk *cow, 463 u64 start, u64 ram_size, 464 u64 compressed_size, 465 struct page **pages, 466 unsigned long nr_pages, 467 int compress_type) 468 { 469 struct async_extent *async_extent; 470 471 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 472 BUG_ON(!async_extent); /* -ENOMEM */ 473 async_extent->start = start; 474 async_extent->ram_size = ram_size; 475 async_extent->compressed_size = compressed_size; 476 async_extent->pages = pages; 477 async_extent->nr_pages = nr_pages; 478 async_extent->compress_type = compress_type; 479 list_add_tail(&async_extent->list, &cow->extents); 480 return 0; 481 } 482 483 /* 484 * Check if the inode has flags compatible with compression 485 */ 486 static inline bool inode_can_compress(struct btrfs_inode *inode) 487 { 488 if (inode->flags & BTRFS_INODE_NODATACOW || 489 inode->flags & BTRFS_INODE_NODATASUM) 490 return false; 491 return true; 492 } 493 494 /* 495 * Check if the inode needs to be submitted to compression, based on mount 496 * options, defragmentation, properties or heuristics. 497 */ 498 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, 499 u64 end) 500 { 501 struct btrfs_fs_info *fs_info = inode->root->fs_info; 502 503 if (!inode_can_compress(inode)) { 504 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), 505 KERN_ERR "BTRFS: unexpected compression for ino %llu\n", 506 btrfs_ino(inode)); 507 return 0; 508 } 509 /* 510 * Special check for subpage. 511 * 512 * We lock the full page then run each delalloc range in the page, thus 513 * for the following case, we will hit some subpage specific corner case: 514 * 515 * 0 32K 64K 516 * | |///////| |///////| 517 * \- A \- B 518 * 519 * In above case, both range A and range B will try to unlock the full 520 * page [0, 64K), causing the one finished later will have page 521 * unlocked already, triggering various page lock requirement BUG_ON()s. 522 * 523 * So here we add an artificial limit that subpage compression can only 524 * if the range is fully page aligned. 525 * 526 * In theory we only need to ensure the first page is fully covered, but 527 * the tailing partial page will be locked until the full compression 528 * finishes, delaying the write of other range. 529 * 530 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range 531 * first to prevent any submitted async extent to unlock the full page. 532 * By this, we can ensure for subpage case that only the last async_cow 533 * will unlock the full page. 534 */ 535 if (fs_info->sectorsize < PAGE_SIZE) { 536 if (!IS_ALIGNED(start, PAGE_SIZE) || 537 !IS_ALIGNED(end + 1, PAGE_SIZE)) 538 return 0; 539 } 540 541 /* force compress */ 542 if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) 543 return 1; 544 /* defrag ioctl */ 545 if (inode->defrag_compress) 546 return 1; 547 /* bad compression ratios */ 548 if (inode->flags & BTRFS_INODE_NOCOMPRESS) 549 return 0; 550 if (btrfs_test_opt(fs_info, COMPRESS) || 551 inode->flags & BTRFS_INODE_COMPRESS || 552 inode->prop_compress) 553 return btrfs_compress_heuristic(&inode->vfs_inode, start, end); 554 return 0; 555 } 556 557 static inline void inode_should_defrag(struct btrfs_inode *inode, 558 u64 start, u64 end, u64 num_bytes, u32 small_write) 559 { 560 /* If this is a small write inside eof, kick off a defrag */ 561 if (num_bytes < small_write && 562 (start > 0 || end + 1 < inode->disk_i_size)) 563 btrfs_add_inode_defrag(NULL, inode, small_write); 564 } 565 566 /* 567 * we create compressed extents in two phases. The first 568 * phase compresses a range of pages that have already been 569 * locked (both pages and state bits are locked). 570 * 571 * This is done inside an ordered work queue, and the compression 572 * is spread across many cpus. The actual IO submission is step 573 * two, and the ordered work queue takes care of making sure that 574 * happens in the same order things were put onto the queue by 575 * writepages and friends. 576 * 577 * If this code finds it can't get good compression, it puts an 578 * entry onto the work queue to write the uncompressed bytes. This 579 * makes sure that both compressed inodes and uncompressed inodes 580 * are written in the same order that the flusher thread sent them 581 * down. 582 */ 583 static noinline int compress_file_range(struct async_chunk *async_chunk) 584 { 585 struct inode *inode = async_chunk->inode; 586 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 587 u64 blocksize = fs_info->sectorsize; 588 u64 start = async_chunk->start; 589 u64 end = async_chunk->end; 590 u64 actual_end; 591 u64 i_size; 592 int ret = 0; 593 struct page **pages = NULL; 594 unsigned long nr_pages; 595 unsigned long total_compressed = 0; 596 unsigned long total_in = 0; 597 int i; 598 int will_compress; 599 int compress_type = fs_info->compress_type; 600 int compressed_extents = 0; 601 int redirty = 0; 602 603 inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, 604 SZ_16K); 605 606 /* 607 * We need to save i_size before now because it could change in between 608 * us evaluating the size and assigning it. This is because we lock and 609 * unlock the page in truncate and fallocate, and then modify the i_size 610 * later on. 611 * 612 * The barriers are to emulate READ_ONCE, remove that once i_size_read 613 * does that for us. 614 */ 615 barrier(); 616 i_size = i_size_read(inode); 617 barrier(); 618 actual_end = min_t(u64, i_size, end + 1); 619 again: 620 will_compress = 0; 621 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; 622 nr_pages = min_t(unsigned long, nr_pages, 623 BTRFS_MAX_COMPRESSED / PAGE_SIZE); 624 625 /* 626 * we don't want to send crud past the end of i_size through 627 * compression, that's just a waste of CPU time. So, if the 628 * end of the file is before the start of our current 629 * requested range of bytes, we bail out to the uncompressed 630 * cleanup code that can deal with all of this. 631 * 632 * It isn't really the fastest way to fix things, but this is a 633 * very uncommon corner. 634 */ 635 if (actual_end <= start) 636 goto cleanup_and_bail_uncompressed; 637 638 total_compressed = actual_end - start; 639 640 /* 641 * Skip compression for a small file range(<=blocksize) that 642 * isn't an inline extent, since it doesn't save disk space at all. 643 */ 644 if (total_compressed <= blocksize && 645 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 646 goto cleanup_and_bail_uncompressed; 647 648 /* 649 * For subpage case, we require full page alignment for the sector 650 * aligned range. 651 * Thus we must also check against @actual_end, not just @end. 652 */ 653 if (blocksize < PAGE_SIZE) { 654 if (!IS_ALIGNED(start, PAGE_SIZE) || 655 !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE)) 656 goto cleanup_and_bail_uncompressed; 657 } 658 659 total_compressed = min_t(unsigned long, total_compressed, 660 BTRFS_MAX_UNCOMPRESSED); 661 total_in = 0; 662 ret = 0; 663 664 /* 665 * we do compression for mount -o compress and when the 666 * inode has not been flagged as nocompress. This flag can 667 * change at any time if we discover bad compression ratios. 668 */ 669 if (inode_need_compress(BTRFS_I(inode), start, end)) { 670 WARN_ON(pages); 671 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 672 if (!pages) { 673 /* just bail out to the uncompressed code */ 674 nr_pages = 0; 675 goto cont; 676 } 677 678 if (BTRFS_I(inode)->defrag_compress) 679 compress_type = BTRFS_I(inode)->defrag_compress; 680 else if (BTRFS_I(inode)->prop_compress) 681 compress_type = BTRFS_I(inode)->prop_compress; 682 683 /* 684 * we need to call clear_page_dirty_for_io on each 685 * page in the range. Otherwise applications with the file 686 * mmap'd can wander in and change the page contents while 687 * we are compressing them. 688 * 689 * If the compression fails for any reason, we set the pages 690 * dirty again later on. 691 * 692 * Note that the remaining part is redirtied, the start pointer 693 * has moved, the end is the original one. 694 */ 695 if (!redirty) { 696 extent_range_clear_dirty_for_io(inode, start, end); 697 redirty = 1; 698 } 699 700 /* Compression level is applied here and only here */ 701 ret = btrfs_compress_pages( 702 compress_type | (fs_info->compress_level << 4), 703 inode->i_mapping, start, 704 pages, 705 &nr_pages, 706 &total_in, 707 &total_compressed); 708 709 if (!ret) { 710 unsigned long offset = offset_in_page(total_compressed); 711 struct page *page = pages[nr_pages - 1]; 712 713 /* zero the tail end of the last page, we might be 714 * sending it down to disk 715 */ 716 if (offset) 717 memzero_page(page, offset, PAGE_SIZE - offset); 718 will_compress = 1; 719 } 720 } 721 cont: 722 /* 723 * Check cow_file_range() for why we don't even try to create inline 724 * extent for subpage case. 725 */ 726 if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { 727 /* lets try to make an inline extent */ 728 if (ret || total_in < actual_end) { 729 /* we didn't compress the entire range, try 730 * to make an uncompressed inline extent. 731 */ 732 ret = cow_file_range_inline(BTRFS_I(inode), actual_end, 733 0, BTRFS_COMPRESS_NONE, 734 NULL, false); 735 } else { 736 /* try making a compressed inline extent */ 737 ret = cow_file_range_inline(BTRFS_I(inode), actual_end, 738 total_compressed, 739 compress_type, pages, 740 false); 741 } 742 if (ret <= 0) { 743 unsigned long clear_flags = EXTENT_DELALLOC | 744 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 745 EXTENT_DO_ACCOUNTING; 746 unsigned long page_error_op; 747 748 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; 749 750 /* 751 * inline extent creation worked or returned error, 752 * we don't need to create any more async work items. 753 * Unlock and free up our temp pages. 754 * 755 * We use DO_ACCOUNTING here because we need the 756 * delalloc_release_metadata to be done _after_ we drop 757 * our outstanding extent for clearing delalloc for this 758 * range. 759 */ 760 extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, 761 NULL, 762 clear_flags, 763 PAGE_UNLOCK | 764 PAGE_START_WRITEBACK | 765 page_error_op | 766 PAGE_END_WRITEBACK); 767 768 /* 769 * Ensure we only free the compressed pages if we have 770 * them allocated, as we can still reach here with 771 * inode_need_compress() == false. 772 */ 773 if (pages) { 774 for (i = 0; i < nr_pages; i++) { 775 WARN_ON(pages[i]->mapping); 776 put_page(pages[i]); 777 } 778 kfree(pages); 779 } 780 return 0; 781 } 782 } 783 784 if (will_compress) { 785 /* 786 * we aren't doing an inline extent round the compressed size 787 * up to a block size boundary so the allocator does sane 788 * things 789 */ 790 total_compressed = ALIGN(total_compressed, blocksize); 791 792 /* 793 * one last check to make sure the compression is really a 794 * win, compare the page count read with the blocks on disk, 795 * compression must free at least one sector size 796 */ 797 total_in = round_up(total_in, fs_info->sectorsize); 798 if (total_compressed + blocksize <= total_in) { 799 compressed_extents++; 800 801 /* 802 * The async work queues will take care of doing actual 803 * allocation on disk for these compressed pages, and 804 * will submit them to the elevator. 805 */ 806 add_async_extent(async_chunk, start, total_in, 807 total_compressed, pages, nr_pages, 808 compress_type); 809 810 if (start + total_in < end) { 811 start += total_in; 812 pages = NULL; 813 cond_resched(); 814 goto again; 815 } 816 return compressed_extents; 817 } 818 } 819 if (pages) { 820 /* 821 * the compression code ran but failed to make things smaller, 822 * free any pages it allocated and our page pointer array 823 */ 824 for (i = 0; i < nr_pages; i++) { 825 WARN_ON(pages[i]->mapping); 826 put_page(pages[i]); 827 } 828 kfree(pages); 829 pages = NULL; 830 total_compressed = 0; 831 nr_pages = 0; 832 833 /* flag the file so we don't compress in the future */ 834 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && 835 !(BTRFS_I(inode)->prop_compress)) { 836 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 837 } 838 } 839 cleanup_and_bail_uncompressed: 840 /* 841 * No compression, but we still need to write the pages in the file 842 * we've been given so far. redirty the locked page if it corresponds 843 * to our extent and set things up for the async work queue to run 844 * cow_file_range to do the normal delalloc dance. 845 */ 846 if (async_chunk->locked_page && 847 (page_offset(async_chunk->locked_page) >= start && 848 page_offset(async_chunk->locked_page)) <= end) { 849 __set_page_dirty_nobuffers(async_chunk->locked_page); 850 /* unlocked later on in the async handlers */ 851 } 852 853 if (redirty) 854 extent_range_redirty_for_io(inode, start, end); 855 add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, 856 BTRFS_COMPRESS_NONE); 857 compressed_extents++; 858 859 return compressed_extents; 860 } 861 862 static void free_async_extent_pages(struct async_extent *async_extent) 863 { 864 int i; 865 866 if (!async_extent->pages) 867 return; 868 869 for (i = 0; i < async_extent->nr_pages; i++) { 870 WARN_ON(async_extent->pages[i]->mapping); 871 put_page(async_extent->pages[i]); 872 } 873 kfree(async_extent->pages); 874 async_extent->nr_pages = 0; 875 async_extent->pages = NULL; 876 } 877 878 static int submit_uncompressed_range(struct btrfs_inode *inode, 879 struct async_extent *async_extent, 880 struct page *locked_page) 881 { 882 u64 start = async_extent->start; 883 u64 end = async_extent->start + async_extent->ram_size - 1; 884 unsigned long nr_written = 0; 885 int page_started = 0; 886 int ret; 887 888 /* 889 * Call cow_file_range() to run the delalloc range directly, since we 890 * won't go to NOCOW or async path again. 891 * 892 * Also we call cow_file_range() with @unlock_page == 0, so that we 893 * can directly submit them without interruption. 894 */ 895 ret = cow_file_range(inode, locked_page, start, end, &page_started, 896 &nr_written, 0); 897 /* Inline extent inserted, page gets unlocked and everything is done */ 898 if (page_started) { 899 ret = 0; 900 goto out; 901 } 902 if (ret < 0) { 903 if (locked_page) 904 unlock_page(locked_page); 905 goto out; 906 } 907 908 ret = extent_write_locked_range(&inode->vfs_inode, start, end); 909 /* All pages will be unlocked, including @locked_page */ 910 out: 911 kfree(async_extent); 912 return ret; 913 } 914 915 static int submit_one_async_extent(struct btrfs_inode *inode, 916 struct async_chunk *async_chunk, 917 struct async_extent *async_extent, 918 u64 *alloc_hint) 919 { 920 struct extent_io_tree *io_tree = &inode->io_tree; 921 struct btrfs_root *root = inode->root; 922 struct btrfs_fs_info *fs_info = root->fs_info; 923 struct btrfs_key ins; 924 struct page *locked_page = NULL; 925 struct extent_map *em; 926 int ret = 0; 927 u64 start = async_extent->start; 928 u64 end = async_extent->start + async_extent->ram_size - 1; 929 930 /* 931 * If async_chunk->locked_page is in the async_extent range, we need to 932 * handle it. 933 */ 934 if (async_chunk->locked_page) { 935 u64 locked_page_start = page_offset(async_chunk->locked_page); 936 u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; 937 938 if (!(start >= locked_page_end || end <= locked_page_start)) 939 locked_page = async_chunk->locked_page; 940 } 941 lock_extent(io_tree, start, end); 942 943 /* We have fall back to uncompressed write */ 944 if (!async_extent->pages) 945 return submit_uncompressed_range(inode, async_extent, locked_page); 946 947 ret = btrfs_reserve_extent(root, async_extent->ram_size, 948 async_extent->compressed_size, 949 async_extent->compressed_size, 950 0, *alloc_hint, &ins, 1, 1); 951 if (ret) { 952 free_async_extent_pages(async_extent); 953 /* 954 * Here we used to try again by going back to non-compressed 955 * path for ENOSPC. But we can't reserve space even for 956 * compressed size, how could it work for uncompressed size 957 * which requires larger size? So here we directly go error 958 * path. 959 */ 960 goto out_free; 961 } 962 963 /* Here we're doing allocation and writeback of the compressed pages */ 964 em = create_io_em(inode, start, 965 async_extent->ram_size, /* len */ 966 start, /* orig_start */ 967 ins.objectid, /* block_start */ 968 ins.offset, /* block_len */ 969 ins.offset, /* orig_block_len */ 970 async_extent->ram_size, /* ram_bytes */ 971 async_extent->compress_type, 972 BTRFS_ORDERED_COMPRESSED); 973 if (IS_ERR(em)) { 974 ret = PTR_ERR(em); 975 goto out_free_reserve; 976 } 977 free_extent_map(em); 978 979 ret = btrfs_add_ordered_extent(inode, start, /* file_offset */ 980 async_extent->ram_size, /* num_bytes */ 981 async_extent->ram_size, /* ram_bytes */ 982 ins.objectid, /* disk_bytenr */ 983 ins.offset, /* disk_num_bytes */ 984 0, /* offset */ 985 1 << BTRFS_ORDERED_COMPRESSED, 986 async_extent->compress_type); 987 if (ret) { 988 btrfs_drop_extent_cache(inode, start, end, 0); 989 goto out_free_reserve; 990 } 991 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 992 993 /* Clear dirty, set writeback and unlock the pages. */ 994 extent_clear_unlock_delalloc(inode, start, end, 995 NULL, EXTENT_LOCKED | EXTENT_DELALLOC, 996 PAGE_UNLOCK | PAGE_START_WRITEBACK); 997 if (btrfs_submit_compressed_write(inode, start, /* file_offset */ 998 async_extent->ram_size, /* num_bytes */ 999 ins.objectid, /* disk_bytenr */ 1000 ins.offset, /* compressed_len */ 1001 async_extent->pages, /* compressed_pages */ 1002 async_extent->nr_pages, 1003 async_chunk->write_flags, 1004 async_chunk->blkcg_css, true)) { 1005 const u64 start = async_extent->start; 1006 const u64 end = start + async_extent->ram_size - 1; 1007 1008 btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0); 1009 1010 extent_clear_unlock_delalloc(inode, start, end, NULL, 0, 1011 PAGE_END_WRITEBACK | PAGE_SET_ERROR); 1012 free_async_extent_pages(async_extent); 1013 } 1014 *alloc_hint = ins.objectid + ins.offset; 1015 kfree(async_extent); 1016 return ret; 1017 1018 out_free_reserve: 1019 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1020 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 1021 out_free: 1022 extent_clear_unlock_delalloc(inode, start, end, 1023 NULL, EXTENT_LOCKED | EXTENT_DELALLOC | 1024 EXTENT_DELALLOC_NEW | 1025 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 1026 PAGE_UNLOCK | PAGE_START_WRITEBACK | 1027 PAGE_END_WRITEBACK | PAGE_SET_ERROR); 1028 free_async_extent_pages(async_extent); 1029 kfree(async_extent); 1030 return ret; 1031 } 1032 1033 /* 1034 * Phase two of compressed writeback. This is the ordered portion of the code, 1035 * which only gets called in the order the work was queued. We walk all the 1036 * async extents created by compress_file_range and send them down to the disk. 1037 */ 1038 static noinline void submit_compressed_extents(struct async_chunk *async_chunk) 1039 { 1040 struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); 1041 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1042 struct async_extent *async_extent; 1043 u64 alloc_hint = 0; 1044 int ret = 0; 1045 1046 while (!list_empty(&async_chunk->extents)) { 1047 u64 extent_start; 1048 u64 ram_size; 1049 1050 async_extent = list_entry(async_chunk->extents.next, 1051 struct async_extent, list); 1052 list_del(&async_extent->list); 1053 extent_start = async_extent->start; 1054 ram_size = async_extent->ram_size; 1055 1056 ret = submit_one_async_extent(inode, async_chunk, async_extent, 1057 &alloc_hint); 1058 btrfs_debug(fs_info, 1059 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", 1060 inode->root->root_key.objectid, 1061 btrfs_ino(inode), extent_start, ram_size, ret); 1062 } 1063 } 1064 1065 static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, 1066 u64 num_bytes) 1067 { 1068 struct extent_map_tree *em_tree = &inode->extent_tree; 1069 struct extent_map *em; 1070 u64 alloc_hint = 0; 1071 1072 read_lock(&em_tree->lock); 1073 em = search_extent_mapping(em_tree, start, num_bytes); 1074 if (em) { 1075 /* 1076 * if block start isn't an actual block number then find the 1077 * first block in this inode and use that as a hint. If that 1078 * block is also bogus then just don't worry about it. 1079 */ 1080 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 1081 free_extent_map(em); 1082 em = search_extent_mapping(em_tree, 0, 0); 1083 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 1084 alloc_hint = em->block_start; 1085 if (em) 1086 free_extent_map(em); 1087 } else { 1088 alloc_hint = em->block_start; 1089 free_extent_map(em); 1090 } 1091 } 1092 read_unlock(&em_tree->lock); 1093 1094 return alloc_hint; 1095 } 1096 1097 /* 1098 * when extent_io.c finds a delayed allocation range in the file, 1099 * the call backs end up in this code. The basic idea is to 1100 * allocate extents on disk for the range, and create ordered data structs 1101 * in ram to track those extents. 1102 * 1103 * locked_page is the page that writepage had locked already. We use 1104 * it to make sure we don't do extra locks or unlocks. 1105 * 1106 * *page_started is set to one if we unlock locked_page and do everything 1107 * required to start IO on it. It may be clean and already done with 1108 * IO when we return. 1109 */ 1110 static noinline int cow_file_range(struct btrfs_inode *inode, 1111 struct page *locked_page, 1112 u64 start, u64 end, int *page_started, 1113 unsigned long *nr_written, int unlock) 1114 { 1115 struct btrfs_root *root = inode->root; 1116 struct btrfs_fs_info *fs_info = root->fs_info; 1117 u64 alloc_hint = 0; 1118 u64 num_bytes; 1119 unsigned long ram_size; 1120 u64 cur_alloc_size = 0; 1121 u64 min_alloc_size; 1122 u64 blocksize = fs_info->sectorsize; 1123 struct btrfs_key ins; 1124 struct extent_map *em; 1125 unsigned clear_bits; 1126 unsigned long page_ops; 1127 bool extent_reserved = false; 1128 int ret = 0; 1129 1130 if (btrfs_is_free_space_inode(inode)) { 1131 ret = -EINVAL; 1132 goto out_unlock; 1133 } 1134 1135 num_bytes = ALIGN(end - start + 1, blocksize); 1136 num_bytes = max(blocksize, num_bytes); 1137 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); 1138 1139 inode_should_defrag(inode, start, end, num_bytes, SZ_64K); 1140 1141 /* 1142 * Due to the page size limit, for subpage we can only trigger the 1143 * writeback for the dirty sectors of page, that means data writeback 1144 * is doing more writeback than what we want. 1145 * 1146 * This is especially unexpected for some call sites like fallocate, 1147 * where we only increase i_size after everything is done. 1148 * This means we can trigger inline extent even if we didn't want to. 1149 * So here we skip inline extent creation completely. 1150 */ 1151 if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { 1152 u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), 1153 end + 1); 1154 1155 /* lets try to make an inline extent */ 1156 ret = cow_file_range_inline(inode, actual_end, 0, 1157 BTRFS_COMPRESS_NONE, NULL, false); 1158 if (ret == 0) { 1159 /* 1160 * We use DO_ACCOUNTING here because we need the 1161 * delalloc_release_metadata to be run _after_ we drop 1162 * our outstanding extent for clearing delalloc for this 1163 * range. 1164 */ 1165 extent_clear_unlock_delalloc(inode, start, end, 1166 locked_page, 1167 EXTENT_LOCKED | EXTENT_DELALLOC | 1168 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 1169 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 1170 PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); 1171 *nr_written = *nr_written + 1172 (end - start + PAGE_SIZE) / PAGE_SIZE; 1173 *page_started = 1; 1174 /* 1175 * locked_page is locked by the caller of 1176 * writepage_delalloc(), not locked by 1177 * __process_pages_contig(). 1178 * 1179 * We can't let __process_pages_contig() to unlock it, 1180 * as it doesn't have any subpage::writers recorded. 1181 * 1182 * Here we manually unlock the page, since the caller 1183 * can't use page_started to determine if it's an 1184 * inline extent or a compressed extent. 1185 */ 1186 unlock_page(locked_page); 1187 goto out; 1188 } else if (ret < 0) { 1189 goto out_unlock; 1190 } 1191 } 1192 1193 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 1194 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 1195 1196 /* 1197 * Relocation relies on the relocated extents to have exactly the same 1198 * size as the original extents. Normally writeback for relocation data 1199 * extents follows a NOCOW path because relocation preallocates the 1200 * extents. However, due to an operation such as scrub turning a block 1201 * group to RO mode, it may fallback to COW mode, so we must make sure 1202 * an extent allocated during COW has exactly the requested size and can 1203 * not be split into smaller extents, otherwise relocation breaks and 1204 * fails during the stage where it updates the bytenr of file extent 1205 * items. 1206 */ 1207 if (btrfs_is_data_reloc_root(root)) 1208 min_alloc_size = num_bytes; 1209 else 1210 min_alloc_size = fs_info->sectorsize; 1211 1212 while (num_bytes > 0) { 1213 cur_alloc_size = num_bytes; 1214 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, 1215 min_alloc_size, 0, alloc_hint, 1216 &ins, 1, 1); 1217 if (ret < 0) 1218 goto out_unlock; 1219 cur_alloc_size = ins.offset; 1220 extent_reserved = true; 1221 1222 ram_size = ins.offset; 1223 em = create_io_em(inode, start, ins.offset, /* len */ 1224 start, /* orig_start */ 1225 ins.objectid, /* block_start */ 1226 ins.offset, /* block_len */ 1227 ins.offset, /* orig_block_len */ 1228 ram_size, /* ram_bytes */ 1229 BTRFS_COMPRESS_NONE, /* compress_type */ 1230 BTRFS_ORDERED_REGULAR /* type */); 1231 if (IS_ERR(em)) { 1232 ret = PTR_ERR(em); 1233 goto out_reserve; 1234 } 1235 free_extent_map(em); 1236 1237 ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size, 1238 ins.objectid, cur_alloc_size, 0, 1239 1 << BTRFS_ORDERED_REGULAR, 1240 BTRFS_COMPRESS_NONE); 1241 if (ret) 1242 goto out_drop_extent_cache; 1243 1244 if (btrfs_is_data_reloc_root(root)) { 1245 ret = btrfs_reloc_clone_csums(inode, start, 1246 cur_alloc_size); 1247 /* 1248 * Only drop cache here, and process as normal. 1249 * 1250 * We must not allow extent_clear_unlock_delalloc() 1251 * at out_unlock label to free meta of this ordered 1252 * extent, as its meta should be freed by 1253 * btrfs_finish_ordered_io(). 1254 * 1255 * So we must continue until @start is increased to 1256 * skip current ordered extent. 1257 */ 1258 if (ret) 1259 btrfs_drop_extent_cache(inode, start, 1260 start + ram_size - 1, 0); 1261 } 1262 1263 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1264 1265 /* 1266 * We're not doing compressed IO, don't unlock the first page 1267 * (which the caller expects to stay locked), don't clear any 1268 * dirty bits and don't set any writeback bits 1269 * 1270 * Do set the Ordered (Private2) bit so we know this page was 1271 * properly setup for writepage. 1272 */ 1273 page_ops = unlock ? PAGE_UNLOCK : 0; 1274 page_ops |= PAGE_SET_ORDERED; 1275 1276 extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, 1277 locked_page, 1278 EXTENT_LOCKED | EXTENT_DELALLOC, 1279 page_ops); 1280 if (num_bytes < cur_alloc_size) 1281 num_bytes = 0; 1282 else 1283 num_bytes -= cur_alloc_size; 1284 alloc_hint = ins.objectid + ins.offset; 1285 start += cur_alloc_size; 1286 extent_reserved = false; 1287 1288 /* 1289 * btrfs_reloc_clone_csums() error, since start is increased 1290 * extent_clear_unlock_delalloc() at out_unlock label won't 1291 * free metadata of current ordered extent, we're OK to exit. 1292 */ 1293 if (ret) 1294 goto out_unlock; 1295 } 1296 out: 1297 return ret; 1298 1299 out_drop_extent_cache: 1300 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); 1301 out_reserve: 1302 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1303 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 1304 out_unlock: 1305 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | 1306 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; 1307 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; 1308 /* 1309 * If we reserved an extent for our delalloc range (or a subrange) and 1310 * failed to create the respective ordered extent, then it means that 1311 * when we reserved the extent we decremented the extent's size from 1312 * the data space_info's bytes_may_use counter and incremented the 1313 * space_info's bytes_reserved counter by the same amount. We must make 1314 * sure extent_clear_unlock_delalloc() does not try to decrement again 1315 * the data space_info's bytes_may_use counter, therefore we do not pass 1316 * it the flag EXTENT_CLEAR_DATA_RESV. 1317 */ 1318 if (extent_reserved) { 1319 extent_clear_unlock_delalloc(inode, start, 1320 start + cur_alloc_size - 1, 1321 locked_page, 1322 clear_bits, 1323 page_ops); 1324 start += cur_alloc_size; 1325 if (start >= end) 1326 goto out; 1327 } 1328 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1329 clear_bits | EXTENT_CLEAR_DATA_RESV, 1330 page_ops); 1331 goto out; 1332 } 1333 1334 /* 1335 * work queue call back to started compression on a file and pages 1336 */ 1337 static noinline void async_cow_start(struct btrfs_work *work) 1338 { 1339 struct async_chunk *async_chunk; 1340 int compressed_extents; 1341 1342 async_chunk = container_of(work, struct async_chunk, work); 1343 1344 compressed_extents = compress_file_range(async_chunk); 1345 if (compressed_extents == 0) { 1346 btrfs_add_delayed_iput(async_chunk->inode); 1347 async_chunk->inode = NULL; 1348 } 1349 } 1350 1351 /* 1352 * work queue call back to submit previously compressed pages 1353 */ 1354 static noinline void async_cow_submit(struct btrfs_work *work) 1355 { 1356 struct async_chunk *async_chunk = container_of(work, struct async_chunk, 1357 work); 1358 struct btrfs_fs_info *fs_info = btrfs_work_owner(work); 1359 unsigned long nr_pages; 1360 1361 nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> 1362 PAGE_SHIFT; 1363 1364 /* 1365 * ->inode could be NULL if async_chunk_start has failed to compress, 1366 * in which case we don't have anything to submit, yet we need to 1367 * always adjust ->async_delalloc_pages as its paired with the init 1368 * happening in cow_file_range_async 1369 */ 1370 if (async_chunk->inode) 1371 submit_compressed_extents(async_chunk); 1372 1373 /* atomic_sub_return implies a barrier */ 1374 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 1375 5 * SZ_1M) 1376 cond_wake_up_nomb(&fs_info->async_submit_wait); 1377 } 1378 1379 static noinline void async_cow_free(struct btrfs_work *work) 1380 { 1381 struct async_chunk *async_chunk; 1382 struct async_cow *async_cow; 1383 1384 async_chunk = container_of(work, struct async_chunk, work); 1385 if (async_chunk->inode) 1386 btrfs_add_delayed_iput(async_chunk->inode); 1387 if (async_chunk->blkcg_css) 1388 css_put(async_chunk->blkcg_css); 1389 1390 async_cow = async_chunk->async_cow; 1391 if (atomic_dec_and_test(&async_cow->num_chunks)) 1392 kvfree(async_cow); 1393 } 1394 1395 static int cow_file_range_async(struct btrfs_inode *inode, 1396 struct writeback_control *wbc, 1397 struct page *locked_page, 1398 u64 start, u64 end, int *page_started, 1399 unsigned long *nr_written) 1400 { 1401 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1402 struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); 1403 struct async_cow *ctx; 1404 struct async_chunk *async_chunk; 1405 unsigned long nr_pages; 1406 u64 cur_end; 1407 u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); 1408 int i; 1409 bool should_compress; 1410 unsigned nofs_flag; 1411 const unsigned int write_flags = wbc_to_write_flags(wbc); 1412 1413 unlock_extent(&inode->io_tree, start, end); 1414 1415 if (inode->flags & BTRFS_INODE_NOCOMPRESS && 1416 !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { 1417 num_chunks = 1; 1418 should_compress = false; 1419 } else { 1420 should_compress = true; 1421 } 1422 1423 nofs_flag = memalloc_nofs_save(); 1424 ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); 1425 memalloc_nofs_restore(nofs_flag); 1426 1427 if (!ctx) { 1428 unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | 1429 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 1430 EXTENT_DO_ACCOUNTING; 1431 unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | 1432 PAGE_END_WRITEBACK | PAGE_SET_ERROR; 1433 1434 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1435 clear_bits, page_ops); 1436 return -ENOMEM; 1437 } 1438 1439 async_chunk = ctx->chunks; 1440 atomic_set(&ctx->num_chunks, num_chunks); 1441 1442 for (i = 0; i < num_chunks; i++) { 1443 if (should_compress) 1444 cur_end = min(end, start + SZ_512K - 1); 1445 else 1446 cur_end = end; 1447 1448 /* 1449 * igrab is called higher up in the call chain, take only the 1450 * lightweight reference for the callback lifetime 1451 */ 1452 ihold(&inode->vfs_inode); 1453 async_chunk[i].async_cow = ctx; 1454 async_chunk[i].inode = &inode->vfs_inode; 1455 async_chunk[i].start = start; 1456 async_chunk[i].end = cur_end; 1457 async_chunk[i].write_flags = write_flags; 1458 INIT_LIST_HEAD(&async_chunk[i].extents); 1459 1460 /* 1461 * The locked_page comes all the way from writepage and its 1462 * the original page we were actually given. As we spread 1463 * this large delalloc region across multiple async_chunk 1464 * structs, only the first struct needs a pointer to locked_page 1465 * 1466 * This way we don't need racey decisions about who is supposed 1467 * to unlock it. 1468 */ 1469 if (locked_page) { 1470 /* 1471 * Depending on the compressibility, the pages might or 1472 * might not go through async. We want all of them to 1473 * be accounted against wbc once. Let's do it here 1474 * before the paths diverge. wbc accounting is used 1475 * only for foreign writeback detection and doesn't 1476 * need full accuracy. Just account the whole thing 1477 * against the first page. 1478 */ 1479 wbc_account_cgroup_owner(wbc, locked_page, 1480 cur_end - start); 1481 async_chunk[i].locked_page = locked_page; 1482 locked_page = NULL; 1483 } else { 1484 async_chunk[i].locked_page = NULL; 1485 } 1486 1487 if (blkcg_css != blkcg_root_css) { 1488 css_get(blkcg_css); 1489 async_chunk[i].blkcg_css = blkcg_css; 1490 } else { 1491 async_chunk[i].blkcg_css = NULL; 1492 } 1493 1494 btrfs_init_work(&async_chunk[i].work, async_cow_start, 1495 async_cow_submit, async_cow_free); 1496 1497 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); 1498 atomic_add(nr_pages, &fs_info->async_delalloc_pages); 1499 1500 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); 1501 1502 *nr_written += nr_pages; 1503 start = cur_end + 1; 1504 } 1505 *page_started = 1; 1506 return 0; 1507 } 1508 1509 static noinline int run_delalloc_zoned(struct btrfs_inode *inode, 1510 struct page *locked_page, u64 start, 1511 u64 end, int *page_started, 1512 unsigned long *nr_written) 1513 { 1514 int ret; 1515 1516 ret = cow_file_range(inode, locked_page, start, end, page_started, 1517 nr_written, 0); 1518 if (ret) 1519 return ret; 1520 1521 if (*page_started) 1522 return 0; 1523 1524 __set_page_dirty_nobuffers(locked_page); 1525 account_page_redirty(locked_page); 1526 extent_write_locked_range(&inode->vfs_inode, start, end); 1527 *page_started = 1; 1528 1529 return 0; 1530 } 1531 1532 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, 1533 u64 bytenr, u64 num_bytes) 1534 { 1535 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); 1536 struct btrfs_ordered_sum *sums; 1537 int ret; 1538 LIST_HEAD(list); 1539 1540 ret = btrfs_lookup_csums_range(csum_root, bytenr, 1541 bytenr + num_bytes - 1, &list, 0); 1542 if (ret == 0 && list_empty(&list)) 1543 return 0; 1544 1545 while (!list_empty(&list)) { 1546 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1547 list_del(&sums->list); 1548 kfree(sums); 1549 } 1550 if (ret < 0) 1551 return ret; 1552 return 1; 1553 } 1554 1555 static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, 1556 const u64 start, const u64 end, 1557 int *page_started, unsigned long *nr_written) 1558 { 1559 const bool is_space_ino = btrfs_is_free_space_inode(inode); 1560 const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); 1561 const u64 range_bytes = end + 1 - start; 1562 struct extent_io_tree *io_tree = &inode->io_tree; 1563 u64 range_start = start; 1564 u64 count; 1565 1566 /* 1567 * If EXTENT_NORESERVE is set it means that when the buffered write was 1568 * made we had not enough available data space and therefore we did not 1569 * reserve data space for it, since we though we could do NOCOW for the 1570 * respective file range (either there is prealloc extent or the inode 1571 * has the NOCOW bit set). 1572 * 1573 * However when we need to fallback to COW mode (because for example the 1574 * block group for the corresponding extent was turned to RO mode by a 1575 * scrub or relocation) we need to do the following: 1576 * 1577 * 1) We increment the bytes_may_use counter of the data space info. 1578 * If COW succeeds, it allocates a new data extent and after doing 1579 * that it decrements the space info's bytes_may_use counter and 1580 * increments its bytes_reserved counter by the same amount (we do 1581 * this at btrfs_add_reserved_bytes()). So we need to increment the 1582 * bytes_may_use counter to compensate (when space is reserved at 1583 * buffered write time, the bytes_may_use counter is incremented); 1584 * 1585 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so 1586 * that if the COW path fails for any reason, it decrements (through 1587 * extent_clear_unlock_delalloc()) the bytes_may_use counter of the 1588 * data space info, which we incremented in the step above. 1589 * 1590 * If we need to fallback to cow and the inode corresponds to a free 1591 * space cache inode or an inode of the data relocation tree, we must 1592 * also increment bytes_may_use of the data space_info for the same 1593 * reason. Space caches and relocated data extents always get a prealloc 1594 * extent for them, however scrub or balance may have set the block 1595 * group that contains that extent to RO mode and therefore force COW 1596 * when starting writeback. 1597 */ 1598 count = count_range_bits(io_tree, &range_start, end, range_bytes, 1599 EXTENT_NORESERVE, 0); 1600 if (count > 0 || is_space_ino || is_reloc_ino) { 1601 u64 bytes = count; 1602 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1603 struct btrfs_space_info *sinfo = fs_info->data_sinfo; 1604 1605 if (is_space_ino || is_reloc_ino) 1606 bytes = range_bytes; 1607 1608 spin_lock(&sinfo->lock); 1609 btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); 1610 spin_unlock(&sinfo->lock); 1611 1612 if (count > 0) 1613 clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, 1614 0, 0, NULL); 1615 } 1616 1617 return cow_file_range(inode, locked_page, start, end, page_started, 1618 nr_written, 1); 1619 } 1620 1621 /* 1622 * when nowcow writeback call back. This checks for snapshots or COW copies 1623 * of the extents that exist in the file, and COWs the file as required. 1624 * 1625 * If no cow copies or snapshots exist, we write directly to the existing 1626 * blocks on disk 1627 */ 1628 static noinline int run_delalloc_nocow(struct btrfs_inode *inode, 1629 struct page *locked_page, 1630 const u64 start, const u64 end, 1631 int *page_started, 1632 unsigned long *nr_written) 1633 { 1634 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1635 struct btrfs_root *root = inode->root; 1636 struct btrfs_path *path; 1637 u64 cow_start = (u64)-1; 1638 u64 cur_offset = start; 1639 int ret; 1640 bool check_prev = true; 1641 const bool freespace_inode = btrfs_is_free_space_inode(inode); 1642 u64 ino = btrfs_ino(inode); 1643 bool nocow = false; 1644 u64 disk_bytenr = 0; 1645 const bool force = inode->flags & BTRFS_INODE_NODATACOW; 1646 1647 path = btrfs_alloc_path(); 1648 if (!path) { 1649 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1650 EXTENT_LOCKED | EXTENT_DELALLOC | 1651 EXTENT_DO_ACCOUNTING | 1652 EXTENT_DEFRAG, PAGE_UNLOCK | 1653 PAGE_START_WRITEBACK | 1654 PAGE_END_WRITEBACK); 1655 return -ENOMEM; 1656 } 1657 1658 while (1) { 1659 struct btrfs_key found_key; 1660 struct btrfs_file_extent_item *fi; 1661 struct extent_buffer *leaf; 1662 u64 extent_end; 1663 u64 extent_offset; 1664 u64 num_bytes = 0; 1665 u64 disk_num_bytes; 1666 u64 ram_bytes; 1667 int extent_type; 1668 1669 nocow = false; 1670 1671 ret = btrfs_lookup_file_extent(NULL, root, path, ino, 1672 cur_offset, 0); 1673 if (ret < 0) 1674 goto error; 1675 1676 /* 1677 * If there is no extent for our range when doing the initial 1678 * search, then go back to the previous slot as it will be the 1679 * one containing the search offset 1680 */ 1681 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1682 leaf = path->nodes[0]; 1683 btrfs_item_key_to_cpu(leaf, &found_key, 1684 path->slots[0] - 1); 1685 if (found_key.objectid == ino && 1686 found_key.type == BTRFS_EXTENT_DATA_KEY) 1687 path->slots[0]--; 1688 } 1689 check_prev = false; 1690 next_slot: 1691 /* Go to next leaf if we have exhausted the current one */ 1692 leaf = path->nodes[0]; 1693 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1694 ret = btrfs_next_leaf(root, path); 1695 if (ret < 0) { 1696 if (cow_start != (u64)-1) 1697 cur_offset = cow_start; 1698 goto error; 1699 } 1700 if (ret > 0) 1701 break; 1702 leaf = path->nodes[0]; 1703 } 1704 1705 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1706 1707 /* Didn't find anything for our INO */ 1708 if (found_key.objectid > ino) 1709 break; 1710 /* 1711 * Keep searching until we find an EXTENT_ITEM or there are no 1712 * more extents for this inode 1713 */ 1714 if (WARN_ON_ONCE(found_key.objectid < ino) || 1715 found_key.type < BTRFS_EXTENT_DATA_KEY) { 1716 path->slots[0]++; 1717 goto next_slot; 1718 } 1719 1720 /* Found key is not EXTENT_DATA_KEY or starts after req range */ 1721 if (found_key.type > BTRFS_EXTENT_DATA_KEY || 1722 found_key.offset > end) 1723 break; 1724 1725 /* 1726 * If the found extent starts after requested offset, then 1727 * adjust extent_end to be right before this extent begins 1728 */ 1729 if (found_key.offset > cur_offset) { 1730 extent_end = found_key.offset; 1731 extent_type = 0; 1732 goto out_check; 1733 } 1734 1735 /* 1736 * Found extent which begins before our range and potentially 1737 * intersect it 1738 */ 1739 fi = btrfs_item_ptr(leaf, path->slots[0], 1740 struct btrfs_file_extent_item); 1741 extent_type = btrfs_file_extent_type(leaf, fi); 1742 1743 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1744 if (extent_type == BTRFS_FILE_EXTENT_REG || 1745 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1746 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1747 extent_offset = btrfs_file_extent_offset(leaf, fi); 1748 extent_end = found_key.offset + 1749 btrfs_file_extent_num_bytes(leaf, fi); 1750 disk_num_bytes = 1751 btrfs_file_extent_disk_num_bytes(leaf, fi); 1752 /* 1753 * If the extent we got ends before our current offset, 1754 * skip to the next extent. 1755 */ 1756 if (extent_end <= cur_offset) { 1757 path->slots[0]++; 1758 goto next_slot; 1759 } 1760 /* Skip holes */ 1761 if (disk_bytenr == 0) 1762 goto out_check; 1763 /* Skip compressed/encrypted/encoded extents */ 1764 if (btrfs_file_extent_compression(leaf, fi) || 1765 btrfs_file_extent_encryption(leaf, fi) || 1766 btrfs_file_extent_other_encoding(leaf, fi)) 1767 goto out_check; 1768 /* 1769 * If extent is created before the last volume's snapshot 1770 * this implies the extent is shared, hence we can't do 1771 * nocow. This is the same check as in 1772 * btrfs_cross_ref_exist but without calling 1773 * btrfs_search_slot. 1774 */ 1775 if (!freespace_inode && 1776 btrfs_file_extent_generation(leaf, fi) <= 1777 btrfs_root_last_snapshot(&root->root_item)) 1778 goto out_check; 1779 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1780 goto out_check; 1781 1782 /* 1783 * The following checks can be expensive, as they need to 1784 * take other locks and do btree or rbtree searches, so 1785 * release the path to avoid blocking other tasks for too 1786 * long. 1787 */ 1788 btrfs_release_path(path); 1789 1790 ret = btrfs_cross_ref_exist(root, ino, 1791 found_key.offset - 1792 extent_offset, disk_bytenr, false); 1793 if (ret) { 1794 /* 1795 * ret could be -EIO if the above fails to read 1796 * metadata. 1797 */ 1798 if (ret < 0) { 1799 if (cow_start != (u64)-1) 1800 cur_offset = cow_start; 1801 goto error; 1802 } 1803 1804 WARN_ON_ONCE(freespace_inode); 1805 goto out_check; 1806 } 1807 disk_bytenr += extent_offset; 1808 disk_bytenr += cur_offset - found_key.offset; 1809 num_bytes = min(end + 1, extent_end) - cur_offset; 1810 /* 1811 * If there are pending snapshots for this root, we 1812 * fall into common COW way 1813 */ 1814 if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) 1815 goto out_check; 1816 /* 1817 * force cow if csum exists in the range. 1818 * this ensure that csum for a given extent are 1819 * either valid or do not exist. 1820 */ 1821 ret = csum_exist_in_range(fs_info, disk_bytenr, 1822 num_bytes); 1823 if (ret) { 1824 /* 1825 * ret could be -EIO if the above fails to read 1826 * metadata. 1827 */ 1828 if (ret < 0) { 1829 if (cow_start != (u64)-1) 1830 cur_offset = cow_start; 1831 goto error; 1832 } 1833 WARN_ON_ONCE(freespace_inode); 1834 goto out_check; 1835 } 1836 /* If the extent's block group is RO, we must COW */ 1837 if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) 1838 goto out_check; 1839 nocow = true; 1840 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1841 extent_end = found_key.offset + ram_bytes; 1842 extent_end = ALIGN(extent_end, fs_info->sectorsize); 1843 /* Skip extents outside of our requested range */ 1844 if (extent_end <= start) { 1845 path->slots[0]++; 1846 goto next_slot; 1847 } 1848 } else { 1849 /* If this triggers then we have a memory corruption */ 1850 BUG(); 1851 } 1852 out_check: 1853 /* 1854 * If nocow is false then record the beginning of the range 1855 * that needs to be COWed 1856 */ 1857 if (!nocow) { 1858 if (cow_start == (u64)-1) 1859 cow_start = cur_offset; 1860 cur_offset = extent_end; 1861 if (cur_offset > end) 1862 break; 1863 if (!path->nodes[0]) 1864 continue; 1865 path->slots[0]++; 1866 goto next_slot; 1867 } 1868 1869 /* 1870 * COW range from cow_start to found_key.offset - 1. As the key 1871 * will contain the beginning of the first extent that can be 1872 * NOCOW, following one which needs to be COW'ed 1873 */ 1874 if (cow_start != (u64)-1) { 1875 ret = fallback_to_cow(inode, locked_page, 1876 cow_start, found_key.offset - 1, 1877 page_started, nr_written); 1878 if (ret) 1879 goto error; 1880 cow_start = (u64)-1; 1881 } 1882 1883 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1884 u64 orig_start = found_key.offset - extent_offset; 1885 struct extent_map *em; 1886 1887 em = create_io_em(inode, cur_offset, num_bytes, 1888 orig_start, 1889 disk_bytenr, /* block_start */ 1890 num_bytes, /* block_len */ 1891 disk_num_bytes, /* orig_block_len */ 1892 ram_bytes, BTRFS_COMPRESS_NONE, 1893 BTRFS_ORDERED_PREALLOC); 1894 if (IS_ERR(em)) { 1895 ret = PTR_ERR(em); 1896 goto error; 1897 } 1898 free_extent_map(em); 1899 ret = btrfs_add_ordered_extent(inode, 1900 cur_offset, num_bytes, num_bytes, 1901 disk_bytenr, num_bytes, 0, 1902 1 << BTRFS_ORDERED_PREALLOC, 1903 BTRFS_COMPRESS_NONE); 1904 if (ret) { 1905 btrfs_drop_extent_cache(inode, cur_offset, 1906 cur_offset + num_bytes - 1, 1907 0); 1908 goto error; 1909 } 1910 } else { 1911 ret = btrfs_add_ordered_extent(inode, cur_offset, 1912 num_bytes, num_bytes, 1913 disk_bytenr, num_bytes, 1914 0, 1915 1 << BTRFS_ORDERED_NOCOW, 1916 BTRFS_COMPRESS_NONE); 1917 if (ret) 1918 goto error; 1919 } 1920 1921 if (nocow) 1922 btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1923 nocow = false; 1924 1925 if (btrfs_is_data_reloc_root(root)) 1926 /* 1927 * Error handled later, as we must prevent 1928 * extent_clear_unlock_delalloc() in error handler 1929 * from freeing metadata of created ordered extent. 1930 */ 1931 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1932 num_bytes); 1933 1934 extent_clear_unlock_delalloc(inode, cur_offset, 1935 cur_offset + num_bytes - 1, 1936 locked_page, EXTENT_LOCKED | 1937 EXTENT_DELALLOC | 1938 EXTENT_CLEAR_DATA_RESV, 1939 PAGE_UNLOCK | PAGE_SET_ORDERED); 1940 1941 cur_offset = extent_end; 1942 1943 /* 1944 * btrfs_reloc_clone_csums() error, now we're OK to call error 1945 * handler, as metadata for created ordered extent will only 1946 * be freed by btrfs_finish_ordered_io(). 1947 */ 1948 if (ret) 1949 goto error; 1950 if (cur_offset > end) 1951 break; 1952 } 1953 btrfs_release_path(path); 1954 1955 if (cur_offset <= end && cow_start == (u64)-1) 1956 cow_start = cur_offset; 1957 1958 if (cow_start != (u64)-1) { 1959 cur_offset = end; 1960 ret = fallback_to_cow(inode, locked_page, cow_start, end, 1961 page_started, nr_written); 1962 if (ret) 1963 goto error; 1964 } 1965 1966 error: 1967 if (nocow) 1968 btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1969 1970 if (ret && cur_offset < end) 1971 extent_clear_unlock_delalloc(inode, cur_offset, end, 1972 locked_page, EXTENT_LOCKED | 1973 EXTENT_DELALLOC | EXTENT_DEFRAG | 1974 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 1975 PAGE_START_WRITEBACK | 1976 PAGE_END_WRITEBACK); 1977 btrfs_free_path(path); 1978 return ret; 1979 } 1980 1981 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) 1982 { 1983 if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { 1984 if (inode->defrag_bytes && 1985 test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 1986 0, NULL)) 1987 return false; 1988 return true; 1989 } 1990 return false; 1991 } 1992 1993 /* 1994 * Function to process delayed allocation (create CoW) for ranges which are 1995 * being touched for the first time. 1996 */ 1997 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, 1998 u64 start, u64 end, int *page_started, unsigned long *nr_written, 1999 struct writeback_control *wbc) 2000 { 2001 int ret; 2002 const bool zoned = btrfs_is_zoned(inode->root->fs_info); 2003 2004 /* 2005 * The range must cover part of the @locked_page, or the returned 2006 * @page_started can confuse the caller. 2007 */ 2008 ASSERT(!(end <= page_offset(locked_page) || 2009 start >= page_offset(locked_page) + PAGE_SIZE)); 2010 2011 if (should_nocow(inode, start, end)) { 2012 /* 2013 * Normally on a zoned device we're only doing COW writes, but 2014 * in case of relocation on a zoned filesystem we have taken 2015 * precaution, that we're only writing sequentially. It's safe 2016 * to use run_delalloc_nocow() here, like for regular 2017 * preallocated inodes. 2018 */ 2019 ASSERT(!zoned || 2020 (zoned && btrfs_is_data_reloc_root(inode->root))); 2021 ret = run_delalloc_nocow(inode, locked_page, start, end, 2022 page_started, nr_written); 2023 } else if (!inode_can_compress(inode) || 2024 !inode_need_compress(inode, start, end)) { 2025 if (zoned) 2026 ret = run_delalloc_zoned(inode, locked_page, start, end, 2027 page_started, nr_written); 2028 else 2029 ret = cow_file_range(inode, locked_page, start, end, 2030 page_started, nr_written, 1); 2031 } else { 2032 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); 2033 ret = cow_file_range_async(inode, wbc, locked_page, start, end, 2034 page_started, nr_written); 2035 } 2036 ASSERT(ret <= 0); 2037 if (ret) 2038 btrfs_cleanup_ordered_extents(inode, locked_page, start, 2039 end - start + 1); 2040 return ret; 2041 } 2042 2043 void btrfs_split_delalloc_extent(struct inode *inode, 2044 struct extent_state *orig, u64 split) 2045 { 2046 u64 size; 2047 2048 /* not delalloc, ignore it */ 2049 if (!(orig->state & EXTENT_DELALLOC)) 2050 return; 2051 2052 size = orig->end - orig->start + 1; 2053 if (size > BTRFS_MAX_EXTENT_SIZE) { 2054 u32 num_extents; 2055 u64 new_size; 2056 2057 /* 2058 * See the explanation in btrfs_merge_delalloc_extent, the same 2059 * applies here, just in reverse. 2060 */ 2061 new_size = orig->end - split + 1; 2062 num_extents = count_max_extents(new_size); 2063 new_size = split - orig->start; 2064 num_extents += count_max_extents(new_size); 2065 if (count_max_extents(size) >= num_extents) 2066 return; 2067 } 2068 2069 spin_lock(&BTRFS_I(inode)->lock); 2070 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); 2071 spin_unlock(&BTRFS_I(inode)->lock); 2072 } 2073 2074 /* 2075 * Handle merged delayed allocation extents so we can keep track of new extents 2076 * that are just merged onto old extents, such as when we are doing sequential 2077 * writes, so we can properly account for the metadata space we'll need. 2078 */ 2079 void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, 2080 struct extent_state *other) 2081 { 2082 u64 new_size, old_size; 2083 u32 num_extents; 2084 2085 /* not delalloc, ignore it */ 2086 if (!(other->state & EXTENT_DELALLOC)) 2087 return; 2088 2089 if (new->start > other->start) 2090 new_size = new->end - other->start + 1; 2091 else 2092 new_size = other->end - new->start + 1; 2093 2094 /* we're not bigger than the max, unreserve the space and go */ 2095 if (new_size <= BTRFS_MAX_EXTENT_SIZE) { 2096 spin_lock(&BTRFS_I(inode)->lock); 2097 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); 2098 spin_unlock(&BTRFS_I(inode)->lock); 2099 return; 2100 } 2101 2102 /* 2103 * We have to add up either side to figure out how many extents were 2104 * accounted for before we merged into one big extent. If the number of 2105 * extents we accounted for is <= the amount we need for the new range 2106 * then we can return, otherwise drop. Think of it like this 2107 * 2108 * [ 4k][MAX_SIZE] 2109 * 2110 * So we've grown the extent by a MAX_SIZE extent, this would mean we 2111 * need 2 outstanding extents, on one side we have 1 and the other side 2112 * we have 1 so they are == and we can return. But in this case 2113 * 2114 * [MAX_SIZE+4k][MAX_SIZE+4k] 2115 * 2116 * Each range on their own accounts for 2 extents, but merged together 2117 * they are only 3 extents worth of accounting, so we need to drop in 2118 * this case. 2119 */ 2120 old_size = other->end - other->start + 1; 2121 num_extents = count_max_extents(old_size); 2122 old_size = new->end - new->start + 1; 2123 num_extents += count_max_extents(old_size); 2124 if (count_max_extents(new_size) >= num_extents) 2125 return; 2126 2127 spin_lock(&BTRFS_I(inode)->lock); 2128 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); 2129 spin_unlock(&BTRFS_I(inode)->lock); 2130 } 2131 2132 static void btrfs_add_delalloc_inodes(struct btrfs_root *root, 2133 struct inode *inode) 2134 { 2135 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2136 2137 spin_lock(&root->delalloc_lock); 2138 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 2139 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 2140 &root->delalloc_inodes); 2141 set_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2142 &BTRFS_I(inode)->runtime_flags); 2143 root->nr_delalloc_inodes++; 2144 if (root->nr_delalloc_inodes == 1) { 2145 spin_lock(&fs_info->delalloc_root_lock); 2146 BUG_ON(!list_empty(&root->delalloc_root)); 2147 list_add_tail(&root->delalloc_root, 2148 &fs_info->delalloc_roots); 2149 spin_unlock(&fs_info->delalloc_root_lock); 2150 } 2151 } 2152 spin_unlock(&root->delalloc_lock); 2153 } 2154 2155 2156 void __btrfs_del_delalloc_inode(struct btrfs_root *root, 2157 struct btrfs_inode *inode) 2158 { 2159 struct btrfs_fs_info *fs_info = root->fs_info; 2160 2161 if (!list_empty(&inode->delalloc_inodes)) { 2162 list_del_init(&inode->delalloc_inodes); 2163 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2164 &inode->runtime_flags); 2165 root->nr_delalloc_inodes--; 2166 if (!root->nr_delalloc_inodes) { 2167 ASSERT(list_empty(&root->delalloc_inodes)); 2168 spin_lock(&fs_info->delalloc_root_lock); 2169 BUG_ON(list_empty(&root->delalloc_root)); 2170 list_del_init(&root->delalloc_root); 2171 spin_unlock(&fs_info->delalloc_root_lock); 2172 } 2173 } 2174 } 2175 2176 static void btrfs_del_delalloc_inode(struct btrfs_root *root, 2177 struct btrfs_inode *inode) 2178 { 2179 spin_lock(&root->delalloc_lock); 2180 __btrfs_del_delalloc_inode(root, inode); 2181 spin_unlock(&root->delalloc_lock); 2182 } 2183 2184 /* 2185 * Properly track delayed allocation bytes in the inode and to maintain the 2186 * list of inodes that have pending delalloc work to be done. 2187 */ 2188 void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, 2189 unsigned *bits) 2190 { 2191 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2192 2193 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) 2194 WARN_ON(1); 2195 /* 2196 * set_bit and clear bit hooks normally require _irqsave/restore 2197 * but in this case, we are only testing for the DELALLOC 2198 * bit, which is only set or cleared with irqs on 2199 */ 2200 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 2201 struct btrfs_root *root = BTRFS_I(inode)->root; 2202 u64 len = state->end + 1 - state->start; 2203 u32 num_extents = count_max_extents(len); 2204 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); 2205 2206 spin_lock(&BTRFS_I(inode)->lock); 2207 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); 2208 spin_unlock(&BTRFS_I(inode)->lock); 2209 2210 /* For sanity tests */ 2211 if (btrfs_is_testing(fs_info)) 2212 return; 2213 2214 percpu_counter_add_batch(&fs_info->delalloc_bytes, len, 2215 fs_info->delalloc_batch); 2216 spin_lock(&BTRFS_I(inode)->lock); 2217 BTRFS_I(inode)->delalloc_bytes += len; 2218 if (*bits & EXTENT_DEFRAG) 2219 BTRFS_I(inode)->defrag_bytes += len; 2220 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2221 &BTRFS_I(inode)->runtime_flags)) 2222 btrfs_add_delalloc_inodes(root, inode); 2223 spin_unlock(&BTRFS_I(inode)->lock); 2224 } 2225 2226 if (!(state->state & EXTENT_DELALLOC_NEW) && 2227 (*bits & EXTENT_DELALLOC_NEW)) { 2228 spin_lock(&BTRFS_I(inode)->lock); 2229 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - 2230 state->start; 2231 spin_unlock(&BTRFS_I(inode)->lock); 2232 } 2233 } 2234 2235 /* 2236 * Once a range is no longer delalloc this function ensures that proper 2237 * accounting happens. 2238 */ 2239 void btrfs_clear_delalloc_extent(struct inode *vfs_inode, 2240 struct extent_state *state, unsigned *bits) 2241 { 2242 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 2243 struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); 2244 u64 len = state->end + 1 - state->start; 2245 u32 num_extents = count_max_extents(len); 2246 2247 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { 2248 spin_lock(&inode->lock); 2249 inode->defrag_bytes -= len; 2250 spin_unlock(&inode->lock); 2251 } 2252 2253 /* 2254 * set_bit and clear bit hooks normally require _irqsave/restore 2255 * but in this case, we are only testing for the DELALLOC 2256 * bit, which is only set or cleared with irqs on 2257 */ 2258 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 2259 struct btrfs_root *root = inode->root; 2260 bool do_list = !btrfs_is_free_space_inode(inode); 2261 2262 spin_lock(&inode->lock); 2263 btrfs_mod_outstanding_extents(inode, -num_extents); 2264 spin_unlock(&inode->lock); 2265 2266 /* 2267 * We don't reserve metadata space for space cache inodes so we 2268 * don't need to call delalloc_release_metadata if there is an 2269 * error. 2270 */ 2271 if (*bits & EXTENT_CLEAR_META_RESV && 2272 root != fs_info->tree_root) 2273 btrfs_delalloc_release_metadata(inode, len, false); 2274 2275 /* For sanity tests. */ 2276 if (btrfs_is_testing(fs_info)) 2277 return; 2278 2279 if (!btrfs_is_data_reloc_root(root) && 2280 do_list && !(state->state & EXTENT_NORESERVE) && 2281 (*bits & EXTENT_CLEAR_DATA_RESV)) 2282 btrfs_free_reserved_data_space_noquota(fs_info, len); 2283 2284 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, 2285 fs_info->delalloc_batch); 2286 spin_lock(&inode->lock); 2287 inode->delalloc_bytes -= len; 2288 if (do_list && inode->delalloc_bytes == 0 && 2289 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 2290 &inode->runtime_flags)) 2291 btrfs_del_delalloc_inode(root, inode); 2292 spin_unlock(&inode->lock); 2293 } 2294 2295 if ((state->state & EXTENT_DELALLOC_NEW) && 2296 (*bits & EXTENT_DELALLOC_NEW)) { 2297 spin_lock(&inode->lock); 2298 ASSERT(inode->new_delalloc_bytes >= len); 2299 inode->new_delalloc_bytes -= len; 2300 if (*bits & EXTENT_ADD_INODE_BYTES) 2301 inode_add_bytes(&inode->vfs_inode, len); 2302 spin_unlock(&inode->lock); 2303 } 2304 } 2305 2306 /* 2307 * in order to insert checksums into the metadata in large chunks, 2308 * we wait until bio submission time. All the pages in the bio are 2309 * checksummed and sums are attached onto the ordered extent record. 2310 * 2311 * At IO completion time the cums attached on the ordered extent record 2312 * are inserted into the btree 2313 */ 2314 static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, 2315 u64 dio_file_offset) 2316 { 2317 return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); 2318 } 2319 2320 /* 2321 * Split an extent_map at [start, start + len] 2322 * 2323 * This function is intended to be used only for extract_ordered_extent(). 2324 */ 2325 static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, 2326 u64 pre, u64 post) 2327 { 2328 struct extent_map_tree *em_tree = &inode->extent_tree; 2329 struct extent_map *em; 2330 struct extent_map *split_pre = NULL; 2331 struct extent_map *split_mid = NULL; 2332 struct extent_map *split_post = NULL; 2333 int ret = 0; 2334 unsigned long flags; 2335 2336 /* Sanity check */ 2337 if (pre == 0 && post == 0) 2338 return 0; 2339 2340 split_pre = alloc_extent_map(); 2341 if (pre) 2342 split_mid = alloc_extent_map(); 2343 if (post) 2344 split_post = alloc_extent_map(); 2345 if (!split_pre || (pre && !split_mid) || (post && !split_post)) { 2346 ret = -ENOMEM; 2347 goto out; 2348 } 2349 2350 ASSERT(pre + post < len); 2351 2352 lock_extent(&inode->io_tree, start, start + len - 1); 2353 write_lock(&em_tree->lock); 2354 em = lookup_extent_mapping(em_tree, start, len); 2355 if (!em) { 2356 ret = -EIO; 2357 goto out_unlock; 2358 } 2359 2360 ASSERT(em->len == len); 2361 ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); 2362 ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); 2363 ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); 2364 ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); 2365 ASSERT(!list_empty(&em->list)); 2366 2367 flags = em->flags; 2368 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 2369 2370 /* First, replace the em with a new extent_map starting from * em->start */ 2371 split_pre->start = em->start; 2372 split_pre->len = (pre ? pre : em->len - post); 2373 split_pre->orig_start = split_pre->start; 2374 split_pre->block_start = em->block_start; 2375 split_pre->block_len = split_pre->len; 2376 split_pre->orig_block_len = split_pre->block_len; 2377 split_pre->ram_bytes = split_pre->len; 2378 split_pre->flags = flags; 2379 split_pre->compress_type = em->compress_type; 2380 split_pre->generation = em->generation; 2381 2382 replace_extent_mapping(em_tree, em, split_pre, 1); 2383 2384 /* 2385 * Now we only have an extent_map at: 2386 * [em->start, em->start + pre] if pre != 0 2387 * [em->start, em->start + em->len - post] if pre == 0 2388 */ 2389 2390 if (pre) { 2391 /* Insert the middle extent_map */ 2392 split_mid->start = em->start + pre; 2393 split_mid->len = em->len - pre - post; 2394 split_mid->orig_start = split_mid->start; 2395 split_mid->block_start = em->block_start + pre; 2396 split_mid->block_len = split_mid->len; 2397 split_mid->orig_block_len = split_mid->block_len; 2398 split_mid->ram_bytes = split_mid->len; 2399 split_mid->flags = flags; 2400 split_mid->compress_type = em->compress_type; 2401 split_mid->generation = em->generation; 2402 add_extent_mapping(em_tree, split_mid, 1); 2403 } 2404 2405 if (post) { 2406 split_post->start = em->start + em->len - post; 2407 split_post->len = post; 2408 split_post->orig_start = split_post->start; 2409 split_post->block_start = em->block_start + em->len - post; 2410 split_post->block_len = split_post->len; 2411 split_post->orig_block_len = split_post->block_len; 2412 split_post->ram_bytes = split_post->len; 2413 split_post->flags = flags; 2414 split_post->compress_type = em->compress_type; 2415 split_post->generation = em->generation; 2416 add_extent_mapping(em_tree, split_post, 1); 2417 } 2418 2419 /* Once for us */ 2420 free_extent_map(em); 2421 /* Once for the tree */ 2422 free_extent_map(em); 2423 2424 out_unlock: 2425 write_unlock(&em_tree->lock); 2426 unlock_extent(&inode->io_tree, start, start + len - 1); 2427 out: 2428 free_extent_map(split_pre); 2429 free_extent_map(split_mid); 2430 free_extent_map(split_post); 2431 2432 return ret; 2433 } 2434 2435 static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, 2436 struct bio *bio, loff_t file_offset) 2437 { 2438 struct btrfs_ordered_extent *ordered; 2439 u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; 2440 u64 file_len; 2441 u64 len = bio->bi_iter.bi_size; 2442 u64 end = start + len; 2443 u64 ordered_end; 2444 u64 pre, post; 2445 int ret = 0; 2446 2447 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 2448 if (WARN_ON_ONCE(!ordered)) 2449 return BLK_STS_IOERR; 2450 2451 /* No need to split */ 2452 if (ordered->disk_num_bytes == len) 2453 goto out; 2454 2455 /* We cannot split once end_bio'd ordered extent */ 2456 if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) { 2457 ret = -EINVAL; 2458 goto out; 2459 } 2460 2461 /* We cannot split a compressed ordered extent */ 2462 if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) { 2463 ret = -EINVAL; 2464 goto out; 2465 } 2466 2467 ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes; 2468 /* bio must be in one ordered extent */ 2469 if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) { 2470 ret = -EINVAL; 2471 goto out; 2472 } 2473 2474 /* Checksum list should be empty */ 2475 if (WARN_ON_ONCE(!list_empty(&ordered->list))) { 2476 ret = -EINVAL; 2477 goto out; 2478 } 2479 2480 file_len = ordered->num_bytes; 2481 pre = start - ordered->disk_bytenr; 2482 post = ordered_end - end; 2483 2484 ret = btrfs_split_ordered_extent(ordered, pre, post); 2485 if (ret) 2486 goto out; 2487 ret = split_zoned_em(inode, file_offset, file_len, pre, post); 2488 2489 out: 2490 btrfs_put_ordered_extent(ordered); 2491 2492 return errno_to_blk_status(ret); 2493 } 2494 2495 /* 2496 * extent_io.c submission hook. This does the right thing for csum calculation 2497 * on write, or reading the csums from the tree before a read. 2498 * 2499 * Rules about async/sync submit, 2500 * a) read: sync submit 2501 * 2502 * b) write without checksum: sync submit 2503 * 2504 * c) write with checksum: 2505 * c-1) if bio is issued by fsync: sync submit 2506 * (sync_writers != 0) 2507 * 2508 * c-2) if root is reloc root: sync submit 2509 * (only in case of buffered IO) 2510 * 2511 * c-3) otherwise: async submit 2512 */ 2513 blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, 2514 int mirror_num, unsigned long bio_flags) 2515 2516 { 2517 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2518 struct btrfs_root *root = BTRFS_I(inode)->root; 2519 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; 2520 blk_status_t ret = 0; 2521 int skip_sum; 2522 int async = !atomic_read(&BTRFS_I(inode)->sync_writers); 2523 2524 skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || 2525 test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); 2526 2527 if (btrfs_is_free_space_inode(BTRFS_I(inode))) 2528 metadata = BTRFS_WQ_ENDIO_FREE_SPACE; 2529 2530 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 2531 struct page *page = bio_first_bvec_all(bio)->bv_page; 2532 loff_t file_offset = page_offset(page); 2533 2534 ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); 2535 if (ret) 2536 goto out; 2537 } 2538 2539 if (btrfs_op(bio) != BTRFS_MAP_WRITE) { 2540 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); 2541 if (ret) 2542 goto out; 2543 2544 if (bio_flags & EXTENT_BIO_COMPRESSED) { 2545 /* 2546 * btrfs_submit_compressed_read will handle completing 2547 * the bio if there were any errors, so just return 2548 * here. 2549 */ 2550 ret = btrfs_submit_compressed_read(inode, bio, 2551 mirror_num, 2552 bio_flags); 2553 goto out_no_endio; 2554 } else { 2555 /* 2556 * Lookup bio sums does extra checks around whether we 2557 * need to csum or not, which is why we ignore skip_sum 2558 * here. 2559 */ 2560 ret = btrfs_lookup_bio_sums(inode, bio, NULL); 2561 if (ret) 2562 goto out; 2563 } 2564 goto mapit; 2565 } else if (async && !skip_sum) { 2566 /* csum items have already been cloned */ 2567 if (btrfs_is_data_reloc_root(root)) 2568 goto mapit; 2569 /* we're doing a write, do the async checksumming */ 2570 ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, 2571 0, btrfs_submit_bio_start); 2572 goto out; 2573 } else if (!skip_sum) { 2574 ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); 2575 if (ret) 2576 goto out; 2577 } 2578 2579 mapit: 2580 ret = btrfs_map_bio(fs_info, bio, mirror_num); 2581 2582 out: 2583 if (ret) { 2584 bio->bi_status = ret; 2585 bio_endio(bio); 2586 } 2587 out_no_endio: 2588 return ret; 2589 } 2590 2591 /* 2592 * given a list of ordered sums record them in the inode. This happens 2593 * at IO completion time based on sums calculated at bio submission time. 2594 */ 2595 static int add_pending_csums(struct btrfs_trans_handle *trans, 2596 struct list_head *list) 2597 { 2598 struct btrfs_ordered_sum *sum; 2599 struct btrfs_root *csum_root = NULL; 2600 int ret; 2601 2602 list_for_each_entry(sum, list, list) { 2603 trans->adding_csums = true; 2604 if (!csum_root) 2605 csum_root = btrfs_csum_root(trans->fs_info, 2606 sum->bytenr); 2607 ret = btrfs_csum_file_blocks(trans, csum_root, sum); 2608 trans->adding_csums = false; 2609 if (ret) 2610 return ret; 2611 } 2612 return 0; 2613 } 2614 2615 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, 2616 const u64 start, 2617 const u64 len, 2618 struct extent_state **cached_state) 2619 { 2620 u64 search_start = start; 2621 const u64 end = start + len - 1; 2622 2623 while (search_start < end) { 2624 const u64 search_len = end - search_start + 1; 2625 struct extent_map *em; 2626 u64 em_len; 2627 int ret = 0; 2628 2629 em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); 2630 if (IS_ERR(em)) 2631 return PTR_ERR(em); 2632 2633 if (em->block_start != EXTENT_MAP_HOLE) 2634 goto next; 2635 2636 em_len = em->len; 2637 if (em->start < search_start) 2638 em_len -= search_start - em->start; 2639 if (em_len > search_len) 2640 em_len = search_len; 2641 2642 ret = set_extent_bit(&inode->io_tree, search_start, 2643 search_start + em_len - 1, 2644 EXTENT_DELALLOC_NEW, 0, NULL, cached_state, 2645 GFP_NOFS, NULL); 2646 next: 2647 search_start = extent_map_end(em); 2648 free_extent_map(em); 2649 if (ret) 2650 return ret; 2651 } 2652 return 0; 2653 } 2654 2655 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 2656 unsigned int extra_bits, 2657 struct extent_state **cached_state) 2658 { 2659 WARN_ON(PAGE_ALIGNED(end)); 2660 2661 if (start >= i_size_read(&inode->vfs_inode) && 2662 !(inode->flags & BTRFS_INODE_PREALLOC)) { 2663 /* 2664 * There can't be any extents following eof in this case so just 2665 * set the delalloc new bit for the range directly. 2666 */ 2667 extra_bits |= EXTENT_DELALLOC_NEW; 2668 } else { 2669 int ret; 2670 2671 ret = btrfs_find_new_delalloc_bytes(inode, start, 2672 end + 1 - start, 2673 cached_state); 2674 if (ret) 2675 return ret; 2676 } 2677 2678 return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, 2679 cached_state); 2680 } 2681 2682 /* see btrfs_writepage_start_hook for details on why this is required */ 2683 struct btrfs_writepage_fixup { 2684 struct page *page; 2685 struct inode *inode; 2686 struct btrfs_work work; 2687 }; 2688 2689 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 2690 { 2691 struct btrfs_writepage_fixup *fixup; 2692 struct btrfs_ordered_extent *ordered; 2693 struct extent_state *cached_state = NULL; 2694 struct extent_changeset *data_reserved = NULL; 2695 struct page *page; 2696 struct btrfs_inode *inode; 2697 u64 page_start; 2698 u64 page_end; 2699 int ret = 0; 2700 bool free_delalloc_space = true; 2701 2702 fixup = container_of(work, struct btrfs_writepage_fixup, work); 2703 page = fixup->page; 2704 inode = BTRFS_I(fixup->inode); 2705 page_start = page_offset(page); 2706 page_end = page_offset(page) + PAGE_SIZE - 1; 2707 2708 /* 2709 * This is similar to page_mkwrite, we need to reserve the space before 2710 * we take the page lock. 2711 */ 2712 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, 2713 PAGE_SIZE); 2714 again: 2715 lock_page(page); 2716 2717 /* 2718 * Before we queued this fixup, we took a reference on the page. 2719 * page->mapping may go NULL, but it shouldn't be moved to a different 2720 * address space. 2721 */ 2722 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 2723 /* 2724 * Unfortunately this is a little tricky, either 2725 * 2726 * 1) We got here and our page had already been dealt with and 2727 * we reserved our space, thus ret == 0, so we need to just 2728 * drop our space reservation and bail. This can happen the 2729 * first time we come into the fixup worker, or could happen 2730 * while waiting for the ordered extent. 2731 * 2) Our page was already dealt with, but we happened to get an 2732 * ENOSPC above from the btrfs_delalloc_reserve_space. In 2733 * this case we obviously don't have anything to release, but 2734 * because the page was already dealt with we don't want to 2735 * mark the page with an error, so make sure we're resetting 2736 * ret to 0. This is why we have this check _before_ the ret 2737 * check, because we do not want to have a surprise ENOSPC 2738 * when the page was already properly dealt with. 2739 */ 2740 if (!ret) { 2741 btrfs_delalloc_release_extents(inode, PAGE_SIZE); 2742 btrfs_delalloc_release_space(inode, data_reserved, 2743 page_start, PAGE_SIZE, 2744 true); 2745 } 2746 ret = 0; 2747 goto out_page; 2748 } 2749 2750 /* 2751 * We can't mess with the page state unless it is locked, so now that 2752 * it is locked bail if we failed to make our space reservation. 2753 */ 2754 if (ret) 2755 goto out_page; 2756 2757 lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); 2758 2759 /* already ordered? We're done */ 2760 if (PageOrdered(page)) 2761 goto out_reserved; 2762 2763 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 2764 if (ordered) { 2765 unlock_extent_cached(&inode->io_tree, page_start, page_end, 2766 &cached_state); 2767 unlock_page(page); 2768 btrfs_start_ordered_extent(ordered, 1); 2769 btrfs_put_ordered_extent(ordered); 2770 goto again; 2771 } 2772 2773 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 2774 &cached_state); 2775 if (ret) 2776 goto out_reserved; 2777 2778 /* 2779 * Everything went as planned, we're now the owner of a dirty page with 2780 * delayed allocation bits set and space reserved for our COW 2781 * destination. 2782 * 2783 * The page was dirty when we started, nothing should have cleaned it. 2784 */ 2785 BUG_ON(!PageDirty(page)); 2786 free_delalloc_space = false; 2787 out_reserved: 2788 btrfs_delalloc_release_extents(inode, PAGE_SIZE); 2789 if (free_delalloc_space) 2790 btrfs_delalloc_release_space(inode, data_reserved, page_start, 2791 PAGE_SIZE, true); 2792 unlock_extent_cached(&inode->io_tree, page_start, page_end, 2793 &cached_state); 2794 out_page: 2795 if (ret) { 2796 /* 2797 * We hit ENOSPC or other errors. Update the mapping and page 2798 * to reflect the errors and clean the page. 2799 */ 2800 mapping_set_error(page->mapping, ret); 2801 end_extent_writepage(page, ret, page_start, page_end); 2802 clear_page_dirty_for_io(page); 2803 SetPageError(page); 2804 } 2805 btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); 2806 unlock_page(page); 2807 put_page(page); 2808 kfree(fixup); 2809 extent_changeset_free(data_reserved); 2810 /* 2811 * As a precaution, do a delayed iput in case it would be the last iput 2812 * that could need flushing space. Recursing back to fixup worker would 2813 * deadlock. 2814 */ 2815 btrfs_add_delayed_iput(&inode->vfs_inode); 2816 } 2817 2818 /* 2819 * There are a few paths in the higher layers of the kernel that directly 2820 * set the page dirty bit without asking the filesystem if it is a 2821 * good idea. This causes problems because we want to make sure COW 2822 * properly happens and the data=ordered rules are followed. 2823 * 2824 * In our case any range that doesn't have the ORDERED bit set 2825 * hasn't been properly setup for IO. We kick off an async process 2826 * to fix it up. The async helper will wait for ordered extents, set 2827 * the delalloc bit and make it safe to write the page. 2828 */ 2829 int btrfs_writepage_cow_fixup(struct page *page) 2830 { 2831 struct inode *inode = page->mapping->host; 2832 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2833 struct btrfs_writepage_fixup *fixup; 2834 2835 /* This page has ordered extent covering it already */ 2836 if (PageOrdered(page)) 2837 return 0; 2838 2839 /* 2840 * PageChecked is set below when we create a fixup worker for this page, 2841 * don't try to create another one if we're already PageChecked() 2842 * 2843 * The extent_io writepage code will redirty the page if we send back 2844 * EAGAIN. 2845 */ 2846 if (PageChecked(page)) 2847 return -EAGAIN; 2848 2849 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 2850 if (!fixup) 2851 return -EAGAIN; 2852 2853 /* 2854 * We are already holding a reference to this inode from 2855 * write_cache_pages. We need to hold it because the space reservation 2856 * takes place outside of the page lock, and we can't trust 2857 * page->mapping outside of the page lock. 2858 */ 2859 ihold(inode); 2860 btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); 2861 get_page(page); 2862 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); 2863 fixup->page = page; 2864 fixup->inode = inode; 2865 btrfs_queue_work(fs_info->fixup_workers, &fixup->work); 2866 2867 return -EAGAIN; 2868 } 2869 2870 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 2871 struct btrfs_inode *inode, u64 file_pos, 2872 struct btrfs_file_extent_item *stack_fi, 2873 const bool update_inode_bytes, 2874 u64 qgroup_reserved) 2875 { 2876 struct btrfs_root *root = inode->root; 2877 const u64 sectorsize = root->fs_info->sectorsize; 2878 struct btrfs_path *path; 2879 struct extent_buffer *leaf; 2880 struct btrfs_key ins; 2881 u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); 2882 u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); 2883 u64 offset = btrfs_stack_file_extent_offset(stack_fi); 2884 u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); 2885 u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); 2886 struct btrfs_drop_extents_args drop_args = { 0 }; 2887 int ret; 2888 2889 path = btrfs_alloc_path(); 2890 if (!path) 2891 return -ENOMEM; 2892 2893 /* 2894 * we may be replacing one extent in the tree with another. 2895 * The new extent is pinned in the extent map, and we don't want 2896 * to drop it from the cache until it is completely in the btree. 2897 * 2898 * So, tell btrfs_drop_extents to leave this extent in the cache. 2899 * the caller is expected to unpin it and allow it to be merged 2900 * with the others. 2901 */ 2902 drop_args.path = path; 2903 drop_args.start = file_pos; 2904 drop_args.end = file_pos + num_bytes; 2905 drop_args.replace_extent = true; 2906 drop_args.extent_item_size = sizeof(*stack_fi); 2907 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 2908 if (ret) 2909 goto out; 2910 2911 if (!drop_args.extent_inserted) { 2912 ins.objectid = btrfs_ino(inode); 2913 ins.offset = file_pos; 2914 ins.type = BTRFS_EXTENT_DATA_KEY; 2915 2916 ret = btrfs_insert_empty_item(trans, root, path, &ins, 2917 sizeof(*stack_fi)); 2918 if (ret) 2919 goto out; 2920 } 2921 leaf = path->nodes[0]; 2922 btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); 2923 write_extent_buffer(leaf, stack_fi, 2924 btrfs_item_ptr_offset(leaf, path->slots[0]), 2925 sizeof(struct btrfs_file_extent_item)); 2926 2927 btrfs_mark_buffer_dirty(leaf); 2928 btrfs_release_path(path); 2929 2930 /* 2931 * If we dropped an inline extent here, we know the range where it is 2932 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the 2933 * number of bytes only for that range containing the inline extent. 2934 * The remaining of the range will be processed when clearning the 2935 * EXTENT_DELALLOC_BIT bit through the ordered extent completion. 2936 */ 2937 if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { 2938 u64 inline_size = round_down(drop_args.bytes_found, sectorsize); 2939 2940 inline_size = drop_args.bytes_found - inline_size; 2941 btrfs_update_inode_bytes(inode, sectorsize, inline_size); 2942 drop_args.bytes_found -= inline_size; 2943 num_bytes -= sectorsize; 2944 } 2945 2946 if (update_inode_bytes) 2947 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); 2948 2949 ins.objectid = disk_bytenr; 2950 ins.offset = disk_num_bytes; 2951 ins.type = BTRFS_EXTENT_ITEM_KEY; 2952 2953 ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); 2954 if (ret) 2955 goto out; 2956 2957 ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), 2958 file_pos - offset, 2959 qgroup_reserved, &ins); 2960 out: 2961 btrfs_free_path(path); 2962 2963 return ret; 2964 } 2965 2966 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, 2967 u64 start, u64 len) 2968 { 2969 struct btrfs_block_group *cache; 2970 2971 cache = btrfs_lookup_block_group(fs_info, start); 2972 ASSERT(cache); 2973 2974 spin_lock(&cache->lock); 2975 cache->delalloc_bytes -= len; 2976 spin_unlock(&cache->lock); 2977 2978 btrfs_put_block_group(cache); 2979 } 2980 2981 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, 2982 struct btrfs_ordered_extent *oe) 2983 { 2984 struct btrfs_file_extent_item stack_fi; 2985 bool update_inode_bytes; 2986 u64 num_bytes = oe->num_bytes; 2987 u64 ram_bytes = oe->ram_bytes; 2988 2989 memset(&stack_fi, 0, sizeof(stack_fi)); 2990 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); 2991 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); 2992 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, 2993 oe->disk_num_bytes); 2994 btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); 2995 if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) 2996 num_bytes = ram_bytes = oe->truncated_len; 2997 btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); 2998 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); 2999 btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); 3000 /* Encryption and other encoding is reserved and all 0 */ 3001 3002 /* 3003 * For delalloc, when completing an ordered extent we update the inode's 3004 * bytes when clearing the range in the inode's io tree, so pass false 3005 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), 3006 * except if the ordered extent was truncated. 3007 */ 3008 update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || 3009 test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || 3010 test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); 3011 3012 return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), 3013 oe->file_offset, &stack_fi, 3014 update_inode_bytes, oe->qgroup_rsv); 3015 } 3016 3017 /* 3018 * As ordered data IO finishes, this gets called so we can finish 3019 * an ordered extent if the range of bytes in the file it covers are 3020 * fully written. 3021 */ 3022 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 3023 { 3024 struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); 3025 struct btrfs_root *root = inode->root; 3026 struct btrfs_fs_info *fs_info = root->fs_info; 3027 struct btrfs_trans_handle *trans = NULL; 3028 struct extent_io_tree *io_tree = &inode->io_tree; 3029 struct extent_state *cached_state = NULL; 3030 u64 start, end; 3031 int compress_type = 0; 3032 int ret = 0; 3033 u64 logical_len = ordered_extent->num_bytes; 3034 bool freespace_inode; 3035 bool truncated = false; 3036 bool clear_reserved_extent = true; 3037 unsigned int clear_bits = EXTENT_DEFRAG; 3038 3039 start = ordered_extent->file_offset; 3040 end = start + ordered_extent->num_bytes - 1; 3041 3042 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3043 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && 3044 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && 3045 !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) 3046 clear_bits |= EXTENT_DELALLOC_NEW; 3047 3048 freespace_inode = btrfs_is_free_space_inode(inode); 3049 3050 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 3051 ret = -EIO; 3052 goto out; 3053 } 3054 3055 /* A valid bdev implies a write on a sequential zone */ 3056 if (ordered_extent->bdev) { 3057 btrfs_rewrite_logical_zoned(ordered_extent); 3058 btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, 3059 ordered_extent->disk_num_bytes); 3060 } 3061 3062 btrfs_free_io_failure_record(inode, start, end); 3063 3064 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 3065 truncated = true; 3066 logical_len = ordered_extent->truncated_len; 3067 /* Truncated the entire extent, don't bother adding */ 3068 if (!logical_len) 3069 goto out; 3070 } 3071 3072 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 3073 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 3074 3075 btrfs_inode_safe_disk_i_size_write(inode, 0); 3076 if (freespace_inode) 3077 trans = btrfs_join_transaction_spacecache(root); 3078 else 3079 trans = btrfs_join_transaction(root); 3080 if (IS_ERR(trans)) { 3081 ret = PTR_ERR(trans); 3082 trans = NULL; 3083 goto out; 3084 } 3085 trans->block_rsv = &inode->block_rsv; 3086 ret = btrfs_update_inode_fallback(trans, root, inode); 3087 if (ret) /* -ENOMEM or corruption */ 3088 btrfs_abort_transaction(trans, ret); 3089 goto out; 3090 } 3091 3092 clear_bits |= EXTENT_LOCKED; 3093 lock_extent_bits(io_tree, start, end, &cached_state); 3094 3095 if (freespace_inode) 3096 trans = btrfs_join_transaction_spacecache(root); 3097 else 3098 trans = btrfs_join_transaction(root); 3099 if (IS_ERR(trans)) { 3100 ret = PTR_ERR(trans); 3101 trans = NULL; 3102 goto out; 3103 } 3104 3105 trans->block_rsv = &inode->block_rsv; 3106 3107 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 3108 compress_type = ordered_extent->compress_type; 3109 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 3110 BUG_ON(compress_type); 3111 ret = btrfs_mark_extent_written(trans, inode, 3112 ordered_extent->file_offset, 3113 ordered_extent->file_offset + 3114 logical_len); 3115 } else { 3116 BUG_ON(root == fs_info->tree_root); 3117 ret = insert_ordered_extent_file_extent(trans, ordered_extent); 3118 if (!ret) { 3119 clear_reserved_extent = false; 3120 btrfs_release_delalloc_bytes(fs_info, 3121 ordered_extent->disk_bytenr, 3122 ordered_extent->disk_num_bytes); 3123 } 3124 } 3125 unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, 3126 ordered_extent->num_bytes, trans->transid); 3127 if (ret < 0) { 3128 btrfs_abort_transaction(trans, ret); 3129 goto out; 3130 } 3131 3132 ret = add_pending_csums(trans, &ordered_extent->list); 3133 if (ret) { 3134 btrfs_abort_transaction(trans, ret); 3135 goto out; 3136 } 3137 3138 /* 3139 * If this is a new delalloc range, clear its new delalloc flag to 3140 * update the inode's number of bytes. This needs to be done first 3141 * before updating the inode item. 3142 */ 3143 if ((clear_bits & EXTENT_DELALLOC_NEW) && 3144 !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) 3145 clear_extent_bit(&inode->io_tree, start, end, 3146 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, 3147 0, 0, &cached_state); 3148 3149 btrfs_inode_safe_disk_i_size_write(inode, 0); 3150 ret = btrfs_update_inode_fallback(trans, root, inode); 3151 if (ret) { /* -ENOMEM or corruption */ 3152 btrfs_abort_transaction(trans, ret); 3153 goto out; 3154 } 3155 ret = 0; 3156 out: 3157 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 3158 (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, 3159 &cached_state); 3160 3161 if (trans) 3162 btrfs_end_transaction(trans); 3163 3164 if (ret || truncated) { 3165 u64 unwritten_start = start; 3166 3167 /* 3168 * If we failed to finish this ordered extent for any reason we 3169 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered 3170 * extent, and mark the inode with the error if it wasn't 3171 * already set. Any error during writeback would have already 3172 * set the mapping error, so we need to set it if we're the ones 3173 * marking this ordered extent as failed. 3174 */ 3175 if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, 3176 &ordered_extent->flags)) 3177 mapping_set_error(ordered_extent->inode->i_mapping, -EIO); 3178 3179 if (truncated) 3180 unwritten_start += logical_len; 3181 clear_extent_uptodate(io_tree, unwritten_start, end, NULL); 3182 3183 /* Drop the cache for the part of the extent we didn't write. */ 3184 btrfs_drop_extent_cache(inode, unwritten_start, end, 0); 3185 3186 /* 3187 * If the ordered extent had an IOERR or something else went 3188 * wrong we need to return the space for this ordered extent 3189 * back to the allocator. We only free the extent in the 3190 * truncated case if we didn't write out the extent at all. 3191 * 3192 * If we made it past insert_reserved_file_extent before we 3193 * errored out then we don't need to do this as the accounting 3194 * has already been done. 3195 */ 3196 if ((ret || !logical_len) && 3197 clear_reserved_extent && 3198 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3199 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 3200 /* 3201 * Discard the range before returning it back to the 3202 * free space pool 3203 */ 3204 if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) 3205 btrfs_discard_extent(fs_info, 3206 ordered_extent->disk_bytenr, 3207 ordered_extent->disk_num_bytes, 3208 NULL); 3209 btrfs_free_reserved_extent(fs_info, 3210 ordered_extent->disk_bytenr, 3211 ordered_extent->disk_num_bytes, 1); 3212 } 3213 } 3214 3215 /* 3216 * This needs to be done to make sure anybody waiting knows we are done 3217 * updating everything for this ordered extent. 3218 */ 3219 btrfs_remove_ordered_extent(inode, ordered_extent); 3220 3221 /* once for us */ 3222 btrfs_put_ordered_extent(ordered_extent); 3223 /* once for the tree */ 3224 btrfs_put_ordered_extent(ordered_extent); 3225 3226 return ret; 3227 } 3228 3229 static void finish_ordered_fn(struct btrfs_work *work) 3230 { 3231 struct btrfs_ordered_extent *ordered_extent; 3232 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 3233 btrfs_finish_ordered_io(ordered_extent); 3234 } 3235 3236 void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, 3237 struct page *page, u64 start, 3238 u64 end, bool uptodate) 3239 { 3240 trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); 3241 3242 btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, 3243 finish_ordered_fn, uptodate); 3244 } 3245 3246 /* 3247 * check_data_csum - verify checksum of one sector of uncompressed data 3248 * @inode: inode 3249 * @io_bio: btrfs_io_bio which contains the csum 3250 * @bio_offset: offset to the beginning of the bio (in bytes) 3251 * @page: page where is the data to be verified 3252 * @pgoff: offset inside the page 3253 * @start: logical offset in the file 3254 * 3255 * The length of such check is always one sector size. 3256 */ 3257 static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, 3258 u32 bio_offset, struct page *page, u32 pgoff, 3259 u64 start) 3260 { 3261 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3262 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 3263 char *kaddr; 3264 u32 len = fs_info->sectorsize; 3265 const u32 csum_size = fs_info->csum_size; 3266 unsigned int offset_sectors; 3267 u8 *csum_expected; 3268 u8 csum[BTRFS_CSUM_SIZE]; 3269 3270 ASSERT(pgoff + len <= PAGE_SIZE); 3271 3272 offset_sectors = bio_offset >> fs_info->sectorsize_bits; 3273 csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size; 3274 3275 kaddr = kmap_atomic(page); 3276 shash->tfm = fs_info->csum_shash; 3277 3278 crypto_shash_digest(shash, kaddr + pgoff, len, csum); 3279 3280 if (memcmp(csum, csum_expected, csum_size)) 3281 goto zeroit; 3282 3283 kunmap_atomic(kaddr); 3284 return 0; 3285 zeroit: 3286 btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, 3287 bbio->mirror_num); 3288 if (bbio->device) 3289 btrfs_dev_stat_inc_and_print(bbio->device, 3290 BTRFS_DEV_STAT_CORRUPTION_ERRS); 3291 memset(kaddr + pgoff, 1, len); 3292 flush_dcache_page(page); 3293 kunmap_atomic(kaddr); 3294 return -EIO; 3295 } 3296 3297 /* 3298 * When reads are done, we need to check csums to verify the data is correct. 3299 * if there's a match, we allow the bio to finish. If not, the code in 3300 * extent_io.c will try to find good copies for us. 3301 * 3302 * @bio_offset: offset to the beginning of the bio (in bytes) 3303 * @start: file offset of the range start 3304 * @end: file offset of the range end (inclusive) 3305 * 3306 * Return a bitmap where bit set means a csum mismatch, and bit not set means 3307 * csum match. 3308 */ 3309 unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, 3310 u32 bio_offset, struct page *page, 3311 u64 start, u64 end) 3312 { 3313 struct inode *inode = page->mapping->host; 3314 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3315 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3316 struct btrfs_root *root = BTRFS_I(inode)->root; 3317 const u32 sectorsize = root->fs_info->sectorsize; 3318 u32 pg_off; 3319 unsigned int result = 0; 3320 3321 if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) { 3322 btrfs_page_clear_checked(fs_info, page, start, end + 1 - start); 3323 return 0; 3324 } 3325 3326 /* 3327 * This only happens for NODATASUM or compressed read. 3328 * Normally this should be covered by above check for compressed read 3329 * or the next check for NODATASUM. Just do a quicker exit here. 3330 */ 3331 if (bbio->csum == NULL) 3332 return 0; 3333 3334 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 3335 return 0; 3336 3337 if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) 3338 return 0; 3339 3340 ASSERT(page_offset(page) <= start && 3341 end <= page_offset(page) + PAGE_SIZE - 1); 3342 for (pg_off = offset_in_page(start); 3343 pg_off < offset_in_page(end); 3344 pg_off += sectorsize, bio_offset += sectorsize) { 3345 u64 file_offset = pg_off + page_offset(page); 3346 int ret; 3347 3348 if (btrfs_is_data_reloc_root(root) && 3349 test_range_bit(io_tree, file_offset, 3350 file_offset + sectorsize - 1, 3351 EXTENT_NODATASUM, 1, NULL)) { 3352 /* Skip the range without csum for data reloc inode */ 3353 clear_extent_bits(io_tree, file_offset, 3354 file_offset + sectorsize - 1, 3355 EXTENT_NODATASUM); 3356 continue; 3357 } 3358 ret = check_data_csum(inode, bbio, bio_offset, page, pg_off, 3359 page_offset(page) + pg_off); 3360 if (ret < 0) { 3361 const int nr_bit = (pg_off - offset_in_page(start)) >> 3362 root->fs_info->sectorsize_bits; 3363 3364 result |= (1U << nr_bit); 3365 } 3366 } 3367 return result; 3368 } 3369 3370 /* 3371 * btrfs_add_delayed_iput - perform a delayed iput on @inode 3372 * 3373 * @inode: The inode we want to perform iput on 3374 * 3375 * This function uses the generic vfs_inode::i_count to track whether we should 3376 * just decrement it (in case it's > 1) or if this is the last iput then link 3377 * the inode to the delayed iput machinery. Delayed iputs are processed at 3378 * transaction commit time/superblock commit/cleaner kthread. 3379 */ 3380 void btrfs_add_delayed_iput(struct inode *inode) 3381 { 3382 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3383 struct btrfs_inode *binode = BTRFS_I(inode); 3384 3385 if (atomic_add_unless(&inode->i_count, -1, 1)) 3386 return; 3387 3388 atomic_inc(&fs_info->nr_delayed_iputs); 3389 spin_lock(&fs_info->delayed_iput_lock); 3390 ASSERT(list_empty(&binode->delayed_iput)); 3391 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); 3392 spin_unlock(&fs_info->delayed_iput_lock); 3393 if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) 3394 wake_up_process(fs_info->cleaner_kthread); 3395 } 3396 3397 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, 3398 struct btrfs_inode *inode) 3399 { 3400 list_del_init(&inode->delayed_iput); 3401 spin_unlock(&fs_info->delayed_iput_lock); 3402 iput(&inode->vfs_inode); 3403 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) 3404 wake_up(&fs_info->delayed_iputs_wait); 3405 spin_lock(&fs_info->delayed_iput_lock); 3406 } 3407 3408 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, 3409 struct btrfs_inode *inode) 3410 { 3411 if (!list_empty(&inode->delayed_iput)) { 3412 spin_lock(&fs_info->delayed_iput_lock); 3413 if (!list_empty(&inode->delayed_iput)) 3414 run_delayed_iput_locked(fs_info, inode); 3415 spin_unlock(&fs_info->delayed_iput_lock); 3416 } 3417 } 3418 3419 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) 3420 { 3421 3422 spin_lock(&fs_info->delayed_iput_lock); 3423 while (!list_empty(&fs_info->delayed_iputs)) { 3424 struct btrfs_inode *inode; 3425 3426 inode = list_first_entry(&fs_info->delayed_iputs, 3427 struct btrfs_inode, delayed_iput); 3428 run_delayed_iput_locked(fs_info, inode); 3429 cond_resched_lock(&fs_info->delayed_iput_lock); 3430 } 3431 spin_unlock(&fs_info->delayed_iput_lock); 3432 } 3433 3434 /** 3435 * Wait for flushing all delayed iputs 3436 * 3437 * @fs_info: the filesystem 3438 * 3439 * This will wait on any delayed iputs that are currently running with KILLABLE 3440 * set. Once they are all done running we will return, unless we are killed in 3441 * which case we return EINTR. This helps in user operations like fallocate etc 3442 * that might get blocked on the iputs. 3443 * 3444 * Return EINTR if we were killed, 0 if nothing's pending 3445 */ 3446 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) 3447 { 3448 int ret = wait_event_killable(fs_info->delayed_iputs_wait, 3449 atomic_read(&fs_info->nr_delayed_iputs) == 0); 3450 if (ret) 3451 return -EINTR; 3452 return 0; 3453 } 3454 3455 /* 3456 * This creates an orphan entry for the given inode in case something goes wrong 3457 * in the middle of an unlink. 3458 */ 3459 int btrfs_orphan_add(struct btrfs_trans_handle *trans, 3460 struct btrfs_inode *inode) 3461 { 3462 int ret; 3463 3464 ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); 3465 if (ret && ret != -EEXIST) { 3466 btrfs_abort_transaction(trans, ret); 3467 return ret; 3468 } 3469 3470 return 0; 3471 } 3472 3473 /* 3474 * We have done the delete so we can go ahead and remove the orphan item for 3475 * this particular inode. 3476 */ 3477 static int btrfs_orphan_del(struct btrfs_trans_handle *trans, 3478 struct btrfs_inode *inode) 3479 { 3480 return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); 3481 } 3482 3483 /* 3484 * this cleans up any orphans that may be left on the list from the last use 3485 * of this root. 3486 */ 3487 int btrfs_orphan_cleanup(struct btrfs_root *root) 3488 { 3489 struct btrfs_fs_info *fs_info = root->fs_info; 3490 struct btrfs_path *path; 3491 struct extent_buffer *leaf; 3492 struct btrfs_key key, found_key; 3493 struct btrfs_trans_handle *trans; 3494 struct inode *inode; 3495 u64 last_objectid = 0; 3496 int ret = 0, nr_unlink = 0; 3497 3498 if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) 3499 return 0; 3500 3501 path = btrfs_alloc_path(); 3502 if (!path) { 3503 ret = -ENOMEM; 3504 goto out; 3505 } 3506 path->reada = READA_BACK; 3507 3508 key.objectid = BTRFS_ORPHAN_OBJECTID; 3509 key.type = BTRFS_ORPHAN_ITEM_KEY; 3510 key.offset = (u64)-1; 3511 3512 while (1) { 3513 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3514 if (ret < 0) 3515 goto out; 3516 3517 /* 3518 * if ret == 0 means we found what we were searching for, which 3519 * is weird, but possible, so only screw with path if we didn't 3520 * find the key and see if we have stuff that matches 3521 */ 3522 if (ret > 0) { 3523 ret = 0; 3524 if (path->slots[0] == 0) 3525 break; 3526 path->slots[0]--; 3527 } 3528 3529 /* pull out the item */ 3530 leaf = path->nodes[0]; 3531 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3532 3533 /* make sure the item matches what we want */ 3534 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3535 break; 3536 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) 3537 break; 3538 3539 /* release the path since we're done with it */ 3540 btrfs_release_path(path); 3541 3542 /* 3543 * this is where we are basically btrfs_lookup, without the 3544 * crossing root thing. we store the inode number in the 3545 * offset of the orphan item. 3546 */ 3547 3548 if (found_key.offset == last_objectid) { 3549 btrfs_err(fs_info, 3550 "Error removing orphan entry, stopping orphan cleanup"); 3551 ret = -EINVAL; 3552 goto out; 3553 } 3554 3555 last_objectid = found_key.offset; 3556 3557 found_key.objectid = found_key.offset; 3558 found_key.type = BTRFS_INODE_ITEM_KEY; 3559 found_key.offset = 0; 3560 inode = btrfs_iget(fs_info->sb, last_objectid, root); 3561 ret = PTR_ERR_OR_ZERO(inode); 3562 if (ret && ret != -ENOENT) 3563 goto out; 3564 3565 if (ret == -ENOENT && root == fs_info->tree_root) { 3566 struct btrfs_root *dead_root; 3567 int is_dead_root = 0; 3568 3569 /* 3570 * This is an orphan in the tree root. Currently these 3571 * could come from 2 sources: 3572 * a) a root (snapshot/subvolume) deletion in progress 3573 * b) a free space cache inode 3574 * We need to distinguish those two, as the orphan item 3575 * for a root must not get deleted before the deletion 3576 * of the snapshot/subvolume's tree completes. 3577 * 3578 * btrfs_find_orphan_roots() ran before us, which has 3579 * found all deleted roots and loaded them into 3580 * fs_info->fs_roots_radix. So here we can find if an 3581 * orphan item corresponds to a deleted root by looking 3582 * up the root from that radix tree. 3583 */ 3584 3585 spin_lock(&fs_info->fs_roots_radix_lock); 3586 dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, 3587 (unsigned long)found_key.objectid); 3588 if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) 3589 is_dead_root = 1; 3590 spin_unlock(&fs_info->fs_roots_radix_lock); 3591 3592 if (is_dead_root) { 3593 /* prevent this orphan from being found again */ 3594 key.offset = found_key.objectid - 1; 3595 continue; 3596 } 3597 3598 } 3599 3600 /* 3601 * If we have an inode with links, there are a couple of 3602 * possibilities: 3603 * 3604 * 1. We were halfway through creating fsverity metadata for the 3605 * file. In that case, the orphan item represents incomplete 3606 * fsverity metadata which must be cleaned up with 3607 * btrfs_drop_verity_items and deleting the orphan item. 3608 3609 * 2. Old kernels (before v3.12) used to create an 3610 * orphan item for truncate indicating that there were possibly 3611 * extent items past i_size that needed to be deleted. In v3.12, 3612 * truncate was changed to update i_size in sync with the extent 3613 * items, but the (useless) orphan item was still created. Since 3614 * v4.18, we don't create the orphan item for truncate at all. 3615 * 3616 * So, this item could mean that we need to do a truncate, but 3617 * only if this filesystem was last used on a pre-v3.12 kernel 3618 * and was not cleanly unmounted. The odds of that are quite 3619 * slim, and it's a pain to do the truncate now, so just delete 3620 * the orphan item. 3621 * 3622 * It's also possible that this orphan item was supposed to be 3623 * deleted but wasn't. The inode number may have been reused, 3624 * but either way, we can delete the orphan item. 3625 */ 3626 if (ret == -ENOENT || inode->i_nlink) { 3627 if (!ret) { 3628 ret = btrfs_drop_verity_items(BTRFS_I(inode)); 3629 iput(inode); 3630 if (ret) 3631 goto out; 3632 } 3633 trans = btrfs_start_transaction(root, 1); 3634 if (IS_ERR(trans)) { 3635 ret = PTR_ERR(trans); 3636 goto out; 3637 } 3638 btrfs_debug(fs_info, "auto deleting %Lu", 3639 found_key.objectid); 3640 ret = btrfs_del_orphan_item(trans, root, 3641 found_key.objectid); 3642 btrfs_end_transaction(trans); 3643 if (ret) 3644 goto out; 3645 continue; 3646 } 3647 3648 nr_unlink++; 3649 3650 /* this will do delete_inode and everything for us */ 3651 iput(inode); 3652 } 3653 /* release the path since we're done with it */ 3654 btrfs_release_path(path); 3655 3656 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { 3657 trans = btrfs_join_transaction(root); 3658 if (!IS_ERR(trans)) 3659 btrfs_end_transaction(trans); 3660 } 3661 3662 if (nr_unlink) 3663 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); 3664 3665 out: 3666 if (ret) 3667 btrfs_err(fs_info, "could not do orphan cleanup %d", ret); 3668 btrfs_free_path(path); 3669 return ret; 3670 } 3671 3672 /* 3673 * very simple check to peek ahead in the leaf looking for xattrs. If we 3674 * don't find any xattrs, we know there can't be any acls. 3675 * 3676 * slot is the slot the inode is in, objectid is the objectid of the inode 3677 */ 3678 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 3679 int slot, u64 objectid, 3680 int *first_xattr_slot) 3681 { 3682 u32 nritems = btrfs_header_nritems(leaf); 3683 struct btrfs_key found_key; 3684 static u64 xattr_access = 0; 3685 static u64 xattr_default = 0; 3686 int scanned = 0; 3687 3688 if (!xattr_access) { 3689 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, 3690 strlen(XATTR_NAME_POSIX_ACL_ACCESS)); 3691 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, 3692 strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); 3693 } 3694 3695 slot++; 3696 *first_xattr_slot = -1; 3697 while (slot < nritems) { 3698 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3699 3700 /* we found a different objectid, there must not be acls */ 3701 if (found_key.objectid != objectid) 3702 return 0; 3703 3704 /* we found an xattr, assume we've got an acl */ 3705 if (found_key.type == BTRFS_XATTR_ITEM_KEY) { 3706 if (*first_xattr_slot == -1) 3707 *first_xattr_slot = slot; 3708 if (found_key.offset == xattr_access || 3709 found_key.offset == xattr_default) 3710 return 1; 3711 } 3712 3713 /* 3714 * we found a key greater than an xattr key, there can't 3715 * be any acls later on 3716 */ 3717 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 3718 return 0; 3719 3720 slot++; 3721 scanned++; 3722 3723 /* 3724 * it goes inode, inode backrefs, xattrs, extents, 3725 * so if there are a ton of hard links to an inode there can 3726 * be a lot of backrefs. Don't waste time searching too hard, 3727 * this is just an optimization 3728 */ 3729 if (scanned >= 8) 3730 break; 3731 } 3732 /* we hit the end of the leaf before we found an xattr or 3733 * something larger than an xattr. We have to assume the inode 3734 * has acls 3735 */ 3736 if (*first_xattr_slot == -1) 3737 *first_xattr_slot = slot; 3738 return 1; 3739 } 3740 3741 /* 3742 * read an inode from the btree into the in-memory inode 3743 */ 3744 static int btrfs_read_locked_inode(struct inode *inode, 3745 struct btrfs_path *in_path) 3746 { 3747 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3748 struct btrfs_path *path = in_path; 3749 struct extent_buffer *leaf; 3750 struct btrfs_inode_item *inode_item; 3751 struct btrfs_root *root = BTRFS_I(inode)->root; 3752 struct btrfs_key location; 3753 unsigned long ptr; 3754 int maybe_acls; 3755 u32 rdev; 3756 int ret; 3757 bool filled = false; 3758 int first_xattr_slot; 3759 3760 ret = btrfs_fill_inode(inode, &rdev); 3761 if (!ret) 3762 filled = true; 3763 3764 if (!path) { 3765 path = btrfs_alloc_path(); 3766 if (!path) 3767 return -ENOMEM; 3768 } 3769 3770 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3771 3772 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3773 if (ret) { 3774 if (path != in_path) 3775 btrfs_free_path(path); 3776 return ret; 3777 } 3778 3779 leaf = path->nodes[0]; 3780 3781 if (filled) 3782 goto cache_index; 3783 3784 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3785 struct btrfs_inode_item); 3786 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 3787 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 3788 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); 3789 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); 3790 btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); 3791 btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, 3792 round_up(i_size_read(inode), fs_info->sectorsize)); 3793 3794 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); 3795 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); 3796 3797 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); 3798 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); 3799 3800 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); 3801 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); 3802 3803 BTRFS_I(inode)->i_otime.tv_sec = 3804 btrfs_timespec_sec(leaf, &inode_item->otime); 3805 BTRFS_I(inode)->i_otime.tv_nsec = 3806 btrfs_timespec_nsec(leaf, &inode_item->otime); 3807 3808 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 3809 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 3810 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); 3811 3812 inode_set_iversion_queried(inode, 3813 btrfs_inode_sequence(leaf, inode_item)); 3814 inode->i_generation = BTRFS_I(inode)->generation; 3815 inode->i_rdev = 0; 3816 rdev = btrfs_inode_rdev(leaf, inode_item); 3817 3818 BTRFS_I(inode)->index_cnt = (u64)-1; 3819 btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), 3820 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); 3821 3822 cache_index: 3823 /* 3824 * If we were modified in the current generation and evicted from memory 3825 * and then re-read we need to do a full sync since we don't have any 3826 * idea about which extents were modified before we were evicted from 3827 * cache. 3828 * 3829 * This is required for both inode re-read from disk and delayed inode 3830 * in delayed_nodes_tree. 3831 */ 3832 if (BTRFS_I(inode)->last_trans == fs_info->generation) 3833 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3834 &BTRFS_I(inode)->runtime_flags); 3835 3836 /* 3837 * We don't persist the id of the transaction where an unlink operation 3838 * against the inode was last made. So here we assume the inode might 3839 * have been evicted, and therefore the exact value of last_unlink_trans 3840 * lost, and set it to last_trans to avoid metadata inconsistencies 3841 * between the inode and its parent if the inode is fsync'ed and the log 3842 * replayed. For example, in the scenario: 3843 * 3844 * touch mydir/foo 3845 * ln mydir/foo mydir/bar 3846 * sync 3847 * unlink mydir/bar 3848 * echo 2 > /proc/sys/vm/drop_caches # evicts inode 3849 * xfs_io -c fsync mydir/foo 3850 * <power failure> 3851 * mount fs, triggers fsync log replay 3852 * 3853 * We must make sure that when we fsync our inode foo we also log its 3854 * parent inode, otherwise after log replay the parent still has the 3855 * dentry with the "bar" name but our inode foo has a link count of 1 3856 * and doesn't have an inode ref with the name "bar" anymore. 3857 * 3858 * Setting last_unlink_trans to last_trans is a pessimistic approach, 3859 * but it guarantees correctness at the expense of occasional full 3860 * transaction commits on fsync if our inode is a directory, or if our 3861 * inode is not a directory, logging its parent unnecessarily. 3862 */ 3863 BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; 3864 3865 /* 3866 * Same logic as for last_unlink_trans. We don't persist the generation 3867 * of the last transaction where this inode was used for a reflink 3868 * operation, so after eviction and reloading the inode we must be 3869 * pessimistic and assume the last transaction that modified the inode. 3870 */ 3871 BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; 3872 3873 path->slots[0]++; 3874 if (inode->i_nlink != 1 || 3875 path->slots[0] >= btrfs_header_nritems(leaf)) 3876 goto cache_acl; 3877 3878 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); 3879 if (location.objectid != btrfs_ino(BTRFS_I(inode))) 3880 goto cache_acl; 3881 3882 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 3883 if (location.type == BTRFS_INODE_REF_KEY) { 3884 struct btrfs_inode_ref *ref; 3885 3886 ref = (struct btrfs_inode_ref *)ptr; 3887 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); 3888 } else if (location.type == BTRFS_INODE_EXTREF_KEY) { 3889 struct btrfs_inode_extref *extref; 3890 3891 extref = (struct btrfs_inode_extref *)ptr; 3892 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, 3893 extref); 3894 } 3895 cache_acl: 3896 /* 3897 * try to precache a NULL acl entry for files that don't have 3898 * any xattrs or acls 3899 */ 3900 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 3901 btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); 3902 if (first_xattr_slot != -1) { 3903 path->slots[0] = first_xattr_slot; 3904 ret = btrfs_load_inode_props(inode, path); 3905 if (ret) 3906 btrfs_err(fs_info, 3907 "error loading props for ino %llu (root %llu): %d", 3908 btrfs_ino(BTRFS_I(inode)), 3909 root->root_key.objectid, ret); 3910 } 3911 if (path != in_path) 3912 btrfs_free_path(path); 3913 3914 if (!maybe_acls) 3915 cache_no_acl(inode); 3916 3917 switch (inode->i_mode & S_IFMT) { 3918 case S_IFREG: 3919 inode->i_mapping->a_ops = &btrfs_aops; 3920 inode->i_fop = &btrfs_file_operations; 3921 inode->i_op = &btrfs_file_inode_operations; 3922 break; 3923 case S_IFDIR: 3924 inode->i_fop = &btrfs_dir_file_operations; 3925 inode->i_op = &btrfs_dir_inode_operations; 3926 break; 3927 case S_IFLNK: 3928 inode->i_op = &btrfs_symlink_inode_operations; 3929 inode_nohighmem(inode); 3930 inode->i_mapping->a_ops = &btrfs_aops; 3931 break; 3932 default: 3933 inode->i_op = &btrfs_special_inode_operations; 3934 init_special_inode(inode, inode->i_mode, rdev); 3935 break; 3936 } 3937 3938 btrfs_sync_inode_flags_to_i_flags(inode); 3939 return 0; 3940 } 3941 3942 /* 3943 * given a leaf and an inode, copy the inode fields into the leaf 3944 */ 3945 static void fill_inode_item(struct btrfs_trans_handle *trans, 3946 struct extent_buffer *leaf, 3947 struct btrfs_inode_item *item, 3948 struct inode *inode) 3949 { 3950 struct btrfs_map_token token; 3951 u64 flags; 3952 3953 btrfs_init_map_token(&token, leaf); 3954 3955 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 3956 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 3957 btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); 3958 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 3959 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 3960 3961 btrfs_set_token_timespec_sec(&token, &item->atime, 3962 inode->i_atime.tv_sec); 3963 btrfs_set_token_timespec_nsec(&token, &item->atime, 3964 inode->i_atime.tv_nsec); 3965 3966 btrfs_set_token_timespec_sec(&token, &item->mtime, 3967 inode->i_mtime.tv_sec); 3968 btrfs_set_token_timespec_nsec(&token, &item->mtime, 3969 inode->i_mtime.tv_nsec); 3970 3971 btrfs_set_token_timespec_sec(&token, &item->ctime, 3972 inode->i_ctime.tv_sec); 3973 btrfs_set_token_timespec_nsec(&token, &item->ctime, 3974 inode->i_ctime.tv_nsec); 3975 3976 btrfs_set_token_timespec_sec(&token, &item->otime, 3977 BTRFS_I(inode)->i_otime.tv_sec); 3978 btrfs_set_token_timespec_nsec(&token, &item->otime, 3979 BTRFS_I(inode)->i_otime.tv_nsec); 3980 3981 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); 3982 btrfs_set_token_inode_generation(&token, item, 3983 BTRFS_I(inode)->generation); 3984 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 3985 btrfs_set_token_inode_transid(&token, item, trans->transid); 3986 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 3987 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 3988 BTRFS_I(inode)->ro_flags); 3989 btrfs_set_token_inode_flags(&token, item, flags); 3990 btrfs_set_token_inode_block_group(&token, item, 0); 3991 } 3992 3993 /* 3994 * copy everything in the in-memory inode into the btree. 3995 */ 3996 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 3997 struct btrfs_root *root, 3998 struct btrfs_inode *inode) 3999 { 4000 struct btrfs_inode_item *inode_item; 4001 struct btrfs_path *path; 4002 struct extent_buffer *leaf; 4003 int ret; 4004 4005 path = btrfs_alloc_path(); 4006 if (!path) 4007 return -ENOMEM; 4008 4009 ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1); 4010 if (ret) { 4011 if (ret > 0) 4012 ret = -ENOENT; 4013 goto failed; 4014 } 4015 4016 leaf = path->nodes[0]; 4017 inode_item = btrfs_item_ptr(leaf, path->slots[0], 4018 struct btrfs_inode_item); 4019 4020 fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); 4021 btrfs_mark_buffer_dirty(leaf); 4022 btrfs_set_inode_last_trans(trans, inode); 4023 ret = 0; 4024 failed: 4025 btrfs_free_path(path); 4026 return ret; 4027 } 4028 4029 /* 4030 * copy everything in the in-memory inode into the btree. 4031 */ 4032 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 4033 struct btrfs_root *root, 4034 struct btrfs_inode *inode) 4035 { 4036 struct btrfs_fs_info *fs_info = root->fs_info; 4037 int ret; 4038 4039 /* 4040 * If the inode is a free space inode, we can deadlock during commit 4041 * if we put it into the delayed code. 4042 * 4043 * The data relocation inode should also be directly updated 4044 * without delay 4045 */ 4046 if (!btrfs_is_free_space_inode(inode) 4047 && !btrfs_is_data_reloc_root(root) 4048 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { 4049 btrfs_update_root_times(trans, root); 4050 4051 ret = btrfs_delayed_update_inode(trans, root, inode); 4052 if (!ret) 4053 btrfs_set_inode_last_trans(trans, inode); 4054 return ret; 4055 } 4056 4057 return btrfs_update_inode_item(trans, root, inode); 4058 } 4059 4060 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 4061 struct btrfs_root *root, struct btrfs_inode *inode) 4062 { 4063 int ret; 4064 4065 ret = btrfs_update_inode(trans, root, inode); 4066 if (ret == -ENOSPC) 4067 return btrfs_update_inode_item(trans, root, inode); 4068 return ret; 4069 } 4070 4071 /* 4072 * unlink helper that gets used here in inode.c and in the tree logging 4073 * recovery code. It remove a link in a directory with a given name, and 4074 * also drops the back refs in the inode to the directory 4075 */ 4076 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 4077 struct btrfs_inode *dir, 4078 struct btrfs_inode *inode, 4079 const char *name, int name_len, 4080 struct btrfs_rename_ctx *rename_ctx) 4081 { 4082 struct btrfs_root *root = dir->root; 4083 struct btrfs_fs_info *fs_info = root->fs_info; 4084 struct btrfs_path *path; 4085 int ret = 0; 4086 struct btrfs_dir_item *di; 4087 u64 index; 4088 u64 ino = btrfs_ino(inode); 4089 u64 dir_ino = btrfs_ino(dir); 4090 4091 path = btrfs_alloc_path(); 4092 if (!path) { 4093 ret = -ENOMEM; 4094 goto out; 4095 } 4096 4097 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 4098 name, name_len, -1); 4099 if (IS_ERR_OR_NULL(di)) { 4100 ret = di ? PTR_ERR(di) : -ENOENT; 4101 goto err; 4102 } 4103 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4104 if (ret) 4105 goto err; 4106 btrfs_release_path(path); 4107 4108 /* 4109 * If we don't have dir index, we have to get it by looking up 4110 * the inode ref, since we get the inode ref, remove it directly, 4111 * it is unnecessary to do delayed deletion. 4112 * 4113 * But if we have dir index, needn't search inode ref to get it. 4114 * Since the inode ref is close to the inode item, it is better 4115 * that we delay to delete it, and just do this deletion when 4116 * we update the inode item. 4117 */ 4118 if (inode->dir_index) { 4119 ret = btrfs_delayed_delete_inode_ref(inode); 4120 if (!ret) { 4121 index = inode->dir_index; 4122 goto skip_backref; 4123 } 4124 } 4125 4126 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 4127 dir_ino, &index); 4128 if (ret) { 4129 btrfs_info(fs_info, 4130 "failed to delete reference to %.*s, inode %llu parent %llu", 4131 name_len, name, ino, dir_ino); 4132 btrfs_abort_transaction(trans, ret); 4133 goto err; 4134 } 4135 skip_backref: 4136 if (rename_ctx) 4137 rename_ctx->index = index; 4138 4139 ret = btrfs_delete_delayed_dir_index(trans, dir, index); 4140 if (ret) { 4141 btrfs_abort_transaction(trans, ret); 4142 goto err; 4143 } 4144 4145 /* 4146 * If we are in a rename context, we don't need to update anything in the 4147 * log. That will be done later during the rename by btrfs_log_new_name(). 4148 * Besides that, doing it here would only cause extra unncessary btree 4149 * operations on the log tree, increasing latency for applications. 4150 */ 4151 if (!rename_ctx) { 4152 btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, 4153 dir_ino); 4154 btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, 4155 index); 4156 } 4157 4158 /* 4159 * If we have a pending delayed iput we could end up with the final iput 4160 * being run in btrfs-cleaner context. If we have enough of these built 4161 * up we can end up burning a lot of time in btrfs-cleaner without any 4162 * way to throttle the unlinks. Since we're currently holding a ref on 4163 * the inode we can run the delayed iput here without any issues as the 4164 * final iput won't be done until after we drop the ref we're currently 4165 * holding. 4166 */ 4167 btrfs_run_delayed_iput(fs_info, inode); 4168 err: 4169 btrfs_free_path(path); 4170 if (ret) 4171 goto out; 4172 4173 btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); 4174 inode_inc_iversion(&inode->vfs_inode); 4175 inode_inc_iversion(&dir->vfs_inode); 4176 inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = 4177 dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); 4178 ret = btrfs_update_inode(trans, root, dir); 4179 out: 4180 return ret; 4181 } 4182 4183 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 4184 struct btrfs_inode *dir, struct btrfs_inode *inode, 4185 const char *name, int name_len) 4186 { 4187 int ret; 4188 ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL); 4189 if (!ret) { 4190 drop_nlink(&inode->vfs_inode); 4191 ret = btrfs_update_inode(trans, inode->root, inode); 4192 } 4193 return ret; 4194 } 4195 4196 /* 4197 * helper to start transaction for unlink and rmdir. 4198 * 4199 * unlink and rmdir are special in btrfs, they do not always free space, so 4200 * if we cannot make our reservations the normal way try and see if there is 4201 * plenty of slack room in the global reserve to migrate, otherwise we cannot 4202 * allow the unlink to occur. 4203 */ 4204 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) 4205 { 4206 struct btrfs_root *root = BTRFS_I(dir)->root; 4207 4208 /* 4209 * 1 for the possible orphan item 4210 * 1 for the dir item 4211 * 1 for the dir index 4212 * 1 for the inode ref 4213 * 1 for the inode 4214 */ 4215 return btrfs_start_transaction_fallback_global_rsv(root, 5); 4216 } 4217 4218 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 4219 { 4220 struct btrfs_trans_handle *trans; 4221 struct inode *inode = d_inode(dentry); 4222 int ret; 4223 4224 trans = __unlink_start_trans(dir); 4225 if (IS_ERR(trans)) 4226 return PTR_ERR(trans); 4227 4228 btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 4229 0); 4230 4231 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), 4232 BTRFS_I(d_inode(dentry)), dentry->d_name.name, 4233 dentry->d_name.len); 4234 if (ret) 4235 goto out; 4236 4237 if (inode->i_nlink == 0) { 4238 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 4239 if (ret) 4240 goto out; 4241 } 4242 4243 out: 4244 btrfs_end_transaction(trans); 4245 btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); 4246 return ret; 4247 } 4248 4249 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 4250 struct inode *dir, struct dentry *dentry) 4251 { 4252 struct btrfs_root *root = BTRFS_I(dir)->root; 4253 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); 4254 struct btrfs_path *path; 4255 struct extent_buffer *leaf; 4256 struct btrfs_dir_item *di; 4257 struct btrfs_key key; 4258 const char *name = dentry->d_name.name; 4259 int name_len = dentry->d_name.len; 4260 u64 index; 4261 int ret; 4262 u64 objectid; 4263 u64 dir_ino = btrfs_ino(BTRFS_I(dir)); 4264 4265 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { 4266 objectid = inode->root->root_key.objectid; 4267 } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { 4268 objectid = inode->location.objectid; 4269 } else { 4270 WARN_ON(1); 4271 return -EINVAL; 4272 } 4273 4274 path = btrfs_alloc_path(); 4275 if (!path) 4276 return -ENOMEM; 4277 4278 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 4279 name, name_len, -1); 4280 if (IS_ERR_OR_NULL(di)) { 4281 ret = di ? PTR_ERR(di) : -ENOENT; 4282 goto out; 4283 } 4284 4285 leaf = path->nodes[0]; 4286 btrfs_dir_item_key_to_cpu(leaf, di, &key); 4287 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 4288 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4289 if (ret) { 4290 btrfs_abort_transaction(trans, ret); 4291 goto out; 4292 } 4293 btrfs_release_path(path); 4294 4295 /* 4296 * This is a placeholder inode for a subvolume we didn't have a 4297 * reference to at the time of the snapshot creation. In the meantime 4298 * we could have renamed the real subvol link into our snapshot, so 4299 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. 4300 * Instead simply lookup the dir_index_item for this entry so we can 4301 * remove it. Otherwise we know we have a ref to the root and we can 4302 * call btrfs_del_root_ref, and it _shouldn't_ fail. 4303 */ 4304 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { 4305 di = btrfs_search_dir_index_item(root, path, dir_ino, 4306 name, name_len); 4307 if (IS_ERR_OR_NULL(di)) { 4308 if (!di) 4309 ret = -ENOENT; 4310 else 4311 ret = PTR_ERR(di); 4312 btrfs_abort_transaction(trans, ret); 4313 goto out; 4314 } 4315 4316 leaf = path->nodes[0]; 4317 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4318 index = key.offset; 4319 btrfs_release_path(path); 4320 } else { 4321 ret = btrfs_del_root_ref(trans, objectid, 4322 root->root_key.objectid, dir_ino, 4323 &index, name, name_len); 4324 if (ret) { 4325 btrfs_abort_transaction(trans, ret); 4326 goto out; 4327 } 4328 } 4329 4330 ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); 4331 if (ret) { 4332 btrfs_abort_transaction(trans, ret); 4333 goto out; 4334 } 4335 4336 btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); 4337 inode_inc_iversion(dir); 4338 dir->i_mtime = dir->i_ctime = current_time(dir); 4339 ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); 4340 if (ret) 4341 btrfs_abort_transaction(trans, ret); 4342 out: 4343 btrfs_free_path(path); 4344 return ret; 4345 } 4346 4347 /* 4348 * Helper to check if the subvolume references other subvolumes or if it's 4349 * default. 4350 */ 4351 static noinline int may_destroy_subvol(struct btrfs_root *root) 4352 { 4353 struct btrfs_fs_info *fs_info = root->fs_info; 4354 struct btrfs_path *path; 4355 struct btrfs_dir_item *di; 4356 struct btrfs_key key; 4357 u64 dir_id; 4358 int ret; 4359 4360 path = btrfs_alloc_path(); 4361 if (!path) 4362 return -ENOMEM; 4363 4364 /* Make sure this root isn't set as the default subvol */ 4365 dir_id = btrfs_super_root_dir(fs_info->super_copy); 4366 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, 4367 dir_id, "default", 7, 0); 4368 if (di && !IS_ERR(di)) { 4369 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 4370 if (key.objectid == root->root_key.objectid) { 4371 ret = -EPERM; 4372 btrfs_err(fs_info, 4373 "deleting default subvolume %llu is not allowed", 4374 key.objectid); 4375 goto out; 4376 } 4377 btrfs_release_path(path); 4378 } 4379 4380 key.objectid = root->root_key.objectid; 4381 key.type = BTRFS_ROOT_REF_KEY; 4382 key.offset = (u64)-1; 4383 4384 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4385 if (ret < 0) 4386 goto out; 4387 BUG_ON(ret == 0); 4388 4389 ret = 0; 4390 if (path->slots[0] > 0) { 4391 path->slots[0]--; 4392 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 4393 if (key.objectid == root->root_key.objectid && 4394 key.type == BTRFS_ROOT_REF_KEY) 4395 ret = -ENOTEMPTY; 4396 } 4397 out: 4398 btrfs_free_path(path); 4399 return ret; 4400 } 4401 4402 /* Delete all dentries for inodes belonging to the root */ 4403 static void btrfs_prune_dentries(struct btrfs_root *root) 4404 { 4405 struct btrfs_fs_info *fs_info = root->fs_info; 4406 struct rb_node *node; 4407 struct rb_node *prev; 4408 struct btrfs_inode *entry; 4409 struct inode *inode; 4410 u64 objectid = 0; 4411 4412 if (!BTRFS_FS_ERROR(fs_info)) 4413 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4414 4415 spin_lock(&root->inode_lock); 4416 again: 4417 node = root->inode_tree.rb_node; 4418 prev = NULL; 4419 while (node) { 4420 prev = node; 4421 entry = rb_entry(node, struct btrfs_inode, rb_node); 4422 4423 if (objectid < btrfs_ino(entry)) 4424 node = node->rb_left; 4425 else if (objectid > btrfs_ino(entry)) 4426 node = node->rb_right; 4427 else 4428 break; 4429 } 4430 if (!node) { 4431 while (prev) { 4432 entry = rb_entry(prev, struct btrfs_inode, rb_node); 4433 if (objectid <= btrfs_ino(entry)) { 4434 node = prev; 4435 break; 4436 } 4437 prev = rb_next(prev); 4438 } 4439 } 4440 while (node) { 4441 entry = rb_entry(node, struct btrfs_inode, rb_node); 4442 objectid = btrfs_ino(entry) + 1; 4443 inode = igrab(&entry->vfs_inode); 4444 if (inode) { 4445 spin_unlock(&root->inode_lock); 4446 if (atomic_read(&inode->i_count) > 1) 4447 d_prune_aliases(inode); 4448 /* 4449 * btrfs_drop_inode will have it removed from the inode 4450 * cache when its usage count hits zero. 4451 */ 4452 iput(inode); 4453 cond_resched(); 4454 spin_lock(&root->inode_lock); 4455 goto again; 4456 } 4457 4458 if (cond_resched_lock(&root->inode_lock)) 4459 goto again; 4460 4461 node = rb_next(node); 4462 } 4463 spin_unlock(&root->inode_lock); 4464 } 4465 4466 int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) 4467 { 4468 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); 4469 struct btrfs_root *root = BTRFS_I(dir)->root; 4470 struct inode *inode = d_inode(dentry); 4471 struct btrfs_root *dest = BTRFS_I(inode)->root; 4472 struct btrfs_trans_handle *trans; 4473 struct btrfs_block_rsv block_rsv; 4474 u64 root_flags; 4475 int ret; 4476 4477 /* 4478 * Don't allow to delete a subvolume with send in progress. This is 4479 * inside the inode lock so the error handling that has to drop the bit 4480 * again is not run concurrently. 4481 */ 4482 spin_lock(&dest->root_item_lock); 4483 if (dest->send_in_progress) { 4484 spin_unlock(&dest->root_item_lock); 4485 btrfs_warn(fs_info, 4486 "attempt to delete subvolume %llu during send", 4487 dest->root_key.objectid); 4488 return -EPERM; 4489 } 4490 if (atomic_read(&dest->nr_swapfiles)) { 4491 spin_unlock(&dest->root_item_lock); 4492 btrfs_warn(fs_info, 4493 "attempt to delete subvolume %llu with active swapfile", 4494 root->root_key.objectid); 4495 return -EPERM; 4496 } 4497 root_flags = btrfs_root_flags(&dest->root_item); 4498 btrfs_set_root_flags(&dest->root_item, 4499 root_flags | BTRFS_ROOT_SUBVOL_DEAD); 4500 spin_unlock(&dest->root_item_lock); 4501 4502 down_write(&fs_info->subvol_sem); 4503 4504 ret = may_destroy_subvol(dest); 4505 if (ret) 4506 goto out_up_write; 4507 4508 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 4509 /* 4510 * One for dir inode, 4511 * two for dir entries, 4512 * two for root ref/backref. 4513 */ 4514 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); 4515 if (ret) 4516 goto out_up_write; 4517 4518 trans = btrfs_start_transaction(root, 0); 4519 if (IS_ERR(trans)) { 4520 ret = PTR_ERR(trans); 4521 goto out_release; 4522 } 4523 trans->block_rsv = &block_rsv; 4524 trans->bytes_reserved = block_rsv.size; 4525 4526 btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); 4527 4528 ret = btrfs_unlink_subvol(trans, dir, dentry); 4529 if (ret) { 4530 btrfs_abort_transaction(trans, ret); 4531 goto out_end_trans; 4532 } 4533 4534 ret = btrfs_record_root_in_trans(trans, dest); 4535 if (ret) { 4536 btrfs_abort_transaction(trans, ret); 4537 goto out_end_trans; 4538 } 4539 4540 memset(&dest->root_item.drop_progress, 0, 4541 sizeof(dest->root_item.drop_progress)); 4542 btrfs_set_root_drop_level(&dest->root_item, 0); 4543 btrfs_set_root_refs(&dest->root_item, 0); 4544 4545 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { 4546 ret = btrfs_insert_orphan_item(trans, 4547 fs_info->tree_root, 4548 dest->root_key.objectid); 4549 if (ret) { 4550 btrfs_abort_transaction(trans, ret); 4551 goto out_end_trans; 4552 } 4553 } 4554 4555 ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, 4556 BTRFS_UUID_KEY_SUBVOL, 4557 dest->root_key.objectid); 4558 if (ret && ret != -ENOENT) { 4559 btrfs_abort_transaction(trans, ret); 4560 goto out_end_trans; 4561 } 4562 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { 4563 ret = btrfs_uuid_tree_remove(trans, 4564 dest->root_item.received_uuid, 4565 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4566 dest->root_key.objectid); 4567 if (ret && ret != -ENOENT) { 4568 btrfs_abort_transaction(trans, ret); 4569 goto out_end_trans; 4570 } 4571 } 4572 4573 free_anon_bdev(dest->anon_dev); 4574 dest->anon_dev = 0; 4575 out_end_trans: 4576 trans->block_rsv = NULL; 4577 trans->bytes_reserved = 0; 4578 ret = btrfs_end_transaction(trans); 4579 inode->i_flags |= S_DEAD; 4580 out_release: 4581 btrfs_subvolume_release_metadata(root, &block_rsv); 4582 out_up_write: 4583 up_write(&fs_info->subvol_sem); 4584 if (ret) { 4585 spin_lock(&dest->root_item_lock); 4586 root_flags = btrfs_root_flags(&dest->root_item); 4587 btrfs_set_root_flags(&dest->root_item, 4588 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); 4589 spin_unlock(&dest->root_item_lock); 4590 } else { 4591 d_invalidate(dentry); 4592 btrfs_prune_dentries(dest); 4593 ASSERT(dest->send_in_progress == 0); 4594 } 4595 4596 return ret; 4597 } 4598 4599 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 4600 { 4601 struct inode *inode = d_inode(dentry); 4602 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 4603 int err = 0; 4604 struct btrfs_trans_handle *trans; 4605 u64 last_unlink_trans; 4606 4607 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 4608 return -ENOTEMPTY; 4609 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { 4610 if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { 4611 btrfs_err(fs_info, 4612 "extent tree v2 doesn't support snapshot deletion yet"); 4613 return -EOPNOTSUPP; 4614 } 4615 return btrfs_delete_subvolume(dir, dentry); 4616 } 4617 4618 trans = __unlink_start_trans(dir); 4619 if (IS_ERR(trans)) 4620 return PTR_ERR(trans); 4621 4622 if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 4623 err = btrfs_unlink_subvol(trans, dir, dentry); 4624 goto out; 4625 } 4626 4627 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 4628 if (err) 4629 goto out; 4630 4631 last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; 4632 4633 /* now the directory is empty */ 4634 err = btrfs_unlink_inode(trans, BTRFS_I(dir), 4635 BTRFS_I(d_inode(dentry)), dentry->d_name.name, 4636 dentry->d_name.len); 4637 if (!err) { 4638 btrfs_i_size_write(BTRFS_I(inode), 0); 4639 /* 4640 * Propagate the last_unlink_trans value of the deleted dir to 4641 * its parent directory. This is to prevent an unrecoverable 4642 * log tree in the case we do something like this: 4643 * 1) create dir foo 4644 * 2) create snapshot under dir foo 4645 * 3) delete the snapshot 4646 * 4) rmdir foo 4647 * 5) mkdir foo 4648 * 6) fsync foo or some file inside foo 4649 */ 4650 if (last_unlink_trans >= trans->transid) 4651 BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; 4652 } 4653 out: 4654 btrfs_end_transaction(trans); 4655 btrfs_btree_balance_dirty(fs_info); 4656 4657 return err; 4658 } 4659 4660 /* 4661 * btrfs_truncate_block - read, zero a chunk and write a block 4662 * @inode - inode that we're zeroing 4663 * @from - the offset to start zeroing 4664 * @len - the length to zero, 0 to zero the entire range respective to the 4665 * offset 4666 * @front - zero up to the offset instead of from the offset on 4667 * 4668 * This will find the block for the "from" offset and cow the block and zero the 4669 * part we want to zero. This is used with truncate and hole punching. 4670 */ 4671 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, 4672 int front) 4673 { 4674 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4675 struct address_space *mapping = inode->vfs_inode.i_mapping; 4676 struct extent_io_tree *io_tree = &inode->io_tree; 4677 struct btrfs_ordered_extent *ordered; 4678 struct extent_state *cached_state = NULL; 4679 struct extent_changeset *data_reserved = NULL; 4680 bool only_release_metadata = false; 4681 u32 blocksize = fs_info->sectorsize; 4682 pgoff_t index = from >> PAGE_SHIFT; 4683 unsigned offset = from & (blocksize - 1); 4684 struct page *page; 4685 gfp_t mask = btrfs_alloc_write_mask(mapping); 4686 size_t write_bytes = blocksize; 4687 int ret = 0; 4688 u64 block_start; 4689 u64 block_end; 4690 4691 if (IS_ALIGNED(offset, blocksize) && 4692 (!len || IS_ALIGNED(len, blocksize))) 4693 goto out; 4694 4695 block_start = round_down(from, blocksize); 4696 block_end = block_start + blocksize - 1; 4697 4698 ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, 4699 blocksize); 4700 if (ret < 0) { 4701 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) { 4702 /* For nocow case, no need to reserve data space */ 4703 only_release_metadata = true; 4704 } else { 4705 goto out; 4706 } 4707 } 4708 ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize); 4709 if (ret < 0) { 4710 if (!only_release_metadata) 4711 btrfs_free_reserved_data_space(inode, data_reserved, 4712 block_start, blocksize); 4713 goto out; 4714 } 4715 again: 4716 page = find_or_create_page(mapping, index, mask); 4717 if (!page) { 4718 btrfs_delalloc_release_space(inode, data_reserved, block_start, 4719 blocksize, true); 4720 btrfs_delalloc_release_extents(inode, blocksize); 4721 ret = -ENOMEM; 4722 goto out; 4723 } 4724 ret = set_page_extent_mapped(page); 4725 if (ret < 0) 4726 goto out_unlock; 4727 4728 if (!PageUptodate(page)) { 4729 ret = btrfs_readpage(NULL, page); 4730 lock_page(page); 4731 if (page->mapping != mapping) { 4732 unlock_page(page); 4733 put_page(page); 4734 goto again; 4735 } 4736 if (!PageUptodate(page)) { 4737 ret = -EIO; 4738 goto out_unlock; 4739 } 4740 } 4741 wait_on_page_writeback(page); 4742 4743 lock_extent_bits(io_tree, block_start, block_end, &cached_state); 4744 4745 ordered = btrfs_lookup_ordered_extent(inode, block_start); 4746 if (ordered) { 4747 unlock_extent_cached(io_tree, block_start, block_end, 4748 &cached_state); 4749 unlock_page(page); 4750 put_page(page); 4751 btrfs_start_ordered_extent(ordered, 1); 4752 btrfs_put_ordered_extent(ordered); 4753 goto again; 4754 } 4755 4756 clear_extent_bit(&inode->io_tree, block_start, block_end, 4757 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 4758 0, 0, &cached_state); 4759 4760 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, 4761 &cached_state); 4762 if (ret) { 4763 unlock_extent_cached(io_tree, block_start, block_end, 4764 &cached_state); 4765 goto out_unlock; 4766 } 4767 4768 if (offset != blocksize) { 4769 if (!len) 4770 len = blocksize - offset; 4771 if (front) 4772 memzero_page(page, (block_start - page_offset(page)), 4773 offset); 4774 else 4775 memzero_page(page, (block_start - page_offset(page)) + offset, 4776 len); 4777 flush_dcache_page(page); 4778 } 4779 btrfs_page_clear_checked(fs_info, page, block_start, 4780 block_end + 1 - block_start); 4781 btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); 4782 unlock_extent_cached(io_tree, block_start, block_end, &cached_state); 4783 4784 if (only_release_metadata) 4785 set_extent_bit(&inode->io_tree, block_start, block_end, 4786 EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL); 4787 4788 out_unlock: 4789 if (ret) { 4790 if (only_release_metadata) 4791 btrfs_delalloc_release_metadata(inode, blocksize, true); 4792 else 4793 btrfs_delalloc_release_space(inode, data_reserved, 4794 block_start, blocksize, true); 4795 } 4796 btrfs_delalloc_release_extents(inode, blocksize); 4797 unlock_page(page); 4798 put_page(page); 4799 out: 4800 if (only_release_metadata) 4801 btrfs_check_nocow_unlock(inode); 4802 extent_changeset_free(data_reserved); 4803 return ret; 4804 } 4805 4806 static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, 4807 u64 offset, u64 len) 4808 { 4809 struct btrfs_fs_info *fs_info = root->fs_info; 4810 struct btrfs_trans_handle *trans; 4811 struct btrfs_drop_extents_args drop_args = { 0 }; 4812 int ret; 4813 4814 /* 4815 * If NO_HOLES is enabled, we don't need to do anything. 4816 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() 4817 * or btrfs_update_inode() will be called, which guarantee that the next 4818 * fsync will know this inode was changed and needs to be logged. 4819 */ 4820 if (btrfs_fs_incompat(fs_info, NO_HOLES)) 4821 return 0; 4822 4823 /* 4824 * 1 - for the one we're dropping 4825 * 1 - for the one we're adding 4826 * 1 - for updating the inode. 4827 */ 4828 trans = btrfs_start_transaction(root, 3); 4829 if (IS_ERR(trans)) 4830 return PTR_ERR(trans); 4831 4832 drop_args.start = offset; 4833 drop_args.end = offset + len; 4834 drop_args.drop_cache = true; 4835 4836 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 4837 if (ret) { 4838 btrfs_abort_transaction(trans, ret); 4839 btrfs_end_transaction(trans); 4840 return ret; 4841 } 4842 4843 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), 4844 offset, 0, 0, len, 0, len, 0, 0, 0); 4845 if (ret) { 4846 btrfs_abort_transaction(trans, ret); 4847 } else { 4848 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); 4849 btrfs_update_inode(trans, root, inode); 4850 } 4851 btrfs_end_transaction(trans); 4852 return ret; 4853 } 4854 4855 /* 4856 * This function puts in dummy file extents for the area we're creating a hole 4857 * for. So if we are truncating this file to a larger size we need to insert 4858 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 4859 * the range between oldsize and size 4860 */ 4861 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) 4862 { 4863 struct btrfs_root *root = inode->root; 4864 struct btrfs_fs_info *fs_info = root->fs_info; 4865 struct extent_io_tree *io_tree = &inode->io_tree; 4866 struct extent_map *em = NULL; 4867 struct extent_state *cached_state = NULL; 4868 struct extent_map_tree *em_tree = &inode->extent_tree; 4869 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); 4870 u64 block_end = ALIGN(size, fs_info->sectorsize); 4871 u64 last_byte; 4872 u64 cur_offset; 4873 u64 hole_size; 4874 int err = 0; 4875 4876 /* 4877 * If our size started in the middle of a block we need to zero out the 4878 * rest of the block before we expand the i_size, otherwise we could 4879 * expose stale data. 4880 */ 4881 err = btrfs_truncate_block(inode, oldsize, 0, 0); 4882 if (err) 4883 return err; 4884 4885 if (size <= hole_start) 4886 return 0; 4887 4888 btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1, 4889 &cached_state); 4890 cur_offset = hole_start; 4891 while (1) { 4892 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 4893 block_end - cur_offset); 4894 if (IS_ERR(em)) { 4895 err = PTR_ERR(em); 4896 em = NULL; 4897 break; 4898 } 4899 last_byte = min(extent_map_end(em), block_end); 4900 last_byte = ALIGN(last_byte, fs_info->sectorsize); 4901 hole_size = last_byte - cur_offset; 4902 4903 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 4904 struct extent_map *hole_em; 4905 4906 err = maybe_insert_hole(root, inode, cur_offset, 4907 hole_size); 4908 if (err) 4909 break; 4910 4911 err = btrfs_inode_set_file_extent_range(inode, 4912 cur_offset, hole_size); 4913 if (err) 4914 break; 4915 4916 btrfs_drop_extent_cache(inode, cur_offset, 4917 cur_offset + hole_size - 1, 0); 4918 hole_em = alloc_extent_map(); 4919 if (!hole_em) { 4920 btrfs_set_inode_full_sync(inode); 4921 goto next; 4922 } 4923 hole_em->start = cur_offset; 4924 hole_em->len = hole_size; 4925 hole_em->orig_start = cur_offset; 4926 4927 hole_em->block_start = EXTENT_MAP_HOLE; 4928 hole_em->block_len = 0; 4929 hole_em->orig_block_len = 0; 4930 hole_em->ram_bytes = hole_size; 4931 hole_em->compress_type = BTRFS_COMPRESS_NONE; 4932 hole_em->generation = fs_info->generation; 4933 4934 while (1) { 4935 write_lock(&em_tree->lock); 4936 err = add_extent_mapping(em_tree, hole_em, 1); 4937 write_unlock(&em_tree->lock); 4938 if (err != -EEXIST) 4939 break; 4940 btrfs_drop_extent_cache(inode, cur_offset, 4941 cur_offset + 4942 hole_size - 1, 0); 4943 } 4944 free_extent_map(hole_em); 4945 } else { 4946 err = btrfs_inode_set_file_extent_range(inode, 4947 cur_offset, hole_size); 4948 if (err) 4949 break; 4950 } 4951 next: 4952 free_extent_map(em); 4953 em = NULL; 4954 cur_offset = last_byte; 4955 if (cur_offset >= block_end) 4956 break; 4957 } 4958 free_extent_map(em); 4959 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state); 4960 return err; 4961 } 4962 4963 static int btrfs_setsize(struct inode *inode, struct iattr *attr) 4964 { 4965 struct btrfs_root *root = BTRFS_I(inode)->root; 4966 struct btrfs_trans_handle *trans; 4967 loff_t oldsize = i_size_read(inode); 4968 loff_t newsize = attr->ia_size; 4969 int mask = attr->ia_valid; 4970 int ret; 4971 4972 /* 4973 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a 4974 * special case where we need to update the times despite not having 4975 * these flags set. For all other operations the VFS set these flags 4976 * explicitly if it wants a timestamp update. 4977 */ 4978 if (newsize != oldsize) { 4979 inode_inc_iversion(inode); 4980 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) 4981 inode->i_ctime = inode->i_mtime = 4982 current_time(inode); 4983 } 4984 4985 if (newsize > oldsize) { 4986 /* 4987 * Don't do an expanding truncate while snapshotting is ongoing. 4988 * This is to ensure the snapshot captures a fully consistent 4989 * state of this file - if the snapshot captures this expanding 4990 * truncation, it must capture all writes that happened before 4991 * this truncation. 4992 */ 4993 btrfs_drew_write_lock(&root->snapshot_lock); 4994 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); 4995 if (ret) { 4996 btrfs_drew_write_unlock(&root->snapshot_lock); 4997 return ret; 4998 } 4999 5000 trans = btrfs_start_transaction(root, 1); 5001 if (IS_ERR(trans)) { 5002 btrfs_drew_write_unlock(&root->snapshot_lock); 5003 return PTR_ERR(trans); 5004 } 5005 5006 i_size_write(inode, newsize); 5007 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 5008 pagecache_isize_extended(inode, oldsize, newsize); 5009 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 5010 btrfs_drew_write_unlock(&root->snapshot_lock); 5011 btrfs_end_transaction(trans); 5012 } else { 5013 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5014 5015 if (btrfs_is_zoned(fs_info)) { 5016 ret = btrfs_wait_ordered_range(inode, 5017 ALIGN(newsize, fs_info->sectorsize), 5018 (u64)-1); 5019 if (ret) 5020 return ret; 5021 } 5022 5023 /* 5024 * We're truncating a file that used to have good data down to 5025 * zero. Make sure any new writes to the file get on disk 5026 * on close. 5027 */ 5028 if (newsize == 0) 5029 set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, 5030 &BTRFS_I(inode)->runtime_flags); 5031 5032 truncate_setsize(inode, newsize); 5033 5034 inode_dio_wait(inode); 5035 5036 ret = btrfs_truncate(inode, newsize == oldsize); 5037 if (ret && inode->i_nlink) { 5038 int err; 5039 5040 /* 5041 * Truncate failed, so fix up the in-memory size. We 5042 * adjusted disk_i_size down as we removed extents, so 5043 * wait for disk_i_size to be stable and then update the 5044 * in-memory size to match. 5045 */ 5046 err = btrfs_wait_ordered_range(inode, 0, (u64)-1); 5047 if (err) 5048 return err; 5049 i_size_write(inode, BTRFS_I(inode)->disk_i_size); 5050 } 5051 } 5052 5053 return ret; 5054 } 5055 5056 static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, 5057 struct iattr *attr) 5058 { 5059 struct inode *inode = d_inode(dentry); 5060 struct btrfs_root *root = BTRFS_I(inode)->root; 5061 int err; 5062 5063 if (btrfs_root_readonly(root)) 5064 return -EROFS; 5065 5066 err = setattr_prepare(mnt_userns, dentry, attr); 5067 if (err) 5068 return err; 5069 5070 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 5071 err = btrfs_setsize(inode, attr); 5072 if (err) 5073 return err; 5074 } 5075 5076 if (attr->ia_valid) { 5077 setattr_copy(mnt_userns, inode, attr); 5078 inode_inc_iversion(inode); 5079 err = btrfs_dirty_inode(inode); 5080 5081 if (!err && attr->ia_valid & ATTR_MODE) 5082 err = posix_acl_chmod(mnt_userns, inode, inode->i_mode); 5083 } 5084 5085 return err; 5086 } 5087 5088 /* 5089 * While truncating the inode pages during eviction, we get the VFS 5090 * calling btrfs_invalidate_folio() against each folio of the inode. This 5091 * is slow because the calls to btrfs_invalidate_folio() result in a 5092 * huge amount of calls to lock_extent_bits() and clear_extent_bit(), 5093 * which keep merging and splitting extent_state structures over and over, 5094 * wasting lots of time. 5095 * 5096 * Therefore if the inode is being evicted, let btrfs_invalidate_folio() 5097 * skip all those expensive operations on a per folio basis and do only 5098 * the ordered io finishing, while we release here the extent_map and 5099 * extent_state structures, without the excessive merging and splitting. 5100 */ 5101 static void evict_inode_truncate_pages(struct inode *inode) 5102 { 5103 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5104 struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; 5105 struct rb_node *node; 5106 5107 ASSERT(inode->i_state & I_FREEING); 5108 truncate_inode_pages_final(&inode->i_data); 5109 5110 write_lock(&map_tree->lock); 5111 while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { 5112 struct extent_map *em; 5113 5114 node = rb_first_cached(&map_tree->map); 5115 em = rb_entry(node, struct extent_map, rb_node); 5116 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 5117 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 5118 remove_extent_mapping(map_tree, em); 5119 free_extent_map(em); 5120 if (need_resched()) { 5121 write_unlock(&map_tree->lock); 5122 cond_resched(); 5123 write_lock(&map_tree->lock); 5124 } 5125 } 5126 write_unlock(&map_tree->lock); 5127 5128 /* 5129 * Keep looping until we have no more ranges in the io tree. 5130 * We can have ongoing bios started by readahead that have 5131 * their endio callback (extent_io.c:end_bio_extent_readpage) 5132 * still in progress (unlocked the pages in the bio but did not yet 5133 * unlocked the ranges in the io tree). Therefore this means some 5134 * ranges can still be locked and eviction started because before 5135 * submitting those bios, which are executed by a separate task (work 5136 * queue kthread), inode references (inode->i_count) were not taken 5137 * (which would be dropped in the end io callback of each bio). 5138 * Therefore here we effectively end up waiting for those bios and 5139 * anyone else holding locked ranges without having bumped the inode's 5140 * reference count - if we don't do it, when they access the inode's 5141 * io_tree to unlock a range it may be too late, leading to an 5142 * use-after-free issue. 5143 */ 5144 spin_lock(&io_tree->lock); 5145 while (!RB_EMPTY_ROOT(&io_tree->state)) { 5146 struct extent_state *state; 5147 struct extent_state *cached_state = NULL; 5148 u64 start; 5149 u64 end; 5150 unsigned state_flags; 5151 5152 node = rb_first(&io_tree->state); 5153 state = rb_entry(node, struct extent_state, rb_node); 5154 start = state->start; 5155 end = state->end; 5156 state_flags = state->state; 5157 spin_unlock(&io_tree->lock); 5158 5159 lock_extent_bits(io_tree, start, end, &cached_state); 5160 5161 /* 5162 * If still has DELALLOC flag, the extent didn't reach disk, 5163 * and its reserved space won't be freed by delayed_ref. 5164 * So we need to free its reserved space here. 5165 * (Refer to comment in btrfs_invalidate_folio, case 2) 5166 * 5167 * Note, end is the bytenr of last byte, so we need + 1 here. 5168 */ 5169 if (state_flags & EXTENT_DELALLOC) 5170 btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, 5171 end - start + 1); 5172 5173 clear_extent_bit(io_tree, start, end, 5174 EXTENT_LOCKED | EXTENT_DELALLOC | 5175 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, 5176 &cached_state); 5177 5178 cond_resched(); 5179 spin_lock(&io_tree->lock); 5180 } 5181 spin_unlock(&io_tree->lock); 5182 } 5183 5184 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, 5185 struct btrfs_block_rsv *rsv) 5186 { 5187 struct btrfs_fs_info *fs_info = root->fs_info; 5188 struct btrfs_trans_handle *trans; 5189 u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); 5190 int ret; 5191 5192 /* 5193 * Eviction should be taking place at some place safe because of our 5194 * delayed iputs. However the normal flushing code will run delayed 5195 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. 5196 * 5197 * We reserve the delayed_refs_extra here again because we can't use 5198 * btrfs_start_transaction(root, 0) for the same deadlocky reason as 5199 * above. We reserve our extra bit here because we generate a ton of 5200 * delayed refs activity by truncating. 5201 * 5202 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can, 5203 * if we fail to make this reservation we can re-try without the 5204 * delayed_refs_extra so we can make some forward progress. 5205 */ 5206 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra, 5207 BTRFS_RESERVE_FLUSH_EVICT); 5208 if (ret) { 5209 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size, 5210 BTRFS_RESERVE_FLUSH_EVICT); 5211 if (ret) { 5212 btrfs_warn(fs_info, 5213 "could not allocate space for delete; will truncate on mount"); 5214 return ERR_PTR(-ENOSPC); 5215 } 5216 delayed_refs_extra = 0; 5217 } 5218 5219 trans = btrfs_join_transaction(root); 5220 if (IS_ERR(trans)) 5221 return trans; 5222 5223 if (delayed_refs_extra) { 5224 trans->block_rsv = &fs_info->trans_block_rsv; 5225 trans->bytes_reserved = delayed_refs_extra; 5226 btrfs_block_rsv_migrate(rsv, trans->block_rsv, 5227 delayed_refs_extra, 1); 5228 } 5229 return trans; 5230 } 5231 5232 void btrfs_evict_inode(struct inode *inode) 5233 { 5234 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5235 struct btrfs_trans_handle *trans; 5236 struct btrfs_root *root = BTRFS_I(inode)->root; 5237 struct btrfs_block_rsv *rsv; 5238 int ret; 5239 5240 trace_btrfs_inode_evict(inode); 5241 5242 if (!root) { 5243 fsverity_cleanup_inode(inode); 5244 clear_inode(inode); 5245 return; 5246 } 5247 5248 evict_inode_truncate_pages(inode); 5249 5250 if (inode->i_nlink && 5251 ((btrfs_root_refs(&root->root_item) != 0 && 5252 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || 5253 btrfs_is_free_space_inode(BTRFS_I(inode)))) 5254 goto no_delete; 5255 5256 if (is_bad_inode(inode)) 5257 goto no_delete; 5258 5259 btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); 5260 5261 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 5262 goto no_delete; 5263 5264 if (inode->i_nlink > 0) { 5265 BUG_ON(btrfs_root_refs(&root->root_item) != 0 && 5266 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); 5267 goto no_delete; 5268 } 5269 5270 /* 5271 * This makes sure the inode item in tree is uptodate and the space for 5272 * the inode update is released. 5273 */ 5274 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); 5275 if (ret) 5276 goto no_delete; 5277 5278 /* 5279 * This drops any pending insert or delete operations we have for this 5280 * inode. We could have a delayed dir index deletion queued up, but 5281 * we're removing the inode completely so that'll be taken care of in 5282 * the truncate. 5283 */ 5284 btrfs_kill_delayed_inode_items(BTRFS_I(inode)); 5285 5286 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 5287 if (!rsv) 5288 goto no_delete; 5289 rsv->size = btrfs_calc_metadata_size(fs_info, 1); 5290 rsv->failfast = 1; 5291 5292 btrfs_i_size_write(BTRFS_I(inode), 0); 5293 5294 while (1) { 5295 struct btrfs_truncate_control control = { 5296 .inode = BTRFS_I(inode), 5297 .ino = btrfs_ino(BTRFS_I(inode)), 5298 .new_size = 0, 5299 .min_type = 0, 5300 }; 5301 5302 trans = evict_refill_and_join(root, rsv); 5303 if (IS_ERR(trans)) 5304 goto free_rsv; 5305 5306 trans->block_rsv = rsv; 5307 5308 ret = btrfs_truncate_inode_items(trans, root, &control); 5309 trans->block_rsv = &fs_info->trans_block_rsv; 5310 btrfs_end_transaction(trans); 5311 btrfs_btree_balance_dirty(fs_info); 5312 if (ret && ret != -ENOSPC && ret != -EAGAIN) 5313 goto free_rsv; 5314 else if (!ret) 5315 break; 5316 } 5317 5318 /* 5319 * Errors here aren't a big deal, it just means we leave orphan items in 5320 * the tree. They will be cleaned up on the next mount. If the inode 5321 * number gets reused, cleanup deletes the orphan item without doing 5322 * anything, and unlink reuses the existing orphan item. 5323 * 5324 * If it turns out that we are dropping too many of these, we might want 5325 * to add a mechanism for retrying these after a commit. 5326 */ 5327 trans = evict_refill_and_join(root, rsv); 5328 if (!IS_ERR(trans)) { 5329 trans->block_rsv = rsv; 5330 btrfs_orphan_del(trans, BTRFS_I(inode)); 5331 trans->block_rsv = &fs_info->trans_block_rsv; 5332 btrfs_end_transaction(trans); 5333 } 5334 5335 free_rsv: 5336 btrfs_free_block_rsv(fs_info, rsv); 5337 no_delete: 5338 /* 5339 * If we didn't successfully delete, the orphan item will still be in 5340 * the tree and we'll retry on the next mount. Again, we might also want 5341 * to retry these periodically in the future. 5342 */ 5343 btrfs_remove_delayed_node(BTRFS_I(inode)); 5344 fsverity_cleanup_inode(inode); 5345 clear_inode(inode); 5346 } 5347 5348 /* 5349 * Return the key found in the dir entry in the location pointer, fill @type 5350 * with BTRFS_FT_*, and return 0. 5351 * 5352 * If no dir entries were found, returns -ENOENT. 5353 * If found a corrupted location in dir entry, returns -EUCLEAN. 5354 */ 5355 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 5356 struct btrfs_key *location, u8 *type) 5357 { 5358 const char *name = dentry->d_name.name; 5359 int namelen = dentry->d_name.len; 5360 struct btrfs_dir_item *di; 5361 struct btrfs_path *path; 5362 struct btrfs_root *root = BTRFS_I(dir)->root; 5363 int ret = 0; 5364 5365 path = btrfs_alloc_path(); 5366 if (!path) 5367 return -ENOMEM; 5368 5369 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), 5370 name, namelen, 0); 5371 if (IS_ERR_OR_NULL(di)) { 5372 ret = di ? PTR_ERR(di) : -ENOENT; 5373 goto out; 5374 } 5375 5376 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 5377 if (location->type != BTRFS_INODE_ITEM_KEY && 5378 location->type != BTRFS_ROOT_ITEM_KEY) { 5379 ret = -EUCLEAN; 5380 btrfs_warn(root->fs_info, 5381 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", 5382 __func__, name, btrfs_ino(BTRFS_I(dir)), 5383 location->objectid, location->type, location->offset); 5384 } 5385 if (!ret) 5386 *type = btrfs_dir_type(path->nodes[0], di); 5387 out: 5388 btrfs_free_path(path); 5389 return ret; 5390 } 5391 5392 /* 5393 * when we hit a tree root in a directory, the btrfs part of the inode 5394 * needs to be changed to reflect the root directory of the tree root. This 5395 * is kind of like crossing a mount point. 5396 */ 5397 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, 5398 struct inode *dir, 5399 struct dentry *dentry, 5400 struct btrfs_key *location, 5401 struct btrfs_root **sub_root) 5402 { 5403 struct btrfs_path *path; 5404 struct btrfs_root *new_root; 5405 struct btrfs_root_ref *ref; 5406 struct extent_buffer *leaf; 5407 struct btrfs_key key; 5408 int ret; 5409 int err = 0; 5410 5411 path = btrfs_alloc_path(); 5412 if (!path) { 5413 err = -ENOMEM; 5414 goto out; 5415 } 5416 5417 err = -ENOENT; 5418 key.objectid = BTRFS_I(dir)->root->root_key.objectid; 5419 key.type = BTRFS_ROOT_REF_KEY; 5420 key.offset = location->objectid; 5421 5422 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 5423 if (ret) { 5424 if (ret < 0) 5425 err = ret; 5426 goto out; 5427 } 5428 5429 leaf = path->nodes[0]; 5430 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 5431 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || 5432 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 5433 goto out; 5434 5435 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 5436 (unsigned long)(ref + 1), 5437 dentry->d_name.len); 5438 if (ret) 5439 goto out; 5440 5441 btrfs_release_path(path); 5442 5443 new_root = btrfs_get_fs_root(fs_info, location->objectid, true); 5444 if (IS_ERR(new_root)) { 5445 err = PTR_ERR(new_root); 5446 goto out; 5447 } 5448 5449 *sub_root = new_root; 5450 location->objectid = btrfs_root_dirid(&new_root->root_item); 5451 location->type = BTRFS_INODE_ITEM_KEY; 5452 location->offset = 0; 5453 err = 0; 5454 out: 5455 btrfs_free_path(path); 5456 return err; 5457 } 5458 5459 static void inode_tree_add(struct inode *inode) 5460 { 5461 struct btrfs_root *root = BTRFS_I(inode)->root; 5462 struct btrfs_inode *entry; 5463 struct rb_node **p; 5464 struct rb_node *parent; 5465 struct rb_node *new = &BTRFS_I(inode)->rb_node; 5466 u64 ino = btrfs_ino(BTRFS_I(inode)); 5467 5468 if (inode_unhashed(inode)) 5469 return; 5470 parent = NULL; 5471 spin_lock(&root->inode_lock); 5472 p = &root->inode_tree.rb_node; 5473 while (*p) { 5474 parent = *p; 5475 entry = rb_entry(parent, struct btrfs_inode, rb_node); 5476 5477 if (ino < btrfs_ino(entry)) 5478 p = &parent->rb_left; 5479 else if (ino > btrfs_ino(entry)) 5480 p = &parent->rb_right; 5481 else { 5482 WARN_ON(!(entry->vfs_inode.i_state & 5483 (I_WILL_FREE | I_FREEING))); 5484 rb_replace_node(parent, new, &root->inode_tree); 5485 RB_CLEAR_NODE(parent); 5486 spin_unlock(&root->inode_lock); 5487 return; 5488 } 5489 } 5490 rb_link_node(new, parent, p); 5491 rb_insert_color(new, &root->inode_tree); 5492 spin_unlock(&root->inode_lock); 5493 } 5494 5495 static void inode_tree_del(struct btrfs_inode *inode) 5496 { 5497 struct btrfs_root *root = inode->root; 5498 int empty = 0; 5499 5500 spin_lock(&root->inode_lock); 5501 if (!RB_EMPTY_NODE(&inode->rb_node)) { 5502 rb_erase(&inode->rb_node, &root->inode_tree); 5503 RB_CLEAR_NODE(&inode->rb_node); 5504 empty = RB_EMPTY_ROOT(&root->inode_tree); 5505 } 5506 spin_unlock(&root->inode_lock); 5507 5508 if (empty && btrfs_root_refs(&root->root_item) == 0) { 5509 spin_lock(&root->inode_lock); 5510 empty = RB_EMPTY_ROOT(&root->inode_tree); 5511 spin_unlock(&root->inode_lock); 5512 if (empty) 5513 btrfs_add_dead_root(root); 5514 } 5515 } 5516 5517 5518 static int btrfs_init_locked_inode(struct inode *inode, void *p) 5519 { 5520 struct btrfs_iget_args *args = p; 5521 5522 inode->i_ino = args->ino; 5523 BTRFS_I(inode)->location.objectid = args->ino; 5524 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; 5525 BTRFS_I(inode)->location.offset = 0; 5526 BTRFS_I(inode)->root = btrfs_grab_root(args->root); 5527 BUG_ON(args->root && !BTRFS_I(inode)->root); 5528 return 0; 5529 } 5530 5531 static int btrfs_find_actor(struct inode *inode, void *opaque) 5532 { 5533 struct btrfs_iget_args *args = opaque; 5534 5535 return args->ino == BTRFS_I(inode)->location.objectid && 5536 args->root == BTRFS_I(inode)->root; 5537 } 5538 5539 static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, 5540 struct btrfs_root *root) 5541 { 5542 struct inode *inode; 5543 struct btrfs_iget_args args; 5544 unsigned long hashval = btrfs_inode_hash(ino, root); 5545 5546 args.ino = ino; 5547 args.root = root; 5548 5549 inode = iget5_locked(s, hashval, btrfs_find_actor, 5550 btrfs_init_locked_inode, 5551 (void *)&args); 5552 return inode; 5553 } 5554 5555 /* 5556 * Get an inode object given its inode number and corresponding root. 5557 * Path can be preallocated to prevent recursing back to iget through 5558 * allocator. NULL is also valid but may require an additional allocation 5559 * later. 5560 */ 5561 struct inode *btrfs_iget_path(struct super_block *s, u64 ino, 5562 struct btrfs_root *root, struct btrfs_path *path) 5563 { 5564 struct inode *inode; 5565 5566 inode = btrfs_iget_locked(s, ino, root); 5567 if (!inode) 5568 return ERR_PTR(-ENOMEM); 5569 5570 if (inode->i_state & I_NEW) { 5571 int ret; 5572 5573 ret = btrfs_read_locked_inode(inode, path); 5574 if (!ret) { 5575 inode_tree_add(inode); 5576 unlock_new_inode(inode); 5577 } else { 5578 iget_failed(inode); 5579 /* 5580 * ret > 0 can come from btrfs_search_slot called by 5581 * btrfs_read_locked_inode, this means the inode item 5582 * was not found. 5583 */ 5584 if (ret > 0) 5585 ret = -ENOENT; 5586 inode = ERR_PTR(ret); 5587 } 5588 } 5589 5590 return inode; 5591 } 5592 5593 struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) 5594 { 5595 return btrfs_iget_path(s, ino, root, NULL); 5596 } 5597 5598 static struct inode *new_simple_dir(struct super_block *s, 5599 struct btrfs_key *key, 5600 struct btrfs_root *root) 5601 { 5602 struct inode *inode = new_inode(s); 5603 5604 if (!inode) 5605 return ERR_PTR(-ENOMEM); 5606 5607 BTRFS_I(inode)->root = btrfs_grab_root(root); 5608 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 5609 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 5610 5611 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 5612 /* 5613 * We only need lookup, the rest is read-only and there's no inode 5614 * associated with the dentry 5615 */ 5616 inode->i_op = &simple_dir_inode_operations; 5617 inode->i_opflags &= ~IOP_XATTR; 5618 inode->i_fop = &simple_dir_operations; 5619 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5620 inode->i_mtime = current_time(inode); 5621 inode->i_atime = inode->i_mtime; 5622 inode->i_ctime = inode->i_mtime; 5623 BTRFS_I(inode)->i_otime = inode->i_mtime; 5624 5625 return inode; 5626 } 5627 5628 static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); 5629 static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); 5630 static_assert(BTRFS_FT_DIR == FT_DIR); 5631 static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); 5632 static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); 5633 static_assert(BTRFS_FT_FIFO == FT_FIFO); 5634 static_assert(BTRFS_FT_SOCK == FT_SOCK); 5635 static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); 5636 5637 static inline u8 btrfs_inode_type(struct inode *inode) 5638 { 5639 return fs_umode_to_ftype(inode->i_mode); 5640 } 5641 5642 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 5643 { 5644 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 5645 struct inode *inode; 5646 struct btrfs_root *root = BTRFS_I(dir)->root; 5647 struct btrfs_root *sub_root = root; 5648 struct btrfs_key location; 5649 u8 di_type = 0; 5650 int ret = 0; 5651 5652 if (dentry->d_name.len > BTRFS_NAME_LEN) 5653 return ERR_PTR(-ENAMETOOLONG); 5654 5655 ret = btrfs_inode_by_name(dir, dentry, &location, &di_type); 5656 if (ret < 0) 5657 return ERR_PTR(ret); 5658 5659 if (location.type == BTRFS_INODE_ITEM_KEY) { 5660 inode = btrfs_iget(dir->i_sb, location.objectid, root); 5661 if (IS_ERR(inode)) 5662 return inode; 5663 5664 /* Do extra check against inode mode with di_type */ 5665 if (btrfs_inode_type(inode) != di_type) { 5666 btrfs_crit(fs_info, 5667 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", 5668 inode->i_mode, btrfs_inode_type(inode), 5669 di_type); 5670 iput(inode); 5671 return ERR_PTR(-EUCLEAN); 5672 } 5673 return inode; 5674 } 5675 5676 ret = fixup_tree_root_location(fs_info, dir, dentry, 5677 &location, &sub_root); 5678 if (ret < 0) { 5679 if (ret != -ENOENT) 5680 inode = ERR_PTR(ret); 5681 else 5682 inode = new_simple_dir(dir->i_sb, &location, sub_root); 5683 } else { 5684 inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); 5685 } 5686 if (root != sub_root) 5687 btrfs_put_root(sub_root); 5688 5689 if (!IS_ERR(inode) && root != sub_root) { 5690 down_read(&fs_info->cleanup_work_sem); 5691 if (!sb_rdonly(inode->i_sb)) 5692 ret = btrfs_orphan_cleanup(sub_root); 5693 up_read(&fs_info->cleanup_work_sem); 5694 if (ret) { 5695 iput(inode); 5696 inode = ERR_PTR(ret); 5697 } 5698 } 5699 5700 return inode; 5701 } 5702 5703 static int btrfs_dentry_delete(const struct dentry *dentry) 5704 { 5705 struct btrfs_root *root; 5706 struct inode *inode = d_inode(dentry); 5707 5708 if (!inode && !IS_ROOT(dentry)) 5709 inode = d_inode(dentry->d_parent); 5710 5711 if (inode) { 5712 root = BTRFS_I(inode)->root; 5713 if (btrfs_root_refs(&root->root_item) == 0) 5714 return 1; 5715 5716 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 5717 return 1; 5718 } 5719 return 0; 5720 } 5721 5722 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 5723 unsigned int flags) 5724 { 5725 struct inode *inode = btrfs_lookup_dentry(dir, dentry); 5726 5727 if (inode == ERR_PTR(-ENOENT)) 5728 inode = NULL; 5729 return d_splice_alias(inode, dentry); 5730 } 5731 5732 /* 5733 * All this infrastructure exists because dir_emit can fault, and we are holding 5734 * the tree lock when doing readdir. For now just allocate a buffer and copy 5735 * our information into that, and then dir_emit from the buffer. This is 5736 * similar to what NFS does, only we don't keep the buffer around in pagecache 5737 * because I'm afraid I'll mess that up. Long term we need to make filldir do 5738 * copy_to_user_inatomic so we don't have to worry about page faulting under the 5739 * tree lock. 5740 */ 5741 static int btrfs_opendir(struct inode *inode, struct file *file) 5742 { 5743 struct btrfs_file_private *private; 5744 5745 private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); 5746 if (!private) 5747 return -ENOMEM; 5748 private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); 5749 if (!private->filldir_buf) { 5750 kfree(private); 5751 return -ENOMEM; 5752 } 5753 file->private_data = private; 5754 return 0; 5755 } 5756 5757 struct dir_entry { 5758 u64 ino; 5759 u64 offset; 5760 unsigned type; 5761 int name_len; 5762 }; 5763 5764 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) 5765 { 5766 while (entries--) { 5767 struct dir_entry *entry = addr; 5768 char *name = (char *)(entry + 1); 5769 5770 ctx->pos = get_unaligned(&entry->offset); 5771 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), 5772 get_unaligned(&entry->ino), 5773 get_unaligned(&entry->type))) 5774 return 1; 5775 addr += sizeof(struct dir_entry) + 5776 get_unaligned(&entry->name_len); 5777 ctx->pos++; 5778 } 5779 return 0; 5780 } 5781 5782 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) 5783 { 5784 struct inode *inode = file_inode(file); 5785 struct btrfs_root *root = BTRFS_I(inode)->root; 5786 struct btrfs_file_private *private = file->private_data; 5787 struct btrfs_dir_item *di; 5788 struct btrfs_key key; 5789 struct btrfs_key found_key; 5790 struct btrfs_path *path; 5791 void *addr; 5792 struct list_head ins_list; 5793 struct list_head del_list; 5794 int ret; 5795 struct extent_buffer *leaf; 5796 int slot; 5797 char *name_ptr; 5798 int name_len; 5799 int entries = 0; 5800 int total_len = 0; 5801 bool put = false; 5802 struct btrfs_key location; 5803 5804 if (!dir_emit_dots(file, ctx)) 5805 return 0; 5806 5807 path = btrfs_alloc_path(); 5808 if (!path) 5809 return -ENOMEM; 5810 5811 addr = private->filldir_buf; 5812 path->reada = READA_FORWARD; 5813 5814 INIT_LIST_HEAD(&ins_list); 5815 INIT_LIST_HEAD(&del_list); 5816 put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list); 5817 5818 again: 5819 key.type = BTRFS_DIR_INDEX_KEY; 5820 key.offset = ctx->pos; 5821 key.objectid = btrfs_ino(BTRFS_I(inode)); 5822 5823 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5824 if (ret < 0) 5825 goto err; 5826 5827 while (1) { 5828 struct dir_entry *entry; 5829 5830 leaf = path->nodes[0]; 5831 slot = path->slots[0]; 5832 if (slot >= btrfs_header_nritems(leaf)) { 5833 ret = btrfs_next_leaf(root, path); 5834 if (ret < 0) 5835 goto err; 5836 else if (ret > 0) 5837 break; 5838 continue; 5839 } 5840 5841 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5842 5843 if (found_key.objectid != key.objectid) 5844 break; 5845 if (found_key.type != BTRFS_DIR_INDEX_KEY) 5846 break; 5847 if (found_key.offset < ctx->pos) 5848 goto next; 5849 if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) 5850 goto next; 5851 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 5852 name_len = btrfs_dir_name_len(leaf, di); 5853 if ((total_len + sizeof(struct dir_entry) + name_len) >= 5854 PAGE_SIZE) { 5855 btrfs_release_path(path); 5856 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 5857 if (ret) 5858 goto nopos; 5859 addr = private->filldir_buf; 5860 entries = 0; 5861 total_len = 0; 5862 goto again; 5863 } 5864 5865 entry = addr; 5866 put_unaligned(name_len, &entry->name_len); 5867 name_ptr = (char *)(entry + 1); 5868 read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), 5869 name_len); 5870 put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), 5871 &entry->type); 5872 btrfs_dir_item_key_to_cpu(leaf, di, &location); 5873 put_unaligned(location.objectid, &entry->ino); 5874 put_unaligned(found_key.offset, &entry->offset); 5875 entries++; 5876 addr += sizeof(struct dir_entry) + name_len; 5877 total_len += sizeof(struct dir_entry) + name_len; 5878 next: 5879 path->slots[0]++; 5880 } 5881 btrfs_release_path(path); 5882 5883 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 5884 if (ret) 5885 goto nopos; 5886 5887 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); 5888 if (ret) 5889 goto nopos; 5890 5891 /* 5892 * Stop new entries from being returned after we return the last 5893 * entry. 5894 * 5895 * New directory entries are assigned a strictly increasing 5896 * offset. This means that new entries created during readdir 5897 * are *guaranteed* to be seen in the future by that readdir. 5898 * This has broken buggy programs which operate on names as 5899 * they're returned by readdir. Until we re-use freed offsets 5900 * we have this hack to stop new entries from being returned 5901 * under the assumption that they'll never reach this huge 5902 * offset. 5903 * 5904 * This is being careful not to overflow 32bit loff_t unless the 5905 * last entry requires it because doing so has broken 32bit apps 5906 * in the past. 5907 */ 5908 if (ctx->pos >= INT_MAX) 5909 ctx->pos = LLONG_MAX; 5910 else 5911 ctx->pos = INT_MAX; 5912 nopos: 5913 ret = 0; 5914 err: 5915 if (put) 5916 btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); 5917 btrfs_free_path(path); 5918 return ret; 5919 } 5920 5921 /* 5922 * This is somewhat expensive, updating the tree every time the 5923 * inode changes. But, it is most likely to find the inode in cache. 5924 * FIXME, needs more benchmarking...there are no reasons other than performance 5925 * to keep or drop this code. 5926 */ 5927 static int btrfs_dirty_inode(struct inode *inode) 5928 { 5929 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5930 struct btrfs_root *root = BTRFS_I(inode)->root; 5931 struct btrfs_trans_handle *trans; 5932 int ret; 5933 5934 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 5935 return 0; 5936 5937 trans = btrfs_join_transaction(root); 5938 if (IS_ERR(trans)) 5939 return PTR_ERR(trans); 5940 5941 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 5942 if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { 5943 /* whoops, lets try again with the full transaction */ 5944 btrfs_end_transaction(trans); 5945 trans = btrfs_start_transaction(root, 1); 5946 if (IS_ERR(trans)) 5947 return PTR_ERR(trans); 5948 5949 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 5950 } 5951 btrfs_end_transaction(trans); 5952 if (BTRFS_I(inode)->delayed_node) 5953 btrfs_balance_delayed_items(fs_info); 5954 5955 return ret; 5956 } 5957 5958 /* 5959 * This is a copy of file_update_time. We need this so we can return error on 5960 * ENOSPC for updating the inode in the case of file write and mmap writes. 5961 */ 5962 static int btrfs_update_time(struct inode *inode, struct timespec64 *now, 5963 int flags) 5964 { 5965 struct btrfs_root *root = BTRFS_I(inode)->root; 5966 bool dirty = flags & ~S_VERSION; 5967 5968 if (btrfs_root_readonly(root)) 5969 return -EROFS; 5970 5971 if (flags & S_VERSION) 5972 dirty |= inode_maybe_inc_iversion(inode, dirty); 5973 if (flags & S_CTIME) 5974 inode->i_ctime = *now; 5975 if (flags & S_MTIME) 5976 inode->i_mtime = *now; 5977 if (flags & S_ATIME) 5978 inode->i_atime = *now; 5979 return dirty ? btrfs_dirty_inode(inode) : 0; 5980 } 5981 5982 /* 5983 * find the highest existing sequence number in a directory 5984 * and then set the in-memory index_cnt variable to reflect 5985 * free sequence numbers 5986 */ 5987 static int btrfs_set_inode_index_count(struct btrfs_inode *inode) 5988 { 5989 struct btrfs_root *root = inode->root; 5990 struct btrfs_key key, found_key; 5991 struct btrfs_path *path; 5992 struct extent_buffer *leaf; 5993 int ret; 5994 5995 key.objectid = btrfs_ino(inode); 5996 key.type = BTRFS_DIR_INDEX_KEY; 5997 key.offset = (u64)-1; 5998 5999 path = btrfs_alloc_path(); 6000 if (!path) 6001 return -ENOMEM; 6002 6003 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6004 if (ret < 0) 6005 goto out; 6006 /* FIXME: we should be able to handle this */ 6007 if (ret == 0) 6008 goto out; 6009 ret = 0; 6010 6011 if (path->slots[0] == 0) { 6012 inode->index_cnt = BTRFS_DIR_START_INDEX; 6013 goto out; 6014 } 6015 6016 path->slots[0]--; 6017 6018 leaf = path->nodes[0]; 6019 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6020 6021 if (found_key.objectid != btrfs_ino(inode) || 6022 found_key.type != BTRFS_DIR_INDEX_KEY) { 6023 inode->index_cnt = BTRFS_DIR_START_INDEX; 6024 goto out; 6025 } 6026 6027 inode->index_cnt = found_key.offset + 1; 6028 out: 6029 btrfs_free_path(path); 6030 return ret; 6031 } 6032 6033 /* 6034 * helper to find a free sequence number in a given directory. This current 6035 * code is very simple, later versions will do smarter things in the btree 6036 */ 6037 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) 6038 { 6039 int ret = 0; 6040 6041 if (dir->index_cnt == (u64)-1) { 6042 ret = btrfs_inode_delayed_dir_index_count(dir); 6043 if (ret) { 6044 ret = btrfs_set_inode_index_count(dir); 6045 if (ret) 6046 return ret; 6047 } 6048 } 6049 6050 *index = dir->index_cnt; 6051 dir->index_cnt++; 6052 6053 return ret; 6054 } 6055 6056 static int btrfs_insert_inode_locked(struct inode *inode) 6057 { 6058 struct btrfs_iget_args args; 6059 6060 args.ino = BTRFS_I(inode)->location.objectid; 6061 args.root = BTRFS_I(inode)->root; 6062 6063 return insert_inode_locked4(inode, 6064 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), 6065 btrfs_find_actor, &args); 6066 } 6067 6068 /* 6069 * Inherit flags from the parent inode. 6070 * 6071 * Currently only the compression flags and the cow flags are inherited. 6072 */ 6073 static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 6074 { 6075 unsigned int flags; 6076 6077 if (!dir) 6078 return; 6079 6080 flags = BTRFS_I(dir)->flags; 6081 6082 if (flags & BTRFS_INODE_NOCOMPRESS) { 6083 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; 6084 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 6085 } else if (flags & BTRFS_INODE_COMPRESS) { 6086 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; 6087 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 6088 } 6089 6090 if (flags & BTRFS_INODE_NODATACOW) { 6091 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 6092 if (S_ISREG(inode->i_mode)) 6093 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6094 } 6095 6096 btrfs_sync_inode_flags_to_i_flags(inode); 6097 } 6098 6099 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 6100 struct btrfs_root *root, 6101 struct user_namespace *mnt_userns, 6102 struct inode *dir, 6103 const char *name, int name_len, 6104 u64 ref_objectid, u64 objectid, 6105 umode_t mode, u64 *index) 6106 { 6107 struct btrfs_fs_info *fs_info = root->fs_info; 6108 struct inode *inode; 6109 struct btrfs_inode_item *inode_item; 6110 struct btrfs_key *location; 6111 struct btrfs_path *path; 6112 struct btrfs_inode_ref *ref; 6113 struct btrfs_key key[2]; 6114 u32 sizes[2]; 6115 struct btrfs_item_batch batch; 6116 unsigned long ptr; 6117 unsigned int nofs_flag; 6118 int ret; 6119 6120 path = btrfs_alloc_path(); 6121 if (!path) 6122 return ERR_PTR(-ENOMEM); 6123 6124 nofs_flag = memalloc_nofs_save(); 6125 inode = new_inode(fs_info->sb); 6126 memalloc_nofs_restore(nofs_flag); 6127 if (!inode) { 6128 btrfs_free_path(path); 6129 return ERR_PTR(-ENOMEM); 6130 } 6131 6132 /* 6133 * O_TMPFILE, set link count to 0, so that after this point, 6134 * we fill in an inode item with the correct link count. 6135 */ 6136 if (!name) 6137 set_nlink(inode, 0); 6138 6139 /* 6140 * we have to initialize this early, so we can reclaim the inode 6141 * number if we fail afterwards in this function. 6142 */ 6143 inode->i_ino = objectid; 6144 6145 if (dir && name) { 6146 trace_btrfs_inode_request(dir); 6147 6148 ret = btrfs_set_inode_index(BTRFS_I(dir), index); 6149 if (ret) { 6150 btrfs_free_path(path); 6151 iput(inode); 6152 return ERR_PTR(ret); 6153 } 6154 } else if (dir) { 6155 *index = 0; 6156 } 6157 /* 6158 * index_cnt is ignored for everything but a dir, 6159 * btrfs_set_inode_index_count has an explanation for the magic 6160 * number 6161 */ 6162 BTRFS_I(inode)->index_cnt = 2; 6163 BTRFS_I(inode)->dir_index = *index; 6164 BTRFS_I(inode)->root = btrfs_grab_root(root); 6165 BTRFS_I(inode)->generation = trans->transid; 6166 inode->i_generation = BTRFS_I(inode)->generation; 6167 6168 /* 6169 * We could have gotten an inode number from somebody who was fsynced 6170 * and then removed in this same transaction, so let's just set full 6171 * sync since it will be a full sync anyway and this will blow away the 6172 * old info in the log. 6173 */ 6174 btrfs_set_inode_full_sync(BTRFS_I(inode)); 6175 6176 key[0].objectid = objectid; 6177 key[0].type = BTRFS_INODE_ITEM_KEY; 6178 key[0].offset = 0; 6179 6180 sizes[0] = sizeof(struct btrfs_inode_item); 6181 6182 if (name) { 6183 /* 6184 * Start new inodes with an inode_ref. This is slightly more 6185 * efficient for small numbers of hard links since they will 6186 * be packed into one item. Extended refs will kick in if we 6187 * add more hard links than can fit in the ref item. 6188 */ 6189 key[1].objectid = objectid; 6190 key[1].type = BTRFS_INODE_REF_KEY; 6191 key[1].offset = ref_objectid; 6192 6193 sizes[1] = name_len + sizeof(*ref); 6194 } 6195 6196 location = &BTRFS_I(inode)->location; 6197 location->objectid = objectid; 6198 location->offset = 0; 6199 location->type = BTRFS_INODE_ITEM_KEY; 6200 6201 ret = btrfs_insert_inode_locked(inode); 6202 if (ret < 0) { 6203 iput(inode); 6204 goto fail; 6205 } 6206 6207 batch.keys = &key[0]; 6208 batch.data_sizes = &sizes[0]; 6209 batch.total_data_size = sizes[0] + (name ? sizes[1] : 0); 6210 batch.nr = name ? 2 : 1; 6211 ret = btrfs_insert_empty_items(trans, root, path, &batch); 6212 if (ret != 0) 6213 goto fail_unlock; 6214 6215 inode_init_owner(mnt_userns, inode, dir, mode); 6216 inode_set_bytes(inode, 0); 6217 6218 inode->i_mtime = current_time(inode); 6219 inode->i_atime = inode->i_mtime; 6220 inode->i_ctime = inode->i_mtime; 6221 BTRFS_I(inode)->i_otime = inode->i_mtime; 6222 6223 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 6224 struct btrfs_inode_item); 6225 memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, 6226 sizeof(*inode_item)); 6227 fill_inode_item(trans, path->nodes[0], inode_item, inode); 6228 6229 if (name) { 6230 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 6231 struct btrfs_inode_ref); 6232 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 6233 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 6234 ptr = (unsigned long)(ref + 1); 6235 write_extent_buffer(path->nodes[0], name, ptr, name_len); 6236 } 6237 6238 btrfs_mark_buffer_dirty(path->nodes[0]); 6239 btrfs_free_path(path); 6240 6241 btrfs_inherit_iflags(inode, dir); 6242 6243 if (S_ISREG(mode)) { 6244 if (btrfs_test_opt(fs_info, NODATASUM)) 6245 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6246 if (btrfs_test_opt(fs_info, NODATACOW)) 6247 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | 6248 BTRFS_INODE_NODATASUM; 6249 } 6250 6251 inode_tree_add(inode); 6252 6253 trace_btrfs_inode_new(inode); 6254 btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); 6255 6256 btrfs_update_root_times(trans, root); 6257 6258 ret = btrfs_inode_inherit_props(trans, inode, dir); 6259 if (ret) 6260 btrfs_err(fs_info, 6261 "error inheriting props for ino %llu (root %llu): %d", 6262 btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); 6263 6264 return inode; 6265 6266 fail_unlock: 6267 discard_new_inode(inode); 6268 fail: 6269 if (dir && name) 6270 BTRFS_I(dir)->index_cnt--; 6271 btrfs_free_path(path); 6272 return ERR_PTR(ret); 6273 } 6274 6275 /* 6276 * utility function to add 'inode' into 'parent_inode' with 6277 * a give name and a given sequence number. 6278 * if 'add_backref' is true, also insert a backref from the 6279 * inode to the parent directory. 6280 */ 6281 int btrfs_add_link(struct btrfs_trans_handle *trans, 6282 struct btrfs_inode *parent_inode, struct btrfs_inode *inode, 6283 const char *name, int name_len, int add_backref, u64 index) 6284 { 6285 int ret = 0; 6286 struct btrfs_key key; 6287 struct btrfs_root *root = parent_inode->root; 6288 u64 ino = btrfs_ino(inode); 6289 u64 parent_ino = btrfs_ino(parent_inode); 6290 6291 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6292 memcpy(&key, &inode->root->root_key, sizeof(key)); 6293 } else { 6294 key.objectid = ino; 6295 key.type = BTRFS_INODE_ITEM_KEY; 6296 key.offset = 0; 6297 } 6298 6299 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6300 ret = btrfs_add_root_ref(trans, key.objectid, 6301 root->root_key.objectid, parent_ino, 6302 index, name, name_len); 6303 } else if (add_backref) { 6304 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 6305 parent_ino, index); 6306 } 6307 6308 /* Nothing to clean up yet */ 6309 if (ret) 6310 return ret; 6311 6312 ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, 6313 btrfs_inode_type(&inode->vfs_inode), index); 6314 if (ret == -EEXIST || ret == -EOVERFLOW) 6315 goto fail_dir_item; 6316 else if (ret) { 6317 btrfs_abort_transaction(trans, ret); 6318 return ret; 6319 } 6320 6321 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + 6322 name_len * 2); 6323 inode_inc_iversion(&parent_inode->vfs_inode); 6324 /* 6325 * If we are replaying a log tree, we do not want to update the mtime 6326 * and ctime of the parent directory with the current time, since the 6327 * log replay procedure is responsible for setting them to their correct 6328 * values (the ones it had when the fsync was done). 6329 */ 6330 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { 6331 struct timespec64 now = current_time(&parent_inode->vfs_inode); 6332 6333 parent_inode->vfs_inode.i_mtime = now; 6334 parent_inode->vfs_inode.i_ctime = now; 6335 } 6336 ret = btrfs_update_inode(trans, root, parent_inode); 6337 if (ret) 6338 btrfs_abort_transaction(trans, ret); 6339 return ret; 6340 6341 fail_dir_item: 6342 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6343 u64 local_index; 6344 int err; 6345 err = btrfs_del_root_ref(trans, key.objectid, 6346 root->root_key.objectid, parent_ino, 6347 &local_index, name, name_len); 6348 if (err) 6349 btrfs_abort_transaction(trans, err); 6350 } else if (add_backref) { 6351 u64 local_index; 6352 int err; 6353 6354 err = btrfs_del_inode_ref(trans, root, name, name_len, 6355 ino, parent_ino, &local_index); 6356 if (err) 6357 btrfs_abort_transaction(trans, err); 6358 } 6359 6360 /* Return the original error code */ 6361 return ret; 6362 } 6363 6364 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 6365 struct btrfs_inode *dir, struct dentry *dentry, 6366 struct btrfs_inode *inode, int backref, u64 index) 6367 { 6368 int err = btrfs_add_link(trans, dir, inode, 6369 dentry->d_name.name, dentry->d_name.len, 6370 backref, index); 6371 if (err > 0) 6372 err = -EEXIST; 6373 return err; 6374 } 6375 6376 static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, 6377 struct dentry *dentry, umode_t mode, dev_t rdev) 6378 { 6379 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6380 struct btrfs_trans_handle *trans; 6381 struct btrfs_root *root = BTRFS_I(dir)->root; 6382 struct inode *inode = NULL; 6383 int err; 6384 u64 objectid; 6385 u64 index = 0; 6386 6387 /* 6388 * 2 for inode item and ref 6389 * 2 for dir items 6390 * 1 for xattr if selinux is on 6391 */ 6392 trans = btrfs_start_transaction(root, 5); 6393 if (IS_ERR(trans)) 6394 return PTR_ERR(trans); 6395 6396 err = btrfs_get_free_objectid(root, &objectid); 6397 if (err) 6398 goto out_unlock; 6399 6400 inode = btrfs_new_inode(trans, root, mnt_userns, dir, 6401 dentry->d_name.name, dentry->d_name.len, 6402 btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); 6403 if (IS_ERR(inode)) { 6404 err = PTR_ERR(inode); 6405 inode = NULL; 6406 goto out_unlock; 6407 } 6408 6409 /* 6410 * If the active LSM wants to access the inode during 6411 * d_instantiate it needs these. Smack checks to see 6412 * if the filesystem supports xattrs by looking at the 6413 * ops vector. 6414 */ 6415 inode->i_op = &btrfs_special_inode_operations; 6416 init_special_inode(inode, inode->i_mode, rdev); 6417 6418 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6419 if (err) 6420 goto out_unlock; 6421 6422 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6423 0, index); 6424 if (err) 6425 goto out_unlock; 6426 6427 btrfs_update_inode(trans, root, BTRFS_I(inode)); 6428 d_instantiate_new(dentry, inode); 6429 6430 out_unlock: 6431 btrfs_end_transaction(trans); 6432 btrfs_btree_balance_dirty(fs_info); 6433 if (err && inode) { 6434 inode_dec_link_count(inode); 6435 discard_new_inode(inode); 6436 } 6437 return err; 6438 } 6439 6440 static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, 6441 struct dentry *dentry, umode_t mode, bool excl) 6442 { 6443 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6444 struct btrfs_trans_handle *trans; 6445 struct btrfs_root *root = BTRFS_I(dir)->root; 6446 struct inode *inode = NULL; 6447 int err; 6448 u64 objectid; 6449 u64 index = 0; 6450 6451 /* 6452 * 2 for inode item and ref 6453 * 2 for dir items 6454 * 1 for xattr if selinux is on 6455 */ 6456 trans = btrfs_start_transaction(root, 5); 6457 if (IS_ERR(trans)) 6458 return PTR_ERR(trans); 6459 6460 err = btrfs_get_free_objectid(root, &objectid); 6461 if (err) 6462 goto out_unlock; 6463 6464 inode = btrfs_new_inode(trans, root, mnt_userns, dir, 6465 dentry->d_name.name, dentry->d_name.len, 6466 btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); 6467 if (IS_ERR(inode)) { 6468 err = PTR_ERR(inode); 6469 inode = NULL; 6470 goto out_unlock; 6471 } 6472 /* 6473 * If the active LSM wants to access the inode during 6474 * d_instantiate it needs these. Smack checks to see 6475 * if the filesystem supports xattrs by looking at the 6476 * ops vector. 6477 */ 6478 inode->i_fop = &btrfs_file_operations; 6479 inode->i_op = &btrfs_file_inode_operations; 6480 inode->i_mapping->a_ops = &btrfs_aops; 6481 6482 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6483 if (err) 6484 goto out_unlock; 6485 6486 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6487 if (err) 6488 goto out_unlock; 6489 6490 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6491 0, index); 6492 if (err) 6493 goto out_unlock; 6494 6495 d_instantiate_new(dentry, inode); 6496 6497 out_unlock: 6498 btrfs_end_transaction(trans); 6499 if (err && inode) { 6500 inode_dec_link_count(inode); 6501 discard_new_inode(inode); 6502 } 6503 btrfs_btree_balance_dirty(fs_info); 6504 return err; 6505 } 6506 6507 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 6508 struct dentry *dentry) 6509 { 6510 struct btrfs_trans_handle *trans = NULL; 6511 struct btrfs_root *root = BTRFS_I(dir)->root; 6512 struct inode *inode = d_inode(old_dentry); 6513 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 6514 u64 index; 6515 int err; 6516 int drop_inode = 0; 6517 6518 /* do not allow sys_link's with other subvols of the same device */ 6519 if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) 6520 return -EXDEV; 6521 6522 if (inode->i_nlink >= BTRFS_LINK_MAX) 6523 return -EMLINK; 6524 6525 err = btrfs_set_inode_index(BTRFS_I(dir), &index); 6526 if (err) 6527 goto fail; 6528 6529 /* 6530 * 2 items for inode and inode ref 6531 * 2 items for dir items 6532 * 1 item for parent inode 6533 * 1 item for orphan item deletion if O_TMPFILE 6534 */ 6535 trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); 6536 if (IS_ERR(trans)) { 6537 err = PTR_ERR(trans); 6538 trans = NULL; 6539 goto fail; 6540 } 6541 6542 /* There are several dir indexes for this inode, clear the cache. */ 6543 BTRFS_I(inode)->dir_index = 0ULL; 6544 inc_nlink(inode); 6545 inode_inc_iversion(inode); 6546 inode->i_ctime = current_time(inode); 6547 ihold(inode); 6548 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6549 6550 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6551 1, index); 6552 6553 if (err) { 6554 drop_inode = 1; 6555 } else { 6556 struct dentry *parent = dentry->d_parent; 6557 6558 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6559 if (err) 6560 goto fail; 6561 if (inode->i_nlink == 1) { 6562 /* 6563 * If new hard link count is 1, it's a file created 6564 * with open(2) O_TMPFILE flag. 6565 */ 6566 err = btrfs_orphan_del(trans, BTRFS_I(inode)); 6567 if (err) 6568 goto fail; 6569 } 6570 d_instantiate(dentry, inode); 6571 btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); 6572 } 6573 6574 fail: 6575 if (trans) 6576 btrfs_end_transaction(trans); 6577 if (drop_inode) { 6578 inode_dec_link_count(inode); 6579 iput(inode); 6580 } 6581 btrfs_btree_balance_dirty(fs_info); 6582 return err; 6583 } 6584 6585 static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, 6586 struct dentry *dentry, umode_t mode) 6587 { 6588 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6589 struct inode *inode = NULL; 6590 struct btrfs_trans_handle *trans; 6591 struct btrfs_root *root = BTRFS_I(dir)->root; 6592 int err = 0; 6593 u64 objectid = 0; 6594 u64 index = 0; 6595 6596 /* 6597 * 2 items for inode and ref 6598 * 2 items for dir items 6599 * 1 for xattr if selinux is on 6600 */ 6601 trans = btrfs_start_transaction(root, 5); 6602 if (IS_ERR(trans)) 6603 return PTR_ERR(trans); 6604 6605 err = btrfs_get_free_objectid(root, &objectid); 6606 if (err) 6607 goto out_fail; 6608 6609 inode = btrfs_new_inode(trans, root, mnt_userns, dir, 6610 dentry->d_name.name, dentry->d_name.len, 6611 btrfs_ino(BTRFS_I(dir)), objectid, 6612 S_IFDIR | mode, &index); 6613 if (IS_ERR(inode)) { 6614 err = PTR_ERR(inode); 6615 inode = NULL; 6616 goto out_fail; 6617 } 6618 6619 /* these must be set before we unlock the inode */ 6620 inode->i_op = &btrfs_dir_inode_operations; 6621 inode->i_fop = &btrfs_dir_file_operations; 6622 6623 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6624 if (err) 6625 goto out_fail; 6626 6627 btrfs_i_size_write(BTRFS_I(inode), 0); 6628 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6629 if (err) 6630 goto out_fail; 6631 6632 err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 6633 dentry->d_name.name, 6634 dentry->d_name.len, 0, index); 6635 if (err) 6636 goto out_fail; 6637 6638 d_instantiate_new(dentry, inode); 6639 6640 out_fail: 6641 btrfs_end_transaction(trans); 6642 if (err && inode) { 6643 inode_dec_link_count(inode); 6644 discard_new_inode(inode); 6645 } 6646 btrfs_btree_balance_dirty(fs_info); 6647 return err; 6648 } 6649 6650 static noinline int uncompress_inline(struct btrfs_path *path, 6651 struct page *page, 6652 size_t pg_offset, u64 extent_offset, 6653 struct btrfs_file_extent_item *item) 6654 { 6655 int ret; 6656 struct extent_buffer *leaf = path->nodes[0]; 6657 char *tmp; 6658 size_t max_size; 6659 unsigned long inline_size; 6660 unsigned long ptr; 6661 int compress_type; 6662 6663 WARN_ON(pg_offset != 0); 6664 compress_type = btrfs_file_extent_compression(leaf, item); 6665 max_size = btrfs_file_extent_ram_bytes(leaf, item); 6666 inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); 6667 tmp = kmalloc(inline_size, GFP_NOFS); 6668 if (!tmp) 6669 return -ENOMEM; 6670 ptr = btrfs_file_extent_inline_start(item); 6671 6672 read_extent_buffer(leaf, tmp, ptr, inline_size); 6673 6674 max_size = min_t(unsigned long, PAGE_SIZE, max_size); 6675 ret = btrfs_decompress(compress_type, tmp, page, 6676 extent_offset, inline_size, max_size); 6677 6678 /* 6679 * decompression code contains a memset to fill in any space between the end 6680 * of the uncompressed data and the end of max_size in case the decompressed 6681 * data ends up shorter than ram_bytes. That doesn't cover the hole between 6682 * the end of an inline extent and the beginning of the next block, so we 6683 * cover that region here. 6684 */ 6685 6686 if (max_size + pg_offset < PAGE_SIZE) 6687 memzero_page(page, pg_offset + max_size, 6688 PAGE_SIZE - max_size - pg_offset); 6689 kfree(tmp); 6690 return ret; 6691 } 6692 6693 /** 6694 * btrfs_get_extent - Lookup the first extent overlapping a range in a file. 6695 * @inode: file to search in 6696 * @page: page to read extent data into if the extent is inline 6697 * @pg_offset: offset into @page to copy to 6698 * @start: file offset 6699 * @len: length of range starting at @start 6700 * 6701 * This returns the first &struct extent_map which overlaps with the given 6702 * range, reading it from the B-tree and caching it if necessary. Note that 6703 * there may be more extents which overlap the given range after the returned 6704 * extent_map. 6705 * 6706 * If @page is not NULL and the extent is inline, this also reads the extent 6707 * data directly into the page and marks the extent up to date in the io_tree. 6708 * 6709 * Return: ERR_PTR on error, non-NULL extent_map on success. 6710 */ 6711 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, 6712 struct page *page, size_t pg_offset, 6713 u64 start, u64 len) 6714 { 6715 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6716 int ret = 0; 6717 u64 extent_start = 0; 6718 u64 extent_end = 0; 6719 u64 objectid = btrfs_ino(inode); 6720 int extent_type = -1; 6721 struct btrfs_path *path = NULL; 6722 struct btrfs_root *root = inode->root; 6723 struct btrfs_file_extent_item *item; 6724 struct extent_buffer *leaf; 6725 struct btrfs_key found_key; 6726 struct extent_map *em = NULL; 6727 struct extent_map_tree *em_tree = &inode->extent_tree; 6728 struct extent_io_tree *io_tree = &inode->io_tree; 6729 6730 read_lock(&em_tree->lock); 6731 em = lookup_extent_mapping(em_tree, start, len); 6732 read_unlock(&em_tree->lock); 6733 6734 if (em) { 6735 if (em->start > start || em->start + em->len <= start) 6736 free_extent_map(em); 6737 else if (em->block_start == EXTENT_MAP_INLINE && page) 6738 free_extent_map(em); 6739 else 6740 goto out; 6741 } 6742 em = alloc_extent_map(); 6743 if (!em) { 6744 ret = -ENOMEM; 6745 goto out; 6746 } 6747 em->start = EXTENT_MAP_HOLE; 6748 em->orig_start = EXTENT_MAP_HOLE; 6749 em->len = (u64)-1; 6750 em->block_len = (u64)-1; 6751 6752 path = btrfs_alloc_path(); 6753 if (!path) { 6754 ret = -ENOMEM; 6755 goto out; 6756 } 6757 6758 /* Chances are we'll be called again, so go ahead and do readahead */ 6759 path->reada = READA_FORWARD; 6760 6761 /* 6762 * The same explanation in load_free_space_cache applies here as well, 6763 * we only read when we're loading the free space cache, and at that 6764 * point the commit_root has everything we need. 6765 */ 6766 if (btrfs_is_free_space_inode(inode)) { 6767 path->search_commit_root = 1; 6768 path->skip_locking = 1; 6769 } 6770 6771 ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); 6772 if (ret < 0) { 6773 goto out; 6774 } else if (ret > 0) { 6775 if (path->slots[0] == 0) 6776 goto not_found; 6777 path->slots[0]--; 6778 ret = 0; 6779 } 6780 6781 leaf = path->nodes[0]; 6782 item = btrfs_item_ptr(leaf, path->slots[0], 6783 struct btrfs_file_extent_item); 6784 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6785 if (found_key.objectid != objectid || 6786 found_key.type != BTRFS_EXTENT_DATA_KEY) { 6787 /* 6788 * If we backup past the first extent we want to move forward 6789 * and see if there is an extent in front of us, otherwise we'll 6790 * say there is a hole for our whole search range which can 6791 * cause problems. 6792 */ 6793 extent_end = start; 6794 goto next; 6795 } 6796 6797 extent_type = btrfs_file_extent_type(leaf, item); 6798 extent_start = found_key.offset; 6799 extent_end = btrfs_file_extent_end(path); 6800 if (extent_type == BTRFS_FILE_EXTENT_REG || 6801 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 6802 /* Only regular file could have regular/prealloc extent */ 6803 if (!S_ISREG(inode->vfs_inode.i_mode)) { 6804 ret = -EUCLEAN; 6805 btrfs_crit(fs_info, 6806 "regular/prealloc extent found for non-regular inode %llu", 6807 btrfs_ino(inode)); 6808 goto out; 6809 } 6810 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, 6811 extent_start); 6812 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 6813 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, 6814 path->slots[0], 6815 extent_start); 6816 } 6817 next: 6818 if (start >= extent_end) { 6819 path->slots[0]++; 6820 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 6821 ret = btrfs_next_leaf(root, path); 6822 if (ret < 0) 6823 goto out; 6824 else if (ret > 0) 6825 goto not_found; 6826 6827 leaf = path->nodes[0]; 6828 } 6829 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6830 if (found_key.objectid != objectid || 6831 found_key.type != BTRFS_EXTENT_DATA_KEY) 6832 goto not_found; 6833 if (start + len <= found_key.offset) 6834 goto not_found; 6835 if (start > found_key.offset) 6836 goto next; 6837 6838 /* New extent overlaps with existing one */ 6839 em->start = start; 6840 em->orig_start = start; 6841 em->len = found_key.offset - start; 6842 em->block_start = EXTENT_MAP_HOLE; 6843 goto insert; 6844 } 6845 6846 btrfs_extent_item_to_extent_map(inode, path, item, !page, em); 6847 6848 if (extent_type == BTRFS_FILE_EXTENT_REG || 6849 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 6850 goto insert; 6851 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 6852 unsigned long ptr; 6853 char *map; 6854 size_t size; 6855 size_t extent_offset; 6856 size_t copy_size; 6857 6858 if (!page) 6859 goto out; 6860 6861 size = btrfs_file_extent_ram_bytes(leaf, item); 6862 extent_offset = page_offset(page) + pg_offset - extent_start; 6863 copy_size = min_t(u64, PAGE_SIZE - pg_offset, 6864 size - extent_offset); 6865 em->start = extent_start + extent_offset; 6866 em->len = ALIGN(copy_size, fs_info->sectorsize); 6867 em->orig_block_len = em->len; 6868 em->orig_start = em->start; 6869 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 6870 6871 if (!PageUptodate(page)) { 6872 if (btrfs_file_extent_compression(leaf, item) != 6873 BTRFS_COMPRESS_NONE) { 6874 ret = uncompress_inline(path, page, pg_offset, 6875 extent_offset, item); 6876 if (ret) 6877 goto out; 6878 } else { 6879 map = kmap_local_page(page); 6880 read_extent_buffer(leaf, map + pg_offset, ptr, 6881 copy_size); 6882 if (pg_offset + copy_size < PAGE_SIZE) { 6883 memset(map + pg_offset + copy_size, 0, 6884 PAGE_SIZE - pg_offset - 6885 copy_size); 6886 } 6887 kunmap_local(map); 6888 } 6889 flush_dcache_page(page); 6890 } 6891 set_extent_uptodate(io_tree, em->start, 6892 extent_map_end(em) - 1, NULL, GFP_NOFS); 6893 goto insert; 6894 } 6895 not_found: 6896 em->start = start; 6897 em->orig_start = start; 6898 em->len = len; 6899 em->block_start = EXTENT_MAP_HOLE; 6900 insert: 6901 ret = 0; 6902 btrfs_release_path(path); 6903 if (em->start > start || extent_map_end(em) <= start) { 6904 btrfs_err(fs_info, 6905 "bad extent! em: [%llu %llu] passed [%llu %llu]", 6906 em->start, em->len, start, len); 6907 ret = -EIO; 6908 goto out; 6909 } 6910 6911 write_lock(&em_tree->lock); 6912 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); 6913 write_unlock(&em_tree->lock); 6914 out: 6915 btrfs_free_path(path); 6916 6917 trace_btrfs_get_extent(root, inode, em); 6918 6919 if (ret) { 6920 free_extent_map(em); 6921 return ERR_PTR(ret); 6922 } 6923 return em; 6924 } 6925 6926 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, 6927 u64 start, u64 len) 6928 { 6929 struct extent_map *em; 6930 struct extent_map *hole_em = NULL; 6931 u64 delalloc_start = start; 6932 u64 end; 6933 u64 delalloc_len; 6934 u64 delalloc_end; 6935 int err = 0; 6936 6937 em = btrfs_get_extent(inode, NULL, 0, start, len); 6938 if (IS_ERR(em)) 6939 return em; 6940 /* 6941 * If our em maps to: 6942 * - a hole or 6943 * - a pre-alloc extent, 6944 * there might actually be delalloc bytes behind it. 6945 */ 6946 if (em->block_start != EXTENT_MAP_HOLE && 6947 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 6948 return em; 6949 else 6950 hole_em = em; 6951 6952 /* check to see if we've wrapped (len == -1 or similar) */ 6953 end = start + len; 6954 if (end < start) 6955 end = (u64)-1; 6956 else 6957 end -= 1; 6958 6959 em = NULL; 6960 6961 /* ok, we didn't find anything, lets look for delalloc */ 6962 delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start, 6963 end, len, EXTENT_DELALLOC, 1); 6964 delalloc_end = delalloc_start + delalloc_len; 6965 if (delalloc_end < delalloc_start) 6966 delalloc_end = (u64)-1; 6967 6968 /* 6969 * We didn't find anything useful, return the original results from 6970 * get_extent() 6971 */ 6972 if (delalloc_start > end || delalloc_end <= start) { 6973 em = hole_em; 6974 hole_em = NULL; 6975 goto out; 6976 } 6977 6978 /* 6979 * Adjust the delalloc_start to make sure it doesn't go backwards from 6980 * the start they passed in 6981 */ 6982 delalloc_start = max(start, delalloc_start); 6983 delalloc_len = delalloc_end - delalloc_start; 6984 6985 if (delalloc_len > 0) { 6986 u64 hole_start; 6987 u64 hole_len; 6988 const u64 hole_end = extent_map_end(hole_em); 6989 6990 em = alloc_extent_map(); 6991 if (!em) { 6992 err = -ENOMEM; 6993 goto out; 6994 } 6995 6996 ASSERT(hole_em); 6997 /* 6998 * When btrfs_get_extent can't find anything it returns one 6999 * huge hole 7000 * 7001 * Make sure what it found really fits our range, and adjust to 7002 * make sure it is based on the start from the caller 7003 */ 7004 if (hole_end <= start || hole_em->start > end) { 7005 free_extent_map(hole_em); 7006 hole_em = NULL; 7007 } else { 7008 hole_start = max(hole_em->start, start); 7009 hole_len = hole_end - hole_start; 7010 } 7011 7012 if (hole_em && delalloc_start > hole_start) { 7013 /* 7014 * Our hole starts before our delalloc, so we have to 7015 * return just the parts of the hole that go until the 7016 * delalloc starts 7017 */ 7018 em->len = min(hole_len, delalloc_start - hole_start); 7019 em->start = hole_start; 7020 em->orig_start = hole_start; 7021 /* 7022 * Don't adjust block start at all, it is fixed at 7023 * EXTENT_MAP_HOLE 7024 */ 7025 em->block_start = hole_em->block_start; 7026 em->block_len = hole_len; 7027 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) 7028 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7029 } else { 7030 /* 7031 * Hole is out of passed range or it starts after 7032 * delalloc range 7033 */ 7034 em->start = delalloc_start; 7035 em->len = delalloc_len; 7036 em->orig_start = delalloc_start; 7037 em->block_start = EXTENT_MAP_DELALLOC; 7038 em->block_len = delalloc_len; 7039 } 7040 } else { 7041 return hole_em; 7042 } 7043 out: 7044 7045 free_extent_map(hole_em); 7046 if (err) { 7047 free_extent_map(em); 7048 return ERR_PTR(err); 7049 } 7050 return em; 7051 } 7052 7053 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, 7054 const u64 start, 7055 const u64 len, 7056 const u64 orig_start, 7057 const u64 block_start, 7058 const u64 block_len, 7059 const u64 orig_block_len, 7060 const u64 ram_bytes, 7061 const int type) 7062 { 7063 struct extent_map *em = NULL; 7064 int ret; 7065 7066 if (type != BTRFS_ORDERED_NOCOW) { 7067 em = create_io_em(inode, start, len, orig_start, block_start, 7068 block_len, orig_block_len, ram_bytes, 7069 BTRFS_COMPRESS_NONE, /* compress_type */ 7070 type); 7071 if (IS_ERR(em)) 7072 goto out; 7073 } 7074 ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, 7075 block_len, 0, 7076 (1 << type) | 7077 (1 << BTRFS_ORDERED_DIRECT), 7078 BTRFS_COMPRESS_NONE); 7079 if (ret) { 7080 if (em) { 7081 free_extent_map(em); 7082 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 7083 } 7084 em = ERR_PTR(ret); 7085 } 7086 out: 7087 7088 return em; 7089 } 7090 7091 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, 7092 u64 start, u64 len) 7093 { 7094 struct btrfs_root *root = inode->root; 7095 struct btrfs_fs_info *fs_info = root->fs_info; 7096 struct extent_map *em; 7097 struct btrfs_key ins; 7098 u64 alloc_hint; 7099 int ret; 7100 7101 alloc_hint = get_extent_allocation_hint(inode, start, len); 7102 ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 7103 0, alloc_hint, &ins, 1, 1); 7104 if (ret) 7105 return ERR_PTR(ret); 7106 7107 em = btrfs_create_dio_extent(inode, start, ins.offset, start, 7108 ins.objectid, ins.offset, ins.offset, 7109 ins.offset, BTRFS_ORDERED_REGULAR); 7110 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 7111 if (IS_ERR(em)) 7112 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 7113 1); 7114 7115 return em; 7116 } 7117 7118 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 7119 { 7120 struct btrfs_block_group *block_group; 7121 bool readonly = false; 7122 7123 block_group = btrfs_lookup_block_group(fs_info, bytenr); 7124 if (!block_group || block_group->ro) 7125 readonly = true; 7126 if (block_group) 7127 btrfs_put_block_group(block_group); 7128 return readonly; 7129 } 7130 7131 /* 7132 * Check if we can do nocow write into the range [@offset, @offset + @len) 7133 * 7134 * @offset: File offset 7135 * @len: The length to write, will be updated to the nocow writeable 7136 * range 7137 * @orig_start: (optional) Return the original file offset of the file extent 7138 * @orig_len: (optional) Return the original on-disk length of the file extent 7139 * @ram_bytes: (optional) Return the ram_bytes of the file extent 7140 * @strict: if true, omit optimizations that might force us into unnecessary 7141 * cow. e.g., don't trust generation number. 7142 * 7143 * Return: 7144 * >0 and update @len if we can do nocow write 7145 * 0 if we can't do nocow write 7146 * <0 if error happened 7147 * 7148 * NOTE: This only checks the file extents, caller is responsible to wait for 7149 * any ordered extents. 7150 */ 7151 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, 7152 u64 *orig_start, u64 *orig_block_len, 7153 u64 *ram_bytes, bool strict) 7154 { 7155 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7156 struct btrfs_path *path; 7157 int ret; 7158 struct extent_buffer *leaf; 7159 struct btrfs_root *root = BTRFS_I(inode)->root; 7160 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7161 struct btrfs_file_extent_item *fi; 7162 struct btrfs_key key; 7163 u64 disk_bytenr; 7164 u64 backref_offset; 7165 u64 extent_end; 7166 u64 num_bytes; 7167 int slot; 7168 int found_type; 7169 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); 7170 7171 path = btrfs_alloc_path(); 7172 if (!path) 7173 return -ENOMEM; 7174 7175 ret = btrfs_lookup_file_extent(NULL, root, path, 7176 btrfs_ino(BTRFS_I(inode)), offset, 0); 7177 if (ret < 0) 7178 goto out; 7179 7180 slot = path->slots[0]; 7181 if (ret == 1) { 7182 if (slot == 0) { 7183 /* can't find the item, must cow */ 7184 ret = 0; 7185 goto out; 7186 } 7187 slot--; 7188 } 7189 ret = 0; 7190 leaf = path->nodes[0]; 7191 btrfs_item_key_to_cpu(leaf, &key, slot); 7192 if (key.objectid != btrfs_ino(BTRFS_I(inode)) || 7193 key.type != BTRFS_EXTENT_DATA_KEY) { 7194 /* not our file or wrong item type, must cow */ 7195 goto out; 7196 } 7197 7198 if (key.offset > offset) { 7199 /* Wrong offset, must cow */ 7200 goto out; 7201 } 7202 7203 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 7204 found_type = btrfs_file_extent_type(leaf, fi); 7205 if (found_type != BTRFS_FILE_EXTENT_REG && 7206 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 7207 /* not a regular extent, must cow */ 7208 goto out; 7209 } 7210 7211 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) 7212 goto out; 7213 7214 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 7215 if (extent_end <= offset) 7216 goto out; 7217 7218 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 7219 if (disk_bytenr == 0) 7220 goto out; 7221 7222 if (btrfs_file_extent_compression(leaf, fi) || 7223 btrfs_file_extent_encryption(leaf, fi) || 7224 btrfs_file_extent_other_encoding(leaf, fi)) 7225 goto out; 7226 7227 /* 7228 * Do the same check as in btrfs_cross_ref_exist but without the 7229 * unnecessary search. 7230 */ 7231 if (!strict && 7232 (btrfs_file_extent_generation(leaf, fi) <= 7233 btrfs_root_last_snapshot(&root->root_item))) 7234 goto out; 7235 7236 backref_offset = btrfs_file_extent_offset(leaf, fi); 7237 7238 if (orig_start) { 7239 *orig_start = key.offset - backref_offset; 7240 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 7241 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 7242 } 7243 7244 if (btrfs_extent_readonly(fs_info, disk_bytenr)) 7245 goto out; 7246 7247 num_bytes = min(offset + *len, extent_end) - offset; 7248 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { 7249 u64 range_end; 7250 7251 range_end = round_up(offset + num_bytes, 7252 root->fs_info->sectorsize) - 1; 7253 ret = test_range_bit(io_tree, offset, range_end, 7254 EXTENT_DELALLOC, 0, NULL); 7255 if (ret) { 7256 ret = -EAGAIN; 7257 goto out; 7258 } 7259 } 7260 7261 btrfs_release_path(path); 7262 7263 /* 7264 * look for other files referencing this extent, if we 7265 * find any we must cow 7266 */ 7267 7268 ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), 7269 key.offset - backref_offset, disk_bytenr, 7270 strict); 7271 if (ret) { 7272 ret = 0; 7273 goto out; 7274 } 7275 7276 /* 7277 * adjust disk_bytenr and num_bytes to cover just the bytes 7278 * in this extent we are about to write. If there 7279 * are any csums in that range we have to cow in order 7280 * to keep the csums correct 7281 */ 7282 disk_bytenr += backref_offset; 7283 disk_bytenr += offset - key.offset; 7284 if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) 7285 goto out; 7286 /* 7287 * all of the above have passed, it is safe to overwrite this extent 7288 * without cow 7289 */ 7290 *len = num_bytes; 7291 ret = 1; 7292 out: 7293 btrfs_free_path(path); 7294 return ret; 7295 } 7296 7297 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 7298 struct extent_state **cached_state, bool writing) 7299 { 7300 struct btrfs_ordered_extent *ordered; 7301 int ret = 0; 7302 7303 while (1) { 7304 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7305 cached_state); 7306 /* 7307 * We're concerned with the entire range that we're going to be 7308 * doing DIO to, so we need to make sure there's no ordered 7309 * extents in this range. 7310 */ 7311 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, 7312 lockend - lockstart + 1); 7313 7314 /* 7315 * We need to make sure there are no buffered pages in this 7316 * range either, we could have raced between the invalidate in 7317 * generic_file_direct_write and locking the extent. The 7318 * invalidate needs to happen so that reads after a write do not 7319 * get stale data. 7320 */ 7321 if (!ordered && 7322 (!writing || !filemap_range_has_page(inode->i_mapping, 7323 lockstart, lockend))) 7324 break; 7325 7326 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7327 cached_state); 7328 7329 if (ordered) { 7330 /* 7331 * If we are doing a DIO read and the ordered extent we 7332 * found is for a buffered write, we can not wait for it 7333 * to complete and retry, because if we do so we can 7334 * deadlock with concurrent buffered writes on page 7335 * locks. This happens only if our DIO read covers more 7336 * than one extent map, if at this point has already 7337 * created an ordered extent for a previous extent map 7338 * and locked its range in the inode's io tree, and a 7339 * concurrent write against that previous extent map's 7340 * range and this range started (we unlock the ranges 7341 * in the io tree only when the bios complete and 7342 * buffered writes always lock pages before attempting 7343 * to lock range in the io tree). 7344 */ 7345 if (writing || 7346 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) 7347 btrfs_start_ordered_extent(ordered, 1); 7348 else 7349 ret = -ENOTBLK; 7350 btrfs_put_ordered_extent(ordered); 7351 } else { 7352 /* 7353 * We could trigger writeback for this range (and wait 7354 * for it to complete) and then invalidate the pages for 7355 * this range (through invalidate_inode_pages2_range()), 7356 * but that can lead us to a deadlock with a concurrent 7357 * call to readahead (a buffered read or a defrag call 7358 * triggered a readahead) on a page lock due to an 7359 * ordered dio extent we created before but did not have 7360 * yet a corresponding bio submitted (whence it can not 7361 * complete), which makes readahead wait for that 7362 * ordered extent to complete while holding a lock on 7363 * that page. 7364 */ 7365 ret = -ENOTBLK; 7366 } 7367 7368 if (ret) 7369 break; 7370 7371 cond_resched(); 7372 } 7373 7374 return ret; 7375 } 7376 7377 /* The callers of this must take lock_extent() */ 7378 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, 7379 u64 len, u64 orig_start, u64 block_start, 7380 u64 block_len, u64 orig_block_len, 7381 u64 ram_bytes, int compress_type, 7382 int type) 7383 { 7384 struct extent_map_tree *em_tree; 7385 struct extent_map *em; 7386 int ret; 7387 7388 ASSERT(type == BTRFS_ORDERED_PREALLOC || 7389 type == BTRFS_ORDERED_COMPRESSED || 7390 type == BTRFS_ORDERED_NOCOW || 7391 type == BTRFS_ORDERED_REGULAR); 7392 7393 em_tree = &inode->extent_tree; 7394 em = alloc_extent_map(); 7395 if (!em) 7396 return ERR_PTR(-ENOMEM); 7397 7398 em->start = start; 7399 em->orig_start = orig_start; 7400 em->len = len; 7401 em->block_len = block_len; 7402 em->block_start = block_start; 7403 em->orig_block_len = orig_block_len; 7404 em->ram_bytes = ram_bytes; 7405 em->generation = -1; 7406 set_bit(EXTENT_FLAG_PINNED, &em->flags); 7407 if (type == BTRFS_ORDERED_PREALLOC) { 7408 set_bit(EXTENT_FLAG_FILLING, &em->flags); 7409 } else if (type == BTRFS_ORDERED_COMPRESSED) { 7410 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 7411 em->compress_type = compress_type; 7412 } 7413 7414 do { 7415 btrfs_drop_extent_cache(inode, em->start, 7416 em->start + em->len - 1, 0); 7417 write_lock(&em_tree->lock); 7418 ret = add_extent_mapping(em_tree, em, 1); 7419 write_unlock(&em_tree->lock); 7420 /* 7421 * The caller has taken lock_extent(), who could race with us 7422 * to add em? 7423 */ 7424 } while (ret == -EEXIST); 7425 7426 if (ret) { 7427 free_extent_map(em); 7428 return ERR_PTR(ret); 7429 } 7430 7431 /* em got 2 refs now, callers needs to do free_extent_map once. */ 7432 return em; 7433 } 7434 7435 7436 static int btrfs_get_blocks_direct_write(struct extent_map **map, 7437 struct inode *inode, 7438 struct btrfs_dio_data *dio_data, 7439 u64 start, u64 len) 7440 { 7441 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7442 struct extent_map *em = *map; 7443 int type; 7444 u64 block_start, orig_start, orig_block_len, ram_bytes; 7445 bool can_nocow = false; 7446 bool space_reserved = false; 7447 int ret = 0; 7448 7449 /* 7450 * We don't allocate a new extent in the following cases 7451 * 7452 * 1) The inode is marked as NODATACOW. In this case we'll just use the 7453 * existing extent. 7454 * 2) The extent is marked as PREALLOC. We're good to go here and can 7455 * just use the extent. 7456 * 7457 */ 7458 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 7459 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 7460 em->block_start != EXTENT_MAP_HOLE)) { 7461 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7462 type = BTRFS_ORDERED_PREALLOC; 7463 else 7464 type = BTRFS_ORDERED_NOCOW; 7465 len = min(len, em->len - (start - em->start)); 7466 block_start = em->block_start + (start - em->start); 7467 7468 if (can_nocow_extent(inode, start, &len, &orig_start, 7469 &orig_block_len, &ram_bytes, false) == 1 && 7470 btrfs_inc_nocow_writers(fs_info, block_start)) 7471 can_nocow = true; 7472 } 7473 7474 if (can_nocow) { 7475 struct extent_map *em2; 7476 7477 /* We can NOCOW, so only need to reserve metadata space. */ 7478 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len); 7479 if (ret < 0) { 7480 /* Our caller expects us to free the input extent map. */ 7481 free_extent_map(em); 7482 *map = NULL; 7483 btrfs_dec_nocow_writers(fs_info, block_start); 7484 goto out; 7485 } 7486 space_reserved = true; 7487 7488 em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, 7489 orig_start, block_start, 7490 len, orig_block_len, 7491 ram_bytes, type); 7492 btrfs_dec_nocow_writers(fs_info, block_start); 7493 if (type == BTRFS_ORDERED_PREALLOC) { 7494 free_extent_map(em); 7495 *map = em = em2; 7496 } 7497 7498 if (IS_ERR(em2)) { 7499 ret = PTR_ERR(em2); 7500 goto out; 7501 } 7502 } else { 7503 const u64 prev_len = len; 7504 7505 /* Our caller expects us to free the input extent map. */ 7506 free_extent_map(em); 7507 *map = NULL; 7508 7509 /* We have to COW, so need to reserve metadata and data space. */ 7510 ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), 7511 &dio_data->data_reserved, 7512 start, len); 7513 if (ret < 0) 7514 goto out; 7515 space_reserved = true; 7516 7517 em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); 7518 if (IS_ERR(em)) { 7519 ret = PTR_ERR(em); 7520 goto out; 7521 } 7522 *map = em; 7523 len = min(len, em->len - (start - em->start)); 7524 if (len < prev_len) 7525 btrfs_delalloc_release_space(BTRFS_I(inode), 7526 dio_data->data_reserved, 7527 start + len, prev_len - len, 7528 true); 7529 } 7530 7531 /* 7532 * We have created our ordered extent, so we can now release our reservation 7533 * for an outstanding extent. 7534 */ 7535 btrfs_delalloc_release_extents(BTRFS_I(inode), len); 7536 7537 /* 7538 * Need to update the i_size under the extent lock so buffered 7539 * readers will get the updated i_size when we unlock. 7540 */ 7541 if (start + len > i_size_read(inode)) 7542 i_size_write(inode, start + len); 7543 out: 7544 if (ret && space_reserved) { 7545 btrfs_delalloc_release_extents(BTRFS_I(inode), len); 7546 if (can_nocow) { 7547 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); 7548 } else { 7549 btrfs_delalloc_release_space(BTRFS_I(inode), 7550 dio_data->data_reserved, 7551 start, len, true); 7552 extent_changeset_free(dio_data->data_reserved); 7553 dio_data->data_reserved = NULL; 7554 } 7555 } 7556 return ret; 7557 } 7558 7559 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, 7560 loff_t length, unsigned int flags, struct iomap *iomap, 7561 struct iomap *srcmap) 7562 { 7563 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7564 struct extent_map *em; 7565 struct extent_state *cached_state = NULL; 7566 struct btrfs_dio_data *dio_data = NULL; 7567 u64 lockstart, lockend; 7568 const bool write = !!(flags & IOMAP_WRITE); 7569 int ret = 0; 7570 u64 len = length; 7571 bool unlock_extents = false; 7572 7573 if (!write) 7574 len = min_t(u64, len, fs_info->sectorsize); 7575 7576 lockstart = start; 7577 lockend = start + len - 1; 7578 7579 /* 7580 * The generic stuff only does filemap_write_and_wait_range, which 7581 * isn't enough if we've written compressed pages to this area, so we 7582 * need to flush the dirty pages again to make absolutely sure that any 7583 * outstanding dirty pages are on disk. 7584 */ 7585 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 7586 &BTRFS_I(inode)->runtime_flags)) { 7587 ret = filemap_fdatawrite_range(inode->i_mapping, start, 7588 start + length - 1); 7589 if (ret) 7590 return ret; 7591 } 7592 7593 dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); 7594 if (!dio_data) 7595 return -ENOMEM; 7596 7597 iomap->private = dio_data; 7598 7599 7600 /* 7601 * If this errors out it's because we couldn't invalidate pagecache for 7602 * this range and we need to fallback to buffered. 7603 */ 7604 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { 7605 ret = -ENOTBLK; 7606 goto err; 7607 } 7608 7609 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); 7610 if (IS_ERR(em)) { 7611 ret = PTR_ERR(em); 7612 goto unlock_err; 7613 } 7614 7615 /* 7616 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 7617 * io. INLINE is special, and we could probably kludge it in here, but 7618 * it's still buffered so for safety lets just fall back to the generic 7619 * buffered path. 7620 * 7621 * For COMPRESSED we _have_ to read the entire extent in so we can 7622 * decompress it, so there will be buffering required no matter what we 7623 * do, so go ahead and fallback to buffered. 7624 * 7625 * We return -ENOTBLK because that's what makes DIO go ahead and go back 7626 * to buffered IO. Don't blame me, this is the price we pay for using 7627 * the generic code. 7628 */ 7629 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 7630 em->block_start == EXTENT_MAP_INLINE) { 7631 free_extent_map(em); 7632 ret = -ENOTBLK; 7633 goto unlock_err; 7634 } 7635 7636 len = min(len, em->len - (start - em->start)); 7637 7638 /* 7639 * If we have a NOWAIT request and the range contains multiple extents 7640 * (or a mix of extents and holes), then we return -EAGAIN to make the 7641 * caller fallback to a context where it can do a blocking (without 7642 * NOWAIT) request. This way we avoid doing partial IO and returning 7643 * success to the caller, which is not optimal for writes and for reads 7644 * it can result in unexpected behaviour for an application. 7645 * 7646 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling 7647 * iomap_dio_rw(), we can end up returning less data then what the caller 7648 * asked for, resulting in an unexpected, and incorrect, short read. 7649 * That is, the caller asked to read N bytes and we return less than that, 7650 * which is wrong unless we are crossing EOF. This happens if we get a 7651 * page fault error when trying to fault in pages for the buffer that is 7652 * associated to the struct iov_iter passed to iomap_dio_rw(), and we 7653 * have previously submitted bios for other extents in the range, in 7654 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of 7655 * those bios have completed by the time we get the page fault error, 7656 * which we return back to our caller - we should only return EIOCBQUEUED 7657 * after we have submitted bios for all the extents in the range. 7658 */ 7659 if ((flags & IOMAP_NOWAIT) && len < length) { 7660 free_extent_map(em); 7661 ret = -EAGAIN; 7662 goto unlock_err; 7663 } 7664 7665 if (write) { 7666 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, 7667 start, len); 7668 if (ret < 0) 7669 goto unlock_err; 7670 unlock_extents = true; 7671 /* Recalc len in case the new em is smaller than requested */ 7672 len = min(len, em->len - (start - em->start)); 7673 } else { 7674 /* 7675 * We need to unlock only the end area that we aren't using. 7676 * The rest is going to be unlocked by the endio routine. 7677 */ 7678 lockstart = start + len; 7679 if (lockstart < lockend) 7680 unlock_extents = true; 7681 } 7682 7683 if (unlock_extents) 7684 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 7685 lockstart, lockend, &cached_state); 7686 else 7687 free_extent_state(cached_state); 7688 7689 /* 7690 * Translate extent map information to iomap. 7691 * We trim the extents (and move the addr) even though iomap code does 7692 * that, since we have locked only the parts we are performing I/O in. 7693 */ 7694 if ((em->block_start == EXTENT_MAP_HOLE) || 7695 (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { 7696 iomap->addr = IOMAP_NULL_ADDR; 7697 iomap->type = IOMAP_HOLE; 7698 } else { 7699 iomap->addr = em->block_start + (start - em->start); 7700 iomap->type = IOMAP_MAPPED; 7701 } 7702 iomap->offset = start; 7703 iomap->bdev = fs_info->fs_devices->latest_dev->bdev; 7704 iomap->length = len; 7705 7706 if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) 7707 iomap->flags |= IOMAP_F_ZONE_APPEND; 7708 7709 free_extent_map(em); 7710 7711 return 0; 7712 7713 unlock_err: 7714 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7715 &cached_state); 7716 err: 7717 kfree(dio_data); 7718 7719 return ret; 7720 } 7721 7722 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, 7723 ssize_t written, unsigned int flags, struct iomap *iomap) 7724 { 7725 int ret = 0; 7726 struct btrfs_dio_data *dio_data = iomap->private; 7727 size_t submitted = dio_data->submitted; 7728 const bool write = !!(flags & IOMAP_WRITE); 7729 7730 if (!write && (iomap->type == IOMAP_HOLE)) { 7731 /* If reading from a hole, unlock and return */ 7732 unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); 7733 goto out; 7734 } 7735 7736 if (submitted < length) { 7737 pos += submitted; 7738 length -= submitted; 7739 if (write) 7740 __endio_write_update_ordered(BTRFS_I(inode), pos, 7741 length, false); 7742 else 7743 unlock_extent(&BTRFS_I(inode)->io_tree, pos, 7744 pos + length - 1); 7745 ret = -ENOTBLK; 7746 } 7747 7748 if (write) 7749 extent_changeset_free(dio_data->data_reserved); 7750 out: 7751 kfree(dio_data); 7752 iomap->private = NULL; 7753 7754 return ret; 7755 } 7756 7757 static void btrfs_dio_private_put(struct btrfs_dio_private *dip) 7758 { 7759 /* 7760 * This implies a barrier so that stores to dio_bio->bi_status before 7761 * this and loads of dio_bio->bi_status after this are fully ordered. 7762 */ 7763 if (!refcount_dec_and_test(&dip->refs)) 7764 return; 7765 7766 if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { 7767 __endio_write_update_ordered(BTRFS_I(dip->inode), 7768 dip->file_offset, 7769 dip->bytes, 7770 !dip->dio_bio->bi_status); 7771 } else { 7772 unlock_extent(&BTRFS_I(dip->inode)->io_tree, 7773 dip->file_offset, 7774 dip->file_offset + dip->bytes - 1); 7775 } 7776 7777 bio_endio(dip->dio_bio); 7778 kfree(dip); 7779 } 7780 7781 static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, 7782 int mirror_num, 7783 unsigned long bio_flags) 7784 { 7785 struct btrfs_dio_private *dip = bio->bi_private; 7786 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7787 blk_status_t ret; 7788 7789 BUG_ON(bio_op(bio) == REQ_OP_WRITE); 7790 7791 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 7792 if (ret) 7793 return ret; 7794 7795 refcount_inc(&dip->refs); 7796 ret = btrfs_map_bio(fs_info, bio, mirror_num); 7797 if (ret) 7798 refcount_dec(&dip->refs); 7799 return ret; 7800 } 7801 7802 static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, 7803 struct btrfs_bio *bbio, 7804 const bool uptodate) 7805 { 7806 struct inode *inode = dip->inode; 7807 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 7808 const u32 sectorsize = fs_info->sectorsize; 7809 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 7810 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7811 const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); 7812 struct bio_vec bvec; 7813 struct bvec_iter iter; 7814 const u64 orig_file_offset = dip->file_offset; 7815 u64 start = orig_file_offset; 7816 u32 bio_offset = 0; 7817 blk_status_t err = BLK_STS_OK; 7818 7819 __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) { 7820 unsigned int i, nr_sectors, pgoff; 7821 7822 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); 7823 pgoff = bvec.bv_offset; 7824 for (i = 0; i < nr_sectors; i++) { 7825 ASSERT(pgoff < PAGE_SIZE); 7826 if (uptodate && 7827 (!csum || !check_data_csum(inode, bbio, 7828 bio_offset, bvec.bv_page, 7829 pgoff, start))) { 7830 clean_io_failure(fs_info, failure_tree, io_tree, 7831 start, bvec.bv_page, 7832 btrfs_ino(BTRFS_I(inode)), 7833 pgoff); 7834 } else { 7835 int ret; 7836 7837 ASSERT((start - orig_file_offset) < UINT_MAX); 7838 ret = btrfs_repair_one_sector(inode, 7839 &bbio->bio, 7840 start - orig_file_offset, 7841 bvec.bv_page, pgoff, 7842 start, bbio->mirror_num, 7843 submit_dio_repair_bio); 7844 if (ret) 7845 err = errno_to_blk_status(ret); 7846 } 7847 start += sectorsize; 7848 ASSERT(bio_offset + sectorsize > bio_offset); 7849 bio_offset += sectorsize; 7850 pgoff += sectorsize; 7851 } 7852 } 7853 return err; 7854 } 7855 7856 static void __endio_write_update_ordered(struct btrfs_inode *inode, 7857 const u64 offset, const u64 bytes, 7858 const bool uptodate) 7859 { 7860 btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, 7861 finish_ordered_fn, uptodate); 7862 } 7863 7864 static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, 7865 struct bio *bio, 7866 u64 dio_file_offset) 7867 { 7868 return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); 7869 } 7870 7871 static void btrfs_end_dio_bio(struct bio *bio) 7872 { 7873 struct btrfs_dio_private *dip = bio->bi_private; 7874 blk_status_t err = bio->bi_status; 7875 7876 if (err) 7877 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, 7878 "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", 7879 btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), 7880 bio->bi_opf, bio->bi_iter.bi_sector, 7881 bio->bi_iter.bi_size, err); 7882 7883 if (bio_op(bio) == REQ_OP_READ) 7884 err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err); 7885 7886 if (err) 7887 dip->dio_bio->bi_status = err; 7888 7889 btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio); 7890 7891 bio_put(bio); 7892 btrfs_dio_private_put(dip); 7893 } 7894 7895 static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, 7896 struct inode *inode, u64 file_offset, int async_submit) 7897 { 7898 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7899 struct btrfs_dio_private *dip = bio->bi_private; 7900 bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; 7901 blk_status_t ret; 7902 7903 /* Check btrfs_submit_bio_hook() for rules about async submit. */ 7904 if (async_submit) 7905 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); 7906 7907 if (!write) { 7908 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 7909 if (ret) 7910 goto err; 7911 } 7912 7913 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 7914 goto map; 7915 7916 if (write && async_submit) { 7917 ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset, 7918 btrfs_submit_bio_start_direct_io); 7919 goto err; 7920 } else if (write) { 7921 /* 7922 * If we aren't doing async submit, calculate the csum of the 7923 * bio now. 7924 */ 7925 ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); 7926 if (ret) 7927 goto err; 7928 } else { 7929 u64 csum_offset; 7930 7931 csum_offset = file_offset - dip->file_offset; 7932 csum_offset >>= fs_info->sectorsize_bits; 7933 csum_offset *= fs_info->csum_size; 7934 btrfs_bio(bio)->csum = dip->csums + csum_offset; 7935 } 7936 map: 7937 ret = btrfs_map_bio(fs_info, bio, 0); 7938 err: 7939 return ret; 7940 } 7941 7942 /* 7943 * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked 7944 * or ordered extents whether or not we submit any bios. 7945 */ 7946 static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, 7947 struct inode *inode, 7948 loff_t file_offset) 7949 { 7950 const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); 7951 const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); 7952 size_t dip_size; 7953 struct btrfs_dio_private *dip; 7954 7955 dip_size = sizeof(*dip); 7956 if (!write && csum) { 7957 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7958 size_t nblocks; 7959 7960 nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; 7961 dip_size += fs_info->csum_size * nblocks; 7962 } 7963 7964 dip = kzalloc(dip_size, GFP_NOFS); 7965 if (!dip) 7966 return NULL; 7967 7968 dip->inode = inode; 7969 dip->file_offset = file_offset; 7970 dip->bytes = dio_bio->bi_iter.bi_size; 7971 dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; 7972 dip->dio_bio = dio_bio; 7973 refcount_set(&dip->refs, 1); 7974 return dip; 7975 } 7976 7977 static void btrfs_submit_direct(const struct iomap_iter *iter, 7978 struct bio *dio_bio, loff_t file_offset) 7979 { 7980 struct inode *inode = iter->inode; 7981 const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); 7982 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7983 const bool raid56 = (btrfs_data_alloc_profile(fs_info) & 7984 BTRFS_BLOCK_GROUP_RAID56_MASK); 7985 struct btrfs_dio_private *dip; 7986 struct bio *bio; 7987 u64 start_sector; 7988 int async_submit = 0; 7989 u64 submit_len; 7990 u64 clone_offset = 0; 7991 u64 clone_len; 7992 u64 logical; 7993 int ret; 7994 blk_status_t status; 7995 struct btrfs_io_geometry geom; 7996 struct btrfs_dio_data *dio_data = iter->iomap.private; 7997 struct extent_map *em = NULL; 7998 7999 dip = btrfs_create_dio_private(dio_bio, inode, file_offset); 8000 if (!dip) { 8001 if (!write) { 8002 unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, 8003 file_offset + dio_bio->bi_iter.bi_size - 1); 8004 } 8005 dio_bio->bi_status = BLK_STS_RESOURCE; 8006 bio_endio(dio_bio); 8007 return; 8008 } 8009 8010 if (!write) { 8011 /* 8012 * Load the csums up front to reduce csum tree searches and 8013 * contention when submitting bios. 8014 * 8015 * If we have csums disabled this will do nothing. 8016 */ 8017 status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); 8018 if (status != BLK_STS_OK) 8019 goto out_err; 8020 } 8021 8022 start_sector = dio_bio->bi_iter.bi_sector; 8023 submit_len = dio_bio->bi_iter.bi_size; 8024 8025 do { 8026 logical = start_sector << 9; 8027 em = btrfs_get_chunk_map(fs_info, logical, submit_len); 8028 if (IS_ERR(em)) { 8029 status = errno_to_blk_status(PTR_ERR(em)); 8030 em = NULL; 8031 goto out_err_em; 8032 } 8033 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), 8034 logical, &geom); 8035 if (ret) { 8036 status = errno_to_blk_status(ret); 8037 goto out_err_em; 8038 } 8039 8040 clone_len = min(submit_len, geom.len); 8041 ASSERT(clone_len <= UINT_MAX); 8042 8043 /* 8044 * This will never fail as it's passing GPF_NOFS and 8045 * the allocation is backed by btrfs_bioset. 8046 */ 8047 bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); 8048 bio->bi_private = dip; 8049 bio->bi_end_io = btrfs_end_dio_bio; 8050 8051 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 8052 status = extract_ordered_extent(BTRFS_I(inode), bio, 8053 file_offset); 8054 if (status) { 8055 bio_put(bio); 8056 goto out_err; 8057 } 8058 } 8059 8060 ASSERT(submit_len >= clone_len); 8061 submit_len -= clone_len; 8062 8063 /* 8064 * Increase the count before we submit the bio so we know 8065 * the end IO handler won't happen before we increase the 8066 * count. Otherwise, the dip might get freed before we're 8067 * done setting it up. 8068 * 8069 * We transfer the initial reference to the last bio, so we 8070 * don't need to increment the reference count for the last one. 8071 */ 8072 if (submit_len > 0) { 8073 refcount_inc(&dip->refs); 8074 /* 8075 * If we are submitting more than one bio, submit them 8076 * all asynchronously. The exception is RAID 5 or 6, as 8077 * asynchronous checksums make it difficult to collect 8078 * full stripe writes. 8079 */ 8080 if (!raid56) 8081 async_submit = 1; 8082 } 8083 8084 status = btrfs_submit_dio_bio(bio, inode, file_offset, 8085 async_submit); 8086 if (status) { 8087 bio_put(bio); 8088 if (submit_len > 0) 8089 refcount_dec(&dip->refs); 8090 goto out_err_em; 8091 } 8092 8093 dio_data->submitted += clone_len; 8094 clone_offset += clone_len; 8095 start_sector += clone_len >> 9; 8096 file_offset += clone_len; 8097 8098 free_extent_map(em); 8099 } while (submit_len > 0); 8100 return; 8101 8102 out_err_em: 8103 free_extent_map(em); 8104 out_err: 8105 dip->dio_bio->bi_status = status; 8106 btrfs_dio_private_put(dip); 8107 } 8108 8109 const struct iomap_ops btrfs_dio_iomap_ops = { 8110 .iomap_begin = btrfs_dio_iomap_begin, 8111 .iomap_end = btrfs_dio_iomap_end, 8112 }; 8113 8114 const struct iomap_dio_ops btrfs_dio_ops = { 8115 .submit_io = btrfs_submit_direct, 8116 }; 8117 8118 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 8119 u64 start, u64 len) 8120 { 8121 int ret; 8122 8123 ret = fiemap_prep(inode, fieinfo, start, &len, 0); 8124 if (ret) 8125 return ret; 8126 8127 return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); 8128 } 8129 8130 int btrfs_readpage(struct file *file, struct page *page) 8131 { 8132 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 8133 u64 start = page_offset(page); 8134 u64 end = start + PAGE_SIZE - 1; 8135 struct btrfs_bio_ctrl bio_ctrl = { 0 }; 8136 int ret; 8137 8138 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 8139 8140 ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); 8141 if (bio_ctrl.bio) { 8142 int ret2; 8143 8144 ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); 8145 if (ret == 0) 8146 ret = ret2; 8147 } 8148 return ret; 8149 } 8150 8151 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 8152 { 8153 struct inode *inode = page->mapping->host; 8154 int ret; 8155 8156 if (current->flags & PF_MEMALLOC) { 8157 redirty_page_for_writepage(wbc, page); 8158 unlock_page(page); 8159 return 0; 8160 } 8161 8162 /* 8163 * If we are under memory pressure we will call this directly from the 8164 * VM, we need to make sure we have the inode referenced for the ordered 8165 * extent. If not just return like we didn't do anything. 8166 */ 8167 if (!igrab(inode)) { 8168 redirty_page_for_writepage(wbc, page); 8169 return AOP_WRITEPAGE_ACTIVATE; 8170 } 8171 ret = extent_write_full_page(page, wbc); 8172 btrfs_add_delayed_iput(inode); 8173 return ret; 8174 } 8175 8176 static int btrfs_writepages(struct address_space *mapping, 8177 struct writeback_control *wbc) 8178 { 8179 return extent_writepages(mapping, wbc); 8180 } 8181 8182 static void btrfs_readahead(struct readahead_control *rac) 8183 { 8184 extent_readahead(rac); 8185 } 8186 8187 /* 8188 * For releasepage() and invalidate_folio() we have a race window where 8189 * folio_end_writeback() is called but the subpage spinlock is not yet released. 8190 * If we continue to release/invalidate the page, we could cause use-after-free 8191 * for subpage spinlock. So this function is to spin and wait for subpage 8192 * spinlock. 8193 */ 8194 static void wait_subpage_spinlock(struct page *page) 8195 { 8196 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 8197 struct btrfs_subpage *subpage; 8198 8199 if (fs_info->sectorsize == PAGE_SIZE) 8200 return; 8201 8202 ASSERT(PagePrivate(page) && page->private); 8203 subpage = (struct btrfs_subpage *)page->private; 8204 8205 /* 8206 * This may look insane as we just acquire the spinlock and release it, 8207 * without doing anything. But we just want to make sure no one is 8208 * still holding the subpage spinlock. 8209 * And since the page is not dirty nor writeback, and we have page 8210 * locked, the only possible way to hold a spinlock is from the endio 8211 * function to clear page writeback. 8212 * 8213 * Here we just acquire the spinlock so that all existing callers 8214 * should exit and we're safe to release/invalidate the page. 8215 */ 8216 spin_lock_irq(&subpage->lock); 8217 spin_unlock_irq(&subpage->lock); 8218 } 8219 8220 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8221 { 8222 int ret = try_release_extent_mapping(page, gfp_flags); 8223 8224 if (ret == 1) { 8225 wait_subpage_spinlock(page); 8226 clear_page_extent_mapped(page); 8227 } 8228 return ret; 8229 } 8230 8231 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8232 { 8233 if (PageWriteback(page) || PageDirty(page)) 8234 return 0; 8235 return __btrfs_releasepage(page, gfp_flags); 8236 } 8237 8238 #ifdef CONFIG_MIGRATION 8239 static int btrfs_migratepage(struct address_space *mapping, 8240 struct page *newpage, struct page *page, 8241 enum migrate_mode mode) 8242 { 8243 int ret; 8244 8245 ret = migrate_page_move_mapping(mapping, newpage, page, 0); 8246 if (ret != MIGRATEPAGE_SUCCESS) 8247 return ret; 8248 8249 if (page_has_private(page)) 8250 attach_page_private(newpage, detach_page_private(page)); 8251 8252 if (PageOrdered(page)) { 8253 ClearPageOrdered(page); 8254 SetPageOrdered(newpage); 8255 } 8256 8257 if (mode != MIGRATE_SYNC_NO_COPY) 8258 migrate_page_copy(newpage, page); 8259 else 8260 migrate_page_states(newpage, page); 8261 return MIGRATEPAGE_SUCCESS; 8262 } 8263 #endif 8264 8265 static void btrfs_invalidate_folio(struct folio *folio, size_t offset, 8266 size_t length) 8267 { 8268 struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); 8269 struct btrfs_fs_info *fs_info = inode->root->fs_info; 8270 struct extent_io_tree *tree = &inode->io_tree; 8271 struct extent_state *cached_state = NULL; 8272 u64 page_start = folio_pos(folio); 8273 u64 page_end = page_start + folio_size(folio) - 1; 8274 u64 cur; 8275 int inode_evicting = inode->vfs_inode.i_state & I_FREEING; 8276 8277 /* 8278 * We have folio locked so no new ordered extent can be created on this 8279 * page, nor bio can be submitted for this folio. 8280 * 8281 * But already submitted bio can still be finished on this folio. 8282 * Furthermore, endio function won't skip folio which has Ordered 8283 * (Private2) already cleared, so it's possible for endio and 8284 * invalidate_folio to do the same ordered extent accounting twice 8285 * on one folio. 8286 * 8287 * So here we wait for any submitted bios to finish, so that we won't 8288 * do double ordered extent accounting on the same folio. 8289 */ 8290 folio_wait_writeback(folio); 8291 wait_subpage_spinlock(&folio->page); 8292 8293 /* 8294 * For subpage case, we have call sites like 8295 * btrfs_punch_hole_lock_range() which passes range not aligned to 8296 * sectorsize. 8297 * If the range doesn't cover the full folio, we don't need to and 8298 * shouldn't clear page extent mapped, as folio->private can still 8299 * record subpage dirty bits for other part of the range. 8300 * 8301 * For cases that invalidate the full folio even the range doesn't 8302 * cover the full folio, like invalidating the last folio, we're 8303 * still safe to wait for ordered extent to finish. 8304 */ 8305 if (!(offset == 0 && length == folio_size(folio))) { 8306 btrfs_releasepage(&folio->page, GFP_NOFS); 8307 return; 8308 } 8309 8310 if (!inode_evicting) 8311 lock_extent_bits(tree, page_start, page_end, &cached_state); 8312 8313 cur = page_start; 8314 while (cur < page_end) { 8315 struct btrfs_ordered_extent *ordered; 8316 bool delete_states; 8317 u64 range_end; 8318 u32 range_len; 8319 8320 ordered = btrfs_lookup_first_ordered_range(inode, cur, 8321 page_end + 1 - cur); 8322 if (!ordered) { 8323 range_end = page_end; 8324 /* 8325 * No ordered extent covering this range, we are safe 8326 * to delete all extent states in the range. 8327 */ 8328 delete_states = true; 8329 goto next; 8330 } 8331 if (ordered->file_offset > cur) { 8332 /* 8333 * There is a range between [cur, oe->file_offset) not 8334 * covered by any ordered extent. 8335 * We are safe to delete all extent states, and handle 8336 * the ordered extent in the next iteration. 8337 */ 8338 range_end = ordered->file_offset - 1; 8339 delete_states = true; 8340 goto next; 8341 } 8342 8343 range_end = min(ordered->file_offset + ordered->num_bytes - 1, 8344 page_end); 8345 ASSERT(range_end + 1 - cur < U32_MAX); 8346 range_len = range_end + 1 - cur; 8347 if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { 8348 /* 8349 * If Ordered (Private2) is cleared, it means endio has 8350 * already been executed for the range. 8351 * We can't delete the extent states as 8352 * btrfs_finish_ordered_io() may still use some of them. 8353 */ 8354 delete_states = false; 8355 goto next; 8356 } 8357 btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); 8358 8359 /* 8360 * IO on this page will never be started, so we need to account 8361 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW 8362 * here, must leave that up for the ordered extent completion. 8363 * 8364 * This will also unlock the range for incoming 8365 * btrfs_finish_ordered_io(). 8366 */ 8367 if (!inode_evicting) 8368 clear_extent_bit(tree, cur, range_end, 8369 EXTENT_DELALLOC | 8370 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 8371 EXTENT_DEFRAG, 1, 0, &cached_state); 8372 8373 spin_lock_irq(&inode->ordered_tree.lock); 8374 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 8375 ordered->truncated_len = min(ordered->truncated_len, 8376 cur - ordered->file_offset); 8377 spin_unlock_irq(&inode->ordered_tree.lock); 8378 8379 if (btrfs_dec_test_ordered_pending(inode, &ordered, 8380 cur, range_end + 1 - cur)) { 8381 btrfs_finish_ordered_io(ordered); 8382 /* 8383 * The ordered extent has finished, now we're again 8384 * safe to delete all extent states of the range. 8385 */ 8386 delete_states = true; 8387 } else { 8388 /* 8389 * btrfs_finish_ordered_io() will get executed by endio 8390 * of other pages, thus we can't delete extent states 8391 * anymore 8392 */ 8393 delete_states = false; 8394 } 8395 next: 8396 if (ordered) 8397 btrfs_put_ordered_extent(ordered); 8398 /* 8399 * Qgroup reserved space handler 8400 * Sector(s) here will be either: 8401 * 8402 * 1) Already written to disk or bio already finished 8403 * Then its QGROUP_RESERVED bit in io_tree is already cleared. 8404 * Qgroup will be handled by its qgroup_record then. 8405 * btrfs_qgroup_free_data() call will do nothing here. 8406 * 8407 * 2) Not written to disk yet 8408 * Then btrfs_qgroup_free_data() call will clear the 8409 * QGROUP_RESERVED bit of its io_tree, and free the qgroup 8410 * reserved data space. 8411 * Since the IO will never happen for this page. 8412 */ 8413 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); 8414 if (!inode_evicting) { 8415 clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | 8416 EXTENT_DELALLOC | EXTENT_UPTODATE | 8417 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 8418 delete_states, &cached_state); 8419 } 8420 cur = range_end + 1; 8421 } 8422 /* 8423 * We have iterated through all ordered extents of the page, the page 8424 * should not have Ordered (Private2) anymore, or the above iteration 8425 * did something wrong. 8426 */ 8427 ASSERT(!folio_test_ordered(folio)); 8428 btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); 8429 if (!inode_evicting) 8430 __btrfs_releasepage(&folio->page, GFP_NOFS); 8431 clear_page_extent_mapped(&folio->page); 8432 } 8433 8434 /* 8435 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 8436 * called from a page fault handler when a page is first dirtied. Hence we must 8437 * be careful to check for EOF conditions here. We set the page up correctly 8438 * for a written page which means we get ENOSPC checking when writing into 8439 * holes and correct delalloc and unwritten extent mapping on filesystems that 8440 * support these features. 8441 * 8442 * We are not allowed to take the i_mutex here so we have to play games to 8443 * protect against truncate races as the page could now be beyond EOF. Because 8444 * truncate_setsize() writes the inode size before removing pages, once we have 8445 * the page lock we can determine safely if the page is beyond EOF. If it is not 8446 * beyond EOF, then the page is guaranteed safe against truncation until we 8447 * unlock the page. 8448 */ 8449 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) 8450 { 8451 struct page *page = vmf->page; 8452 struct inode *inode = file_inode(vmf->vma->vm_file); 8453 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8454 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 8455 struct btrfs_ordered_extent *ordered; 8456 struct extent_state *cached_state = NULL; 8457 struct extent_changeset *data_reserved = NULL; 8458 unsigned long zero_start; 8459 loff_t size; 8460 vm_fault_t ret; 8461 int ret2; 8462 int reserved = 0; 8463 u64 reserved_space; 8464 u64 page_start; 8465 u64 page_end; 8466 u64 end; 8467 8468 reserved_space = PAGE_SIZE; 8469 8470 sb_start_pagefault(inode->i_sb); 8471 page_start = page_offset(page); 8472 page_end = page_start + PAGE_SIZE - 1; 8473 end = page_end; 8474 8475 /* 8476 * Reserving delalloc space after obtaining the page lock can lead to 8477 * deadlock. For example, if a dirty page is locked by this function 8478 * and the call to btrfs_delalloc_reserve_space() ends up triggering 8479 * dirty page write out, then the btrfs_writepage() function could 8480 * end up waiting indefinitely to get a lock on the page currently 8481 * being processed by btrfs_page_mkwrite() function. 8482 */ 8483 ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 8484 page_start, reserved_space); 8485 if (!ret2) { 8486 ret2 = file_update_time(vmf->vma->vm_file); 8487 reserved = 1; 8488 } 8489 if (ret2) { 8490 ret = vmf_error(ret2); 8491 if (reserved) 8492 goto out; 8493 goto out_noreserve; 8494 } 8495 8496 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 8497 again: 8498 down_read(&BTRFS_I(inode)->i_mmap_lock); 8499 lock_page(page); 8500 size = i_size_read(inode); 8501 8502 if ((page->mapping != inode->i_mapping) || 8503 (page_start >= size)) { 8504 /* page got truncated out from underneath us */ 8505 goto out_unlock; 8506 } 8507 wait_on_page_writeback(page); 8508 8509 lock_extent_bits(io_tree, page_start, page_end, &cached_state); 8510 ret2 = set_page_extent_mapped(page); 8511 if (ret2 < 0) { 8512 ret = vmf_error(ret2); 8513 unlock_extent_cached(io_tree, page_start, page_end, &cached_state); 8514 goto out_unlock; 8515 } 8516 8517 /* 8518 * we can't set the delalloc bits if there are pending ordered 8519 * extents. Drop our locks and wait for them to finish 8520 */ 8521 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, 8522 PAGE_SIZE); 8523 if (ordered) { 8524 unlock_extent_cached(io_tree, page_start, page_end, 8525 &cached_state); 8526 unlock_page(page); 8527 up_read(&BTRFS_I(inode)->i_mmap_lock); 8528 btrfs_start_ordered_extent(ordered, 1); 8529 btrfs_put_ordered_extent(ordered); 8530 goto again; 8531 } 8532 8533 if (page->index == ((size - 1) >> PAGE_SHIFT)) { 8534 reserved_space = round_up(size - page_start, 8535 fs_info->sectorsize); 8536 if (reserved_space < PAGE_SIZE) { 8537 end = page_start + reserved_space - 1; 8538 btrfs_delalloc_release_space(BTRFS_I(inode), 8539 data_reserved, page_start, 8540 PAGE_SIZE - reserved_space, true); 8541 } 8542 } 8543 8544 /* 8545 * page_mkwrite gets called when the page is firstly dirtied after it's 8546 * faulted in, but write(2) could also dirty a page and set delalloc 8547 * bits, thus in this case for space account reason, we still need to 8548 * clear any delalloc bits within this page range since we have to 8549 * reserve data&meta space before lock_page() (see above comments). 8550 */ 8551 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, 8552 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 8553 EXTENT_DEFRAG, 0, 0, &cached_state); 8554 8555 ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, 8556 &cached_state); 8557 if (ret2) { 8558 unlock_extent_cached(io_tree, page_start, page_end, 8559 &cached_state); 8560 ret = VM_FAULT_SIGBUS; 8561 goto out_unlock; 8562 } 8563 8564 /* page is wholly or partially inside EOF */ 8565 if (page_start + PAGE_SIZE > size) 8566 zero_start = offset_in_page(size); 8567 else 8568 zero_start = PAGE_SIZE; 8569 8570 if (zero_start != PAGE_SIZE) { 8571 memzero_page(page, zero_start, PAGE_SIZE - zero_start); 8572 flush_dcache_page(page); 8573 } 8574 btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); 8575 btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); 8576 btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); 8577 8578 btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); 8579 8580 unlock_extent_cached(io_tree, page_start, page_end, &cached_state); 8581 up_read(&BTRFS_I(inode)->i_mmap_lock); 8582 8583 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 8584 sb_end_pagefault(inode->i_sb); 8585 extent_changeset_free(data_reserved); 8586 return VM_FAULT_LOCKED; 8587 8588 out_unlock: 8589 unlock_page(page); 8590 up_read(&BTRFS_I(inode)->i_mmap_lock); 8591 out: 8592 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 8593 btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, 8594 reserved_space, (ret != 0)); 8595 out_noreserve: 8596 sb_end_pagefault(inode->i_sb); 8597 extent_changeset_free(data_reserved); 8598 return ret; 8599 } 8600 8601 static int btrfs_truncate(struct inode *inode, bool skip_writeback) 8602 { 8603 struct btrfs_truncate_control control = { 8604 .inode = BTRFS_I(inode), 8605 .ino = btrfs_ino(BTRFS_I(inode)), 8606 .min_type = BTRFS_EXTENT_DATA_KEY, 8607 .clear_extent_range = true, 8608 }; 8609 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8610 struct btrfs_root *root = BTRFS_I(inode)->root; 8611 struct btrfs_block_rsv *rsv; 8612 int ret; 8613 struct btrfs_trans_handle *trans; 8614 u64 mask = fs_info->sectorsize - 1; 8615 u64 min_size = btrfs_calc_metadata_size(fs_info, 1); 8616 8617 if (!skip_writeback) { 8618 ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), 8619 (u64)-1); 8620 if (ret) 8621 return ret; 8622 } 8623 8624 /* 8625 * Yes ladies and gentlemen, this is indeed ugly. We have a couple of 8626 * things going on here: 8627 * 8628 * 1) We need to reserve space to update our inode. 8629 * 8630 * 2) We need to have something to cache all the space that is going to 8631 * be free'd up by the truncate operation, but also have some slack 8632 * space reserved in case it uses space during the truncate (thank you 8633 * very much snapshotting). 8634 * 8635 * And we need these to be separate. The fact is we can use a lot of 8636 * space doing the truncate, and we have no earthly idea how much space 8637 * we will use, so we need the truncate reservation to be separate so it 8638 * doesn't end up using space reserved for updating the inode. We also 8639 * need to be able to stop the transaction and start a new one, which 8640 * means we need to be able to update the inode several times, and we 8641 * have no idea of knowing how many times that will be, so we can't just 8642 * reserve 1 item for the entirety of the operation, so that has to be 8643 * done separately as well. 8644 * 8645 * So that leaves us with 8646 * 8647 * 1) rsv - for the truncate reservation, which we will steal from the 8648 * transaction reservation. 8649 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for 8650 * updating the inode. 8651 */ 8652 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 8653 if (!rsv) 8654 return -ENOMEM; 8655 rsv->size = min_size; 8656 rsv->failfast = 1; 8657 8658 /* 8659 * 1 for the truncate slack space 8660 * 1 for updating the inode. 8661 */ 8662 trans = btrfs_start_transaction(root, 2); 8663 if (IS_ERR(trans)) { 8664 ret = PTR_ERR(trans); 8665 goto out; 8666 } 8667 8668 /* Migrate the slack space for the truncate to our reserve */ 8669 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, 8670 min_size, false); 8671 BUG_ON(ret); 8672 8673 trans->block_rsv = rsv; 8674 8675 while (1) { 8676 struct extent_state *cached_state = NULL; 8677 const u64 new_size = inode->i_size; 8678 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); 8679 8680 control.new_size = new_size; 8681 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, 8682 &cached_state); 8683 /* 8684 * We want to drop from the next block forward in case this new 8685 * size is not block aligned since we will be keeping the last 8686 * block of the extent just the way it is. 8687 */ 8688 btrfs_drop_extent_cache(BTRFS_I(inode), 8689 ALIGN(new_size, fs_info->sectorsize), 8690 (u64)-1, 0); 8691 8692 ret = btrfs_truncate_inode_items(trans, root, &control); 8693 8694 inode_sub_bytes(inode, control.sub_bytes); 8695 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size); 8696 8697 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, 8698 (u64)-1, &cached_state); 8699 8700 trans->block_rsv = &fs_info->trans_block_rsv; 8701 if (ret != -ENOSPC && ret != -EAGAIN) 8702 break; 8703 8704 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 8705 if (ret) 8706 break; 8707 8708 btrfs_end_transaction(trans); 8709 btrfs_btree_balance_dirty(fs_info); 8710 8711 trans = btrfs_start_transaction(root, 2); 8712 if (IS_ERR(trans)) { 8713 ret = PTR_ERR(trans); 8714 trans = NULL; 8715 break; 8716 } 8717 8718 btrfs_block_rsv_release(fs_info, rsv, -1, NULL); 8719 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 8720 rsv, min_size, false); 8721 BUG_ON(ret); /* shouldn't happen */ 8722 trans->block_rsv = rsv; 8723 } 8724 8725 /* 8726 * We can't call btrfs_truncate_block inside a trans handle as we could 8727 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we 8728 * know we've truncated everything except the last little bit, and can 8729 * do btrfs_truncate_block and then update the disk_i_size. 8730 */ 8731 if (ret == BTRFS_NEED_TRUNCATE_BLOCK) { 8732 btrfs_end_transaction(trans); 8733 btrfs_btree_balance_dirty(fs_info); 8734 8735 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); 8736 if (ret) 8737 goto out; 8738 trans = btrfs_start_transaction(root, 1); 8739 if (IS_ERR(trans)) { 8740 ret = PTR_ERR(trans); 8741 goto out; 8742 } 8743 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 8744 } 8745 8746 if (trans) { 8747 int ret2; 8748 8749 trans->block_rsv = &fs_info->trans_block_rsv; 8750 ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode)); 8751 if (ret2 && !ret) 8752 ret = ret2; 8753 8754 ret2 = btrfs_end_transaction(trans); 8755 if (ret2 && !ret) 8756 ret = ret2; 8757 btrfs_btree_balance_dirty(fs_info); 8758 } 8759 out: 8760 btrfs_free_block_rsv(fs_info, rsv); 8761 /* 8762 * So if we truncate and then write and fsync we normally would just 8763 * write the extents that changed, which is a problem if we need to 8764 * first truncate that entire inode. So set this flag so we write out 8765 * all of the extents in the inode to the sync log so we're completely 8766 * safe. 8767 * 8768 * If no extents were dropped or trimmed we don't need to force the next 8769 * fsync to truncate all the inode's items from the log and re-log them 8770 * all. This means the truncate operation did not change the file size, 8771 * or changed it to a smaller size but there was only an implicit hole 8772 * between the old i_size and the new i_size, and there were no prealloc 8773 * extents beyond i_size to drop. 8774 */ 8775 if (control.extents_found > 0) 8776 btrfs_set_inode_full_sync(BTRFS_I(inode)); 8777 8778 return ret; 8779 } 8780 8781 /* 8782 * create a new subvolume directory/inode (helper for the ioctl). 8783 */ 8784 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 8785 struct btrfs_root *new_root, 8786 struct btrfs_root *parent_root, 8787 struct user_namespace *mnt_userns) 8788 { 8789 struct inode *inode; 8790 int err; 8791 u64 index = 0; 8792 u64 ino; 8793 8794 err = btrfs_get_free_objectid(new_root, &ino); 8795 if (err < 0) 8796 return err; 8797 8798 inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, 8799 ino, ino, 8800 S_IFDIR | (~current_umask() & S_IRWXUGO), 8801 &index); 8802 if (IS_ERR(inode)) 8803 return PTR_ERR(inode); 8804 inode->i_op = &btrfs_dir_inode_operations; 8805 inode->i_fop = &btrfs_dir_file_operations; 8806 8807 set_nlink(inode, 1); 8808 btrfs_i_size_write(BTRFS_I(inode), 0); 8809 unlock_new_inode(inode); 8810 8811 err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 8812 if (err) 8813 btrfs_err(new_root->fs_info, 8814 "error inheriting subvolume %llu properties: %d", 8815 new_root->root_key.objectid, err); 8816 8817 err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); 8818 8819 iput(inode); 8820 return err; 8821 } 8822 8823 struct inode *btrfs_alloc_inode(struct super_block *sb) 8824 { 8825 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 8826 struct btrfs_inode *ei; 8827 struct inode *inode; 8828 8829 ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); 8830 if (!ei) 8831 return NULL; 8832 8833 ei->root = NULL; 8834 ei->generation = 0; 8835 ei->last_trans = 0; 8836 ei->last_sub_trans = 0; 8837 ei->logged_trans = 0; 8838 ei->delalloc_bytes = 0; 8839 ei->new_delalloc_bytes = 0; 8840 ei->defrag_bytes = 0; 8841 ei->disk_i_size = 0; 8842 ei->flags = 0; 8843 ei->ro_flags = 0; 8844 ei->csum_bytes = 0; 8845 ei->index_cnt = (u64)-1; 8846 ei->dir_index = 0; 8847 ei->last_unlink_trans = 0; 8848 ei->last_reflink_trans = 0; 8849 ei->last_log_commit = 0; 8850 8851 spin_lock_init(&ei->lock); 8852 ei->outstanding_extents = 0; 8853 if (sb->s_magic != BTRFS_TEST_MAGIC) 8854 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, 8855 BTRFS_BLOCK_RSV_DELALLOC); 8856 ei->runtime_flags = 0; 8857 ei->prop_compress = BTRFS_COMPRESS_NONE; 8858 ei->defrag_compress = BTRFS_COMPRESS_NONE; 8859 8860 ei->delayed_node = NULL; 8861 8862 ei->i_otime.tv_sec = 0; 8863 ei->i_otime.tv_nsec = 0; 8864 8865 inode = &ei->vfs_inode; 8866 extent_map_tree_init(&ei->extent_tree); 8867 extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); 8868 extent_io_tree_init(fs_info, &ei->io_failure_tree, 8869 IO_TREE_INODE_IO_FAILURE, inode); 8870 extent_io_tree_init(fs_info, &ei->file_extent_tree, 8871 IO_TREE_INODE_FILE_EXTENT, inode); 8872 ei->io_tree.track_uptodate = true; 8873 ei->io_failure_tree.track_uptodate = true; 8874 atomic_set(&ei->sync_writers, 0); 8875 mutex_init(&ei->log_mutex); 8876 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 8877 INIT_LIST_HEAD(&ei->delalloc_inodes); 8878 INIT_LIST_HEAD(&ei->delayed_iput); 8879 RB_CLEAR_NODE(&ei->rb_node); 8880 init_rwsem(&ei->i_mmap_lock); 8881 8882 return inode; 8883 } 8884 8885 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 8886 void btrfs_test_destroy_inode(struct inode *inode) 8887 { 8888 btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); 8889 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 8890 } 8891 #endif 8892 8893 void btrfs_free_inode(struct inode *inode) 8894 { 8895 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 8896 } 8897 8898 void btrfs_destroy_inode(struct inode *vfs_inode) 8899 { 8900 struct btrfs_ordered_extent *ordered; 8901 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 8902 struct btrfs_root *root = inode->root; 8903 8904 WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); 8905 WARN_ON(vfs_inode->i_data.nrpages); 8906 WARN_ON(inode->block_rsv.reserved); 8907 WARN_ON(inode->block_rsv.size); 8908 WARN_ON(inode->outstanding_extents); 8909 if (!S_ISDIR(vfs_inode->i_mode)) { 8910 WARN_ON(inode->delalloc_bytes); 8911 WARN_ON(inode->new_delalloc_bytes); 8912 } 8913 WARN_ON(inode->csum_bytes); 8914 WARN_ON(inode->defrag_bytes); 8915 8916 /* 8917 * This can happen where we create an inode, but somebody else also 8918 * created the same inode and we need to destroy the one we already 8919 * created. 8920 */ 8921 if (!root) 8922 return; 8923 8924 while (1) { 8925 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 8926 if (!ordered) 8927 break; 8928 else { 8929 btrfs_err(root->fs_info, 8930 "found ordered extent %llu %llu on inode cleanup", 8931 ordered->file_offset, ordered->num_bytes); 8932 btrfs_remove_ordered_extent(inode, ordered); 8933 btrfs_put_ordered_extent(ordered); 8934 btrfs_put_ordered_extent(ordered); 8935 } 8936 } 8937 btrfs_qgroup_check_reserved_leak(inode); 8938 inode_tree_del(inode); 8939 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 8940 btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); 8941 btrfs_put_root(inode->root); 8942 } 8943 8944 int btrfs_drop_inode(struct inode *inode) 8945 { 8946 struct btrfs_root *root = BTRFS_I(inode)->root; 8947 8948 if (root == NULL) 8949 return 1; 8950 8951 /* the snap/subvol tree is on deleting */ 8952 if (btrfs_root_refs(&root->root_item) == 0) 8953 return 1; 8954 else 8955 return generic_drop_inode(inode); 8956 } 8957 8958 static void init_once(void *foo) 8959 { 8960 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 8961 8962 inode_init_once(&ei->vfs_inode); 8963 } 8964 8965 void __cold btrfs_destroy_cachep(void) 8966 { 8967 /* 8968 * Make sure all delayed rcu free inodes are flushed before we 8969 * destroy cache. 8970 */ 8971 rcu_barrier(); 8972 kmem_cache_destroy(btrfs_inode_cachep); 8973 kmem_cache_destroy(btrfs_trans_handle_cachep); 8974 kmem_cache_destroy(btrfs_path_cachep); 8975 kmem_cache_destroy(btrfs_free_space_cachep); 8976 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 8977 } 8978 8979 int __init btrfs_init_cachep(void) 8980 { 8981 btrfs_inode_cachep = kmem_cache_create("btrfs_inode", 8982 sizeof(struct btrfs_inode), 0, 8983 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, 8984 init_once); 8985 if (!btrfs_inode_cachep) 8986 goto fail; 8987 8988 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", 8989 sizeof(struct btrfs_trans_handle), 0, 8990 SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); 8991 if (!btrfs_trans_handle_cachep) 8992 goto fail; 8993 8994 btrfs_path_cachep = kmem_cache_create("btrfs_path", 8995 sizeof(struct btrfs_path), 0, 8996 SLAB_MEM_SPREAD, NULL); 8997 if (!btrfs_path_cachep) 8998 goto fail; 8999 9000 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 9001 sizeof(struct btrfs_free_space), 0, 9002 SLAB_MEM_SPREAD, NULL); 9003 if (!btrfs_free_space_cachep) 9004 goto fail; 9005 9006 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 9007 PAGE_SIZE, PAGE_SIZE, 9008 SLAB_MEM_SPREAD, NULL); 9009 if (!btrfs_free_space_bitmap_cachep) 9010 goto fail; 9011 9012 return 0; 9013 fail: 9014 btrfs_destroy_cachep(); 9015 return -ENOMEM; 9016 } 9017 9018 static int btrfs_getattr(struct user_namespace *mnt_userns, 9019 const struct path *path, struct kstat *stat, 9020 u32 request_mask, unsigned int flags) 9021 { 9022 u64 delalloc_bytes; 9023 u64 inode_bytes; 9024 struct inode *inode = d_inode(path->dentry); 9025 u32 blocksize = inode->i_sb->s_blocksize; 9026 u32 bi_flags = BTRFS_I(inode)->flags; 9027 u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; 9028 9029 stat->result_mask |= STATX_BTIME; 9030 stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; 9031 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; 9032 if (bi_flags & BTRFS_INODE_APPEND) 9033 stat->attributes |= STATX_ATTR_APPEND; 9034 if (bi_flags & BTRFS_INODE_COMPRESS) 9035 stat->attributes |= STATX_ATTR_COMPRESSED; 9036 if (bi_flags & BTRFS_INODE_IMMUTABLE) 9037 stat->attributes |= STATX_ATTR_IMMUTABLE; 9038 if (bi_flags & BTRFS_INODE_NODUMP) 9039 stat->attributes |= STATX_ATTR_NODUMP; 9040 if (bi_ro_flags & BTRFS_INODE_RO_VERITY) 9041 stat->attributes |= STATX_ATTR_VERITY; 9042 9043 stat->attributes_mask |= (STATX_ATTR_APPEND | 9044 STATX_ATTR_COMPRESSED | 9045 STATX_ATTR_IMMUTABLE | 9046 STATX_ATTR_NODUMP); 9047 9048 generic_fillattr(mnt_userns, inode, stat); 9049 stat->dev = BTRFS_I(inode)->root->anon_dev; 9050 9051 spin_lock(&BTRFS_I(inode)->lock); 9052 delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; 9053 inode_bytes = inode_get_bytes(inode); 9054 spin_unlock(&BTRFS_I(inode)->lock); 9055 stat->blocks = (ALIGN(inode_bytes, blocksize) + 9056 ALIGN(delalloc_bytes, blocksize)) >> 9; 9057 return 0; 9058 } 9059 9060 static int btrfs_rename_exchange(struct inode *old_dir, 9061 struct dentry *old_dentry, 9062 struct inode *new_dir, 9063 struct dentry *new_dentry) 9064 { 9065 struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); 9066 struct btrfs_trans_handle *trans; 9067 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9068 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9069 struct inode *new_inode = new_dentry->d_inode; 9070 struct inode *old_inode = old_dentry->d_inode; 9071 struct timespec64 ctime = current_time(old_inode); 9072 struct btrfs_rename_ctx old_rename_ctx; 9073 struct btrfs_rename_ctx new_rename_ctx; 9074 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 9075 u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); 9076 u64 old_idx = 0; 9077 u64 new_idx = 0; 9078 int ret; 9079 int ret2; 9080 bool need_abort = false; 9081 9082 /* 9083 * For non-subvolumes allow exchange only within one subvolume, in the 9084 * same inode namespace. Two subvolumes (represented as directory) can 9085 * be exchanged as they're a logical link and have a fixed inode number. 9086 */ 9087 if (root != dest && 9088 (old_ino != BTRFS_FIRST_FREE_OBJECTID || 9089 new_ino != BTRFS_FIRST_FREE_OBJECTID)) 9090 return -EXDEV; 9091 9092 /* close the race window with snapshot create/destroy ioctl */ 9093 if (old_ino == BTRFS_FIRST_FREE_OBJECTID || 9094 new_ino == BTRFS_FIRST_FREE_OBJECTID) 9095 down_read(&fs_info->subvol_sem); 9096 9097 /* 9098 * We want to reserve the absolute worst case amount of items. So if 9099 * both inodes are subvols and we need to unlink them then that would 9100 * require 4 item modifications, but if they are both normal inodes it 9101 * would require 5 item modifications, so we'll assume their normal 9102 * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items 9103 * should cover the worst case number of items we'll modify. 9104 */ 9105 trans = btrfs_start_transaction(root, 12); 9106 if (IS_ERR(trans)) { 9107 ret = PTR_ERR(trans); 9108 goto out_notrans; 9109 } 9110 9111 if (dest != root) { 9112 ret = btrfs_record_root_in_trans(trans, dest); 9113 if (ret) 9114 goto out_fail; 9115 } 9116 9117 /* 9118 * We need to find a free sequence number both in the source and 9119 * in the destination directory for the exchange. 9120 */ 9121 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); 9122 if (ret) 9123 goto out_fail; 9124 ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); 9125 if (ret) 9126 goto out_fail; 9127 9128 BTRFS_I(old_inode)->dir_index = 0ULL; 9129 BTRFS_I(new_inode)->dir_index = 0ULL; 9130 9131 /* Reference for the source. */ 9132 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 9133 /* force full log commit if subvolume involved. */ 9134 btrfs_set_log_full_commit(trans); 9135 } else { 9136 ret = btrfs_insert_inode_ref(trans, dest, 9137 new_dentry->d_name.name, 9138 new_dentry->d_name.len, 9139 old_ino, 9140 btrfs_ino(BTRFS_I(new_dir)), 9141 old_idx); 9142 if (ret) 9143 goto out_fail; 9144 need_abort = true; 9145 } 9146 9147 /* And now for the dest. */ 9148 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 9149 /* force full log commit if subvolume involved. */ 9150 btrfs_set_log_full_commit(trans); 9151 } else { 9152 ret = btrfs_insert_inode_ref(trans, root, 9153 old_dentry->d_name.name, 9154 old_dentry->d_name.len, 9155 new_ino, 9156 btrfs_ino(BTRFS_I(old_dir)), 9157 new_idx); 9158 if (ret) { 9159 if (need_abort) 9160 btrfs_abort_transaction(trans, ret); 9161 goto out_fail; 9162 } 9163 } 9164 9165 /* Update inode version and ctime/mtime. */ 9166 inode_inc_iversion(old_dir); 9167 inode_inc_iversion(new_dir); 9168 inode_inc_iversion(old_inode); 9169 inode_inc_iversion(new_inode); 9170 old_dir->i_ctime = old_dir->i_mtime = ctime; 9171 new_dir->i_ctime = new_dir->i_mtime = ctime; 9172 old_inode->i_ctime = ctime; 9173 new_inode->i_ctime = ctime; 9174 9175 if (old_dentry->d_parent != new_dentry->d_parent) { 9176 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 9177 BTRFS_I(old_inode), 1); 9178 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), 9179 BTRFS_I(new_inode), 1); 9180 } 9181 9182 /* src is a subvolume */ 9183 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 9184 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); 9185 } else { /* src is an inode */ 9186 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), 9187 BTRFS_I(old_dentry->d_inode), 9188 old_dentry->d_name.name, 9189 old_dentry->d_name.len, 9190 &old_rename_ctx); 9191 if (!ret) 9192 ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); 9193 } 9194 if (ret) { 9195 btrfs_abort_transaction(trans, ret); 9196 goto out_fail; 9197 } 9198 9199 /* dest is a subvolume */ 9200 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 9201 ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); 9202 } else { /* dest is an inode */ 9203 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), 9204 BTRFS_I(new_dentry->d_inode), 9205 new_dentry->d_name.name, 9206 new_dentry->d_name.len, 9207 &new_rename_ctx); 9208 if (!ret) 9209 ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); 9210 } 9211 if (ret) { 9212 btrfs_abort_transaction(trans, ret); 9213 goto out_fail; 9214 } 9215 9216 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 9217 new_dentry->d_name.name, 9218 new_dentry->d_name.len, 0, old_idx); 9219 if (ret) { 9220 btrfs_abort_transaction(trans, ret); 9221 goto out_fail; 9222 } 9223 9224 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), 9225 old_dentry->d_name.name, 9226 old_dentry->d_name.len, 0, new_idx); 9227 if (ret) { 9228 btrfs_abort_transaction(trans, ret); 9229 goto out_fail; 9230 } 9231 9232 if (old_inode->i_nlink == 1) 9233 BTRFS_I(old_inode)->dir_index = old_idx; 9234 if (new_inode->i_nlink == 1) 9235 BTRFS_I(new_inode)->dir_index = new_idx; 9236 9237 /* 9238 * Now pin the logs of the roots. We do it to ensure that no other task 9239 * can sync the logs while we are in progress with the rename, because 9240 * that could result in an inconsistency in case any of the inodes that 9241 * are part of this rename operation were logged before. 9242 */ 9243 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 9244 btrfs_pin_log_trans(root); 9245 if (new_ino != BTRFS_FIRST_FREE_OBJECTID) 9246 btrfs_pin_log_trans(dest); 9247 9248 /* Do the log updates for all inodes. */ 9249 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 9250 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), 9251 old_rename_ctx.index, new_dentry->d_parent); 9252 if (new_ino != BTRFS_FIRST_FREE_OBJECTID) 9253 btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), 9254 new_rename_ctx.index, old_dentry->d_parent); 9255 9256 /* Now unpin the logs. */ 9257 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 9258 btrfs_end_log_trans(root); 9259 if (new_ino != BTRFS_FIRST_FREE_OBJECTID) 9260 btrfs_end_log_trans(dest); 9261 out_fail: 9262 ret2 = btrfs_end_transaction(trans); 9263 ret = ret ? ret : ret2; 9264 out_notrans: 9265 if (new_ino == BTRFS_FIRST_FREE_OBJECTID || 9266 old_ino == BTRFS_FIRST_FREE_OBJECTID) 9267 up_read(&fs_info->subvol_sem); 9268 9269 return ret; 9270 } 9271 9272 static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, 9273 struct btrfs_root *root, 9274 struct user_namespace *mnt_userns, 9275 struct inode *dir, 9276 struct dentry *dentry) 9277 { 9278 int ret; 9279 struct inode *inode; 9280 u64 objectid; 9281 u64 index; 9282 9283 ret = btrfs_get_free_objectid(root, &objectid); 9284 if (ret) 9285 return ret; 9286 9287 inode = btrfs_new_inode(trans, root, mnt_userns, dir, 9288 dentry->d_name.name, 9289 dentry->d_name.len, 9290 btrfs_ino(BTRFS_I(dir)), 9291 objectid, 9292 S_IFCHR | WHITEOUT_MODE, 9293 &index); 9294 9295 if (IS_ERR(inode)) { 9296 ret = PTR_ERR(inode); 9297 return ret; 9298 } 9299 9300 inode->i_op = &btrfs_special_inode_operations; 9301 init_special_inode(inode, inode->i_mode, 9302 WHITEOUT_DEV); 9303 9304 ret = btrfs_init_inode_security(trans, inode, dir, 9305 &dentry->d_name); 9306 if (ret) 9307 goto out; 9308 9309 ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, 9310 BTRFS_I(inode), 0, index); 9311 if (ret) 9312 goto out; 9313 9314 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 9315 out: 9316 unlock_new_inode(inode); 9317 if (ret) 9318 inode_dec_link_count(inode); 9319 iput(inode); 9320 9321 return ret; 9322 } 9323 9324 static int btrfs_rename(struct user_namespace *mnt_userns, 9325 struct inode *old_dir, struct dentry *old_dentry, 9326 struct inode *new_dir, struct dentry *new_dentry, 9327 unsigned int flags) 9328 { 9329 struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); 9330 struct btrfs_trans_handle *trans; 9331 unsigned int trans_num_items; 9332 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9333 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9334 struct inode *new_inode = d_inode(new_dentry); 9335 struct inode *old_inode = d_inode(old_dentry); 9336 struct btrfs_rename_ctx rename_ctx; 9337 u64 index = 0; 9338 int ret; 9339 int ret2; 9340 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 9341 9342 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 9343 return -EPERM; 9344 9345 /* we only allow rename subvolume link between subvolumes */ 9346 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 9347 return -EXDEV; 9348 9349 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 9350 (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID)) 9351 return -ENOTEMPTY; 9352 9353 if (S_ISDIR(old_inode->i_mode) && new_inode && 9354 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 9355 return -ENOTEMPTY; 9356 9357 9358 /* check for collisions, even if the name isn't there */ 9359 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, 9360 new_dentry->d_name.name, 9361 new_dentry->d_name.len); 9362 9363 if (ret) { 9364 if (ret == -EEXIST) { 9365 /* we shouldn't get 9366 * eexist without a new_inode */ 9367 if (WARN_ON(!new_inode)) { 9368 return ret; 9369 } 9370 } else { 9371 /* maybe -EOVERFLOW */ 9372 return ret; 9373 } 9374 } 9375 ret = 0; 9376 9377 /* 9378 * we're using rename to replace one file with another. Start IO on it 9379 * now so we don't add too much work to the end of the transaction 9380 */ 9381 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) 9382 filemap_flush(old_inode->i_mapping); 9383 9384 /* close the racy window with snapshot create/destroy ioctl */ 9385 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9386 down_read(&fs_info->subvol_sem); 9387 /* 9388 * We want to reserve the absolute worst case amount of items. So if 9389 * both inodes are subvols and we need to unlink them then that would 9390 * require 4 item modifications, but if they are both normal inodes it 9391 * would require 5 item modifications, so we'll assume they are normal 9392 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 9393 * should cover the worst case number of items we'll modify. 9394 * If our rename has the whiteout flag, we need more 5 units for the 9395 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item 9396 * when selinux is enabled). 9397 */ 9398 trans_num_items = 11; 9399 if (flags & RENAME_WHITEOUT) 9400 trans_num_items += 5; 9401 trans = btrfs_start_transaction(root, trans_num_items); 9402 if (IS_ERR(trans)) { 9403 ret = PTR_ERR(trans); 9404 goto out_notrans; 9405 } 9406 9407 if (dest != root) { 9408 ret = btrfs_record_root_in_trans(trans, dest); 9409 if (ret) 9410 goto out_fail; 9411 } 9412 9413 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); 9414 if (ret) 9415 goto out_fail; 9416 9417 BTRFS_I(old_inode)->dir_index = 0ULL; 9418 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 9419 /* force full log commit if subvolume involved. */ 9420 btrfs_set_log_full_commit(trans); 9421 } else { 9422 ret = btrfs_insert_inode_ref(trans, dest, 9423 new_dentry->d_name.name, 9424 new_dentry->d_name.len, 9425 old_ino, 9426 btrfs_ino(BTRFS_I(new_dir)), index); 9427 if (ret) 9428 goto out_fail; 9429 } 9430 9431 inode_inc_iversion(old_dir); 9432 inode_inc_iversion(new_dir); 9433 inode_inc_iversion(old_inode); 9434 old_dir->i_ctime = old_dir->i_mtime = 9435 new_dir->i_ctime = new_dir->i_mtime = 9436 old_inode->i_ctime = current_time(old_dir); 9437 9438 if (old_dentry->d_parent != new_dentry->d_parent) 9439 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 9440 BTRFS_I(old_inode), 1); 9441 9442 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 9443 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); 9444 } else { 9445 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), 9446 BTRFS_I(d_inode(old_dentry)), 9447 old_dentry->d_name.name, 9448 old_dentry->d_name.len, 9449 &rename_ctx); 9450 if (!ret) 9451 ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); 9452 } 9453 if (ret) { 9454 btrfs_abort_transaction(trans, ret); 9455 goto out_fail; 9456 } 9457 9458 if (new_inode) { 9459 inode_inc_iversion(new_inode); 9460 new_inode->i_ctime = current_time(new_inode); 9461 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == 9462 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 9463 ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); 9464 BUG_ON(new_inode->i_nlink == 0); 9465 } else { 9466 ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), 9467 BTRFS_I(d_inode(new_dentry)), 9468 new_dentry->d_name.name, 9469 new_dentry->d_name.len); 9470 } 9471 if (!ret && new_inode->i_nlink == 0) 9472 ret = btrfs_orphan_add(trans, 9473 BTRFS_I(d_inode(new_dentry))); 9474 if (ret) { 9475 btrfs_abort_transaction(trans, ret); 9476 goto out_fail; 9477 } 9478 } 9479 9480 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 9481 new_dentry->d_name.name, 9482 new_dentry->d_name.len, 0, index); 9483 if (ret) { 9484 btrfs_abort_transaction(trans, ret); 9485 goto out_fail; 9486 } 9487 9488 if (old_inode->i_nlink == 1) 9489 BTRFS_I(old_inode)->dir_index = index; 9490 9491 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 9492 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), 9493 rename_ctx.index, new_dentry->d_parent); 9494 9495 if (flags & RENAME_WHITEOUT) { 9496 ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, 9497 old_dir, old_dentry); 9498 9499 if (ret) { 9500 btrfs_abort_transaction(trans, ret); 9501 goto out_fail; 9502 } 9503 } 9504 out_fail: 9505 ret2 = btrfs_end_transaction(trans); 9506 ret = ret ? ret : ret2; 9507 out_notrans: 9508 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9509 up_read(&fs_info->subvol_sem); 9510 9511 return ret; 9512 } 9513 9514 static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, 9515 struct dentry *old_dentry, struct inode *new_dir, 9516 struct dentry *new_dentry, unsigned int flags) 9517 { 9518 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 9519 return -EINVAL; 9520 9521 if (flags & RENAME_EXCHANGE) 9522 return btrfs_rename_exchange(old_dir, old_dentry, new_dir, 9523 new_dentry); 9524 9525 return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, 9526 new_dentry, flags); 9527 } 9528 9529 struct btrfs_delalloc_work { 9530 struct inode *inode; 9531 struct completion completion; 9532 struct list_head list; 9533 struct btrfs_work work; 9534 }; 9535 9536 static void btrfs_run_delalloc_work(struct btrfs_work *work) 9537 { 9538 struct btrfs_delalloc_work *delalloc_work; 9539 struct inode *inode; 9540 9541 delalloc_work = container_of(work, struct btrfs_delalloc_work, 9542 work); 9543 inode = delalloc_work->inode; 9544 filemap_flush(inode->i_mapping); 9545 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 9546 &BTRFS_I(inode)->runtime_flags)) 9547 filemap_flush(inode->i_mapping); 9548 9549 iput(inode); 9550 complete(&delalloc_work->completion); 9551 } 9552 9553 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) 9554 { 9555 struct btrfs_delalloc_work *work; 9556 9557 work = kmalloc(sizeof(*work), GFP_NOFS); 9558 if (!work) 9559 return NULL; 9560 9561 init_completion(&work->completion); 9562 INIT_LIST_HEAD(&work->list); 9563 work->inode = inode; 9564 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); 9565 9566 return work; 9567 } 9568 9569 /* 9570 * some fairly slow code that needs optimization. This walks the list 9571 * of all the inodes with pending delalloc and forces them to disk. 9572 */ 9573 static int start_delalloc_inodes(struct btrfs_root *root, 9574 struct writeback_control *wbc, bool snapshot, 9575 bool in_reclaim_context) 9576 { 9577 struct btrfs_inode *binode; 9578 struct inode *inode; 9579 struct btrfs_delalloc_work *work, *next; 9580 struct list_head works; 9581 struct list_head splice; 9582 int ret = 0; 9583 bool full_flush = wbc->nr_to_write == LONG_MAX; 9584 9585 INIT_LIST_HEAD(&works); 9586 INIT_LIST_HEAD(&splice); 9587 9588 mutex_lock(&root->delalloc_mutex); 9589 spin_lock(&root->delalloc_lock); 9590 list_splice_init(&root->delalloc_inodes, &splice); 9591 while (!list_empty(&splice)) { 9592 binode = list_entry(splice.next, struct btrfs_inode, 9593 delalloc_inodes); 9594 9595 list_move_tail(&binode->delalloc_inodes, 9596 &root->delalloc_inodes); 9597 9598 if (in_reclaim_context && 9599 test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) 9600 continue; 9601 9602 inode = igrab(&binode->vfs_inode); 9603 if (!inode) { 9604 cond_resched_lock(&root->delalloc_lock); 9605 continue; 9606 } 9607 spin_unlock(&root->delalloc_lock); 9608 9609 if (snapshot) 9610 set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 9611 &binode->runtime_flags); 9612 if (full_flush) { 9613 work = btrfs_alloc_delalloc_work(inode); 9614 if (!work) { 9615 iput(inode); 9616 ret = -ENOMEM; 9617 goto out; 9618 } 9619 list_add_tail(&work->list, &works); 9620 btrfs_queue_work(root->fs_info->flush_workers, 9621 &work->work); 9622 } else { 9623 ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); 9624 btrfs_add_delayed_iput(inode); 9625 if (ret || wbc->nr_to_write <= 0) 9626 goto out; 9627 } 9628 cond_resched(); 9629 spin_lock(&root->delalloc_lock); 9630 } 9631 spin_unlock(&root->delalloc_lock); 9632 9633 out: 9634 list_for_each_entry_safe(work, next, &works, list) { 9635 list_del_init(&work->list); 9636 wait_for_completion(&work->completion); 9637 kfree(work); 9638 } 9639 9640 if (!list_empty(&splice)) { 9641 spin_lock(&root->delalloc_lock); 9642 list_splice_tail(&splice, &root->delalloc_inodes); 9643 spin_unlock(&root->delalloc_lock); 9644 } 9645 mutex_unlock(&root->delalloc_mutex); 9646 return ret; 9647 } 9648 9649 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) 9650 { 9651 struct writeback_control wbc = { 9652 .nr_to_write = LONG_MAX, 9653 .sync_mode = WB_SYNC_NONE, 9654 .range_start = 0, 9655 .range_end = LLONG_MAX, 9656 }; 9657 struct btrfs_fs_info *fs_info = root->fs_info; 9658 9659 if (BTRFS_FS_ERROR(fs_info)) 9660 return -EROFS; 9661 9662 return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); 9663 } 9664 9665 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, 9666 bool in_reclaim_context) 9667 { 9668 struct writeback_control wbc = { 9669 .nr_to_write = nr, 9670 .sync_mode = WB_SYNC_NONE, 9671 .range_start = 0, 9672 .range_end = LLONG_MAX, 9673 }; 9674 struct btrfs_root *root; 9675 struct list_head splice; 9676 int ret; 9677 9678 if (BTRFS_FS_ERROR(fs_info)) 9679 return -EROFS; 9680 9681 INIT_LIST_HEAD(&splice); 9682 9683 mutex_lock(&fs_info->delalloc_root_mutex); 9684 spin_lock(&fs_info->delalloc_root_lock); 9685 list_splice_init(&fs_info->delalloc_roots, &splice); 9686 while (!list_empty(&splice)) { 9687 /* 9688 * Reset nr_to_write here so we know that we're doing a full 9689 * flush. 9690 */ 9691 if (nr == LONG_MAX) 9692 wbc.nr_to_write = LONG_MAX; 9693 9694 root = list_first_entry(&splice, struct btrfs_root, 9695 delalloc_root); 9696 root = btrfs_grab_root(root); 9697 BUG_ON(!root); 9698 list_move_tail(&root->delalloc_root, 9699 &fs_info->delalloc_roots); 9700 spin_unlock(&fs_info->delalloc_root_lock); 9701 9702 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); 9703 btrfs_put_root(root); 9704 if (ret < 0 || wbc.nr_to_write <= 0) 9705 goto out; 9706 spin_lock(&fs_info->delalloc_root_lock); 9707 } 9708 spin_unlock(&fs_info->delalloc_root_lock); 9709 9710 ret = 0; 9711 out: 9712 if (!list_empty(&splice)) { 9713 spin_lock(&fs_info->delalloc_root_lock); 9714 list_splice_tail(&splice, &fs_info->delalloc_roots); 9715 spin_unlock(&fs_info->delalloc_root_lock); 9716 } 9717 mutex_unlock(&fs_info->delalloc_root_mutex); 9718 return ret; 9719 } 9720 9721 static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, 9722 struct dentry *dentry, const char *symname) 9723 { 9724 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 9725 struct btrfs_trans_handle *trans; 9726 struct btrfs_root *root = BTRFS_I(dir)->root; 9727 struct btrfs_path *path; 9728 struct btrfs_key key; 9729 struct inode *inode = NULL; 9730 int err; 9731 u64 objectid; 9732 u64 index = 0; 9733 int name_len; 9734 int datasize; 9735 unsigned long ptr; 9736 struct btrfs_file_extent_item *ei; 9737 struct extent_buffer *leaf; 9738 9739 name_len = strlen(symname); 9740 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) 9741 return -ENAMETOOLONG; 9742 9743 /* 9744 * 2 items for inode item and ref 9745 * 2 items for dir items 9746 * 1 item for updating parent inode item 9747 * 1 item for the inline extent item 9748 * 1 item for xattr if selinux is on 9749 */ 9750 trans = btrfs_start_transaction(root, 7); 9751 if (IS_ERR(trans)) 9752 return PTR_ERR(trans); 9753 9754 err = btrfs_get_free_objectid(root, &objectid); 9755 if (err) 9756 goto out_unlock; 9757 9758 inode = btrfs_new_inode(trans, root, mnt_userns, dir, 9759 dentry->d_name.name, dentry->d_name.len, 9760 btrfs_ino(BTRFS_I(dir)), objectid, 9761 S_IFLNK | S_IRWXUGO, &index); 9762 if (IS_ERR(inode)) { 9763 err = PTR_ERR(inode); 9764 inode = NULL; 9765 goto out_unlock; 9766 } 9767 9768 /* 9769 * If the active LSM wants to access the inode during 9770 * d_instantiate it needs these. Smack checks to see 9771 * if the filesystem supports xattrs by looking at the 9772 * ops vector. 9773 */ 9774 inode->i_fop = &btrfs_file_operations; 9775 inode->i_op = &btrfs_file_inode_operations; 9776 inode->i_mapping->a_ops = &btrfs_aops; 9777 9778 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 9779 if (err) 9780 goto out_unlock; 9781 9782 path = btrfs_alloc_path(); 9783 if (!path) { 9784 err = -ENOMEM; 9785 goto out_unlock; 9786 } 9787 key.objectid = btrfs_ino(BTRFS_I(inode)); 9788 key.offset = 0; 9789 key.type = BTRFS_EXTENT_DATA_KEY; 9790 datasize = btrfs_file_extent_calc_inline_size(name_len); 9791 err = btrfs_insert_empty_item(trans, root, path, &key, 9792 datasize); 9793 if (err) { 9794 btrfs_free_path(path); 9795 goto out_unlock; 9796 } 9797 leaf = path->nodes[0]; 9798 ei = btrfs_item_ptr(leaf, path->slots[0], 9799 struct btrfs_file_extent_item); 9800 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 9801 btrfs_set_file_extent_type(leaf, ei, 9802 BTRFS_FILE_EXTENT_INLINE); 9803 btrfs_set_file_extent_encryption(leaf, ei, 0); 9804 btrfs_set_file_extent_compression(leaf, ei, 0); 9805 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 9806 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 9807 9808 ptr = btrfs_file_extent_inline_start(ei); 9809 write_extent_buffer(leaf, symname, ptr, name_len); 9810 btrfs_mark_buffer_dirty(leaf); 9811 btrfs_free_path(path); 9812 9813 inode->i_op = &btrfs_symlink_inode_operations; 9814 inode_nohighmem(inode); 9815 inode_set_bytes(inode, name_len); 9816 btrfs_i_size_write(BTRFS_I(inode), name_len); 9817 err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 9818 /* 9819 * Last step, add directory indexes for our symlink inode. This is the 9820 * last step to avoid extra cleanup of these indexes if an error happens 9821 * elsewhere above. 9822 */ 9823 if (!err) 9824 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, 9825 BTRFS_I(inode), 0, index); 9826 if (err) 9827 goto out_unlock; 9828 9829 d_instantiate_new(dentry, inode); 9830 9831 out_unlock: 9832 btrfs_end_transaction(trans); 9833 if (err && inode) { 9834 inode_dec_link_count(inode); 9835 discard_new_inode(inode); 9836 } 9837 btrfs_btree_balance_dirty(fs_info); 9838 return err; 9839 } 9840 9841 static struct btrfs_trans_handle *insert_prealloc_file_extent( 9842 struct btrfs_trans_handle *trans_in, 9843 struct btrfs_inode *inode, 9844 struct btrfs_key *ins, 9845 u64 file_offset) 9846 { 9847 struct btrfs_file_extent_item stack_fi; 9848 struct btrfs_replace_extent_info extent_info; 9849 struct btrfs_trans_handle *trans = trans_in; 9850 struct btrfs_path *path; 9851 u64 start = ins->objectid; 9852 u64 len = ins->offset; 9853 int qgroup_released; 9854 int ret; 9855 9856 memset(&stack_fi, 0, sizeof(stack_fi)); 9857 9858 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); 9859 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); 9860 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); 9861 btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); 9862 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); 9863 btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); 9864 /* Encryption and other encoding is reserved and all 0 */ 9865 9866 qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); 9867 if (qgroup_released < 0) 9868 return ERR_PTR(qgroup_released); 9869 9870 if (trans) { 9871 ret = insert_reserved_file_extent(trans, inode, 9872 file_offset, &stack_fi, 9873 true, qgroup_released); 9874 if (ret) 9875 goto free_qgroup; 9876 return trans; 9877 } 9878 9879 extent_info.disk_offset = start; 9880 extent_info.disk_len = len; 9881 extent_info.data_offset = 0; 9882 extent_info.data_len = len; 9883 extent_info.file_offset = file_offset; 9884 extent_info.extent_buf = (char *)&stack_fi; 9885 extent_info.is_new_extent = true; 9886 extent_info.qgroup_reserved = qgroup_released; 9887 extent_info.insertions = 0; 9888 9889 path = btrfs_alloc_path(); 9890 if (!path) { 9891 ret = -ENOMEM; 9892 goto free_qgroup; 9893 } 9894 9895 ret = btrfs_replace_file_extents(inode, path, file_offset, 9896 file_offset + len - 1, &extent_info, 9897 &trans); 9898 btrfs_free_path(path); 9899 if (ret) 9900 goto free_qgroup; 9901 return trans; 9902 9903 free_qgroup: 9904 /* 9905 * We have released qgroup data range at the beginning of the function, 9906 * and normally qgroup_released bytes will be freed when committing 9907 * transaction. 9908 * But if we error out early, we have to free what we have released 9909 * or we leak qgroup data reservation. 9910 */ 9911 btrfs_qgroup_free_refroot(inode->root->fs_info, 9912 inode->root->root_key.objectid, qgroup_released, 9913 BTRFS_QGROUP_RSV_DATA); 9914 return ERR_PTR(ret); 9915 } 9916 9917 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 9918 u64 start, u64 num_bytes, u64 min_size, 9919 loff_t actual_len, u64 *alloc_hint, 9920 struct btrfs_trans_handle *trans) 9921 { 9922 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 9923 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 9924 struct extent_map *em; 9925 struct btrfs_root *root = BTRFS_I(inode)->root; 9926 struct btrfs_key ins; 9927 u64 cur_offset = start; 9928 u64 clear_offset = start; 9929 u64 i_size; 9930 u64 cur_bytes; 9931 u64 last_alloc = (u64)-1; 9932 int ret = 0; 9933 bool own_trans = true; 9934 u64 end = start + num_bytes - 1; 9935 9936 if (trans) 9937 own_trans = false; 9938 while (num_bytes > 0) { 9939 cur_bytes = min_t(u64, num_bytes, SZ_256M); 9940 cur_bytes = max(cur_bytes, min_size); 9941 /* 9942 * If we are severely fragmented we could end up with really 9943 * small allocations, so if the allocator is returning small 9944 * chunks lets make its job easier by only searching for those 9945 * sized chunks. 9946 */ 9947 cur_bytes = min(cur_bytes, last_alloc); 9948 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, 9949 min_size, 0, *alloc_hint, &ins, 1, 0); 9950 if (ret) 9951 break; 9952 9953 /* 9954 * We've reserved this space, and thus converted it from 9955 * ->bytes_may_use to ->bytes_reserved. Any error that happens 9956 * from here on out we will only need to clear our reservation 9957 * for the remaining unreserved area, so advance our 9958 * clear_offset by our extent size. 9959 */ 9960 clear_offset += ins.offset; 9961 9962 last_alloc = ins.offset; 9963 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode), 9964 &ins, cur_offset); 9965 /* 9966 * Now that we inserted the prealloc extent we can finally 9967 * decrement the number of reservations in the block group. 9968 * If we did it before, we could race with relocation and have 9969 * relocation miss the reserved extent, making it fail later. 9970 */ 9971 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 9972 if (IS_ERR(trans)) { 9973 ret = PTR_ERR(trans); 9974 btrfs_free_reserved_extent(fs_info, ins.objectid, 9975 ins.offset, 0); 9976 break; 9977 } 9978 9979 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, 9980 cur_offset + ins.offset -1, 0); 9981 9982 em = alloc_extent_map(); 9983 if (!em) { 9984 btrfs_set_inode_full_sync(BTRFS_I(inode)); 9985 goto next; 9986 } 9987 9988 em->start = cur_offset; 9989 em->orig_start = cur_offset; 9990 em->len = ins.offset; 9991 em->block_start = ins.objectid; 9992 em->block_len = ins.offset; 9993 em->orig_block_len = ins.offset; 9994 em->ram_bytes = ins.offset; 9995 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 9996 em->generation = trans->transid; 9997 9998 while (1) { 9999 write_lock(&em_tree->lock); 10000 ret = add_extent_mapping(em_tree, em, 1); 10001 write_unlock(&em_tree->lock); 10002 if (ret != -EEXIST) 10003 break; 10004 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, 10005 cur_offset + ins.offset - 1, 10006 0); 10007 } 10008 free_extent_map(em); 10009 next: 10010 num_bytes -= ins.offset; 10011 cur_offset += ins.offset; 10012 *alloc_hint = ins.objectid + ins.offset; 10013 10014 inode_inc_iversion(inode); 10015 inode->i_ctime = current_time(inode); 10016 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 10017 if (!(mode & FALLOC_FL_KEEP_SIZE) && 10018 (actual_len > inode->i_size) && 10019 (cur_offset > inode->i_size)) { 10020 if (cur_offset > actual_len) 10021 i_size = actual_len; 10022 else 10023 i_size = cur_offset; 10024 i_size_write(inode, i_size); 10025 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 10026 } 10027 10028 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 10029 10030 if (ret) { 10031 btrfs_abort_transaction(trans, ret); 10032 if (own_trans) 10033 btrfs_end_transaction(trans); 10034 break; 10035 } 10036 10037 if (own_trans) { 10038 btrfs_end_transaction(trans); 10039 trans = NULL; 10040 } 10041 } 10042 if (clear_offset < end) 10043 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, 10044 end - clear_offset + 1); 10045 return ret; 10046 } 10047 10048 int btrfs_prealloc_file_range(struct inode *inode, int mode, 10049 u64 start, u64 num_bytes, u64 min_size, 10050 loff_t actual_len, u64 *alloc_hint) 10051 { 10052 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 10053 min_size, actual_len, alloc_hint, 10054 NULL); 10055 } 10056 10057 int btrfs_prealloc_file_range_trans(struct inode *inode, 10058 struct btrfs_trans_handle *trans, int mode, 10059 u64 start, u64 num_bytes, u64 min_size, 10060 loff_t actual_len, u64 *alloc_hint) 10061 { 10062 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 10063 min_size, actual_len, alloc_hint, trans); 10064 } 10065 10066 static int btrfs_permission(struct user_namespace *mnt_userns, 10067 struct inode *inode, int mask) 10068 { 10069 struct btrfs_root *root = BTRFS_I(inode)->root; 10070 umode_t mode = inode->i_mode; 10071 10072 if (mask & MAY_WRITE && 10073 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 10074 if (btrfs_root_readonly(root)) 10075 return -EROFS; 10076 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 10077 return -EACCES; 10078 } 10079 return generic_permission(mnt_userns, inode, mask); 10080 } 10081 10082 static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, 10083 struct dentry *dentry, umode_t mode) 10084 { 10085 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 10086 struct btrfs_trans_handle *trans; 10087 struct btrfs_root *root = BTRFS_I(dir)->root; 10088 struct inode *inode = NULL; 10089 u64 objectid; 10090 u64 index; 10091 int ret = 0; 10092 10093 /* 10094 * 5 units required for adding orphan entry 10095 */ 10096 trans = btrfs_start_transaction(root, 5); 10097 if (IS_ERR(trans)) 10098 return PTR_ERR(trans); 10099 10100 ret = btrfs_get_free_objectid(root, &objectid); 10101 if (ret) 10102 goto out; 10103 10104 inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0, 10105 btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); 10106 if (IS_ERR(inode)) { 10107 ret = PTR_ERR(inode); 10108 inode = NULL; 10109 goto out; 10110 } 10111 10112 inode->i_fop = &btrfs_file_operations; 10113 inode->i_op = &btrfs_file_inode_operations; 10114 10115 inode->i_mapping->a_ops = &btrfs_aops; 10116 10117 ret = btrfs_init_inode_security(trans, inode, dir, NULL); 10118 if (ret) 10119 goto out; 10120 10121 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 10122 if (ret) 10123 goto out; 10124 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 10125 if (ret) 10126 goto out; 10127 10128 /* 10129 * We set number of links to 0 in btrfs_new_inode(), and here we set 10130 * it to 1 because d_tmpfile() will issue a warning if the count is 0, 10131 * through: 10132 * 10133 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() 10134 */ 10135 set_nlink(inode, 1); 10136 d_tmpfile(dentry, inode); 10137 unlock_new_inode(inode); 10138 mark_inode_dirty(inode); 10139 out: 10140 btrfs_end_transaction(trans); 10141 if (ret && inode) 10142 discard_new_inode(inode); 10143 btrfs_btree_balance_dirty(fs_info); 10144 return ret; 10145 } 10146 10147 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) 10148 { 10149 struct btrfs_fs_info *fs_info = inode->root->fs_info; 10150 unsigned long index = start >> PAGE_SHIFT; 10151 unsigned long end_index = end >> PAGE_SHIFT; 10152 struct page *page; 10153 u32 len; 10154 10155 ASSERT(end + 1 - start <= U32_MAX); 10156 len = end + 1 - start; 10157 while (index <= end_index) { 10158 page = find_get_page(inode->vfs_inode.i_mapping, index); 10159 ASSERT(page); /* Pages should be in the extent_io_tree */ 10160 10161 btrfs_page_set_writeback(fs_info, page, start, len); 10162 put_page(page); 10163 index++; 10164 } 10165 } 10166 10167 static int btrfs_encoded_io_compression_from_extent( 10168 struct btrfs_fs_info *fs_info, 10169 int compress_type) 10170 { 10171 switch (compress_type) { 10172 case BTRFS_COMPRESS_NONE: 10173 return BTRFS_ENCODED_IO_COMPRESSION_NONE; 10174 case BTRFS_COMPRESS_ZLIB: 10175 return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; 10176 case BTRFS_COMPRESS_LZO: 10177 /* 10178 * The LZO format depends on the sector size. 64K is the maximum 10179 * sector size that we support. 10180 */ 10181 if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) 10182 return -EINVAL; 10183 return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 10184 (fs_info->sectorsize_bits - 12); 10185 case BTRFS_COMPRESS_ZSTD: 10186 return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; 10187 default: 10188 return -EUCLEAN; 10189 } 10190 } 10191 10192 static ssize_t btrfs_encoded_read_inline( 10193 struct kiocb *iocb, 10194 struct iov_iter *iter, u64 start, 10195 u64 lockend, 10196 struct extent_state **cached_state, 10197 u64 extent_start, size_t count, 10198 struct btrfs_ioctl_encoded_io_args *encoded, 10199 bool *unlocked) 10200 { 10201 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 10202 struct btrfs_root *root = inode->root; 10203 struct btrfs_fs_info *fs_info = root->fs_info; 10204 struct extent_io_tree *io_tree = &inode->io_tree; 10205 struct btrfs_path *path; 10206 struct extent_buffer *leaf; 10207 struct btrfs_file_extent_item *item; 10208 u64 ram_bytes; 10209 unsigned long ptr; 10210 void *tmp; 10211 ssize_t ret; 10212 10213 path = btrfs_alloc_path(); 10214 if (!path) { 10215 ret = -ENOMEM; 10216 goto out; 10217 } 10218 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), 10219 extent_start, 0); 10220 if (ret) { 10221 if (ret > 0) { 10222 /* The extent item disappeared? */ 10223 ret = -EIO; 10224 } 10225 goto out; 10226 } 10227 leaf = path->nodes[0]; 10228 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 10229 10230 ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); 10231 ptr = btrfs_file_extent_inline_start(item); 10232 10233 encoded->len = min_t(u64, extent_start + ram_bytes, 10234 inode->vfs_inode.i_size) - iocb->ki_pos; 10235 ret = btrfs_encoded_io_compression_from_extent(fs_info, 10236 btrfs_file_extent_compression(leaf, item)); 10237 if (ret < 0) 10238 goto out; 10239 encoded->compression = ret; 10240 if (encoded->compression) { 10241 size_t inline_size; 10242 10243 inline_size = btrfs_file_extent_inline_item_len(leaf, 10244 path->slots[0]); 10245 if (inline_size > count) { 10246 ret = -ENOBUFS; 10247 goto out; 10248 } 10249 count = inline_size; 10250 encoded->unencoded_len = ram_bytes; 10251 encoded->unencoded_offset = iocb->ki_pos - extent_start; 10252 } else { 10253 count = min_t(u64, count, encoded->len); 10254 encoded->len = count; 10255 encoded->unencoded_len = count; 10256 ptr += iocb->ki_pos - extent_start; 10257 } 10258 10259 tmp = kmalloc(count, GFP_NOFS); 10260 if (!tmp) { 10261 ret = -ENOMEM; 10262 goto out; 10263 } 10264 read_extent_buffer(leaf, tmp, ptr, count); 10265 btrfs_release_path(path); 10266 unlock_extent_cached(io_tree, start, lockend, cached_state); 10267 btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10268 *unlocked = true; 10269 10270 ret = copy_to_iter(tmp, count, iter); 10271 if (ret != count) 10272 ret = -EFAULT; 10273 kfree(tmp); 10274 out: 10275 btrfs_free_path(path); 10276 return ret; 10277 } 10278 10279 struct btrfs_encoded_read_private { 10280 struct btrfs_inode *inode; 10281 u64 file_offset; 10282 wait_queue_head_t wait; 10283 atomic_t pending; 10284 blk_status_t status; 10285 bool skip_csum; 10286 }; 10287 10288 static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, 10289 struct bio *bio, int mirror_num) 10290 { 10291 struct btrfs_encoded_read_private *priv = bio->bi_private; 10292 struct btrfs_bio *bbio = btrfs_bio(bio); 10293 struct btrfs_fs_info *fs_info = inode->root->fs_info; 10294 blk_status_t ret; 10295 10296 if (!priv->skip_csum) { 10297 ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); 10298 if (ret) 10299 return ret; 10300 } 10301 10302 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 10303 if (ret) { 10304 btrfs_bio_free_csum(bbio); 10305 return ret; 10306 } 10307 10308 atomic_inc(&priv->pending); 10309 ret = btrfs_map_bio(fs_info, bio, mirror_num); 10310 if (ret) { 10311 atomic_dec(&priv->pending); 10312 btrfs_bio_free_csum(bbio); 10313 } 10314 return ret; 10315 } 10316 10317 static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) 10318 { 10319 const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); 10320 struct btrfs_encoded_read_private *priv = bbio->bio.bi_private; 10321 struct btrfs_inode *inode = priv->inode; 10322 struct btrfs_fs_info *fs_info = inode->root->fs_info; 10323 u32 sectorsize = fs_info->sectorsize; 10324 struct bio_vec *bvec; 10325 struct bvec_iter_all iter_all; 10326 u64 start = priv->file_offset; 10327 u32 bio_offset = 0; 10328 10329 if (priv->skip_csum || !uptodate) 10330 return bbio->bio.bi_status; 10331 10332 bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { 10333 unsigned int i, nr_sectors, pgoff; 10334 10335 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); 10336 pgoff = bvec->bv_offset; 10337 for (i = 0; i < nr_sectors; i++) { 10338 ASSERT(pgoff < PAGE_SIZE); 10339 if (check_data_csum(&inode->vfs_inode, bbio, bio_offset, 10340 bvec->bv_page, pgoff, start)) 10341 return BLK_STS_IOERR; 10342 start += sectorsize; 10343 bio_offset += sectorsize; 10344 pgoff += sectorsize; 10345 } 10346 } 10347 return BLK_STS_OK; 10348 } 10349 10350 static void btrfs_encoded_read_endio(struct bio *bio) 10351 { 10352 struct btrfs_encoded_read_private *priv = bio->bi_private; 10353 struct btrfs_bio *bbio = btrfs_bio(bio); 10354 blk_status_t status; 10355 10356 status = btrfs_encoded_read_verify_csum(bbio); 10357 if (status) { 10358 /* 10359 * The memory barrier implied by the atomic_dec_return() here 10360 * pairs with the memory barrier implied by the 10361 * atomic_dec_return() or io_wait_event() in 10362 * btrfs_encoded_read_regular_fill_pages() to ensure that this 10363 * write is observed before the load of status in 10364 * btrfs_encoded_read_regular_fill_pages(). 10365 */ 10366 WRITE_ONCE(priv->status, status); 10367 } 10368 if (!atomic_dec_return(&priv->pending)) 10369 wake_up(&priv->wait); 10370 btrfs_bio_free_csum(bbio); 10371 bio_put(bio); 10372 } 10373 10374 static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, 10375 u64 file_offset, 10376 u64 disk_bytenr, 10377 u64 disk_io_size, 10378 struct page **pages) 10379 { 10380 struct btrfs_fs_info *fs_info = inode->root->fs_info; 10381 struct btrfs_encoded_read_private priv = { 10382 .inode = inode, 10383 .file_offset = file_offset, 10384 .pending = ATOMIC_INIT(1), 10385 .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), 10386 }; 10387 unsigned long i = 0; 10388 u64 cur = 0; 10389 int ret; 10390 10391 init_waitqueue_head(&priv.wait); 10392 /* 10393 * Submit bios for the extent, splitting due to bio or stripe limits as 10394 * necessary. 10395 */ 10396 while (cur < disk_io_size) { 10397 struct extent_map *em; 10398 struct btrfs_io_geometry geom; 10399 struct bio *bio = NULL; 10400 u64 remaining; 10401 10402 em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, 10403 disk_io_size - cur); 10404 if (IS_ERR(em)) { 10405 ret = PTR_ERR(em); 10406 } else { 10407 ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, 10408 disk_bytenr + cur, &geom); 10409 free_extent_map(em); 10410 } 10411 if (ret) { 10412 WRITE_ONCE(priv.status, errno_to_blk_status(ret)); 10413 break; 10414 } 10415 remaining = min(geom.len, disk_io_size - cur); 10416 while (bio || remaining) { 10417 size_t bytes = min_t(u64, remaining, PAGE_SIZE); 10418 10419 if (!bio) { 10420 bio = btrfs_bio_alloc(BIO_MAX_VECS); 10421 bio->bi_iter.bi_sector = 10422 (disk_bytenr + cur) >> SECTOR_SHIFT; 10423 bio->bi_end_io = btrfs_encoded_read_endio; 10424 bio->bi_private = &priv; 10425 bio->bi_opf = REQ_OP_READ; 10426 } 10427 10428 if (!bytes || 10429 bio_add_page(bio, pages[i], bytes, 0) < bytes) { 10430 blk_status_t status; 10431 10432 status = submit_encoded_read_bio(inode, bio, 0); 10433 if (status) { 10434 WRITE_ONCE(priv.status, status); 10435 bio_put(bio); 10436 goto out; 10437 } 10438 bio = NULL; 10439 continue; 10440 } 10441 10442 i++; 10443 cur += bytes; 10444 remaining -= bytes; 10445 } 10446 } 10447 10448 out: 10449 if (atomic_dec_return(&priv.pending)) 10450 io_wait_event(priv.wait, !atomic_read(&priv.pending)); 10451 /* See btrfs_encoded_read_endio() for ordering. */ 10452 return blk_status_to_errno(READ_ONCE(priv.status)); 10453 } 10454 10455 static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, 10456 struct iov_iter *iter, 10457 u64 start, u64 lockend, 10458 struct extent_state **cached_state, 10459 u64 disk_bytenr, u64 disk_io_size, 10460 size_t count, bool compressed, 10461 bool *unlocked) 10462 { 10463 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 10464 struct extent_io_tree *io_tree = &inode->io_tree; 10465 struct page **pages; 10466 unsigned long nr_pages, i; 10467 u64 cur; 10468 size_t page_offset; 10469 ssize_t ret; 10470 10471 nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); 10472 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 10473 if (!pages) 10474 return -ENOMEM; 10475 for (i = 0; i < nr_pages; i++) { 10476 pages[i] = alloc_page(GFP_NOFS); 10477 if (!pages[i]) { 10478 ret = -ENOMEM; 10479 goto out; 10480 } 10481 } 10482 10483 ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, 10484 disk_io_size, pages); 10485 if (ret) 10486 goto out; 10487 10488 unlock_extent_cached(io_tree, start, lockend, cached_state); 10489 btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10490 *unlocked = true; 10491 10492 if (compressed) { 10493 i = 0; 10494 page_offset = 0; 10495 } else { 10496 i = (iocb->ki_pos - start) >> PAGE_SHIFT; 10497 page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1); 10498 } 10499 cur = 0; 10500 while (cur < count) { 10501 size_t bytes = min_t(size_t, count - cur, 10502 PAGE_SIZE - page_offset); 10503 10504 if (copy_page_to_iter(pages[i], page_offset, bytes, 10505 iter) != bytes) { 10506 ret = -EFAULT; 10507 goto out; 10508 } 10509 i++; 10510 cur += bytes; 10511 page_offset = 0; 10512 } 10513 ret = count; 10514 out: 10515 for (i = 0; i < nr_pages; i++) { 10516 if (pages[i]) 10517 __free_page(pages[i]); 10518 } 10519 kfree(pages); 10520 return ret; 10521 } 10522 10523 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, 10524 struct btrfs_ioctl_encoded_io_args *encoded) 10525 { 10526 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 10527 struct btrfs_fs_info *fs_info = inode->root->fs_info; 10528 struct extent_io_tree *io_tree = &inode->io_tree; 10529 ssize_t ret; 10530 size_t count = iov_iter_count(iter); 10531 u64 start, lockend, disk_bytenr, disk_io_size; 10532 struct extent_state *cached_state = NULL; 10533 struct extent_map *em; 10534 bool unlocked = false; 10535 10536 file_accessed(iocb->ki_filp); 10537 10538 btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10539 10540 if (iocb->ki_pos >= inode->vfs_inode.i_size) { 10541 btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10542 return 0; 10543 } 10544 start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); 10545 /* 10546 * We don't know how long the extent containing iocb->ki_pos is, but if 10547 * it's compressed we know that it won't be longer than this. 10548 */ 10549 lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; 10550 10551 for (;;) { 10552 struct btrfs_ordered_extent *ordered; 10553 10554 ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, 10555 lockend - start + 1); 10556 if (ret) 10557 goto out_unlock_inode; 10558 lock_extent_bits(io_tree, start, lockend, &cached_state); 10559 ordered = btrfs_lookup_ordered_range(inode, start, 10560 lockend - start + 1); 10561 if (!ordered) 10562 break; 10563 btrfs_put_ordered_extent(ordered); 10564 unlock_extent_cached(io_tree, start, lockend, &cached_state); 10565 cond_resched(); 10566 } 10567 10568 em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); 10569 if (IS_ERR(em)) { 10570 ret = PTR_ERR(em); 10571 goto out_unlock_extent; 10572 } 10573 10574 if (em->block_start == EXTENT_MAP_INLINE) { 10575 u64 extent_start = em->start; 10576 10577 /* 10578 * For inline extents we get everything we need out of the 10579 * extent item. 10580 */ 10581 free_extent_map(em); 10582 em = NULL; 10583 ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, 10584 &cached_state, extent_start, 10585 count, encoded, &unlocked); 10586 goto out; 10587 } 10588 10589 /* 10590 * We only want to return up to EOF even if the extent extends beyond 10591 * that. 10592 */ 10593 encoded->len = min_t(u64, extent_map_end(em), 10594 inode->vfs_inode.i_size) - iocb->ki_pos; 10595 if (em->block_start == EXTENT_MAP_HOLE || 10596 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 10597 disk_bytenr = EXTENT_MAP_HOLE; 10598 count = min_t(u64, count, encoded->len); 10599 encoded->len = count; 10600 encoded->unencoded_len = count; 10601 } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 10602 disk_bytenr = em->block_start; 10603 /* 10604 * Bail if the buffer isn't large enough to return the whole 10605 * compressed extent. 10606 */ 10607 if (em->block_len > count) { 10608 ret = -ENOBUFS; 10609 goto out_em; 10610 } 10611 disk_io_size = count = em->block_len; 10612 encoded->unencoded_len = em->ram_bytes; 10613 encoded->unencoded_offset = iocb->ki_pos - em->orig_start; 10614 ret = btrfs_encoded_io_compression_from_extent(fs_info, 10615 em->compress_type); 10616 if (ret < 0) 10617 goto out_em; 10618 encoded->compression = ret; 10619 } else { 10620 disk_bytenr = em->block_start + (start - em->start); 10621 if (encoded->len > count) 10622 encoded->len = count; 10623 /* 10624 * Don't read beyond what we locked. This also limits the page 10625 * allocations that we'll do. 10626 */ 10627 disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; 10628 count = start + disk_io_size - iocb->ki_pos; 10629 encoded->len = count; 10630 encoded->unencoded_len = count; 10631 disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); 10632 } 10633 free_extent_map(em); 10634 em = NULL; 10635 10636 if (disk_bytenr == EXTENT_MAP_HOLE) { 10637 unlock_extent_cached(io_tree, start, lockend, &cached_state); 10638 btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10639 unlocked = true; 10640 ret = iov_iter_zero(count, iter); 10641 if (ret != count) 10642 ret = -EFAULT; 10643 } else { 10644 ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, 10645 &cached_state, disk_bytenr, 10646 disk_io_size, count, 10647 encoded->compression, 10648 &unlocked); 10649 } 10650 10651 out: 10652 if (ret >= 0) 10653 iocb->ki_pos += encoded->len; 10654 out_em: 10655 free_extent_map(em); 10656 out_unlock_extent: 10657 if (!unlocked) 10658 unlock_extent_cached(io_tree, start, lockend, &cached_state); 10659 out_unlock_inode: 10660 if (!unlocked) 10661 btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10662 return ret; 10663 } 10664 10665 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, 10666 const struct btrfs_ioctl_encoded_io_args *encoded) 10667 { 10668 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 10669 struct btrfs_root *root = inode->root; 10670 struct btrfs_fs_info *fs_info = root->fs_info; 10671 struct extent_io_tree *io_tree = &inode->io_tree; 10672 struct extent_changeset *data_reserved = NULL; 10673 struct extent_state *cached_state = NULL; 10674 int compression; 10675 size_t orig_count; 10676 u64 start, end; 10677 u64 num_bytes, ram_bytes, disk_num_bytes; 10678 unsigned long nr_pages, i; 10679 struct page **pages; 10680 struct btrfs_key ins; 10681 bool extent_reserved = false; 10682 struct extent_map *em; 10683 ssize_t ret; 10684 10685 switch (encoded->compression) { 10686 case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: 10687 compression = BTRFS_COMPRESS_ZLIB; 10688 break; 10689 case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: 10690 compression = BTRFS_COMPRESS_ZSTD; 10691 break; 10692 case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: 10693 case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: 10694 case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: 10695 case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: 10696 case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: 10697 /* The sector size must match for LZO. */ 10698 if (encoded->compression - 10699 BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != 10700 fs_info->sectorsize_bits) 10701 return -EINVAL; 10702 compression = BTRFS_COMPRESS_LZO; 10703 break; 10704 default: 10705 return -EINVAL; 10706 } 10707 if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) 10708 return -EINVAL; 10709 10710 orig_count = iov_iter_count(from); 10711 10712 /* The extent size must be sane. */ 10713 if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || 10714 orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) 10715 return -EINVAL; 10716 10717 /* 10718 * The compressed data must be smaller than the decompressed data. 10719 * 10720 * It's of course possible for data to compress to larger or the same 10721 * size, but the buffered I/O path falls back to no compression for such 10722 * data, and we don't want to break any assumptions by creating these 10723 * extents. 10724 * 10725 * Note that this is less strict than the current check we have that the 10726 * compressed data must be at least one sector smaller than the 10727 * decompressed data. We only want to enforce the weaker requirement 10728 * from old kernels that it is at least one byte smaller. 10729 */ 10730 if (orig_count >= encoded->unencoded_len) 10731 return -EINVAL; 10732 10733 /* The extent must start on a sector boundary. */ 10734 start = iocb->ki_pos; 10735 if (!IS_ALIGNED(start, fs_info->sectorsize)) 10736 return -EINVAL; 10737 10738 /* 10739 * The extent must end on a sector boundary. However, we allow a write 10740 * which ends at or extends i_size to have an unaligned length; we round 10741 * up the extent size and set i_size to the unaligned end. 10742 */ 10743 if (start + encoded->len < inode->vfs_inode.i_size && 10744 !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) 10745 return -EINVAL; 10746 10747 /* Finally, the offset in the unencoded data must be sector-aligned. */ 10748 if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) 10749 return -EINVAL; 10750 10751 num_bytes = ALIGN(encoded->len, fs_info->sectorsize); 10752 ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); 10753 end = start + num_bytes - 1; 10754 10755 /* 10756 * If the extent cannot be inline, the compressed data on disk must be 10757 * sector-aligned. For convenience, we extend it with zeroes if it 10758 * isn't. 10759 */ 10760 disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); 10761 nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); 10762 pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); 10763 if (!pages) 10764 return -ENOMEM; 10765 for (i = 0; i < nr_pages; i++) { 10766 size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); 10767 char *kaddr; 10768 10769 pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); 10770 if (!pages[i]) { 10771 ret = -ENOMEM; 10772 goto out_pages; 10773 } 10774 kaddr = kmap(pages[i]); 10775 if (copy_from_iter(kaddr, bytes, from) != bytes) { 10776 kunmap(pages[i]); 10777 ret = -EFAULT; 10778 goto out_pages; 10779 } 10780 if (bytes < PAGE_SIZE) 10781 memset(kaddr + bytes, 0, PAGE_SIZE - bytes); 10782 kunmap(pages[i]); 10783 } 10784 10785 for (;;) { 10786 struct btrfs_ordered_extent *ordered; 10787 10788 ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); 10789 if (ret) 10790 goto out_pages; 10791 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, 10792 start >> PAGE_SHIFT, 10793 end >> PAGE_SHIFT); 10794 if (ret) 10795 goto out_pages; 10796 lock_extent_bits(io_tree, start, end, &cached_state); 10797 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); 10798 if (!ordered && 10799 !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) 10800 break; 10801 if (ordered) 10802 btrfs_put_ordered_extent(ordered); 10803 unlock_extent_cached(io_tree, start, end, &cached_state); 10804 cond_resched(); 10805 } 10806 10807 /* 10808 * We don't use the higher-level delalloc space functions because our 10809 * num_bytes and disk_num_bytes are different. 10810 */ 10811 ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); 10812 if (ret) 10813 goto out_unlock; 10814 ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); 10815 if (ret) 10816 goto out_free_data_space; 10817 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes); 10818 if (ret) 10819 goto out_qgroup_free_data; 10820 10821 /* Try an inline extent first. */ 10822 if (start == 0 && encoded->unencoded_len == encoded->len && 10823 encoded->unencoded_offset == 0) { 10824 ret = cow_file_range_inline(inode, encoded->len, orig_count, 10825 compression, pages, true); 10826 if (ret <= 0) { 10827 if (ret == 0) 10828 ret = orig_count; 10829 goto out_delalloc_release; 10830 } 10831 } 10832 10833 ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, 10834 disk_num_bytes, 0, 0, &ins, 1, 1); 10835 if (ret) 10836 goto out_delalloc_release; 10837 extent_reserved = true; 10838 10839 em = create_io_em(inode, start, num_bytes, 10840 start - encoded->unencoded_offset, ins.objectid, 10841 ins.offset, ins.offset, ram_bytes, compression, 10842 BTRFS_ORDERED_COMPRESSED); 10843 if (IS_ERR(em)) { 10844 ret = PTR_ERR(em); 10845 goto out_free_reserved; 10846 } 10847 free_extent_map(em); 10848 10849 ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes, 10850 ins.objectid, ins.offset, 10851 encoded->unencoded_offset, 10852 (1 << BTRFS_ORDERED_ENCODED) | 10853 (1 << BTRFS_ORDERED_COMPRESSED), 10854 compression); 10855 if (ret) { 10856 btrfs_drop_extent_cache(inode, start, end, 0); 10857 goto out_free_reserved; 10858 } 10859 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 10860 10861 if (start + encoded->len > inode->vfs_inode.i_size) 10862 i_size_write(&inode->vfs_inode, start + encoded->len); 10863 10864 unlock_extent_cached(io_tree, start, end, &cached_state); 10865 10866 btrfs_delalloc_release_extents(inode, num_bytes); 10867 10868 if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, 10869 ins.offset, pages, nr_pages, 0, NULL, 10870 false)) { 10871 btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0); 10872 ret = -EIO; 10873 goto out_pages; 10874 } 10875 ret = orig_count; 10876 goto out; 10877 10878 out_free_reserved: 10879 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 10880 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 10881 out_delalloc_release: 10882 btrfs_delalloc_release_extents(inode, num_bytes); 10883 btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); 10884 out_qgroup_free_data: 10885 if (ret < 0) 10886 btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); 10887 out_free_data_space: 10888 /* 10889 * If btrfs_reserve_extent() succeeded, then we already decremented 10890 * bytes_may_use. 10891 */ 10892 if (!extent_reserved) 10893 btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); 10894 out_unlock: 10895 unlock_extent_cached(io_tree, start, end, &cached_state); 10896 out_pages: 10897 for (i = 0; i < nr_pages; i++) { 10898 if (pages[i]) 10899 __free_page(pages[i]); 10900 } 10901 kvfree(pages); 10902 out: 10903 if (ret >= 0) 10904 iocb->ki_pos += encoded->len; 10905 return ret; 10906 } 10907 10908 #ifdef CONFIG_SWAP 10909 /* 10910 * Add an entry indicating a block group or device which is pinned by a 10911 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a 10912 * negative errno on failure. 10913 */ 10914 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, 10915 bool is_block_group) 10916 { 10917 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 10918 struct btrfs_swapfile_pin *sp, *entry; 10919 struct rb_node **p; 10920 struct rb_node *parent = NULL; 10921 10922 sp = kmalloc(sizeof(*sp), GFP_NOFS); 10923 if (!sp) 10924 return -ENOMEM; 10925 sp->ptr = ptr; 10926 sp->inode = inode; 10927 sp->is_block_group = is_block_group; 10928 sp->bg_extent_count = 1; 10929 10930 spin_lock(&fs_info->swapfile_pins_lock); 10931 p = &fs_info->swapfile_pins.rb_node; 10932 while (*p) { 10933 parent = *p; 10934 entry = rb_entry(parent, struct btrfs_swapfile_pin, node); 10935 if (sp->ptr < entry->ptr || 10936 (sp->ptr == entry->ptr && sp->inode < entry->inode)) { 10937 p = &(*p)->rb_left; 10938 } else if (sp->ptr > entry->ptr || 10939 (sp->ptr == entry->ptr && sp->inode > entry->inode)) { 10940 p = &(*p)->rb_right; 10941 } else { 10942 if (is_block_group) 10943 entry->bg_extent_count++; 10944 spin_unlock(&fs_info->swapfile_pins_lock); 10945 kfree(sp); 10946 return 1; 10947 } 10948 } 10949 rb_link_node(&sp->node, parent, p); 10950 rb_insert_color(&sp->node, &fs_info->swapfile_pins); 10951 spin_unlock(&fs_info->swapfile_pins_lock); 10952 return 0; 10953 } 10954 10955 /* Free all of the entries pinned by this swapfile. */ 10956 static void btrfs_free_swapfile_pins(struct inode *inode) 10957 { 10958 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 10959 struct btrfs_swapfile_pin *sp; 10960 struct rb_node *node, *next; 10961 10962 spin_lock(&fs_info->swapfile_pins_lock); 10963 node = rb_first(&fs_info->swapfile_pins); 10964 while (node) { 10965 next = rb_next(node); 10966 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 10967 if (sp->inode == inode) { 10968 rb_erase(&sp->node, &fs_info->swapfile_pins); 10969 if (sp->is_block_group) { 10970 btrfs_dec_block_group_swap_extents(sp->ptr, 10971 sp->bg_extent_count); 10972 btrfs_put_block_group(sp->ptr); 10973 } 10974 kfree(sp); 10975 } 10976 node = next; 10977 } 10978 spin_unlock(&fs_info->swapfile_pins_lock); 10979 } 10980 10981 struct btrfs_swap_info { 10982 u64 start; 10983 u64 block_start; 10984 u64 block_len; 10985 u64 lowest_ppage; 10986 u64 highest_ppage; 10987 unsigned long nr_pages; 10988 int nr_extents; 10989 }; 10990 10991 static int btrfs_add_swap_extent(struct swap_info_struct *sis, 10992 struct btrfs_swap_info *bsi) 10993 { 10994 unsigned long nr_pages; 10995 unsigned long max_pages; 10996 u64 first_ppage, first_ppage_reported, next_ppage; 10997 int ret; 10998 10999 /* 11000 * Our swapfile may have had its size extended after the swap header was 11001 * written. In that case activating the swapfile should not go beyond 11002 * the max size set in the swap header. 11003 */ 11004 if (bsi->nr_pages >= sis->max) 11005 return 0; 11006 11007 max_pages = sis->max - bsi->nr_pages; 11008 first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; 11009 next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, 11010 PAGE_SIZE) >> PAGE_SHIFT; 11011 11012 if (first_ppage >= next_ppage) 11013 return 0; 11014 nr_pages = next_ppage - first_ppage; 11015 nr_pages = min(nr_pages, max_pages); 11016 11017 first_ppage_reported = first_ppage; 11018 if (bsi->start == 0) 11019 first_ppage_reported++; 11020 if (bsi->lowest_ppage > first_ppage_reported) 11021 bsi->lowest_ppage = first_ppage_reported; 11022 if (bsi->highest_ppage < (next_ppage - 1)) 11023 bsi->highest_ppage = next_ppage - 1; 11024 11025 ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); 11026 if (ret < 0) 11027 return ret; 11028 bsi->nr_extents += ret; 11029 bsi->nr_pages += nr_pages; 11030 return 0; 11031 } 11032 11033 static void btrfs_swap_deactivate(struct file *file) 11034 { 11035 struct inode *inode = file_inode(file); 11036 11037 btrfs_free_swapfile_pins(inode); 11038 atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); 11039 } 11040 11041 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, 11042 sector_t *span) 11043 { 11044 struct inode *inode = file_inode(file); 11045 struct btrfs_root *root = BTRFS_I(inode)->root; 11046 struct btrfs_fs_info *fs_info = root->fs_info; 11047 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 11048 struct extent_state *cached_state = NULL; 11049 struct extent_map *em = NULL; 11050 struct btrfs_device *device = NULL; 11051 struct btrfs_swap_info bsi = { 11052 .lowest_ppage = (sector_t)-1ULL, 11053 }; 11054 int ret = 0; 11055 u64 isize; 11056 u64 start; 11057 11058 /* 11059 * If the swap file was just created, make sure delalloc is done. If the 11060 * file changes again after this, the user is doing something stupid and 11061 * we don't really care. 11062 */ 11063 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 11064 if (ret) 11065 return ret; 11066 11067 /* 11068 * The inode is locked, so these flags won't change after we check them. 11069 */ 11070 if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { 11071 btrfs_warn(fs_info, "swapfile must not be compressed"); 11072 return -EINVAL; 11073 } 11074 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { 11075 btrfs_warn(fs_info, "swapfile must not be copy-on-write"); 11076 return -EINVAL; 11077 } 11078 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 11079 btrfs_warn(fs_info, "swapfile must not be checksummed"); 11080 return -EINVAL; 11081 } 11082 11083 /* 11084 * Balance or device remove/replace/resize can move stuff around from 11085 * under us. The exclop protection makes sure they aren't running/won't 11086 * run concurrently while we are mapping the swap extents, and 11087 * fs_info->swapfile_pins prevents them from running while the swap 11088 * file is active and moving the extents. Note that this also prevents 11089 * a concurrent device add which isn't actually necessary, but it's not 11090 * really worth the trouble to allow it. 11091 */ 11092 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { 11093 btrfs_warn(fs_info, 11094 "cannot activate swapfile while exclusive operation is running"); 11095 return -EBUSY; 11096 } 11097 11098 /* 11099 * Prevent snapshot creation while we are activating the swap file. 11100 * We do not want to race with snapshot creation. If snapshot creation 11101 * already started before we bumped nr_swapfiles from 0 to 1 and 11102 * completes before the first write into the swap file after it is 11103 * activated, than that write would fallback to COW. 11104 */ 11105 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) { 11106 btrfs_exclop_finish(fs_info); 11107 btrfs_warn(fs_info, 11108 "cannot activate swapfile because snapshot creation is in progress"); 11109 return -EINVAL; 11110 } 11111 /* 11112 * Snapshots can create extents which require COW even if NODATACOW is 11113 * set. We use this counter to prevent snapshots. We must increment it 11114 * before walking the extents because we don't want a concurrent 11115 * snapshot to run after we've already checked the extents. 11116 * 11117 * It is possible that subvolume is marked for deletion but still not 11118 * removed yet. To prevent this race, we check the root status before 11119 * activating the swapfile. 11120 */ 11121 spin_lock(&root->root_item_lock); 11122 if (btrfs_root_dead(root)) { 11123 spin_unlock(&root->root_item_lock); 11124 11125 btrfs_exclop_finish(fs_info); 11126 btrfs_warn(fs_info, 11127 "cannot activate swapfile because subvolume %llu is being deleted", 11128 root->root_key.objectid); 11129 return -EPERM; 11130 } 11131 atomic_inc(&root->nr_swapfiles); 11132 spin_unlock(&root->root_item_lock); 11133 11134 isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); 11135 11136 lock_extent_bits(io_tree, 0, isize - 1, &cached_state); 11137 start = 0; 11138 while (start < isize) { 11139 u64 logical_block_start, physical_block_start; 11140 struct btrfs_block_group *bg; 11141 u64 len = isize - start; 11142 11143 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); 11144 if (IS_ERR(em)) { 11145 ret = PTR_ERR(em); 11146 goto out; 11147 } 11148 11149 if (em->block_start == EXTENT_MAP_HOLE) { 11150 btrfs_warn(fs_info, "swapfile must not have holes"); 11151 ret = -EINVAL; 11152 goto out; 11153 } 11154 if (em->block_start == EXTENT_MAP_INLINE) { 11155 /* 11156 * It's unlikely we'll ever actually find ourselves 11157 * here, as a file small enough to fit inline won't be 11158 * big enough to store more than the swap header, but in 11159 * case something changes in the future, let's catch it 11160 * here rather than later. 11161 */ 11162 btrfs_warn(fs_info, "swapfile must not be inline"); 11163 ret = -EINVAL; 11164 goto out; 11165 } 11166 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 11167 btrfs_warn(fs_info, "swapfile must not be compressed"); 11168 ret = -EINVAL; 11169 goto out; 11170 } 11171 11172 logical_block_start = em->block_start + (start - em->start); 11173 len = min(len, em->len - (start - em->start)); 11174 free_extent_map(em); 11175 em = NULL; 11176 11177 ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true); 11178 if (ret < 0) { 11179 goto out; 11180 } else if (ret) { 11181 ret = 0; 11182 } else { 11183 btrfs_warn(fs_info, 11184 "swapfile must not be copy-on-write"); 11185 ret = -EINVAL; 11186 goto out; 11187 } 11188 11189 em = btrfs_get_chunk_map(fs_info, logical_block_start, len); 11190 if (IS_ERR(em)) { 11191 ret = PTR_ERR(em); 11192 goto out; 11193 } 11194 11195 if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 11196 btrfs_warn(fs_info, 11197 "swapfile must have single data profile"); 11198 ret = -EINVAL; 11199 goto out; 11200 } 11201 11202 if (device == NULL) { 11203 device = em->map_lookup->stripes[0].dev; 11204 ret = btrfs_add_swapfile_pin(inode, device, false); 11205 if (ret == 1) 11206 ret = 0; 11207 else if (ret) 11208 goto out; 11209 } else if (device != em->map_lookup->stripes[0].dev) { 11210 btrfs_warn(fs_info, "swapfile must be on one device"); 11211 ret = -EINVAL; 11212 goto out; 11213 } 11214 11215 physical_block_start = (em->map_lookup->stripes[0].physical + 11216 (logical_block_start - em->start)); 11217 len = min(len, em->len - (logical_block_start - em->start)); 11218 free_extent_map(em); 11219 em = NULL; 11220 11221 bg = btrfs_lookup_block_group(fs_info, logical_block_start); 11222 if (!bg) { 11223 btrfs_warn(fs_info, 11224 "could not find block group containing swapfile"); 11225 ret = -EINVAL; 11226 goto out; 11227 } 11228 11229 if (!btrfs_inc_block_group_swap_extents(bg)) { 11230 btrfs_warn(fs_info, 11231 "block group for swapfile at %llu is read-only%s", 11232 bg->start, 11233 atomic_read(&fs_info->scrubs_running) ? 11234 " (scrub running)" : ""); 11235 btrfs_put_block_group(bg); 11236 ret = -EINVAL; 11237 goto out; 11238 } 11239 11240 ret = btrfs_add_swapfile_pin(inode, bg, true); 11241 if (ret) { 11242 btrfs_put_block_group(bg); 11243 if (ret == 1) 11244 ret = 0; 11245 else 11246 goto out; 11247 } 11248 11249 if (bsi.block_len && 11250 bsi.block_start + bsi.block_len == physical_block_start) { 11251 bsi.block_len += len; 11252 } else { 11253 if (bsi.block_len) { 11254 ret = btrfs_add_swap_extent(sis, &bsi); 11255 if (ret) 11256 goto out; 11257 } 11258 bsi.start = start; 11259 bsi.block_start = physical_block_start; 11260 bsi.block_len = len; 11261 } 11262 11263 start += len; 11264 } 11265 11266 if (bsi.block_len) 11267 ret = btrfs_add_swap_extent(sis, &bsi); 11268 11269 out: 11270 if (!IS_ERR_OR_NULL(em)) 11271 free_extent_map(em); 11272 11273 unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); 11274 11275 if (ret) 11276 btrfs_swap_deactivate(file); 11277 11278 btrfs_drew_write_unlock(&root->snapshot_lock); 11279 11280 btrfs_exclop_finish(fs_info); 11281 11282 if (ret) 11283 return ret; 11284 11285 if (device) 11286 sis->bdev = device->bdev; 11287 *span = bsi.highest_ppage - bsi.lowest_ppage + 1; 11288 sis->max = bsi.nr_pages; 11289 sis->pages = bsi.nr_pages - 1; 11290 sis->highest_bit = bsi.nr_pages - 1; 11291 return bsi.nr_extents; 11292 } 11293 #else 11294 static void btrfs_swap_deactivate(struct file *file) 11295 { 11296 } 11297 11298 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, 11299 sector_t *span) 11300 { 11301 return -EOPNOTSUPP; 11302 } 11303 #endif 11304 11305 /* 11306 * Update the number of bytes used in the VFS' inode. When we replace extents in 11307 * a range (clone, dedupe, fallocate's zero range), we must update the number of 11308 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls 11309 * always get a correct value. 11310 */ 11311 void btrfs_update_inode_bytes(struct btrfs_inode *inode, 11312 const u64 add_bytes, 11313 const u64 del_bytes) 11314 { 11315 if (add_bytes == del_bytes) 11316 return; 11317 11318 spin_lock(&inode->lock); 11319 if (del_bytes > 0) 11320 inode_sub_bytes(&inode->vfs_inode, del_bytes); 11321 if (add_bytes > 0) 11322 inode_add_bytes(&inode->vfs_inode, add_bytes); 11323 spin_unlock(&inode->lock); 11324 } 11325 11326 static const struct inode_operations btrfs_dir_inode_operations = { 11327 .getattr = btrfs_getattr, 11328 .lookup = btrfs_lookup, 11329 .create = btrfs_create, 11330 .unlink = btrfs_unlink, 11331 .link = btrfs_link, 11332 .mkdir = btrfs_mkdir, 11333 .rmdir = btrfs_rmdir, 11334 .rename = btrfs_rename2, 11335 .symlink = btrfs_symlink, 11336 .setattr = btrfs_setattr, 11337 .mknod = btrfs_mknod, 11338 .listxattr = btrfs_listxattr, 11339 .permission = btrfs_permission, 11340 .get_acl = btrfs_get_acl, 11341 .set_acl = btrfs_set_acl, 11342 .update_time = btrfs_update_time, 11343 .tmpfile = btrfs_tmpfile, 11344 .fileattr_get = btrfs_fileattr_get, 11345 .fileattr_set = btrfs_fileattr_set, 11346 }; 11347 11348 static const struct file_operations btrfs_dir_file_operations = { 11349 .llseek = generic_file_llseek, 11350 .read = generic_read_dir, 11351 .iterate_shared = btrfs_real_readdir, 11352 .open = btrfs_opendir, 11353 .unlocked_ioctl = btrfs_ioctl, 11354 #ifdef CONFIG_COMPAT 11355 .compat_ioctl = btrfs_compat_ioctl, 11356 #endif 11357 .release = btrfs_release_file, 11358 .fsync = btrfs_sync_file, 11359 }; 11360 11361 /* 11362 * btrfs doesn't support the bmap operation because swapfiles 11363 * use bmap to make a mapping of extents in the file. They assume 11364 * these extents won't change over the life of the file and they 11365 * use the bmap result to do IO directly to the drive. 11366 * 11367 * the btrfs bmap call would return logical addresses that aren't 11368 * suitable for IO and they also will change frequently as COW 11369 * operations happen. So, swapfile + btrfs == corruption. 11370 * 11371 * For now we're avoiding this by dropping bmap. 11372 */ 11373 static const struct address_space_operations btrfs_aops = { 11374 .readpage = btrfs_readpage, 11375 .writepage = btrfs_writepage, 11376 .writepages = btrfs_writepages, 11377 .readahead = btrfs_readahead, 11378 .direct_IO = noop_direct_IO, 11379 .invalidate_folio = btrfs_invalidate_folio, 11380 .releasepage = btrfs_releasepage, 11381 #ifdef CONFIG_MIGRATION 11382 .migratepage = btrfs_migratepage, 11383 #endif 11384 .dirty_folio = filemap_dirty_folio, 11385 .error_remove_page = generic_error_remove_page, 11386 .swap_activate = btrfs_swap_activate, 11387 .swap_deactivate = btrfs_swap_deactivate, 11388 }; 11389 11390 static const struct inode_operations btrfs_file_inode_operations = { 11391 .getattr = btrfs_getattr, 11392 .setattr = btrfs_setattr, 11393 .listxattr = btrfs_listxattr, 11394 .permission = btrfs_permission, 11395 .fiemap = btrfs_fiemap, 11396 .get_acl = btrfs_get_acl, 11397 .set_acl = btrfs_set_acl, 11398 .update_time = btrfs_update_time, 11399 .fileattr_get = btrfs_fileattr_get, 11400 .fileattr_set = btrfs_fileattr_set, 11401 }; 11402 static const struct inode_operations btrfs_special_inode_operations = { 11403 .getattr = btrfs_getattr, 11404 .setattr = btrfs_setattr, 11405 .permission = btrfs_permission, 11406 .listxattr = btrfs_listxattr, 11407 .get_acl = btrfs_get_acl, 11408 .set_acl = btrfs_set_acl, 11409 .update_time = btrfs_update_time, 11410 }; 11411 static const struct inode_operations btrfs_symlink_inode_operations = { 11412 .get_link = page_get_link, 11413 .getattr = btrfs_getattr, 11414 .setattr = btrfs_setattr, 11415 .permission = btrfs_permission, 11416 .listxattr = btrfs_listxattr, 11417 .update_time = btrfs_update_time, 11418 }; 11419 11420 const struct dentry_operations btrfs_dentry_operations = { 11421 .d_delete = btrfs_dentry_delete, 11422 }; 11423