1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/kernel.h> 7 #include <linux/bio.h> 8 #include <linux/buffer_head.h> 9 #include <linux/file.h> 10 #include <linux/fs.h> 11 #include <linux/pagemap.h> 12 #include <linux/highmem.h> 13 #include <linux/time.h> 14 #include <linux/init.h> 15 #include <linux/string.h> 16 #include <linux/backing-dev.h> 17 #include <linux/writeback.h> 18 #include <linux/compat.h> 19 #include <linux/xattr.h> 20 #include <linux/posix_acl.h> 21 #include <linux/falloc.h> 22 #include <linux/slab.h> 23 #include <linux/ratelimit.h> 24 #include <linux/btrfs.h> 25 #include <linux/blkdev.h> 26 #include <linux/posix_acl_xattr.h> 27 #include <linux/uio.h> 28 #include <linux/magic.h> 29 #include <linux/iversion.h> 30 #include <asm/unaligned.h> 31 #include "ctree.h" 32 #include "disk-io.h" 33 #include "transaction.h" 34 #include "btrfs_inode.h" 35 #include "print-tree.h" 36 #include "ordered-data.h" 37 #include "xattr.h" 38 #include "tree-log.h" 39 #include "volumes.h" 40 #include "compression.h" 41 #include "locking.h" 42 #include "free-space-cache.h" 43 #include "inode-map.h" 44 #include "backref.h" 45 #include "props.h" 46 #include "qgroup.h" 47 #include "dedupe.h" 48 49 struct btrfs_iget_args { 50 struct btrfs_key *location; 51 struct btrfs_root *root; 52 }; 53 54 struct btrfs_dio_data { 55 u64 reserve; 56 u64 unsubmitted_oe_range_start; 57 u64 unsubmitted_oe_range_end; 58 int overwrite; 59 }; 60 61 static const struct inode_operations btrfs_dir_inode_operations; 62 static const struct inode_operations btrfs_symlink_inode_operations; 63 static const struct inode_operations btrfs_dir_ro_inode_operations; 64 static const struct inode_operations btrfs_special_inode_operations; 65 static const struct inode_operations btrfs_file_inode_operations; 66 static const struct address_space_operations btrfs_aops; 67 static const struct file_operations btrfs_dir_file_operations; 68 static const struct extent_io_ops btrfs_extent_io_ops; 69 70 static struct kmem_cache *btrfs_inode_cachep; 71 struct kmem_cache *btrfs_trans_handle_cachep; 72 struct kmem_cache *btrfs_path_cachep; 73 struct kmem_cache *btrfs_free_space_cachep; 74 75 #define S_SHIFT 12 76 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 77 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 78 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 79 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 80 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 81 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 82 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 83 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 84 }; 85 86 static int btrfs_setsize(struct inode *inode, struct iattr *attr); 87 static int btrfs_truncate(struct inode *inode, bool skip_writeback); 88 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 89 static noinline int cow_file_range(struct inode *inode, 90 struct page *locked_page, 91 u64 start, u64 end, u64 delalloc_end, 92 int *page_started, unsigned long *nr_written, 93 int unlock, struct btrfs_dedupe_hash *hash); 94 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, 95 u64 orig_start, u64 block_start, 96 u64 block_len, u64 orig_block_len, 97 u64 ram_bytes, int compress_type, 98 int type); 99 100 static void __endio_write_update_ordered(struct inode *inode, 101 const u64 offset, const u64 bytes, 102 const bool uptodate); 103 104 /* 105 * Cleanup all submitted ordered extents in specified range to handle errors 106 * from the fill_dellaloc() callback. 107 * 108 * NOTE: caller must ensure that when an error happens, it can not call 109 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING 110 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata 111 * to be released, which we want to happen only when finishing the ordered 112 * extent (btrfs_finish_ordered_io()). Also note that the caller of the 113 * fill_delalloc() callback already does proper cleanup for the first page of 114 * the range, that is, it invokes the callback writepage_end_io_hook() for the 115 * range of the first page. 116 */ 117 static inline void btrfs_cleanup_ordered_extents(struct inode *inode, 118 const u64 offset, 119 const u64 bytes) 120 { 121 unsigned long index = offset >> PAGE_SHIFT; 122 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; 123 struct page *page; 124 125 while (index <= end_index) { 126 page = find_get_page(inode->i_mapping, index); 127 index++; 128 if (!page) 129 continue; 130 ClearPagePrivate2(page); 131 put_page(page); 132 } 133 return __endio_write_update_ordered(inode, offset + PAGE_SIZE, 134 bytes - PAGE_SIZE, false); 135 } 136 137 static int btrfs_dirty_inode(struct inode *inode); 138 139 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 140 void btrfs_test_inode_set_ops(struct inode *inode) 141 { 142 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 143 } 144 #endif 145 146 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 147 struct inode *inode, struct inode *dir, 148 const struct qstr *qstr) 149 { 150 int err; 151 152 err = btrfs_init_acl(trans, inode, dir); 153 if (!err) 154 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 155 return err; 156 } 157 158 /* 159 * this does all the hard work for inserting an inline extent into 160 * the btree. The caller should have done a btrfs_drop_extents so that 161 * no overlapping inline items exist in the btree 162 */ 163 static int insert_inline_extent(struct btrfs_trans_handle *trans, 164 struct btrfs_path *path, int extent_inserted, 165 struct btrfs_root *root, struct inode *inode, 166 u64 start, size_t size, size_t compressed_size, 167 int compress_type, 168 struct page **compressed_pages) 169 { 170 struct extent_buffer *leaf; 171 struct page *page = NULL; 172 char *kaddr; 173 unsigned long ptr; 174 struct btrfs_file_extent_item *ei; 175 int ret; 176 size_t cur_size = size; 177 unsigned long offset; 178 179 if (compressed_size && compressed_pages) 180 cur_size = compressed_size; 181 182 inode_add_bytes(inode, size); 183 184 if (!extent_inserted) { 185 struct btrfs_key key; 186 size_t datasize; 187 188 key.objectid = btrfs_ino(BTRFS_I(inode)); 189 key.offset = start; 190 key.type = BTRFS_EXTENT_DATA_KEY; 191 192 datasize = btrfs_file_extent_calc_inline_size(cur_size); 193 path->leave_spinning = 1; 194 ret = btrfs_insert_empty_item(trans, root, path, &key, 195 datasize); 196 if (ret) 197 goto fail; 198 } 199 leaf = path->nodes[0]; 200 ei = btrfs_item_ptr(leaf, path->slots[0], 201 struct btrfs_file_extent_item); 202 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 203 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 204 btrfs_set_file_extent_encryption(leaf, ei, 0); 205 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 206 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 207 ptr = btrfs_file_extent_inline_start(ei); 208 209 if (compress_type != BTRFS_COMPRESS_NONE) { 210 struct page *cpage; 211 int i = 0; 212 while (compressed_size > 0) { 213 cpage = compressed_pages[i]; 214 cur_size = min_t(unsigned long, compressed_size, 215 PAGE_SIZE); 216 217 kaddr = kmap_atomic(cpage); 218 write_extent_buffer(leaf, kaddr, ptr, cur_size); 219 kunmap_atomic(kaddr); 220 221 i++; 222 ptr += cur_size; 223 compressed_size -= cur_size; 224 } 225 btrfs_set_file_extent_compression(leaf, ei, 226 compress_type); 227 } else { 228 page = find_get_page(inode->i_mapping, 229 start >> PAGE_SHIFT); 230 btrfs_set_file_extent_compression(leaf, ei, 0); 231 kaddr = kmap_atomic(page); 232 offset = start & (PAGE_SIZE - 1); 233 write_extent_buffer(leaf, kaddr + offset, ptr, size); 234 kunmap_atomic(kaddr); 235 put_page(page); 236 } 237 btrfs_mark_buffer_dirty(leaf); 238 btrfs_release_path(path); 239 240 /* 241 * we're an inline extent, so nobody can 242 * extend the file past i_size without locking 243 * a page we already have locked. 244 * 245 * We must do any isize and inode updates 246 * before we unlock the pages. Otherwise we 247 * could end up racing with unlink. 248 */ 249 BTRFS_I(inode)->disk_i_size = inode->i_size; 250 ret = btrfs_update_inode(trans, root, inode); 251 252 fail: 253 return ret; 254 } 255 256 257 /* 258 * conditionally insert an inline extent into the file. This 259 * does the checks required to make sure the data is small enough 260 * to fit as an inline extent. 261 */ 262 static noinline int cow_file_range_inline(struct inode *inode, u64 start, 263 u64 end, size_t compressed_size, 264 int compress_type, 265 struct page **compressed_pages) 266 { 267 struct btrfs_root *root = BTRFS_I(inode)->root; 268 struct btrfs_fs_info *fs_info = root->fs_info; 269 struct btrfs_trans_handle *trans; 270 u64 isize = i_size_read(inode); 271 u64 actual_end = min(end + 1, isize); 272 u64 inline_len = actual_end - start; 273 u64 aligned_end = ALIGN(end, fs_info->sectorsize); 274 u64 data_len = inline_len; 275 int ret; 276 struct btrfs_path *path; 277 int extent_inserted = 0; 278 u32 extent_item_size; 279 280 if (compressed_size) 281 data_len = compressed_size; 282 283 if (start > 0 || 284 actual_end > fs_info->sectorsize || 285 data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || 286 (!compressed_size && 287 (actual_end & (fs_info->sectorsize - 1)) == 0) || 288 end + 1 < isize || 289 data_len > fs_info->max_inline) { 290 return 1; 291 } 292 293 path = btrfs_alloc_path(); 294 if (!path) 295 return -ENOMEM; 296 297 trans = btrfs_join_transaction(root); 298 if (IS_ERR(trans)) { 299 btrfs_free_path(path); 300 return PTR_ERR(trans); 301 } 302 trans->block_rsv = &BTRFS_I(inode)->block_rsv; 303 304 if (compressed_size && compressed_pages) 305 extent_item_size = btrfs_file_extent_calc_inline_size( 306 compressed_size); 307 else 308 extent_item_size = btrfs_file_extent_calc_inline_size( 309 inline_len); 310 311 ret = __btrfs_drop_extents(trans, root, inode, path, 312 start, aligned_end, NULL, 313 1, 1, extent_item_size, &extent_inserted); 314 if (ret) { 315 btrfs_abort_transaction(trans, ret); 316 goto out; 317 } 318 319 if (isize > actual_end) 320 inline_len = min_t(u64, isize, actual_end); 321 ret = insert_inline_extent(trans, path, extent_inserted, 322 root, inode, start, 323 inline_len, compressed_size, 324 compress_type, compressed_pages); 325 if (ret && ret != -ENOSPC) { 326 btrfs_abort_transaction(trans, ret); 327 goto out; 328 } else if (ret == -ENOSPC) { 329 ret = 1; 330 goto out; 331 } 332 333 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 334 btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0); 335 out: 336 /* 337 * Don't forget to free the reserved space, as for inlined extent 338 * it won't count as data extent, free them directly here. 339 * And at reserve time, it's always aligned to page size, so 340 * just free one page here. 341 */ 342 btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); 343 btrfs_free_path(path); 344 btrfs_end_transaction(trans); 345 return ret; 346 } 347 348 struct async_extent { 349 u64 start; 350 u64 ram_size; 351 u64 compressed_size; 352 struct page **pages; 353 unsigned long nr_pages; 354 int compress_type; 355 struct list_head list; 356 }; 357 358 struct async_cow { 359 struct inode *inode; 360 struct btrfs_root *root; 361 struct page *locked_page; 362 u64 start; 363 u64 end; 364 unsigned int write_flags; 365 struct list_head extents; 366 struct btrfs_work work; 367 }; 368 369 static noinline int add_async_extent(struct async_cow *cow, 370 u64 start, u64 ram_size, 371 u64 compressed_size, 372 struct page **pages, 373 unsigned long nr_pages, 374 int compress_type) 375 { 376 struct async_extent *async_extent; 377 378 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 379 BUG_ON(!async_extent); /* -ENOMEM */ 380 async_extent->start = start; 381 async_extent->ram_size = ram_size; 382 async_extent->compressed_size = compressed_size; 383 async_extent->pages = pages; 384 async_extent->nr_pages = nr_pages; 385 async_extent->compress_type = compress_type; 386 list_add_tail(&async_extent->list, &cow->extents); 387 return 0; 388 } 389 390 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end) 391 { 392 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 393 394 /* force compress */ 395 if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) 396 return 1; 397 /* defrag ioctl */ 398 if (BTRFS_I(inode)->defrag_compress) 399 return 1; 400 /* bad compression ratios */ 401 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 402 return 0; 403 if (btrfs_test_opt(fs_info, COMPRESS) || 404 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || 405 BTRFS_I(inode)->prop_compress) 406 return btrfs_compress_heuristic(inode, start, end); 407 return 0; 408 } 409 410 static inline void inode_should_defrag(struct btrfs_inode *inode, 411 u64 start, u64 end, u64 num_bytes, u64 small_write) 412 { 413 /* If this is a small write inside eof, kick off a defrag */ 414 if (num_bytes < small_write && 415 (start > 0 || end + 1 < inode->disk_i_size)) 416 btrfs_add_inode_defrag(NULL, inode); 417 } 418 419 /* 420 * we create compressed extents in two phases. The first 421 * phase compresses a range of pages that have already been 422 * locked (both pages and state bits are locked). 423 * 424 * This is done inside an ordered work queue, and the compression 425 * is spread across many cpus. The actual IO submission is step 426 * two, and the ordered work queue takes care of making sure that 427 * happens in the same order things were put onto the queue by 428 * writepages and friends. 429 * 430 * If this code finds it can't get good compression, it puts an 431 * entry onto the work queue to write the uncompressed bytes. This 432 * makes sure that both compressed inodes and uncompressed inodes 433 * are written in the same order that the flusher thread sent them 434 * down. 435 */ 436 static noinline void compress_file_range(struct inode *inode, 437 struct page *locked_page, 438 u64 start, u64 end, 439 struct async_cow *async_cow, 440 int *num_added) 441 { 442 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 443 u64 blocksize = fs_info->sectorsize; 444 u64 actual_end; 445 u64 isize = i_size_read(inode); 446 int ret = 0; 447 struct page **pages = NULL; 448 unsigned long nr_pages; 449 unsigned long total_compressed = 0; 450 unsigned long total_in = 0; 451 int i; 452 int will_compress; 453 int compress_type = fs_info->compress_type; 454 int redirty = 0; 455 456 inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, 457 SZ_16K); 458 459 actual_end = min_t(u64, isize, end + 1); 460 again: 461 will_compress = 0; 462 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; 463 BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); 464 nr_pages = min_t(unsigned long, nr_pages, 465 BTRFS_MAX_COMPRESSED / PAGE_SIZE); 466 467 /* 468 * we don't want to send crud past the end of i_size through 469 * compression, that's just a waste of CPU time. So, if the 470 * end of the file is before the start of our current 471 * requested range of bytes, we bail out to the uncompressed 472 * cleanup code that can deal with all of this. 473 * 474 * It isn't really the fastest way to fix things, but this is a 475 * very uncommon corner. 476 */ 477 if (actual_end <= start) 478 goto cleanup_and_bail_uncompressed; 479 480 total_compressed = actual_end - start; 481 482 /* 483 * skip compression for a small file range(<=blocksize) that 484 * isn't an inline extent, since it doesn't save disk space at all. 485 */ 486 if (total_compressed <= blocksize && 487 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 488 goto cleanup_and_bail_uncompressed; 489 490 total_compressed = min_t(unsigned long, total_compressed, 491 BTRFS_MAX_UNCOMPRESSED); 492 total_in = 0; 493 ret = 0; 494 495 /* 496 * we do compression for mount -o compress and when the 497 * inode has not been flagged as nocompress. This flag can 498 * change at any time if we discover bad compression ratios. 499 */ 500 if (inode_need_compress(inode, start, end)) { 501 WARN_ON(pages); 502 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 503 if (!pages) { 504 /* just bail out to the uncompressed code */ 505 nr_pages = 0; 506 goto cont; 507 } 508 509 if (BTRFS_I(inode)->defrag_compress) 510 compress_type = BTRFS_I(inode)->defrag_compress; 511 else if (BTRFS_I(inode)->prop_compress) 512 compress_type = BTRFS_I(inode)->prop_compress; 513 514 /* 515 * we need to call clear_page_dirty_for_io on each 516 * page in the range. Otherwise applications with the file 517 * mmap'd can wander in and change the page contents while 518 * we are compressing them. 519 * 520 * If the compression fails for any reason, we set the pages 521 * dirty again later on. 522 * 523 * Note that the remaining part is redirtied, the start pointer 524 * has moved, the end is the original one. 525 */ 526 if (!redirty) { 527 extent_range_clear_dirty_for_io(inode, start, end); 528 redirty = 1; 529 } 530 531 /* Compression level is applied here and only here */ 532 ret = btrfs_compress_pages( 533 compress_type | (fs_info->compress_level << 4), 534 inode->i_mapping, start, 535 pages, 536 &nr_pages, 537 &total_in, 538 &total_compressed); 539 540 if (!ret) { 541 unsigned long offset = total_compressed & 542 (PAGE_SIZE - 1); 543 struct page *page = pages[nr_pages - 1]; 544 char *kaddr; 545 546 /* zero the tail end of the last page, we might be 547 * sending it down to disk 548 */ 549 if (offset) { 550 kaddr = kmap_atomic(page); 551 memset(kaddr + offset, 0, 552 PAGE_SIZE - offset); 553 kunmap_atomic(kaddr); 554 } 555 will_compress = 1; 556 } 557 } 558 cont: 559 if (start == 0) { 560 /* lets try to make an inline extent */ 561 if (ret || total_in < actual_end) { 562 /* we didn't compress the entire range, try 563 * to make an uncompressed inline extent. 564 */ 565 ret = cow_file_range_inline(inode, start, end, 0, 566 BTRFS_COMPRESS_NONE, NULL); 567 } else { 568 /* try making a compressed inline extent */ 569 ret = cow_file_range_inline(inode, start, end, 570 total_compressed, 571 compress_type, pages); 572 } 573 if (ret <= 0) { 574 unsigned long clear_flags = EXTENT_DELALLOC | 575 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 576 EXTENT_DO_ACCOUNTING; 577 unsigned long page_error_op; 578 579 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; 580 581 /* 582 * inline extent creation worked or returned error, 583 * we don't need to create any more async work items. 584 * Unlock and free up our temp pages. 585 * 586 * We use DO_ACCOUNTING here because we need the 587 * delalloc_release_metadata to be done _after_ we drop 588 * our outstanding extent for clearing delalloc for this 589 * range. 590 */ 591 extent_clear_unlock_delalloc(inode, start, end, end, 592 NULL, clear_flags, 593 PAGE_UNLOCK | 594 PAGE_CLEAR_DIRTY | 595 PAGE_SET_WRITEBACK | 596 page_error_op | 597 PAGE_END_WRITEBACK); 598 goto free_pages_out; 599 } 600 } 601 602 if (will_compress) { 603 /* 604 * we aren't doing an inline extent round the compressed size 605 * up to a block size boundary so the allocator does sane 606 * things 607 */ 608 total_compressed = ALIGN(total_compressed, blocksize); 609 610 /* 611 * one last check to make sure the compression is really a 612 * win, compare the page count read with the blocks on disk, 613 * compression must free at least one sector size 614 */ 615 total_in = ALIGN(total_in, PAGE_SIZE); 616 if (total_compressed + blocksize <= total_in) { 617 *num_added += 1; 618 619 /* 620 * The async work queues will take care of doing actual 621 * allocation on disk for these compressed pages, and 622 * will submit them to the elevator. 623 */ 624 add_async_extent(async_cow, start, total_in, 625 total_compressed, pages, nr_pages, 626 compress_type); 627 628 if (start + total_in < end) { 629 start += total_in; 630 pages = NULL; 631 cond_resched(); 632 goto again; 633 } 634 return; 635 } 636 } 637 if (pages) { 638 /* 639 * the compression code ran but failed to make things smaller, 640 * free any pages it allocated and our page pointer array 641 */ 642 for (i = 0; i < nr_pages; i++) { 643 WARN_ON(pages[i]->mapping); 644 put_page(pages[i]); 645 } 646 kfree(pages); 647 pages = NULL; 648 total_compressed = 0; 649 nr_pages = 0; 650 651 /* flag the file so we don't compress in the future */ 652 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && 653 !(BTRFS_I(inode)->prop_compress)) { 654 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 655 } 656 } 657 cleanup_and_bail_uncompressed: 658 /* 659 * No compression, but we still need to write the pages in the file 660 * we've been given so far. redirty the locked page if it corresponds 661 * to our extent and set things up for the async work queue to run 662 * cow_file_range to do the normal delalloc dance. 663 */ 664 if (page_offset(locked_page) >= start && 665 page_offset(locked_page) <= end) 666 __set_page_dirty_nobuffers(locked_page); 667 /* unlocked later on in the async handlers */ 668 669 if (redirty) 670 extent_range_redirty_for_io(inode, start, end); 671 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, 672 BTRFS_COMPRESS_NONE); 673 *num_added += 1; 674 675 return; 676 677 free_pages_out: 678 for (i = 0; i < nr_pages; i++) { 679 WARN_ON(pages[i]->mapping); 680 put_page(pages[i]); 681 } 682 kfree(pages); 683 } 684 685 static void free_async_extent_pages(struct async_extent *async_extent) 686 { 687 int i; 688 689 if (!async_extent->pages) 690 return; 691 692 for (i = 0; i < async_extent->nr_pages; i++) { 693 WARN_ON(async_extent->pages[i]->mapping); 694 put_page(async_extent->pages[i]); 695 } 696 kfree(async_extent->pages); 697 async_extent->nr_pages = 0; 698 async_extent->pages = NULL; 699 } 700 701 /* 702 * phase two of compressed writeback. This is the ordered portion 703 * of the code, which only gets called in the order the work was 704 * queued. We walk all the async extents created by compress_file_range 705 * and send them down to the disk. 706 */ 707 static noinline void submit_compressed_extents(struct inode *inode, 708 struct async_cow *async_cow) 709 { 710 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 711 struct async_extent *async_extent; 712 u64 alloc_hint = 0; 713 struct btrfs_key ins; 714 struct extent_map *em; 715 struct btrfs_root *root = BTRFS_I(inode)->root; 716 struct extent_io_tree *io_tree; 717 int ret = 0; 718 719 again: 720 while (!list_empty(&async_cow->extents)) { 721 async_extent = list_entry(async_cow->extents.next, 722 struct async_extent, list); 723 list_del(&async_extent->list); 724 725 io_tree = &BTRFS_I(inode)->io_tree; 726 727 retry: 728 /* did the compression code fall back to uncompressed IO? */ 729 if (!async_extent->pages) { 730 int page_started = 0; 731 unsigned long nr_written = 0; 732 733 lock_extent(io_tree, async_extent->start, 734 async_extent->start + 735 async_extent->ram_size - 1); 736 737 /* allocate blocks */ 738 ret = cow_file_range(inode, async_cow->locked_page, 739 async_extent->start, 740 async_extent->start + 741 async_extent->ram_size - 1, 742 async_extent->start + 743 async_extent->ram_size - 1, 744 &page_started, &nr_written, 0, 745 NULL); 746 747 /* JDM XXX */ 748 749 /* 750 * if page_started, cow_file_range inserted an 751 * inline extent and took care of all the unlocking 752 * and IO for us. Otherwise, we need to submit 753 * all those pages down to the drive. 754 */ 755 if (!page_started && !ret) 756 extent_write_locked_range(inode, 757 async_extent->start, 758 async_extent->start + 759 async_extent->ram_size - 1, 760 WB_SYNC_ALL); 761 else if (ret) 762 unlock_page(async_cow->locked_page); 763 kfree(async_extent); 764 cond_resched(); 765 continue; 766 } 767 768 lock_extent(io_tree, async_extent->start, 769 async_extent->start + async_extent->ram_size - 1); 770 771 ret = btrfs_reserve_extent(root, async_extent->ram_size, 772 async_extent->compressed_size, 773 async_extent->compressed_size, 774 0, alloc_hint, &ins, 1, 1); 775 if (ret) { 776 free_async_extent_pages(async_extent); 777 778 if (ret == -ENOSPC) { 779 unlock_extent(io_tree, async_extent->start, 780 async_extent->start + 781 async_extent->ram_size - 1); 782 783 /* 784 * we need to redirty the pages if we decide to 785 * fallback to uncompressed IO, otherwise we 786 * will not submit these pages down to lower 787 * layers. 788 */ 789 extent_range_redirty_for_io(inode, 790 async_extent->start, 791 async_extent->start + 792 async_extent->ram_size - 1); 793 794 goto retry; 795 } 796 goto out_free; 797 } 798 /* 799 * here we're doing allocation and writeback of the 800 * compressed pages 801 */ 802 em = create_io_em(inode, async_extent->start, 803 async_extent->ram_size, /* len */ 804 async_extent->start, /* orig_start */ 805 ins.objectid, /* block_start */ 806 ins.offset, /* block_len */ 807 ins.offset, /* orig_block_len */ 808 async_extent->ram_size, /* ram_bytes */ 809 async_extent->compress_type, 810 BTRFS_ORDERED_COMPRESSED); 811 if (IS_ERR(em)) 812 /* ret value is not necessary due to void function */ 813 goto out_free_reserve; 814 free_extent_map(em); 815 816 ret = btrfs_add_ordered_extent_compress(inode, 817 async_extent->start, 818 ins.objectid, 819 async_extent->ram_size, 820 ins.offset, 821 BTRFS_ORDERED_COMPRESSED, 822 async_extent->compress_type); 823 if (ret) { 824 btrfs_drop_extent_cache(BTRFS_I(inode), 825 async_extent->start, 826 async_extent->start + 827 async_extent->ram_size - 1, 0); 828 goto out_free_reserve; 829 } 830 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 831 832 /* 833 * clear dirty, set writeback and unlock the pages. 834 */ 835 extent_clear_unlock_delalloc(inode, async_extent->start, 836 async_extent->start + 837 async_extent->ram_size - 1, 838 async_extent->start + 839 async_extent->ram_size - 1, 840 NULL, EXTENT_LOCKED | EXTENT_DELALLOC, 841 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 842 PAGE_SET_WRITEBACK); 843 if (btrfs_submit_compressed_write(inode, 844 async_extent->start, 845 async_extent->ram_size, 846 ins.objectid, 847 ins.offset, async_extent->pages, 848 async_extent->nr_pages, 849 async_cow->write_flags)) { 850 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 851 struct page *p = async_extent->pages[0]; 852 const u64 start = async_extent->start; 853 const u64 end = start + async_extent->ram_size - 1; 854 855 p->mapping = inode->i_mapping; 856 tree->ops->writepage_end_io_hook(p, start, end, 857 NULL, 0); 858 p->mapping = NULL; 859 extent_clear_unlock_delalloc(inode, start, end, end, 860 NULL, 0, 861 PAGE_END_WRITEBACK | 862 PAGE_SET_ERROR); 863 free_async_extent_pages(async_extent); 864 } 865 alloc_hint = ins.objectid + ins.offset; 866 kfree(async_extent); 867 cond_resched(); 868 } 869 return; 870 out_free_reserve: 871 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 872 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 873 out_free: 874 extent_clear_unlock_delalloc(inode, async_extent->start, 875 async_extent->start + 876 async_extent->ram_size - 1, 877 async_extent->start + 878 async_extent->ram_size - 1, 879 NULL, EXTENT_LOCKED | EXTENT_DELALLOC | 880 EXTENT_DELALLOC_NEW | 881 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 882 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 883 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | 884 PAGE_SET_ERROR); 885 free_async_extent_pages(async_extent); 886 kfree(async_extent); 887 goto again; 888 } 889 890 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 891 u64 num_bytes) 892 { 893 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 894 struct extent_map *em; 895 u64 alloc_hint = 0; 896 897 read_lock(&em_tree->lock); 898 em = search_extent_mapping(em_tree, start, num_bytes); 899 if (em) { 900 /* 901 * if block start isn't an actual block number then find the 902 * first block in this inode and use that as a hint. If that 903 * block is also bogus then just don't worry about it. 904 */ 905 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 906 free_extent_map(em); 907 em = search_extent_mapping(em_tree, 0, 0); 908 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 909 alloc_hint = em->block_start; 910 if (em) 911 free_extent_map(em); 912 } else { 913 alloc_hint = em->block_start; 914 free_extent_map(em); 915 } 916 } 917 read_unlock(&em_tree->lock); 918 919 return alloc_hint; 920 } 921 922 /* 923 * when extent_io.c finds a delayed allocation range in the file, 924 * the call backs end up in this code. The basic idea is to 925 * allocate extents on disk for the range, and create ordered data structs 926 * in ram to track those extents. 927 * 928 * locked_page is the page that writepage had locked already. We use 929 * it to make sure we don't do extra locks or unlocks. 930 * 931 * *page_started is set to one if we unlock locked_page and do everything 932 * required to start IO on it. It may be clean and already done with 933 * IO when we return. 934 */ 935 static noinline int cow_file_range(struct inode *inode, 936 struct page *locked_page, 937 u64 start, u64 end, u64 delalloc_end, 938 int *page_started, unsigned long *nr_written, 939 int unlock, struct btrfs_dedupe_hash *hash) 940 { 941 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 942 struct btrfs_root *root = BTRFS_I(inode)->root; 943 u64 alloc_hint = 0; 944 u64 num_bytes; 945 unsigned long ram_size; 946 u64 cur_alloc_size = 0; 947 u64 blocksize = fs_info->sectorsize; 948 struct btrfs_key ins; 949 struct extent_map *em; 950 unsigned clear_bits; 951 unsigned long page_ops; 952 bool extent_reserved = false; 953 int ret = 0; 954 955 if (btrfs_is_free_space_inode(BTRFS_I(inode))) { 956 WARN_ON_ONCE(1); 957 ret = -EINVAL; 958 goto out_unlock; 959 } 960 961 num_bytes = ALIGN(end - start + 1, blocksize); 962 num_bytes = max(blocksize, num_bytes); 963 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); 964 965 inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K); 966 967 if (start == 0) { 968 /* lets try to make an inline extent */ 969 ret = cow_file_range_inline(inode, start, end, 0, 970 BTRFS_COMPRESS_NONE, NULL); 971 if (ret == 0) { 972 /* 973 * We use DO_ACCOUNTING here because we need the 974 * delalloc_release_metadata to be run _after_ we drop 975 * our outstanding extent for clearing delalloc for this 976 * range. 977 */ 978 extent_clear_unlock_delalloc(inode, start, end, 979 delalloc_end, NULL, 980 EXTENT_LOCKED | EXTENT_DELALLOC | 981 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 982 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 983 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 984 PAGE_END_WRITEBACK); 985 *nr_written = *nr_written + 986 (end - start + PAGE_SIZE) / PAGE_SIZE; 987 *page_started = 1; 988 goto out; 989 } else if (ret < 0) { 990 goto out_unlock; 991 } 992 } 993 994 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 995 btrfs_drop_extent_cache(BTRFS_I(inode), start, 996 start + num_bytes - 1, 0); 997 998 while (num_bytes > 0) { 999 cur_alloc_size = num_bytes; 1000 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, 1001 fs_info->sectorsize, 0, alloc_hint, 1002 &ins, 1, 1); 1003 if (ret < 0) 1004 goto out_unlock; 1005 cur_alloc_size = ins.offset; 1006 extent_reserved = true; 1007 1008 ram_size = ins.offset; 1009 em = create_io_em(inode, start, ins.offset, /* len */ 1010 start, /* orig_start */ 1011 ins.objectid, /* block_start */ 1012 ins.offset, /* block_len */ 1013 ins.offset, /* orig_block_len */ 1014 ram_size, /* ram_bytes */ 1015 BTRFS_COMPRESS_NONE, /* compress_type */ 1016 BTRFS_ORDERED_REGULAR /* type */); 1017 if (IS_ERR(em)) { 1018 ret = PTR_ERR(em); 1019 goto out_reserve; 1020 } 1021 free_extent_map(em); 1022 1023 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 1024 ram_size, cur_alloc_size, 0); 1025 if (ret) 1026 goto out_drop_extent_cache; 1027 1028 if (root->root_key.objectid == 1029 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1030 ret = btrfs_reloc_clone_csums(inode, start, 1031 cur_alloc_size); 1032 /* 1033 * Only drop cache here, and process as normal. 1034 * 1035 * We must not allow extent_clear_unlock_delalloc() 1036 * at out_unlock label to free meta of this ordered 1037 * extent, as its meta should be freed by 1038 * btrfs_finish_ordered_io(). 1039 * 1040 * So we must continue until @start is increased to 1041 * skip current ordered extent. 1042 */ 1043 if (ret) 1044 btrfs_drop_extent_cache(BTRFS_I(inode), start, 1045 start + ram_size - 1, 0); 1046 } 1047 1048 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1049 1050 /* we're not doing compressed IO, don't unlock the first 1051 * page (which the caller expects to stay locked), don't 1052 * clear any dirty bits and don't set any writeback bits 1053 * 1054 * Do set the Private2 bit so we know this page was properly 1055 * setup for writepage 1056 */ 1057 page_ops = unlock ? PAGE_UNLOCK : 0; 1058 page_ops |= PAGE_SET_PRIVATE2; 1059 1060 extent_clear_unlock_delalloc(inode, start, 1061 start + ram_size - 1, 1062 delalloc_end, locked_page, 1063 EXTENT_LOCKED | EXTENT_DELALLOC, 1064 page_ops); 1065 if (num_bytes < cur_alloc_size) 1066 num_bytes = 0; 1067 else 1068 num_bytes -= cur_alloc_size; 1069 alloc_hint = ins.objectid + ins.offset; 1070 start += cur_alloc_size; 1071 extent_reserved = false; 1072 1073 /* 1074 * btrfs_reloc_clone_csums() error, since start is increased 1075 * extent_clear_unlock_delalloc() at out_unlock label won't 1076 * free metadata of current ordered extent, we're OK to exit. 1077 */ 1078 if (ret) 1079 goto out_unlock; 1080 } 1081 out: 1082 return ret; 1083 1084 out_drop_extent_cache: 1085 btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0); 1086 out_reserve: 1087 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1088 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 1089 out_unlock: 1090 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | 1091 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; 1092 page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 1093 PAGE_END_WRITEBACK; 1094 /* 1095 * If we reserved an extent for our delalloc range (or a subrange) and 1096 * failed to create the respective ordered extent, then it means that 1097 * when we reserved the extent we decremented the extent's size from 1098 * the data space_info's bytes_may_use counter and incremented the 1099 * space_info's bytes_reserved counter by the same amount. We must make 1100 * sure extent_clear_unlock_delalloc() does not try to decrement again 1101 * the data space_info's bytes_may_use counter, therefore we do not pass 1102 * it the flag EXTENT_CLEAR_DATA_RESV. 1103 */ 1104 if (extent_reserved) { 1105 extent_clear_unlock_delalloc(inode, start, 1106 start + cur_alloc_size, 1107 start + cur_alloc_size, 1108 locked_page, 1109 clear_bits, 1110 page_ops); 1111 start += cur_alloc_size; 1112 if (start >= end) 1113 goto out; 1114 } 1115 extent_clear_unlock_delalloc(inode, start, end, delalloc_end, 1116 locked_page, 1117 clear_bits | EXTENT_CLEAR_DATA_RESV, 1118 page_ops); 1119 goto out; 1120 } 1121 1122 /* 1123 * work queue call back to started compression on a file and pages 1124 */ 1125 static noinline void async_cow_start(struct btrfs_work *work) 1126 { 1127 struct async_cow *async_cow; 1128 int num_added = 0; 1129 async_cow = container_of(work, struct async_cow, work); 1130 1131 compress_file_range(async_cow->inode, async_cow->locked_page, 1132 async_cow->start, async_cow->end, async_cow, 1133 &num_added); 1134 if (num_added == 0) { 1135 btrfs_add_delayed_iput(async_cow->inode); 1136 async_cow->inode = NULL; 1137 } 1138 } 1139 1140 /* 1141 * work queue call back to submit previously compressed pages 1142 */ 1143 static noinline void async_cow_submit(struct btrfs_work *work) 1144 { 1145 struct btrfs_fs_info *fs_info; 1146 struct async_cow *async_cow; 1147 struct btrfs_root *root; 1148 unsigned long nr_pages; 1149 1150 async_cow = container_of(work, struct async_cow, work); 1151 1152 root = async_cow->root; 1153 fs_info = root->fs_info; 1154 nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> 1155 PAGE_SHIFT; 1156 1157 /* atomic_sub_return implies a barrier */ 1158 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 1159 5 * SZ_1M) 1160 cond_wake_up_nomb(&fs_info->async_submit_wait); 1161 1162 if (async_cow->inode) 1163 submit_compressed_extents(async_cow->inode, async_cow); 1164 } 1165 1166 static noinline void async_cow_free(struct btrfs_work *work) 1167 { 1168 struct async_cow *async_cow; 1169 async_cow = container_of(work, struct async_cow, work); 1170 if (async_cow->inode) 1171 btrfs_add_delayed_iput(async_cow->inode); 1172 kfree(async_cow); 1173 } 1174 1175 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1176 u64 start, u64 end, int *page_started, 1177 unsigned long *nr_written, 1178 unsigned int write_flags) 1179 { 1180 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1181 struct async_cow *async_cow; 1182 struct btrfs_root *root = BTRFS_I(inode)->root; 1183 unsigned long nr_pages; 1184 u64 cur_end; 1185 1186 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1187 1, 0, NULL); 1188 while (start < end) { 1189 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 1190 BUG_ON(!async_cow); /* -ENOMEM */ 1191 async_cow->inode = igrab(inode); 1192 async_cow->root = root; 1193 async_cow->locked_page = locked_page; 1194 async_cow->start = start; 1195 async_cow->write_flags = write_flags; 1196 1197 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && 1198 !btrfs_test_opt(fs_info, FORCE_COMPRESS)) 1199 cur_end = end; 1200 else 1201 cur_end = min(end, start + SZ_512K - 1); 1202 1203 async_cow->end = cur_end; 1204 INIT_LIST_HEAD(&async_cow->extents); 1205 1206 btrfs_init_work(&async_cow->work, 1207 btrfs_delalloc_helper, 1208 async_cow_start, async_cow_submit, 1209 async_cow_free); 1210 1211 nr_pages = (cur_end - start + PAGE_SIZE) >> 1212 PAGE_SHIFT; 1213 atomic_add(nr_pages, &fs_info->async_delalloc_pages); 1214 1215 btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); 1216 1217 *nr_written += nr_pages; 1218 start = cur_end + 1; 1219 } 1220 *page_started = 1; 1221 return 0; 1222 } 1223 1224 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, 1225 u64 bytenr, u64 num_bytes) 1226 { 1227 int ret; 1228 struct btrfs_ordered_sum *sums; 1229 LIST_HEAD(list); 1230 1231 ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr, 1232 bytenr + num_bytes - 1, &list, 0); 1233 if (ret == 0 && list_empty(&list)) 1234 return 0; 1235 1236 while (!list_empty(&list)) { 1237 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1238 list_del(&sums->list); 1239 kfree(sums); 1240 } 1241 if (ret < 0) 1242 return ret; 1243 return 1; 1244 } 1245 1246 /* 1247 * when nowcow writeback call back. This checks for snapshots or COW copies 1248 * of the extents that exist in the file, and COWs the file as required. 1249 * 1250 * If no cow copies or snapshots exist, we write directly to the existing 1251 * blocks on disk 1252 */ 1253 static noinline int run_delalloc_nocow(struct inode *inode, 1254 struct page *locked_page, 1255 u64 start, u64 end, int *page_started, int force, 1256 unsigned long *nr_written) 1257 { 1258 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1259 struct btrfs_root *root = BTRFS_I(inode)->root; 1260 struct extent_buffer *leaf; 1261 struct btrfs_path *path; 1262 struct btrfs_file_extent_item *fi; 1263 struct btrfs_key found_key; 1264 struct extent_map *em; 1265 u64 cow_start; 1266 u64 cur_offset; 1267 u64 extent_end; 1268 u64 extent_offset; 1269 u64 disk_bytenr; 1270 u64 num_bytes; 1271 u64 disk_num_bytes; 1272 u64 ram_bytes; 1273 int extent_type; 1274 int ret; 1275 int type; 1276 int nocow; 1277 int check_prev = 1; 1278 bool nolock; 1279 u64 ino = btrfs_ino(BTRFS_I(inode)); 1280 1281 path = btrfs_alloc_path(); 1282 if (!path) { 1283 extent_clear_unlock_delalloc(inode, start, end, end, 1284 locked_page, 1285 EXTENT_LOCKED | EXTENT_DELALLOC | 1286 EXTENT_DO_ACCOUNTING | 1287 EXTENT_DEFRAG, PAGE_UNLOCK | 1288 PAGE_CLEAR_DIRTY | 1289 PAGE_SET_WRITEBACK | 1290 PAGE_END_WRITEBACK); 1291 return -ENOMEM; 1292 } 1293 1294 nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); 1295 1296 cow_start = (u64)-1; 1297 cur_offset = start; 1298 while (1) { 1299 ret = btrfs_lookup_file_extent(NULL, root, path, ino, 1300 cur_offset, 0); 1301 if (ret < 0) 1302 goto error; 1303 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1304 leaf = path->nodes[0]; 1305 btrfs_item_key_to_cpu(leaf, &found_key, 1306 path->slots[0] - 1); 1307 if (found_key.objectid == ino && 1308 found_key.type == BTRFS_EXTENT_DATA_KEY) 1309 path->slots[0]--; 1310 } 1311 check_prev = 0; 1312 next_slot: 1313 leaf = path->nodes[0]; 1314 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1315 ret = btrfs_next_leaf(root, path); 1316 if (ret < 0) { 1317 if (cow_start != (u64)-1) 1318 cur_offset = cow_start; 1319 goto error; 1320 } 1321 if (ret > 0) 1322 break; 1323 leaf = path->nodes[0]; 1324 } 1325 1326 nocow = 0; 1327 disk_bytenr = 0; 1328 num_bytes = 0; 1329 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1330 1331 if (found_key.objectid > ino) 1332 break; 1333 if (WARN_ON_ONCE(found_key.objectid < ino) || 1334 found_key.type < BTRFS_EXTENT_DATA_KEY) { 1335 path->slots[0]++; 1336 goto next_slot; 1337 } 1338 if (found_key.type > BTRFS_EXTENT_DATA_KEY || 1339 found_key.offset > end) 1340 break; 1341 1342 if (found_key.offset > cur_offset) { 1343 extent_end = found_key.offset; 1344 extent_type = 0; 1345 goto out_check; 1346 } 1347 1348 fi = btrfs_item_ptr(leaf, path->slots[0], 1349 struct btrfs_file_extent_item); 1350 extent_type = btrfs_file_extent_type(leaf, fi); 1351 1352 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1353 if (extent_type == BTRFS_FILE_EXTENT_REG || 1354 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1355 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1356 extent_offset = btrfs_file_extent_offset(leaf, fi); 1357 extent_end = found_key.offset + 1358 btrfs_file_extent_num_bytes(leaf, fi); 1359 disk_num_bytes = 1360 btrfs_file_extent_disk_num_bytes(leaf, fi); 1361 if (extent_end <= start) { 1362 path->slots[0]++; 1363 goto next_slot; 1364 } 1365 if (disk_bytenr == 0) 1366 goto out_check; 1367 if (btrfs_file_extent_compression(leaf, fi) || 1368 btrfs_file_extent_encryption(leaf, fi) || 1369 btrfs_file_extent_other_encoding(leaf, fi)) 1370 goto out_check; 1371 /* 1372 * Do the same check as in btrfs_cross_ref_exist but 1373 * without the unnecessary search. 1374 */ 1375 if (btrfs_file_extent_generation(leaf, fi) <= 1376 btrfs_root_last_snapshot(&root->root_item)) 1377 goto out_check; 1378 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1379 goto out_check; 1380 if (btrfs_extent_readonly(fs_info, disk_bytenr)) 1381 goto out_check; 1382 ret = btrfs_cross_ref_exist(root, ino, 1383 found_key.offset - 1384 extent_offset, disk_bytenr); 1385 if (ret) { 1386 /* 1387 * ret could be -EIO if the above fails to read 1388 * metadata. 1389 */ 1390 if (ret < 0) { 1391 if (cow_start != (u64)-1) 1392 cur_offset = cow_start; 1393 goto error; 1394 } 1395 1396 WARN_ON_ONCE(nolock); 1397 goto out_check; 1398 } 1399 disk_bytenr += extent_offset; 1400 disk_bytenr += cur_offset - found_key.offset; 1401 num_bytes = min(end + 1, extent_end) - cur_offset; 1402 /* 1403 * if there are pending snapshots for this root, 1404 * we fall into common COW way. 1405 */ 1406 if (!nolock && atomic_read(&root->snapshot_force_cow)) 1407 goto out_check; 1408 /* 1409 * force cow if csum exists in the range. 1410 * this ensure that csum for a given extent are 1411 * either valid or do not exist. 1412 */ 1413 ret = csum_exist_in_range(fs_info, disk_bytenr, 1414 num_bytes); 1415 if (ret) { 1416 /* 1417 * ret could be -EIO if the above fails to read 1418 * metadata. 1419 */ 1420 if (ret < 0) { 1421 if (cow_start != (u64)-1) 1422 cur_offset = cow_start; 1423 goto error; 1424 } 1425 WARN_ON_ONCE(nolock); 1426 goto out_check; 1427 } 1428 if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) 1429 goto out_check; 1430 nocow = 1; 1431 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1432 extent_end = found_key.offset + 1433 btrfs_file_extent_ram_bytes(leaf, fi); 1434 extent_end = ALIGN(extent_end, 1435 fs_info->sectorsize); 1436 } else { 1437 BUG_ON(1); 1438 } 1439 out_check: 1440 if (extent_end <= start) { 1441 path->slots[0]++; 1442 if (nocow) 1443 btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1444 goto next_slot; 1445 } 1446 if (!nocow) { 1447 if (cow_start == (u64)-1) 1448 cow_start = cur_offset; 1449 cur_offset = extent_end; 1450 if (cur_offset > end) 1451 break; 1452 path->slots[0]++; 1453 goto next_slot; 1454 } 1455 1456 btrfs_release_path(path); 1457 if (cow_start != (u64)-1) { 1458 ret = cow_file_range(inode, locked_page, 1459 cow_start, found_key.offset - 1, 1460 end, page_started, nr_written, 1, 1461 NULL); 1462 if (ret) { 1463 if (nocow) 1464 btrfs_dec_nocow_writers(fs_info, 1465 disk_bytenr); 1466 goto error; 1467 } 1468 cow_start = (u64)-1; 1469 } 1470 1471 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1472 u64 orig_start = found_key.offset - extent_offset; 1473 1474 em = create_io_em(inode, cur_offset, num_bytes, 1475 orig_start, 1476 disk_bytenr, /* block_start */ 1477 num_bytes, /* block_len */ 1478 disk_num_bytes, /* orig_block_len */ 1479 ram_bytes, BTRFS_COMPRESS_NONE, 1480 BTRFS_ORDERED_PREALLOC); 1481 if (IS_ERR(em)) { 1482 if (nocow) 1483 btrfs_dec_nocow_writers(fs_info, 1484 disk_bytenr); 1485 ret = PTR_ERR(em); 1486 goto error; 1487 } 1488 free_extent_map(em); 1489 } 1490 1491 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1492 type = BTRFS_ORDERED_PREALLOC; 1493 } else { 1494 type = BTRFS_ORDERED_NOCOW; 1495 } 1496 1497 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1498 num_bytes, num_bytes, type); 1499 if (nocow) 1500 btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1501 BUG_ON(ret); /* -ENOMEM */ 1502 1503 if (root->root_key.objectid == 1504 BTRFS_DATA_RELOC_TREE_OBJECTID) 1505 /* 1506 * Error handled later, as we must prevent 1507 * extent_clear_unlock_delalloc() in error handler 1508 * from freeing metadata of created ordered extent. 1509 */ 1510 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1511 num_bytes); 1512 1513 extent_clear_unlock_delalloc(inode, cur_offset, 1514 cur_offset + num_bytes - 1, end, 1515 locked_page, EXTENT_LOCKED | 1516 EXTENT_DELALLOC | 1517 EXTENT_CLEAR_DATA_RESV, 1518 PAGE_UNLOCK | PAGE_SET_PRIVATE2); 1519 1520 cur_offset = extent_end; 1521 1522 /* 1523 * btrfs_reloc_clone_csums() error, now we're OK to call error 1524 * handler, as metadata for created ordered extent will only 1525 * be freed by btrfs_finish_ordered_io(). 1526 */ 1527 if (ret) 1528 goto error; 1529 if (cur_offset > end) 1530 break; 1531 } 1532 btrfs_release_path(path); 1533 1534 if (cur_offset <= end && cow_start == (u64)-1) { 1535 cow_start = cur_offset; 1536 cur_offset = end; 1537 } 1538 1539 if (cow_start != (u64)-1) { 1540 ret = cow_file_range(inode, locked_page, cow_start, end, end, 1541 page_started, nr_written, 1, NULL); 1542 if (ret) 1543 goto error; 1544 } 1545 1546 error: 1547 if (ret && cur_offset < end) 1548 extent_clear_unlock_delalloc(inode, cur_offset, end, end, 1549 locked_page, EXTENT_LOCKED | 1550 EXTENT_DELALLOC | EXTENT_DEFRAG | 1551 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 1552 PAGE_CLEAR_DIRTY | 1553 PAGE_SET_WRITEBACK | 1554 PAGE_END_WRITEBACK); 1555 btrfs_free_path(path); 1556 return ret; 1557 } 1558 1559 static inline int need_force_cow(struct inode *inode, u64 start, u64 end) 1560 { 1561 1562 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 1563 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) 1564 return 0; 1565 1566 /* 1567 * @defrag_bytes is a hint value, no spinlock held here, 1568 * if is not zero, it means the file is defragging. 1569 * Force cow if given extent needs to be defragged. 1570 */ 1571 if (BTRFS_I(inode)->defrag_bytes && 1572 test_range_bit(&BTRFS_I(inode)->io_tree, start, end, 1573 EXTENT_DEFRAG, 0, NULL)) 1574 return 1; 1575 1576 return 0; 1577 } 1578 1579 /* 1580 * extent_io.c call back to do delayed allocation processing 1581 */ 1582 static int run_delalloc_range(void *private_data, struct page *locked_page, 1583 u64 start, u64 end, int *page_started, 1584 unsigned long *nr_written, 1585 struct writeback_control *wbc) 1586 { 1587 struct inode *inode = private_data; 1588 int ret; 1589 int force_cow = need_force_cow(inode, start, end); 1590 unsigned int write_flags = wbc_to_write_flags(wbc); 1591 1592 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { 1593 ret = run_delalloc_nocow(inode, locked_page, start, end, 1594 page_started, 1, nr_written); 1595 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { 1596 ret = run_delalloc_nocow(inode, locked_page, start, end, 1597 page_started, 0, nr_written); 1598 } else if (!inode_need_compress(inode, start, end)) { 1599 ret = cow_file_range(inode, locked_page, start, end, end, 1600 page_started, nr_written, 1, NULL); 1601 } else { 1602 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1603 &BTRFS_I(inode)->runtime_flags); 1604 ret = cow_file_range_async(inode, locked_page, start, end, 1605 page_started, nr_written, 1606 write_flags); 1607 } 1608 if (ret) 1609 btrfs_cleanup_ordered_extents(inode, start, end - start + 1); 1610 return ret; 1611 } 1612 1613 static void btrfs_split_extent_hook(void *private_data, 1614 struct extent_state *orig, u64 split) 1615 { 1616 struct inode *inode = private_data; 1617 u64 size; 1618 1619 /* not delalloc, ignore it */ 1620 if (!(orig->state & EXTENT_DELALLOC)) 1621 return; 1622 1623 size = orig->end - orig->start + 1; 1624 if (size > BTRFS_MAX_EXTENT_SIZE) { 1625 u32 num_extents; 1626 u64 new_size; 1627 1628 /* 1629 * See the explanation in btrfs_merge_extent_hook, the same 1630 * applies here, just in reverse. 1631 */ 1632 new_size = orig->end - split + 1; 1633 num_extents = count_max_extents(new_size); 1634 new_size = split - orig->start; 1635 num_extents += count_max_extents(new_size); 1636 if (count_max_extents(size) >= num_extents) 1637 return; 1638 } 1639 1640 spin_lock(&BTRFS_I(inode)->lock); 1641 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); 1642 spin_unlock(&BTRFS_I(inode)->lock); 1643 } 1644 1645 /* 1646 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1647 * extents so we can keep track of new extents that are just merged onto old 1648 * extents, such as when we are doing sequential writes, so we can properly 1649 * account for the metadata space we'll need. 1650 */ 1651 static void btrfs_merge_extent_hook(void *private_data, 1652 struct extent_state *new, 1653 struct extent_state *other) 1654 { 1655 struct inode *inode = private_data; 1656 u64 new_size, old_size; 1657 u32 num_extents; 1658 1659 /* not delalloc, ignore it */ 1660 if (!(other->state & EXTENT_DELALLOC)) 1661 return; 1662 1663 if (new->start > other->start) 1664 new_size = new->end - other->start + 1; 1665 else 1666 new_size = other->end - new->start + 1; 1667 1668 /* we're not bigger than the max, unreserve the space and go */ 1669 if (new_size <= BTRFS_MAX_EXTENT_SIZE) { 1670 spin_lock(&BTRFS_I(inode)->lock); 1671 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); 1672 spin_unlock(&BTRFS_I(inode)->lock); 1673 return; 1674 } 1675 1676 /* 1677 * We have to add up either side to figure out how many extents were 1678 * accounted for before we merged into one big extent. If the number of 1679 * extents we accounted for is <= the amount we need for the new range 1680 * then we can return, otherwise drop. Think of it like this 1681 * 1682 * [ 4k][MAX_SIZE] 1683 * 1684 * So we've grown the extent by a MAX_SIZE extent, this would mean we 1685 * need 2 outstanding extents, on one side we have 1 and the other side 1686 * we have 1 so they are == and we can return. But in this case 1687 * 1688 * [MAX_SIZE+4k][MAX_SIZE+4k] 1689 * 1690 * Each range on their own accounts for 2 extents, but merged together 1691 * they are only 3 extents worth of accounting, so we need to drop in 1692 * this case. 1693 */ 1694 old_size = other->end - other->start + 1; 1695 num_extents = count_max_extents(old_size); 1696 old_size = new->end - new->start + 1; 1697 num_extents += count_max_extents(old_size); 1698 if (count_max_extents(new_size) >= num_extents) 1699 return; 1700 1701 spin_lock(&BTRFS_I(inode)->lock); 1702 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); 1703 spin_unlock(&BTRFS_I(inode)->lock); 1704 } 1705 1706 static void btrfs_add_delalloc_inodes(struct btrfs_root *root, 1707 struct inode *inode) 1708 { 1709 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1710 1711 spin_lock(&root->delalloc_lock); 1712 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1713 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1714 &root->delalloc_inodes); 1715 set_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1716 &BTRFS_I(inode)->runtime_flags); 1717 root->nr_delalloc_inodes++; 1718 if (root->nr_delalloc_inodes == 1) { 1719 spin_lock(&fs_info->delalloc_root_lock); 1720 BUG_ON(!list_empty(&root->delalloc_root)); 1721 list_add_tail(&root->delalloc_root, 1722 &fs_info->delalloc_roots); 1723 spin_unlock(&fs_info->delalloc_root_lock); 1724 } 1725 } 1726 spin_unlock(&root->delalloc_lock); 1727 } 1728 1729 1730 void __btrfs_del_delalloc_inode(struct btrfs_root *root, 1731 struct btrfs_inode *inode) 1732 { 1733 struct btrfs_fs_info *fs_info = root->fs_info; 1734 1735 if (!list_empty(&inode->delalloc_inodes)) { 1736 list_del_init(&inode->delalloc_inodes); 1737 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1738 &inode->runtime_flags); 1739 root->nr_delalloc_inodes--; 1740 if (!root->nr_delalloc_inodes) { 1741 ASSERT(list_empty(&root->delalloc_inodes)); 1742 spin_lock(&fs_info->delalloc_root_lock); 1743 BUG_ON(list_empty(&root->delalloc_root)); 1744 list_del_init(&root->delalloc_root); 1745 spin_unlock(&fs_info->delalloc_root_lock); 1746 } 1747 } 1748 } 1749 1750 static void btrfs_del_delalloc_inode(struct btrfs_root *root, 1751 struct btrfs_inode *inode) 1752 { 1753 spin_lock(&root->delalloc_lock); 1754 __btrfs_del_delalloc_inode(root, inode); 1755 spin_unlock(&root->delalloc_lock); 1756 } 1757 1758 /* 1759 * extent_io.c set_bit_hook, used to track delayed allocation 1760 * bytes in this file, and to maintain the list of inodes that 1761 * have pending delalloc work to be done. 1762 */ 1763 static void btrfs_set_bit_hook(void *private_data, 1764 struct extent_state *state, unsigned *bits) 1765 { 1766 struct inode *inode = private_data; 1767 1768 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1769 1770 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) 1771 WARN_ON(1); 1772 /* 1773 * set_bit and clear bit hooks normally require _irqsave/restore 1774 * but in this case, we are only testing for the DELALLOC 1775 * bit, which is only set or cleared with irqs on 1776 */ 1777 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1778 struct btrfs_root *root = BTRFS_I(inode)->root; 1779 u64 len = state->end + 1 - state->start; 1780 u32 num_extents = count_max_extents(len); 1781 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); 1782 1783 spin_lock(&BTRFS_I(inode)->lock); 1784 btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); 1785 spin_unlock(&BTRFS_I(inode)->lock); 1786 1787 /* For sanity tests */ 1788 if (btrfs_is_testing(fs_info)) 1789 return; 1790 1791 percpu_counter_add_batch(&fs_info->delalloc_bytes, len, 1792 fs_info->delalloc_batch); 1793 spin_lock(&BTRFS_I(inode)->lock); 1794 BTRFS_I(inode)->delalloc_bytes += len; 1795 if (*bits & EXTENT_DEFRAG) 1796 BTRFS_I(inode)->defrag_bytes += len; 1797 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1798 &BTRFS_I(inode)->runtime_flags)) 1799 btrfs_add_delalloc_inodes(root, inode); 1800 spin_unlock(&BTRFS_I(inode)->lock); 1801 } 1802 1803 if (!(state->state & EXTENT_DELALLOC_NEW) && 1804 (*bits & EXTENT_DELALLOC_NEW)) { 1805 spin_lock(&BTRFS_I(inode)->lock); 1806 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - 1807 state->start; 1808 spin_unlock(&BTRFS_I(inode)->lock); 1809 } 1810 } 1811 1812 /* 1813 * extent_io.c clear_bit_hook, see set_bit_hook for why 1814 */ 1815 static void btrfs_clear_bit_hook(void *private_data, 1816 struct extent_state *state, 1817 unsigned *bits) 1818 { 1819 struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data); 1820 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 1821 u64 len = state->end + 1 - state->start; 1822 u32 num_extents = count_max_extents(len); 1823 1824 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { 1825 spin_lock(&inode->lock); 1826 inode->defrag_bytes -= len; 1827 spin_unlock(&inode->lock); 1828 } 1829 1830 /* 1831 * set_bit and clear bit hooks normally require _irqsave/restore 1832 * but in this case, we are only testing for the DELALLOC 1833 * bit, which is only set or cleared with irqs on 1834 */ 1835 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1836 struct btrfs_root *root = inode->root; 1837 bool do_list = !btrfs_is_free_space_inode(inode); 1838 1839 spin_lock(&inode->lock); 1840 btrfs_mod_outstanding_extents(inode, -num_extents); 1841 spin_unlock(&inode->lock); 1842 1843 /* 1844 * We don't reserve metadata space for space cache inodes so we 1845 * don't need to call dellalloc_release_metadata if there is an 1846 * error. 1847 */ 1848 if (*bits & EXTENT_CLEAR_META_RESV && 1849 root != fs_info->tree_root) 1850 btrfs_delalloc_release_metadata(inode, len, false); 1851 1852 /* For sanity tests. */ 1853 if (btrfs_is_testing(fs_info)) 1854 return; 1855 1856 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && 1857 do_list && !(state->state & EXTENT_NORESERVE) && 1858 (*bits & EXTENT_CLEAR_DATA_RESV)) 1859 btrfs_free_reserved_data_space_noquota( 1860 &inode->vfs_inode, 1861 state->start, len); 1862 1863 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, 1864 fs_info->delalloc_batch); 1865 spin_lock(&inode->lock); 1866 inode->delalloc_bytes -= len; 1867 if (do_list && inode->delalloc_bytes == 0 && 1868 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1869 &inode->runtime_flags)) 1870 btrfs_del_delalloc_inode(root, inode); 1871 spin_unlock(&inode->lock); 1872 } 1873 1874 if ((state->state & EXTENT_DELALLOC_NEW) && 1875 (*bits & EXTENT_DELALLOC_NEW)) { 1876 spin_lock(&inode->lock); 1877 ASSERT(inode->new_delalloc_bytes >= len); 1878 inode->new_delalloc_bytes -= len; 1879 spin_unlock(&inode->lock); 1880 } 1881 } 1882 1883 /* 1884 * Merge bio hook, this must check the chunk tree to make sure we don't create 1885 * bios that span stripes or chunks 1886 * 1887 * return 1 if page cannot be merged to bio 1888 * return 0 if page can be merged to bio 1889 * return error otherwise 1890 */ 1891 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1892 size_t size, struct bio *bio, 1893 unsigned long bio_flags) 1894 { 1895 struct inode *inode = page->mapping->host; 1896 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1897 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 1898 u64 length = 0; 1899 u64 map_length; 1900 int ret; 1901 1902 if (bio_flags & EXTENT_BIO_COMPRESSED) 1903 return 0; 1904 1905 length = bio->bi_iter.bi_size; 1906 map_length = length; 1907 ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 1908 NULL, 0); 1909 if (ret < 0) 1910 return ret; 1911 if (map_length < length + size) 1912 return 1; 1913 return 0; 1914 } 1915 1916 /* 1917 * in order to insert checksums into the metadata in large chunks, 1918 * we wait until bio submission time. All the pages in the bio are 1919 * checksummed and sums are attached onto the ordered extent record. 1920 * 1921 * At IO completion time the cums attached on the ordered extent record 1922 * are inserted into the btree 1923 */ 1924 static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, 1925 u64 bio_offset) 1926 { 1927 struct inode *inode = private_data; 1928 blk_status_t ret = 0; 1929 1930 ret = btrfs_csum_one_bio(inode, bio, 0, 0); 1931 BUG_ON(ret); /* -ENOMEM */ 1932 return 0; 1933 } 1934 1935 /* 1936 * in order to insert checksums into the metadata in large chunks, 1937 * we wait until bio submission time. All the pages in the bio are 1938 * checksummed and sums are attached onto the ordered extent record. 1939 * 1940 * At IO completion time the cums attached on the ordered extent record 1941 * are inserted into the btree 1942 */ 1943 blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, 1944 int mirror_num) 1945 { 1946 struct inode *inode = private_data; 1947 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1948 blk_status_t ret; 1949 1950 ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); 1951 if (ret) { 1952 bio->bi_status = ret; 1953 bio_endio(bio); 1954 } 1955 return ret; 1956 } 1957 1958 /* 1959 * extent_io.c submission hook. This does the right thing for csum calculation 1960 * on write, or reading the csums from the tree before a read. 1961 * 1962 * Rules about async/sync submit, 1963 * a) read: sync submit 1964 * 1965 * b) write without checksum: sync submit 1966 * 1967 * c) write with checksum: 1968 * c-1) if bio is issued by fsync: sync submit 1969 * (sync_writers != 0) 1970 * 1971 * c-2) if root is reloc root: sync submit 1972 * (only in case of buffered IO) 1973 * 1974 * c-3) otherwise: async submit 1975 */ 1976 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, 1977 int mirror_num, unsigned long bio_flags, 1978 u64 bio_offset) 1979 { 1980 struct inode *inode = private_data; 1981 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1982 struct btrfs_root *root = BTRFS_I(inode)->root; 1983 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; 1984 blk_status_t ret = 0; 1985 int skip_sum; 1986 int async = !atomic_read(&BTRFS_I(inode)->sync_writers); 1987 1988 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1989 1990 if (btrfs_is_free_space_inode(BTRFS_I(inode))) 1991 metadata = BTRFS_WQ_ENDIO_FREE_SPACE; 1992 1993 if (bio_op(bio) != REQ_OP_WRITE) { 1994 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); 1995 if (ret) 1996 goto out; 1997 1998 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1999 ret = btrfs_submit_compressed_read(inode, bio, 2000 mirror_num, 2001 bio_flags); 2002 goto out; 2003 } else if (!skip_sum) { 2004 ret = btrfs_lookup_bio_sums(inode, bio, NULL); 2005 if (ret) 2006 goto out; 2007 } 2008 goto mapit; 2009 } else if (async && !skip_sum) { 2010 /* csum items have already been cloned */ 2011 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 2012 goto mapit; 2013 /* we're doing a write, do the async checksumming */ 2014 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, 2015 bio_offset, inode, 2016 btrfs_submit_bio_start); 2017 goto out; 2018 } else if (!skip_sum) { 2019 ret = btrfs_csum_one_bio(inode, bio, 0, 0); 2020 if (ret) 2021 goto out; 2022 } 2023 2024 mapit: 2025 ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); 2026 2027 out: 2028 if (ret) { 2029 bio->bi_status = ret; 2030 bio_endio(bio); 2031 } 2032 return ret; 2033 } 2034 2035 /* 2036 * given a list of ordered sums record them in the inode. This happens 2037 * at IO completion time based on sums calculated at bio submission time. 2038 */ 2039 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 2040 struct inode *inode, struct list_head *list) 2041 { 2042 struct btrfs_ordered_sum *sum; 2043 int ret; 2044 2045 list_for_each_entry(sum, list, list) { 2046 trans->adding_csums = true; 2047 ret = btrfs_csum_file_blocks(trans, 2048 BTRFS_I(inode)->root->fs_info->csum_root, sum); 2049 trans->adding_csums = false; 2050 if (ret) 2051 return ret; 2052 } 2053 return 0; 2054 } 2055 2056 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2057 unsigned int extra_bits, 2058 struct extent_state **cached_state, int dedupe) 2059 { 2060 WARN_ON((end & (PAGE_SIZE - 1)) == 0); 2061 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 2062 extra_bits, cached_state); 2063 } 2064 2065 /* see btrfs_writepage_start_hook for details on why this is required */ 2066 struct btrfs_writepage_fixup { 2067 struct page *page; 2068 struct btrfs_work work; 2069 }; 2070 2071 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 2072 { 2073 struct btrfs_writepage_fixup *fixup; 2074 struct btrfs_ordered_extent *ordered; 2075 struct extent_state *cached_state = NULL; 2076 struct extent_changeset *data_reserved = NULL; 2077 struct page *page; 2078 struct inode *inode; 2079 u64 page_start; 2080 u64 page_end; 2081 int ret; 2082 2083 fixup = container_of(work, struct btrfs_writepage_fixup, work); 2084 page = fixup->page; 2085 again: 2086 lock_page(page); 2087 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 2088 ClearPageChecked(page); 2089 goto out_page; 2090 } 2091 2092 inode = page->mapping->host; 2093 page_start = page_offset(page); 2094 page_end = page_offset(page) + PAGE_SIZE - 1; 2095 2096 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 2097 &cached_state); 2098 2099 /* already ordered? We're done */ 2100 if (PagePrivate2(page)) 2101 goto out; 2102 2103 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, 2104 PAGE_SIZE); 2105 if (ordered) { 2106 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 2107 page_end, &cached_state); 2108 unlock_page(page); 2109 btrfs_start_ordered_extent(inode, ordered, 1); 2110 btrfs_put_ordered_extent(ordered); 2111 goto again; 2112 } 2113 2114 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, 2115 PAGE_SIZE); 2116 if (ret) { 2117 mapping_set_error(page->mapping, ret); 2118 end_extent_writepage(page, ret, page_start, page_end); 2119 ClearPageChecked(page); 2120 goto out; 2121 } 2122 2123 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 2124 &cached_state, 0); 2125 if (ret) { 2126 mapping_set_error(page->mapping, ret); 2127 end_extent_writepage(page, ret, page_start, page_end); 2128 ClearPageChecked(page); 2129 goto out; 2130 } 2131 2132 ClearPageChecked(page); 2133 set_page_dirty(page); 2134 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false); 2135 out: 2136 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 2137 &cached_state); 2138 out_page: 2139 unlock_page(page); 2140 put_page(page); 2141 kfree(fixup); 2142 extent_changeset_free(data_reserved); 2143 } 2144 2145 /* 2146 * There are a few paths in the higher layers of the kernel that directly 2147 * set the page dirty bit without asking the filesystem if it is a 2148 * good idea. This causes problems because we want to make sure COW 2149 * properly happens and the data=ordered rules are followed. 2150 * 2151 * In our case any range that doesn't have the ORDERED bit set 2152 * hasn't been properly setup for IO. We kick off an async process 2153 * to fix it up. The async helper will wait for ordered extents, set 2154 * the delalloc bit and make it safe to write the page. 2155 */ 2156 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 2157 { 2158 struct inode *inode = page->mapping->host; 2159 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2160 struct btrfs_writepage_fixup *fixup; 2161 2162 /* this page is properly in the ordered list */ 2163 if (TestClearPagePrivate2(page)) 2164 return 0; 2165 2166 if (PageChecked(page)) 2167 return -EAGAIN; 2168 2169 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 2170 if (!fixup) 2171 return -EAGAIN; 2172 2173 SetPageChecked(page); 2174 get_page(page); 2175 btrfs_init_work(&fixup->work, btrfs_fixup_helper, 2176 btrfs_writepage_fixup_worker, NULL, NULL); 2177 fixup->page = page; 2178 btrfs_queue_work(fs_info->fixup_workers, &fixup->work); 2179 return -EBUSY; 2180 } 2181 2182 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 2183 struct inode *inode, u64 file_pos, 2184 u64 disk_bytenr, u64 disk_num_bytes, 2185 u64 num_bytes, u64 ram_bytes, 2186 u8 compression, u8 encryption, 2187 u16 other_encoding, int extent_type) 2188 { 2189 struct btrfs_root *root = BTRFS_I(inode)->root; 2190 struct btrfs_file_extent_item *fi; 2191 struct btrfs_path *path; 2192 struct extent_buffer *leaf; 2193 struct btrfs_key ins; 2194 u64 qg_released; 2195 int extent_inserted = 0; 2196 int ret; 2197 2198 path = btrfs_alloc_path(); 2199 if (!path) 2200 return -ENOMEM; 2201 2202 /* 2203 * we may be replacing one extent in the tree with another. 2204 * The new extent is pinned in the extent map, and we don't want 2205 * to drop it from the cache until it is completely in the btree. 2206 * 2207 * So, tell btrfs_drop_extents to leave this extent in the cache. 2208 * the caller is expected to unpin it and allow it to be merged 2209 * with the others. 2210 */ 2211 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, 2212 file_pos + num_bytes, NULL, 0, 2213 1, sizeof(*fi), &extent_inserted); 2214 if (ret) 2215 goto out; 2216 2217 if (!extent_inserted) { 2218 ins.objectid = btrfs_ino(BTRFS_I(inode)); 2219 ins.offset = file_pos; 2220 ins.type = BTRFS_EXTENT_DATA_KEY; 2221 2222 path->leave_spinning = 1; 2223 ret = btrfs_insert_empty_item(trans, root, path, &ins, 2224 sizeof(*fi)); 2225 if (ret) 2226 goto out; 2227 } 2228 leaf = path->nodes[0]; 2229 fi = btrfs_item_ptr(leaf, path->slots[0], 2230 struct btrfs_file_extent_item); 2231 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 2232 btrfs_set_file_extent_type(leaf, fi, extent_type); 2233 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 2234 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 2235 btrfs_set_file_extent_offset(leaf, fi, 0); 2236 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2237 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 2238 btrfs_set_file_extent_compression(leaf, fi, compression); 2239 btrfs_set_file_extent_encryption(leaf, fi, encryption); 2240 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 2241 2242 btrfs_mark_buffer_dirty(leaf); 2243 btrfs_release_path(path); 2244 2245 inode_add_bytes(inode, num_bytes); 2246 2247 ins.objectid = disk_bytenr; 2248 ins.offset = disk_num_bytes; 2249 ins.type = BTRFS_EXTENT_ITEM_KEY; 2250 2251 /* 2252 * Release the reserved range from inode dirty range map, as it is 2253 * already moved into delayed_ref_head 2254 */ 2255 ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes); 2256 if (ret < 0) 2257 goto out; 2258 qg_released = ret; 2259 ret = btrfs_alloc_reserved_file_extent(trans, root, 2260 btrfs_ino(BTRFS_I(inode)), 2261 file_pos, qg_released, &ins); 2262 out: 2263 btrfs_free_path(path); 2264 2265 return ret; 2266 } 2267 2268 /* snapshot-aware defrag */ 2269 struct sa_defrag_extent_backref { 2270 struct rb_node node; 2271 struct old_sa_defrag_extent *old; 2272 u64 root_id; 2273 u64 inum; 2274 u64 file_pos; 2275 u64 extent_offset; 2276 u64 num_bytes; 2277 u64 generation; 2278 }; 2279 2280 struct old_sa_defrag_extent { 2281 struct list_head list; 2282 struct new_sa_defrag_extent *new; 2283 2284 u64 extent_offset; 2285 u64 bytenr; 2286 u64 offset; 2287 u64 len; 2288 int count; 2289 }; 2290 2291 struct new_sa_defrag_extent { 2292 struct rb_root root; 2293 struct list_head head; 2294 struct btrfs_path *path; 2295 struct inode *inode; 2296 u64 file_pos; 2297 u64 len; 2298 u64 bytenr; 2299 u64 disk_len; 2300 u8 compress_type; 2301 }; 2302 2303 static int backref_comp(struct sa_defrag_extent_backref *b1, 2304 struct sa_defrag_extent_backref *b2) 2305 { 2306 if (b1->root_id < b2->root_id) 2307 return -1; 2308 else if (b1->root_id > b2->root_id) 2309 return 1; 2310 2311 if (b1->inum < b2->inum) 2312 return -1; 2313 else if (b1->inum > b2->inum) 2314 return 1; 2315 2316 if (b1->file_pos < b2->file_pos) 2317 return -1; 2318 else if (b1->file_pos > b2->file_pos) 2319 return 1; 2320 2321 /* 2322 * [------------------------------] ===> (a range of space) 2323 * |<--->| |<---->| =============> (fs/file tree A) 2324 * |<---------------------------->| ===> (fs/file tree B) 2325 * 2326 * A range of space can refer to two file extents in one tree while 2327 * refer to only one file extent in another tree. 2328 * 2329 * So we may process a disk offset more than one time(two extents in A) 2330 * and locate at the same extent(one extent in B), then insert two same 2331 * backrefs(both refer to the extent in B). 2332 */ 2333 return 0; 2334 } 2335 2336 static void backref_insert(struct rb_root *root, 2337 struct sa_defrag_extent_backref *backref) 2338 { 2339 struct rb_node **p = &root->rb_node; 2340 struct rb_node *parent = NULL; 2341 struct sa_defrag_extent_backref *entry; 2342 int ret; 2343 2344 while (*p) { 2345 parent = *p; 2346 entry = rb_entry(parent, struct sa_defrag_extent_backref, node); 2347 2348 ret = backref_comp(backref, entry); 2349 if (ret < 0) 2350 p = &(*p)->rb_left; 2351 else 2352 p = &(*p)->rb_right; 2353 } 2354 2355 rb_link_node(&backref->node, parent, p); 2356 rb_insert_color(&backref->node, root); 2357 } 2358 2359 /* 2360 * Note the backref might has changed, and in this case we just return 0. 2361 */ 2362 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, 2363 void *ctx) 2364 { 2365 struct btrfs_file_extent_item *extent; 2366 struct old_sa_defrag_extent *old = ctx; 2367 struct new_sa_defrag_extent *new = old->new; 2368 struct btrfs_path *path = new->path; 2369 struct btrfs_key key; 2370 struct btrfs_root *root; 2371 struct sa_defrag_extent_backref *backref; 2372 struct extent_buffer *leaf; 2373 struct inode *inode = new->inode; 2374 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2375 int slot; 2376 int ret; 2377 u64 extent_offset; 2378 u64 num_bytes; 2379 2380 if (BTRFS_I(inode)->root->root_key.objectid == root_id && 2381 inum == btrfs_ino(BTRFS_I(inode))) 2382 return 0; 2383 2384 key.objectid = root_id; 2385 key.type = BTRFS_ROOT_ITEM_KEY; 2386 key.offset = (u64)-1; 2387 2388 root = btrfs_read_fs_root_no_name(fs_info, &key); 2389 if (IS_ERR(root)) { 2390 if (PTR_ERR(root) == -ENOENT) 2391 return 0; 2392 WARN_ON(1); 2393 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu", 2394 inum, offset, root_id); 2395 return PTR_ERR(root); 2396 } 2397 2398 key.objectid = inum; 2399 key.type = BTRFS_EXTENT_DATA_KEY; 2400 if (offset > (u64)-1 << 32) 2401 key.offset = 0; 2402 else 2403 key.offset = offset; 2404 2405 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2406 if (WARN_ON(ret < 0)) 2407 return ret; 2408 ret = 0; 2409 2410 while (1) { 2411 cond_resched(); 2412 2413 leaf = path->nodes[0]; 2414 slot = path->slots[0]; 2415 2416 if (slot >= btrfs_header_nritems(leaf)) { 2417 ret = btrfs_next_leaf(root, path); 2418 if (ret < 0) { 2419 goto out; 2420 } else if (ret > 0) { 2421 ret = 0; 2422 goto out; 2423 } 2424 continue; 2425 } 2426 2427 path->slots[0]++; 2428 2429 btrfs_item_key_to_cpu(leaf, &key, slot); 2430 2431 if (key.objectid > inum) 2432 goto out; 2433 2434 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) 2435 continue; 2436 2437 extent = btrfs_item_ptr(leaf, slot, 2438 struct btrfs_file_extent_item); 2439 2440 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) 2441 continue; 2442 2443 /* 2444 * 'offset' refers to the exact key.offset, 2445 * NOT the 'offset' field in btrfs_extent_data_ref, ie. 2446 * (key.offset - extent_offset). 2447 */ 2448 if (key.offset != offset) 2449 continue; 2450 2451 extent_offset = btrfs_file_extent_offset(leaf, extent); 2452 num_bytes = btrfs_file_extent_num_bytes(leaf, extent); 2453 2454 if (extent_offset >= old->extent_offset + old->offset + 2455 old->len || extent_offset + num_bytes <= 2456 old->extent_offset + old->offset) 2457 continue; 2458 break; 2459 } 2460 2461 backref = kmalloc(sizeof(*backref), GFP_NOFS); 2462 if (!backref) { 2463 ret = -ENOENT; 2464 goto out; 2465 } 2466 2467 backref->root_id = root_id; 2468 backref->inum = inum; 2469 backref->file_pos = offset; 2470 backref->num_bytes = num_bytes; 2471 backref->extent_offset = extent_offset; 2472 backref->generation = btrfs_file_extent_generation(leaf, extent); 2473 backref->old = old; 2474 backref_insert(&new->root, backref); 2475 old->count++; 2476 out: 2477 btrfs_release_path(path); 2478 WARN_ON(ret); 2479 return ret; 2480 } 2481 2482 static noinline bool record_extent_backrefs(struct btrfs_path *path, 2483 struct new_sa_defrag_extent *new) 2484 { 2485 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); 2486 struct old_sa_defrag_extent *old, *tmp; 2487 int ret; 2488 2489 new->path = path; 2490 2491 list_for_each_entry_safe(old, tmp, &new->head, list) { 2492 ret = iterate_inodes_from_logical(old->bytenr + 2493 old->extent_offset, fs_info, 2494 path, record_one_backref, 2495 old, false); 2496 if (ret < 0 && ret != -ENOENT) 2497 return false; 2498 2499 /* no backref to be processed for this extent */ 2500 if (!old->count) { 2501 list_del(&old->list); 2502 kfree(old); 2503 } 2504 } 2505 2506 if (list_empty(&new->head)) 2507 return false; 2508 2509 return true; 2510 } 2511 2512 static int relink_is_mergable(struct extent_buffer *leaf, 2513 struct btrfs_file_extent_item *fi, 2514 struct new_sa_defrag_extent *new) 2515 { 2516 if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) 2517 return 0; 2518 2519 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 2520 return 0; 2521 2522 if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) 2523 return 0; 2524 2525 if (btrfs_file_extent_encryption(leaf, fi) || 2526 btrfs_file_extent_other_encoding(leaf, fi)) 2527 return 0; 2528 2529 return 1; 2530 } 2531 2532 /* 2533 * Note the backref might has changed, and in this case we just return 0. 2534 */ 2535 static noinline int relink_extent_backref(struct btrfs_path *path, 2536 struct sa_defrag_extent_backref *prev, 2537 struct sa_defrag_extent_backref *backref) 2538 { 2539 struct btrfs_file_extent_item *extent; 2540 struct btrfs_file_extent_item *item; 2541 struct btrfs_ordered_extent *ordered; 2542 struct btrfs_trans_handle *trans; 2543 struct btrfs_root *root; 2544 struct btrfs_key key; 2545 struct extent_buffer *leaf; 2546 struct old_sa_defrag_extent *old = backref->old; 2547 struct new_sa_defrag_extent *new = old->new; 2548 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); 2549 struct inode *inode; 2550 struct extent_state *cached = NULL; 2551 int ret = 0; 2552 u64 start; 2553 u64 len; 2554 u64 lock_start; 2555 u64 lock_end; 2556 bool merge = false; 2557 int index; 2558 2559 if (prev && prev->root_id == backref->root_id && 2560 prev->inum == backref->inum && 2561 prev->file_pos + prev->num_bytes == backref->file_pos) 2562 merge = true; 2563 2564 /* step 1: get root */ 2565 key.objectid = backref->root_id; 2566 key.type = BTRFS_ROOT_ITEM_KEY; 2567 key.offset = (u64)-1; 2568 2569 index = srcu_read_lock(&fs_info->subvol_srcu); 2570 2571 root = btrfs_read_fs_root_no_name(fs_info, &key); 2572 if (IS_ERR(root)) { 2573 srcu_read_unlock(&fs_info->subvol_srcu, index); 2574 if (PTR_ERR(root) == -ENOENT) 2575 return 0; 2576 return PTR_ERR(root); 2577 } 2578 2579 if (btrfs_root_readonly(root)) { 2580 srcu_read_unlock(&fs_info->subvol_srcu, index); 2581 return 0; 2582 } 2583 2584 /* step 2: get inode */ 2585 key.objectid = backref->inum; 2586 key.type = BTRFS_INODE_ITEM_KEY; 2587 key.offset = 0; 2588 2589 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 2590 if (IS_ERR(inode)) { 2591 srcu_read_unlock(&fs_info->subvol_srcu, index); 2592 return 0; 2593 } 2594 2595 srcu_read_unlock(&fs_info->subvol_srcu, index); 2596 2597 /* step 3: relink backref */ 2598 lock_start = backref->file_pos; 2599 lock_end = backref->file_pos + backref->num_bytes - 1; 2600 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, 2601 &cached); 2602 2603 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); 2604 if (ordered) { 2605 btrfs_put_ordered_extent(ordered); 2606 goto out_unlock; 2607 } 2608 2609 trans = btrfs_join_transaction(root); 2610 if (IS_ERR(trans)) { 2611 ret = PTR_ERR(trans); 2612 goto out_unlock; 2613 } 2614 2615 key.objectid = backref->inum; 2616 key.type = BTRFS_EXTENT_DATA_KEY; 2617 key.offset = backref->file_pos; 2618 2619 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2620 if (ret < 0) { 2621 goto out_free_path; 2622 } else if (ret > 0) { 2623 ret = 0; 2624 goto out_free_path; 2625 } 2626 2627 extent = btrfs_item_ptr(path->nodes[0], path->slots[0], 2628 struct btrfs_file_extent_item); 2629 2630 if (btrfs_file_extent_generation(path->nodes[0], extent) != 2631 backref->generation) 2632 goto out_free_path; 2633 2634 btrfs_release_path(path); 2635 2636 start = backref->file_pos; 2637 if (backref->extent_offset < old->extent_offset + old->offset) 2638 start += old->extent_offset + old->offset - 2639 backref->extent_offset; 2640 2641 len = min(backref->extent_offset + backref->num_bytes, 2642 old->extent_offset + old->offset + old->len); 2643 len -= max(backref->extent_offset, old->extent_offset + old->offset); 2644 2645 ret = btrfs_drop_extents(trans, root, inode, start, 2646 start + len, 1); 2647 if (ret) 2648 goto out_free_path; 2649 again: 2650 key.objectid = btrfs_ino(BTRFS_I(inode)); 2651 key.type = BTRFS_EXTENT_DATA_KEY; 2652 key.offset = start; 2653 2654 path->leave_spinning = 1; 2655 if (merge) { 2656 struct btrfs_file_extent_item *fi; 2657 u64 extent_len; 2658 struct btrfs_key found_key; 2659 2660 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2661 if (ret < 0) 2662 goto out_free_path; 2663 2664 path->slots[0]--; 2665 leaf = path->nodes[0]; 2666 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2667 2668 fi = btrfs_item_ptr(leaf, path->slots[0], 2669 struct btrfs_file_extent_item); 2670 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 2671 2672 if (extent_len + found_key.offset == start && 2673 relink_is_mergable(leaf, fi, new)) { 2674 btrfs_set_file_extent_num_bytes(leaf, fi, 2675 extent_len + len); 2676 btrfs_mark_buffer_dirty(leaf); 2677 inode_add_bytes(inode, len); 2678 2679 ret = 1; 2680 goto out_free_path; 2681 } else { 2682 merge = false; 2683 btrfs_release_path(path); 2684 goto again; 2685 } 2686 } 2687 2688 ret = btrfs_insert_empty_item(trans, root, path, &key, 2689 sizeof(*extent)); 2690 if (ret) { 2691 btrfs_abort_transaction(trans, ret); 2692 goto out_free_path; 2693 } 2694 2695 leaf = path->nodes[0]; 2696 item = btrfs_item_ptr(leaf, path->slots[0], 2697 struct btrfs_file_extent_item); 2698 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); 2699 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); 2700 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); 2701 btrfs_set_file_extent_num_bytes(leaf, item, len); 2702 btrfs_set_file_extent_ram_bytes(leaf, item, new->len); 2703 btrfs_set_file_extent_generation(leaf, item, trans->transid); 2704 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); 2705 btrfs_set_file_extent_compression(leaf, item, new->compress_type); 2706 btrfs_set_file_extent_encryption(leaf, item, 0); 2707 btrfs_set_file_extent_other_encoding(leaf, item, 0); 2708 2709 btrfs_mark_buffer_dirty(leaf); 2710 inode_add_bytes(inode, len); 2711 btrfs_release_path(path); 2712 2713 ret = btrfs_inc_extent_ref(trans, root, new->bytenr, 2714 new->disk_len, 0, 2715 backref->root_id, backref->inum, 2716 new->file_pos); /* start - extent_offset */ 2717 if (ret) { 2718 btrfs_abort_transaction(trans, ret); 2719 goto out_free_path; 2720 } 2721 2722 ret = 1; 2723 out_free_path: 2724 btrfs_release_path(path); 2725 path->leave_spinning = 0; 2726 btrfs_end_transaction(trans); 2727 out_unlock: 2728 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, 2729 &cached); 2730 iput(inode); 2731 return ret; 2732 } 2733 2734 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) 2735 { 2736 struct old_sa_defrag_extent *old, *tmp; 2737 2738 if (!new) 2739 return; 2740 2741 list_for_each_entry_safe(old, tmp, &new->head, list) { 2742 kfree(old); 2743 } 2744 kfree(new); 2745 } 2746 2747 static void relink_file_extents(struct new_sa_defrag_extent *new) 2748 { 2749 struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); 2750 struct btrfs_path *path; 2751 struct sa_defrag_extent_backref *backref; 2752 struct sa_defrag_extent_backref *prev = NULL; 2753 struct rb_node *node; 2754 int ret; 2755 2756 path = btrfs_alloc_path(); 2757 if (!path) 2758 return; 2759 2760 if (!record_extent_backrefs(path, new)) { 2761 btrfs_free_path(path); 2762 goto out; 2763 } 2764 btrfs_release_path(path); 2765 2766 while (1) { 2767 node = rb_first(&new->root); 2768 if (!node) 2769 break; 2770 rb_erase(node, &new->root); 2771 2772 backref = rb_entry(node, struct sa_defrag_extent_backref, node); 2773 2774 ret = relink_extent_backref(path, prev, backref); 2775 WARN_ON(ret < 0); 2776 2777 kfree(prev); 2778 2779 if (ret == 1) 2780 prev = backref; 2781 else 2782 prev = NULL; 2783 cond_resched(); 2784 } 2785 kfree(prev); 2786 2787 btrfs_free_path(path); 2788 out: 2789 free_sa_defrag_extent(new); 2790 2791 atomic_dec(&fs_info->defrag_running); 2792 wake_up(&fs_info->transaction_wait); 2793 } 2794 2795 static struct new_sa_defrag_extent * 2796 record_old_file_extents(struct inode *inode, 2797 struct btrfs_ordered_extent *ordered) 2798 { 2799 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2800 struct btrfs_root *root = BTRFS_I(inode)->root; 2801 struct btrfs_path *path; 2802 struct btrfs_key key; 2803 struct old_sa_defrag_extent *old; 2804 struct new_sa_defrag_extent *new; 2805 int ret; 2806 2807 new = kmalloc(sizeof(*new), GFP_NOFS); 2808 if (!new) 2809 return NULL; 2810 2811 new->inode = inode; 2812 new->file_pos = ordered->file_offset; 2813 new->len = ordered->len; 2814 new->bytenr = ordered->start; 2815 new->disk_len = ordered->disk_len; 2816 new->compress_type = ordered->compress_type; 2817 new->root = RB_ROOT; 2818 INIT_LIST_HEAD(&new->head); 2819 2820 path = btrfs_alloc_path(); 2821 if (!path) 2822 goto out_kfree; 2823 2824 key.objectid = btrfs_ino(BTRFS_I(inode)); 2825 key.type = BTRFS_EXTENT_DATA_KEY; 2826 key.offset = new->file_pos; 2827 2828 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2829 if (ret < 0) 2830 goto out_free_path; 2831 if (ret > 0 && path->slots[0] > 0) 2832 path->slots[0]--; 2833 2834 /* find out all the old extents for the file range */ 2835 while (1) { 2836 struct btrfs_file_extent_item *extent; 2837 struct extent_buffer *l; 2838 int slot; 2839 u64 num_bytes; 2840 u64 offset; 2841 u64 end; 2842 u64 disk_bytenr; 2843 u64 extent_offset; 2844 2845 l = path->nodes[0]; 2846 slot = path->slots[0]; 2847 2848 if (slot >= btrfs_header_nritems(l)) { 2849 ret = btrfs_next_leaf(root, path); 2850 if (ret < 0) 2851 goto out_free_path; 2852 else if (ret > 0) 2853 break; 2854 continue; 2855 } 2856 2857 btrfs_item_key_to_cpu(l, &key, slot); 2858 2859 if (key.objectid != btrfs_ino(BTRFS_I(inode))) 2860 break; 2861 if (key.type != BTRFS_EXTENT_DATA_KEY) 2862 break; 2863 if (key.offset >= new->file_pos + new->len) 2864 break; 2865 2866 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); 2867 2868 num_bytes = btrfs_file_extent_num_bytes(l, extent); 2869 if (key.offset + num_bytes < new->file_pos) 2870 goto next; 2871 2872 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); 2873 if (!disk_bytenr) 2874 goto next; 2875 2876 extent_offset = btrfs_file_extent_offset(l, extent); 2877 2878 old = kmalloc(sizeof(*old), GFP_NOFS); 2879 if (!old) 2880 goto out_free_path; 2881 2882 offset = max(new->file_pos, key.offset); 2883 end = min(new->file_pos + new->len, key.offset + num_bytes); 2884 2885 old->bytenr = disk_bytenr; 2886 old->extent_offset = extent_offset; 2887 old->offset = offset - key.offset; 2888 old->len = end - offset; 2889 old->new = new; 2890 old->count = 0; 2891 list_add_tail(&old->list, &new->head); 2892 next: 2893 path->slots[0]++; 2894 cond_resched(); 2895 } 2896 2897 btrfs_free_path(path); 2898 atomic_inc(&fs_info->defrag_running); 2899 2900 return new; 2901 2902 out_free_path: 2903 btrfs_free_path(path); 2904 out_kfree: 2905 free_sa_defrag_extent(new); 2906 return NULL; 2907 } 2908 2909 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, 2910 u64 start, u64 len) 2911 { 2912 struct btrfs_block_group_cache *cache; 2913 2914 cache = btrfs_lookup_block_group(fs_info, start); 2915 ASSERT(cache); 2916 2917 spin_lock(&cache->lock); 2918 cache->delalloc_bytes -= len; 2919 spin_unlock(&cache->lock); 2920 2921 btrfs_put_block_group(cache); 2922 } 2923 2924 /* as ordered data IO finishes, this gets called so we can finish 2925 * an ordered extent if the range of bytes in the file it covers are 2926 * fully written. 2927 */ 2928 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 2929 { 2930 struct inode *inode = ordered_extent->inode; 2931 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2932 struct btrfs_root *root = BTRFS_I(inode)->root; 2933 struct btrfs_trans_handle *trans = NULL; 2934 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2935 struct extent_state *cached_state = NULL; 2936 struct new_sa_defrag_extent *new = NULL; 2937 int compress_type = 0; 2938 int ret = 0; 2939 u64 logical_len = ordered_extent->len; 2940 bool nolock; 2941 bool truncated = false; 2942 bool range_locked = false; 2943 bool clear_new_delalloc_bytes = false; 2944 bool clear_reserved_extent = true; 2945 2946 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 2947 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && 2948 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) 2949 clear_new_delalloc_bytes = true; 2950 2951 nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); 2952 2953 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 2954 ret = -EIO; 2955 goto out; 2956 } 2957 2958 btrfs_free_io_failure_record(BTRFS_I(inode), 2959 ordered_extent->file_offset, 2960 ordered_extent->file_offset + 2961 ordered_extent->len - 1); 2962 2963 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 2964 truncated = true; 2965 logical_len = ordered_extent->truncated_len; 2966 /* Truncated the entire extent, don't bother adding */ 2967 if (!logical_len) 2968 goto out; 2969 } 2970 2971 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 2972 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 2973 2974 /* 2975 * For mwrite(mmap + memset to write) case, we still reserve 2976 * space for NOCOW range. 2977 * As NOCOW won't cause a new delayed ref, just free the space 2978 */ 2979 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, 2980 ordered_extent->len); 2981 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 2982 if (nolock) 2983 trans = btrfs_join_transaction_nolock(root); 2984 else 2985 trans = btrfs_join_transaction(root); 2986 if (IS_ERR(trans)) { 2987 ret = PTR_ERR(trans); 2988 trans = NULL; 2989 goto out; 2990 } 2991 trans->block_rsv = &BTRFS_I(inode)->block_rsv; 2992 ret = btrfs_update_inode_fallback(trans, root, inode); 2993 if (ret) /* -ENOMEM or corruption */ 2994 btrfs_abort_transaction(trans, ret); 2995 goto out; 2996 } 2997 2998 range_locked = true; 2999 lock_extent_bits(io_tree, ordered_extent->file_offset, 3000 ordered_extent->file_offset + ordered_extent->len - 1, 3001 &cached_state); 3002 3003 ret = test_range_bit(io_tree, ordered_extent->file_offset, 3004 ordered_extent->file_offset + ordered_extent->len - 1, 3005 EXTENT_DEFRAG, 0, cached_state); 3006 if (ret) { 3007 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 3008 if (0 && last_snapshot >= BTRFS_I(inode)->generation) 3009 /* the inode is shared */ 3010 new = record_old_file_extents(inode, ordered_extent); 3011 3012 clear_extent_bit(io_tree, ordered_extent->file_offset, 3013 ordered_extent->file_offset + ordered_extent->len - 1, 3014 EXTENT_DEFRAG, 0, 0, &cached_state); 3015 } 3016 3017 if (nolock) 3018 trans = btrfs_join_transaction_nolock(root); 3019 else 3020 trans = btrfs_join_transaction(root); 3021 if (IS_ERR(trans)) { 3022 ret = PTR_ERR(trans); 3023 trans = NULL; 3024 goto out; 3025 } 3026 3027 trans->block_rsv = &BTRFS_I(inode)->block_rsv; 3028 3029 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 3030 compress_type = ordered_extent->compress_type; 3031 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 3032 BUG_ON(compress_type); 3033 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, 3034 ordered_extent->len); 3035 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), 3036 ordered_extent->file_offset, 3037 ordered_extent->file_offset + 3038 logical_len); 3039 } else { 3040 BUG_ON(root == fs_info->tree_root); 3041 ret = insert_reserved_file_extent(trans, inode, 3042 ordered_extent->file_offset, 3043 ordered_extent->start, 3044 ordered_extent->disk_len, 3045 logical_len, logical_len, 3046 compress_type, 0, 0, 3047 BTRFS_FILE_EXTENT_REG); 3048 if (!ret) { 3049 clear_reserved_extent = false; 3050 btrfs_release_delalloc_bytes(fs_info, 3051 ordered_extent->start, 3052 ordered_extent->disk_len); 3053 } 3054 } 3055 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 3056 ordered_extent->file_offset, ordered_extent->len, 3057 trans->transid); 3058 if (ret < 0) { 3059 btrfs_abort_transaction(trans, ret); 3060 goto out; 3061 } 3062 3063 ret = add_pending_csums(trans, inode, &ordered_extent->list); 3064 if (ret) { 3065 btrfs_abort_transaction(trans, ret); 3066 goto out; 3067 } 3068 3069 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 3070 ret = btrfs_update_inode_fallback(trans, root, inode); 3071 if (ret) { /* -ENOMEM or corruption */ 3072 btrfs_abort_transaction(trans, ret); 3073 goto out; 3074 } 3075 ret = 0; 3076 out: 3077 if (range_locked || clear_new_delalloc_bytes) { 3078 unsigned int clear_bits = 0; 3079 3080 if (range_locked) 3081 clear_bits |= EXTENT_LOCKED; 3082 if (clear_new_delalloc_bytes) 3083 clear_bits |= EXTENT_DELALLOC_NEW; 3084 clear_extent_bit(&BTRFS_I(inode)->io_tree, 3085 ordered_extent->file_offset, 3086 ordered_extent->file_offset + 3087 ordered_extent->len - 1, 3088 clear_bits, 3089 (clear_bits & EXTENT_LOCKED) ? 1 : 0, 3090 0, &cached_state); 3091 } 3092 3093 if (trans) 3094 btrfs_end_transaction(trans); 3095 3096 if (ret || truncated) { 3097 u64 start, end; 3098 3099 if (truncated) 3100 start = ordered_extent->file_offset + logical_len; 3101 else 3102 start = ordered_extent->file_offset; 3103 end = ordered_extent->file_offset + ordered_extent->len - 1; 3104 clear_extent_uptodate(io_tree, start, end, NULL); 3105 3106 /* Drop the cache for the part of the extent we didn't write. */ 3107 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); 3108 3109 /* 3110 * If the ordered extent had an IOERR or something else went 3111 * wrong we need to return the space for this ordered extent 3112 * back to the allocator. We only free the extent in the 3113 * truncated case if we didn't write out the extent at all. 3114 * 3115 * If we made it past insert_reserved_file_extent before we 3116 * errored out then we don't need to do this as the accounting 3117 * has already been done. 3118 */ 3119 if ((ret || !logical_len) && 3120 clear_reserved_extent && 3121 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3122 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) 3123 btrfs_free_reserved_extent(fs_info, 3124 ordered_extent->start, 3125 ordered_extent->disk_len, 1); 3126 } 3127 3128 3129 /* 3130 * This needs to be done to make sure anybody waiting knows we are done 3131 * updating everything for this ordered extent. 3132 */ 3133 btrfs_remove_ordered_extent(inode, ordered_extent); 3134 3135 /* for snapshot-aware defrag */ 3136 if (new) { 3137 if (ret) { 3138 free_sa_defrag_extent(new); 3139 atomic_dec(&fs_info->defrag_running); 3140 } else { 3141 relink_file_extents(new); 3142 } 3143 } 3144 3145 /* once for us */ 3146 btrfs_put_ordered_extent(ordered_extent); 3147 /* once for the tree */ 3148 btrfs_put_ordered_extent(ordered_extent); 3149 3150 /* Try to release some metadata so we don't get an OOM but don't wait */ 3151 btrfs_btree_balance_dirty_nodelay(fs_info); 3152 3153 return ret; 3154 } 3155 3156 static void finish_ordered_fn(struct btrfs_work *work) 3157 { 3158 struct btrfs_ordered_extent *ordered_extent; 3159 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 3160 btrfs_finish_ordered_io(ordered_extent); 3161 } 3162 3163 static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 3164 struct extent_state *state, int uptodate) 3165 { 3166 struct inode *inode = page->mapping->host; 3167 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3168 struct btrfs_ordered_extent *ordered_extent = NULL; 3169 struct btrfs_workqueue *wq; 3170 btrfs_work_func_t func; 3171 3172 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 3173 3174 ClearPagePrivate2(page); 3175 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 3176 end - start + 1, uptodate)) 3177 return; 3178 3179 if (btrfs_is_free_space_inode(BTRFS_I(inode))) { 3180 wq = fs_info->endio_freespace_worker; 3181 func = btrfs_freespace_write_helper; 3182 } else { 3183 wq = fs_info->endio_write_workers; 3184 func = btrfs_endio_write_helper; 3185 } 3186 3187 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, 3188 NULL); 3189 btrfs_queue_work(wq, &ordered_extent->work); 3190 } 3191 3192 static int __readpage_endio_check(struct inode *inode, 3193 struct btrfs_io_bio *io_bio, 3194 int icsum, struct page *page, 3195 int pgoff, u64 start, size_t len) 3196 { 3197 char *kaddr; 3198 u32 csum_expected; 3199 u32 csum = ~(u32)0; 3200 3201 csum_expected = *(((u32 *)io_bio->csum) + icsum); 3202 3203 kaddr = kmap_atomic(page); 3204 csum = btrfs_csum_data(kaddr + pgoff, csum, len); 3205 btrfs_csum_final(csum, (u8 *)&csum); 3206 if (csum != csum_expected) 3207 goto zeroit; 3208 3209 kunmap_atomic(kaddr); 3210 return 0; 3211 zeroit: 3212 btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, 3213 io_bio->mirror_num); 3214 memset(kaddr + pgoff, 1, len); 3215 flush_dcache_page(page); 3216 kunmap_atomic(kaddr); 3217 return -EIO; 3218 } 3219 3220 /* 3221 * when reads are done, we need to check csums to verify the data is correct 3222 * if there's a match, we allow the bio to finish. If not, the code in 3223 * extent_io.c will try to find good copies for us. 3224 */ 3225 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, 3226 u64 phy_offset, struct page *page, 3227 u64 start, u64 end, int mirror) 3228 { 3229 size_t offset = start - page_offset(page); 3230 struct inode *inode = page->mapping->host; 3231 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3232 struct btrfs_root *root = BTRFS_I(inode)->root; 3233 3234 if (PageChecked(page)) { 3235 ClearPageChecked(page); 3236 return 0; 3237 } 3238 3239 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 3240 return 0; 3241 3242 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 3243 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 3244 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); 3245 return 0; 3246 } 3247 3248 phy_offset >>= inode->i_sb->s_blocksize_bits; 3249 return __readpage_endio_check(inode, io_bio, phy_offset, page, offset, 3250 start, (size_t)(end - start + 1)); 3251 } 3252 3253 /* 3254 * btrfs_add_delayed_iput - perform a delayed iput on @inode 3255 * 3256 * @inode: The inode we want to perform iput on 3257 * 3258 * This function uses the generic vfs_inode::i_count to track whether we should 3259 * just decrement it (in case it's > 1) or if this is the last iput then link 3260 * the inode to the delayed iput machinery. Delayed iputs are processed at 3261 * transaction commit time/superblock commit/cleaner kthread. 3262 */ 3263 void btrfs_add_delayed_iput(struct inode *inode) 3264 { 3265 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3266 struct btrfs_inode *binode = BTRFS_I(inode); 3267 3268 if (atomic_add_unless(&inode->i_count, -1, 1)) 3269 return; 3270 3271 spin_lock(&fs_info->delayed_iput_lock); 3272 ASSERT(list_empty(&binode->delayed_iput)); 3273 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); 3274 spin_unlock(&fs_info->delayed_iput_lock); 3275 } 3276 3277 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) 3278 { 3279 3280 spin_lock(&fs_info->delayed_iput_lock); 3281 while (!list_empty(&fs_info->delayed_iputs)) { 3282 struct btrfs_inode *inode; 3283 3284 inode = list_first_entry(&fs_info->delayed_iputs, 3285 struct btrfs_inode, delayed_iput); 3286 list_del_init(&inode->delayed_iput); 3287 spin_unlock(&fs_info->delayed_iput_lock); 3288 iput(&inode->vfs_inode); 3289 spin_lock(&fs_info->delayed_iput_lock); 3290 } 3291 spin_unlock(&fs_info->delayed_iput_lock); 3292 } 3293 3294 /* 3295 * This creates an orphan entry for the given inode in case something goes wrong 3296 * in the middle of an unlink. 3297 */ 3298 int btrfs_orphan_add(struct btrfs_trans_handle *trans, 3299 struct btrfs_inode *inode) 3300 { 3301 int ret; 3302 3303 ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); 3304 if (ret && ret != -EEXIST) { 3305 btrfs_abort_transaction(trans, ret); 3306 return ret; 3307 } 3308 3309 return 0; 3310 } 3311 3312 /* 3313 * We have done the delete so we can go ahead and remove the orphan item for 3314 * this particular inode. 3315 */ 3316 static int btrfs_orphan_del(struct btrfs_trans_handle *trans, 3317 struct btrfs_inode *inode) 3318 { 3319 return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); 3320 } 3321 3322 /* 3323 * this cleans up any orphans that may be left on the list from the last use 3324 * of this root. 3325 */ 3326 int btrfs_orphan_cleanup(struct btrfs_root *root) 3327 { 3328 struct btrfs_fs_info *fs_info = root->fs_info; 3329 struct btrfs_path *path; 3330 struct extent_buffer *leaf; 3331 struct btrfs_key key, found_key; 3332 struct btrfs_trans_handle *trans; 3333 struct inode *inode; 3334 u64 last_objectid = 0; 3335 int ret = 0, nr_unlink = 0; 3336 3337 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 3338 return 0; 3339 3340 path = btrfs_alloc_path(); 3341 if (!path) { 3342 ret = -ENOMEM; 3343 goto out; 3344 } 3345 path->reada = READA_BACK; 3346 3347 key.objectid = BTRFS_ORPHAN_OBJECTID; 3348 key.type = BTRFS_ORPHAN_ITEM_KEY; 3349 key.offset = (u64)-1; 3350 3351 while (1) { 3352 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3353 if (ret < 0) 3354 goto out; 3355 3356 /* 3357 * if ret == 0 means we found what we were searching for, which 3358 * is weird, but possible, so only screw with path if we didn't 3359 * find the key and see if we have stuff that matches 3360 */ 3361 if (ret > 0) { 3362 ret = 0; 3363 if (path->slots[0] == 0) 3364 break; 3365 path->slots[0]--; 3366 } 3367 3368 /* pull out the item */ 3369 leaf = path->nodes[0]; 3370 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3371 3372 /* make sure the item matches what we want */ 3373 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3374 break; 3375 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) 3376 break; 3377 3378 /* release the path since we're done with it */ 3379 btrfs_release_path(path); 3380 3381 /* 3382 * this is where we are basically btrfs_lookup, without the 3383 * crossing root thing. we store the inode number in the 3384 * offset of the orphan item. 3385 */ 3386 3387 if (found_key.offset == last_objectid) { 3388 btrfs_err(fs_info, 3389 "Error removing orphan entry, stopping orphan cleanup"); 3390 ret = -EINVAL; 3391 goto out; 3392 } 3393 3394 last_objectid = found_key.offset; 3395 3396 found_key.objectid = found_key.offset; 3397 found_key.type = BTRFS_INODE_ITEM_KEY; 3398 found_key.offset = 0; 3399 inode = btrfs_iget(fs_info->sb, &found_key, root, NULL); 3400 ret = PTR_ERR_OR_ZERO(inode); 3401 if (ret && ret != -ENOENT) 3402 goto out; 3403 3404 if (ret == -ENOENT && root == fs_info->tree_root) { 3405 struct btrfs_root *dead_root; 3406 struct btrfs_fs_info *fs_info = root->fs_info; 3407 int is_dead_root = 0; 3408 3409 /* 3410 * this is an orphan in the tree root. Currently these 3411 * could come from 2 sources: 3412 * a) a snapshot deletion in progress 3413 * b) a free space cache inode 3414 * We need to distinguish those two, as the snapshot 3415 * orphan must not get deleted. 3416 * find_dead_roots already ran before us, so if this 3417 * is a snapshot deletion, we should find the root 3418 * in the dead_roots list 3419 */ 3420 spin_lock(&fs_info->trans_lock); 3421 list_for_each_entry(dead_root, &fs_info->dead_roots, 3422 root_list) { 3423 if (dead_root->root_key.objectid == 3424 found_key.objectid) { 3425 is_dead_root = 1; 3426 break; 3427 } 3428 } 3429 spin_unlock(&fs_info->trans_lock); 3430 if (is_dead_root) { 3431 /* prevent this orphan from being found again */ 3432 key.offset = found_key.objectid - 1; 3433 continue; 3434 } 3435 3436 } 3437 3438 /* 3439 * If we have an inode with links, there are a couple of 3440 * possibilities. Old kernels (before v3.12) used to create an 3441 * orphan item for truncate indicating that there were possibly 3442 * extent items past i_size that needed to be deleted. In v3.12, 3443 * truncate was changed to update i_size in sync with the extent 3444 * items, but the (useless) orphan item was still created. Since 3445 * v4.18, we don't create the orphan item for truncate at all. 3446 * 3447 * So, this item could mean that we need to do a truncate, but 3448 * only if this filesystem was last used on a pre-v3.12 kernel 3449 * and was not cleanly unmounted. The odds of that are quite 3450 * slim, and it's a pain to do the truncate now, so just delete 3451 * the orphan item. 3452 * 3453 * It's also possible that this orphan item was supposed to be 3454 * deleted but wasn't. The inode number may have been reused, 3455 * but either way, we can delete the orphan item. 3456 */ 3457 if (ret == -ENOENT || inode->i_nlink) { 3458 if (!ret) 3459 iput(inode); 3460 trans = btrfs_start_transaction(root, 1); 3461 if (IS_ERR(trans)) { 3462 ret = PTR_ERR(trans); 3463 goto out; 3464 } 3465 btrfs_debug(fs_info, "auto deleting %Lu", 3466 found_key.objectid); 3467 ret = btrfs_del_orphan_item(trans, root, 3468 found_key.objectid); 3469 btrfs_end_transaction(trans); 3470 if (ret) 3471 goto out; 3472 continue; 3473 } 3474 3475 nr_unlink++; 3476 3477 /* this will do delete_inode and everything for us */ 3478 iput(inode); 3479 } 3480 /* release the path since we're done with it */ 3481 btrfs_release_path(path); 3482 3483 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 3484 3485 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { 3486 trans = btrfs_join_transaction(root); 3487 if (!IS_ERR(trans)) 3488 btrfs_end_transaction(trans); 3489 } 3490 3491 if (nr_unlink) 3492 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); 3493 3494 out: 3495 if (ret) 3496 btrfs_err(fs_info, "could not do orphan cleanup %d", ret); 3497 btrfs_free_path(path); 3498 return ret; 3499 } 3500 3501 /* 3502 * very simple check to peek ahead in the leaf looking for xattrs. If we 3503 * don't find any xattrs, we know there can't be any acls. 3504 * 3505 * slot is the slot the inode is in, objectid is the objectid of the inode 3506 */ 3507 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 3508 int slot, u64 objectid, 3509 int *first_xattr_slot) 3510 { 3511 u32 nritems = btrfs_header_nritems(leaf); 3512 struct btrfs_key found_key; 3513 static u64 xattr_access = 0; 3514 static u64 xattr_default = 0; 3515 int scanned = 0; 3516 3517 if (!xattr_access) { 3518 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, 3519 strlen(XATTR_NAME_POSIX_ACL_ACCESS)); 3520 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, 3521 strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); 3522 } 3523 3524 slot++; 3525 *first_xattr_slot = -1; 3526 while (slot < nritems) { 3527 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3528 3529 /* we found a different objectid, there must not be acls */ 3530 if (found_key.objectid != objectid) 3531 return 0; 3532 3533 /* we found an xattr, assume we've got an acl */ 3534 if (found_key.type == BTRFS_XATTR_ITEM_KEY) { 3535 if (*first_xattr_slot == -1) 3536 *first_xattr_slot = slot; 3537 if (found_key.offset == xattr_access || 3538 found_key.offset == xattr_default) 3539 return 1; 3540 } 3541 3542 /* 3543 * we found a key greater than an xattr key, there can't 3544 * be any acls later on 3545 */ 3546 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 3547 return 0; 3548 3549 slot++; 3550 scanned++; 3551 3552 /* 3553 * it goes inode, inode backrefs, xattrs, extents, 3554 * so if there are a ton of hard links to an inode there can 3555 * be a lot of backrefs. Don't waste time searching too hard, 3556 * this is just an optimization 3557 */ 3558 if (scanned >= 8) 3559 break; 3560 } 3561 /* we hit the end of the leaf before we found an xattr or 3562 * something larger than an xattr. We have to assume the inode 3563 * has acls 3564 */ 3565 if (*first_xattr_slot == -1) 3566 *first_xattr_slot = slot; 3567 return 1; 3568 } 3569 3570 /* 3571 * read an inode from the btree into the in-memory inode 3572 */ 3573 static int btrfs_read_locked_inode(struct inode *inode) 3574 { 3575 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3576 struct btrfs_path *path; 3577 struct extent_buffer *leaf; 3578 struct btrfs_inode_item *inode_item; 3579 struct btrfs_root *root = BTRFS_I(inode)->root; 3580 struct btrfs_key location; 3581 unsigned long ptr; 3582 int maybe_acls; 3583 u32 rdev; 3584 int ret; 3585 bool filled = false; 3586 int first_xattr_slot; 3587 3588 ret = btrfs_fill_inode(inode, &rdev); 3589 if (!ret) 3590 filled = true; 3591 3592 path = btrfs_alloc_path(); 3593 if (!path) 3594 return -ENOMEM; 3595 3596 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3597 3598 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3599 if (ret) { 3600 btrfs_free_path(path); 3601 return ret; 3602 } 3603 3604 leaf = path->nodes[0]; 3605 3606 if (filled) 3607 goto cache_index; 3608 3609 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3610 struct btrfs_inode_item); 3611 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 3612 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 3613 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); 3614 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); 3615 btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); 3616 3617 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); 3618 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); 3619 3620 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); 3621 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); 3622 3623 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); 3624 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); 3625 3626 BTRFS_I(inode)->i_otime.tv_sec = 3627 btrfs_timespec_sec(leaf, &inode_item->otime); 3628 BTRFS_I(inode)->i_otime.tv_nsec = 3629 btrfs_timespec_nsec(leaf, &inode_item->otime); 3630 3631 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 3632 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 3633 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); 3634 3635 inode_set_iversion_queried(inode, 3636 btrfs_inode_sequence(leaf, inode_item)); 3637 inode->i_generation = BTRFS_I(inode)->generation; 3638 inode->i_rdev = 0; 3639 rdev = btrfs_inode_rdev(leaf, inode_item); 3640 3641 BTRFS_I(inode)->index_cnt = (u64)-1; 3642 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 3643 3644 cache_index: 3645 /* 3646 * If we were modified in the current generation and evicted from memory 3647 * and then re-read we need to do a full sync since we don't have any 3648 * idea about which extents were modified before we were evicted from 3649 * cache. 3650 * 3651 * This is required for both inode re-read from disk and delayed inode 3652 * in delayed_nodes_tree. 3653 */ 3654 if (BTRFS_I(inode)->last_trans == fs_info->generation) 3655 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3656 &BTRFS_I(inode)->runtime_flags); 3657 3658 /* 3659 * We don't persist the id of the transaction where an unlink operation 3660 * against the inode was last made. So here we assume the inode might 3661 * have been evicted, and therefore the exact value of last_unlink_trans 3662 * lost, and set it to last_trans to avoid metadata inconsistencies 3663 * between the inode and its parent if the inode is fsync'ed and the log 3664 * replayed. For example, in the scenario: 3665 * 3666 * touch mydir/foo 3667 * ln mydir/foo mydir/bar 3668 * sync 3669 * unlink mydir/bar 3670 * echo 2 > /proc/sys/vm/drop_caches # evicts inode 3671 * xfs_io -c fsync mydir/foo 3672 * <power failure> 3673 * mount fs, triggers fsync log replay 3674 * 3675 * We must make sure that when we fsync our inode foo we also log its 3676 * parent inode, otherwise after log replay the parent still has the 3677 * dentry with the "bar" name but our inode foo has a link count of 1 3678 * and doesn't have an inode ref with the name "bar" anymore. 3679 * 3680 * Setting last_unlink_trans to last_trans is a pessimistic approach, 3681 * but it guarantees correctness at the expense of occasional full 3682 * transaction commits on fsync if our inode is a directory, or if our 3683 * inode is not a directory, logging its parent unnecessarily. 3684 */ 3685 BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; 3686 3687 path->slots[0]++; 3688 if (inode->i_nlink != 1 || 3689 path->slots[0] >= btrfs_header_nritems(leaf)) 3690 goto cache_acl; 3691 3692 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); 3693 if (location.objectid != btrfs_ino(BTRFS_I(inode))) 3694 goto cache_acl; 3695 3696 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 3697 if (location.type == BTRFS_INODE_REF_KEY) { 3698 struct btrfs_inode_ref *ref; 3699 3700 ref = (struct btrfs_inode_ref *)ptr; 3701 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); 3702 } else if (location.type == BTRFS_INODE_EXTREF_KEY) { 3703 struct btrfs_inode_extref *extref; 3704 3705 extref = (struct btrfs_inode_extref *)ptr; 3706 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, 3707 extref); 3708 } 3709 cache_acl: 3710 /* 3711 * try to precache a NULL acl entry for files that don't have 3712 * any xattrs or acls 3713 */ 3714 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 3715 btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); 3716 if (first_xattr_slot != -1) { 3717 path->slots[0] = first_xattr_slot; 3718 ret = btrfs_load_inode_props(inode, path); 3719 if (ret) 3720 btrfs_err(fs_info, 3721 "error loading props for ino %llu (root %llu): %d", 3722 btrfs_ino(BTRFS_I(inode)), 3723 root->root_key.objectid, ret); 3724 } 3725 btrfs_free_path(path); 3726 3727 if (!maybe_acls) 3728 cache_no_acl(inode); 3729 3730 switch (inode->i_mode & S_IFMT) { 3731 case S_IFREG: 3732 inode->i_mapping->a_ops = &btrfs_aops; 3733 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 3734 inode->i_fop = &btrfs_file_operations; 3735 inode->i_op = &btrfs_file_inode_operations; 3736 break; 3737 case S_IFDIR: 3738 inode->i_fop = &btrfs_dir_file_operations; 3739 inode->i_op = &btrfs_dir_inode_operations; 3740 break; 3741 case S_IFLNK: 3742 inode->i_op = &btrfs_symlink_inode_operations; 3743 inode_nohighmem(inode); 3744 inode->i_mapping->a_ops = &btrfs_aops; 3745 break; 3746 default: 3747 inode->i_op = &btrfs_special_inode_operations; 3748 init_special_inode(inode, inode->i_mode, rdev); 3749 break; 3750 } 3751 3752 btrfs_sync_inode_flags_to_i_flags(inode); 3753 return 0; 3754 } 3755 3756 /* 3757 * given a leaf and an inode, copy the inode fields into the leaf 3758 */ 3759 static void fill_inode_item(struct btrfs_trans_handle *trans, 3760 struct extent_buffer *leaf, 3761 struct btrfs_inode_item *item, 3762 struct inode *inode) 3763 { 3764 struct btrfs_map_token token; 3765 3766 btrfs_init_map_token(&token); 3767 3768 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3769 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3770 btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, 3771 &token); 3772 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3773 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3774 3775 btrfs_set_token_timespec_sec(leaf, &item->atime, 3776 inode->i_atime.tv_sec, &token); 3777 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3778 inode->i_atime.tv_nsec, &token); 3779 3780 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3781 inode->i_mtime.tv_sec, &token); 3782 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3783 inode->i_mtime.tv_nsec, &token); 3784 3785 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3786 inode->i_ctime.tv_sec, &token); 3787 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3788 inode->i_ctime.tv_nsec, &token); 3789 3790 btrfs_set_token_timespec_sec(leaf, &item->otime, 3791 BTRFS_I(inode)->i_otime.tv_sec, &token); 3792 btrfs_set_token_timespec_nsec(leaf, &item->otime, 3793 BTRFS_I(inode)->i_otime.tv_nsec, &token); 3794 3795 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3796 &token); 3797 btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, 3798 &token); 3799 btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode), 3800 &token); 3801 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3802 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3803 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3804 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3805 } 3806 3807 /* 3808 * copy everything in the in-memory inode into the btree. 3809 */ 3810 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 3811 struct btrfs_root *root, struct inode *inode) 3812 { 3813 struct btrfs_inode_item *inode_item; 3814 struct btrfs_path *path; 3815 struct extent_buffer *leaf; 3816 int ret; 3817 3818 path = btrfs_alloc_path(); 3819 if (!path) 3820 return -ENOMEM; 3821 3822 path->leave_spinning = 1; 3823 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 3824 1); 3825 if (ret) { 3826 if (ret > 0) 3827 ret = -ENOENT; 3828 goto failed; 3829 } 3830 3831 leaf = path->nodes[0]; 3832 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3833 struct btrfs_inode_item); 3834 3835 fill_inode_item(trans, leaf, inode_item, inode); 3836 btrfs_mark_buffer_dirty(leaf); 3837 btrfs_set_inode_last_trans(trans, inode); 3838 ret = 0; 3839 failed: 3840 btrfs_free_path(path); 3841 return ret; 3842 } 3843 3844 /* 3845 * copy everything in the in-memory inode into the btree. 3846 */ 3847 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 3848 struct btrfs_root *root, struct inode *inode) 3849 { 3850 struct btrfs_fs_info *fs_info = root->fs_info; 3851 int ret; 3852 3853 /* 3854 * If the inode is a free space inode, we can deadlock during commit 3855 * if we put it into the delayed code. 3856 * 3857 * The data relocation inode should also be directly updated 3858 * without delay 3859 */ 3860 if (!btrfs_is_free_space_inode(BTRFS_I(inode)) 3861 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 3862 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { 3863 btrfs_update_root_times(trans, root); 3864 3865 ret = btrfs_delayed_update_inode(trans, root, inode); 3866 if (!ret) 3867 btrfs_set_inode_last_trans(trans, inode); 3868 return ret; 3869 } 3870 3871 return btrfs_update_inode_item(trans, root, inode); 3872 } 3873 3874 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 3875 struct btrfs_root *root, 3876 struct inode *inode) 3877 { 3878 int ret; 3879 3880 ret = btrfs_update_inode(trans, root, inode); 3881 if (ret == -ENOSPC) 3882 return btrfs_update_inode_item(trans, root, inode); 3883 return ret; 3884 } 3885 3886 /* 3887 * unlink helper that gets used here in inode.c and in the tree logging 3888 * recovery code. It remove a link in a directory with a given name, and 3889 * also drops the back refs in the inode to the directory 3890 */ 3891 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 3892 struct btrfs_root *root, 3893 struct btrfs_inode *dir, 3894 struct btrfs_inode *inode, 3895 const char *name, int name_len) 3896 { 3897 struct btrfs_fs_info *fs_info = root->fs_info; 3898 struct btrfs_path *path; 3899 int ret = 0; 3900 struct extent_buffer *leaf; 3901 struct btrfs_dir_item *di; 3902 struct btrfs_key key; 3903 u64 index; 3904 u64 ino = btrfs_ino(inode); 3905 u64 dir_ino = btrfs_ino(dir); 3906 3907 path = btrfs_alloc_path(); 3908 if (!path) { 3909 ret = -ENOMEM; 3910 goto out; 3911 } 3912 3913 path->leave_spinning = 1; 3914 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 3915 name, name_len, -1); 3916 if (IS_ERR_OR_NULL(di)) { 3917 ret = di ? PTR_ERR(di) : -ENOENT; 3918 goto err; 3919 } 3920 leaf = path->nodes[0]; 3921 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3922 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3923 if (ret) 3924 goto err; 3925 btrfs_release_path(path); 3926 3927 /* 3928 * If we don't have dir index, we have to get it by looking up 3929 * the inode ref, since we get the inode ref, remove it directly, 3930 * it is unnecessary to do delayed deletion. 3931 * 3932 * But if we have dir index, needn't search inode ref to get it. 3933 * Since the inode ref is close to the inode item, it is better 3934 * that we delay to delete it, and just do this deletion when 3935 * we update the inode item. 3936 */ 3937 if (inode->dir_index) { 3938 ret = btrfs_delayed_delete_inode_ref(inode); 3939 if (!ret) { 3940 index = inode->dir_index; 3941 goto skip_backref; 3942 } 3943 } 3944 3945 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 3946 dir_ino, &index); 3947 if (ret) { 3948 btrfs_info(fs_info, 3949 "failed to delete reference to %.*s, inode %llu parent %llu", 3950 name_len, name, ino, dir_ino); 3951 btrfs_abort_transaction(trans, ret); 3952 goto err; 3953 } 3954 skip_backref: 3955 ret = btrfs_delete_delayed_dir_index(trans, dir, index); 3956 if (ret) { 3957 btrfs_abort_transaction(trans, ret); 3958 goto err; 3959 } 3960 3961 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, 3962 dir_ino); 3963 if (ret != 0 && ret != -ENOENT) { 3964 btrfs_abort_transaction(trans, ret); 3965 goto err; 3966 } 3967 3968 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, 3969 index); 3970 if (ret == -ENOENT) 3971 ret = 0; 3972 else if (ret) 3973 btrfs_abort_transaction(trans, ret); 3974 err: 3975 btrfs_free_path(path); 3976 if (ret) 3977 goto out; 3978 3979 btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); 3980 inode_inc_iversion(&inode->vfs_inode); 3981 inode_inc_iversion(&dir->vfs_inode); 3982 inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = 3983 dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); 3984 ret = btrfs_update_inode(trans, root, &dir->vfs_inode); 3985 out: 3986 return ret; 3987 } 3988 3989 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 3990 struct btrfs_root *root, 3991 struct btrfs_inode *dir, struct btrfs_inode *inode, 3992 const char *name, int name_len) 3993 { 3994 int ret; 3995 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 3996 if (!ret) { 3997 drop_nlink(&inode->vfs_inode); 3998 ret = btrfs_update_inode(trans, root, &inode->vfs_inode); 3999 } 4000 return ret; 4001 } 4002 4003 /* 4004 * helper to start transaction for unlink and rmdir. 4005 * 4006 * unlink and rmdir are special in btrfs, they do not always free space, so 4007 * if we cannot make our reservations the normal way try and see if there is 4008 * plenty of slack room in the global reserve to migrate, otherwise we cannot 4009 * allow the unlink to occur. 4010 */ 4011 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) 4012 { 4013 struct btrfs_root *root = BTRFS_I(dir)->root; 4014 4015 /* 4016 * 1 for the possible orphan item 4017 * 1 for the dir item 4018 * 1 for the dir index 4019 * 1 for the inode ref 4020 * 1 for the inode 4021 */ 4022 return btrfs_start_transaction_fallback_global_rsv(root, 5, 5); 4023 } 4024 4025 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 4026 { 4027 struct btrfs_root *root = BTRFS_I(dir)->root; 4028 struct btrfs_trans_handle *trans; 4029 struct inode *inode = d_inode(dentry); 4030 int ret; 4031 4032 trans = __unlink_start_trans(dir); 4033 if (IS_ERR(trans)) 4034 return PTR_ERR(trans); 4035 4036 btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 4037 0); 4038 4039 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 4040 BTRFS_I(d_inode(dentry)), dentry->d_name.name, 4041 dentry->d_name.len); 4042 if (ret) 4043 goto out; 4044 4045 if (inode->i_nlink == 0) { 4046 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 4047 if (ret) 4048 goto out; 4049 } 4050 4051 out: 4052 btrfs_end_transaction(trans); 4053 btrfs_btree_balance_dirty(root->fs_info); 4054 return ret; 4055 } 4056 4057 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 4058 struct inode *dir, u64 objectid, 4059 const char *name, int name_len) 4060 { 4061 struct btrfs_root *root = BTRFS_I(dir)->root; 4062 struct btrfs_path *path; 4063 struct extent_buffer *leaf; 4064 struct btrfs_dir_item *di; 4065 struct btrfs_key key; 4066 u64 index; 4067 int ret; 4068 u64 dir_ino = btrfs_ino(BTRFS_I(dir)); 4069 4070 path = btrfs_alloc_path(); 4071 if (!path) 4072 return -ENOMEM; 4073 4074 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 4075 name, name_len, -1); 4076 if (IS_ERR_OR_NULL(di)) { 4077 ret = di ? PTR_ERR(di) : -ENOENT; 4078 goto out; 4079 } 4080 4081 leaf = path->nodes[0]; 4082 btrfs_dir_item_key_to_cpu(leaf, di, &key); 4083 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 4084 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4085 if (ret) { 4086 btrfs_abort_transaction(trans, ret); 4087 goto out; 4088 } 4089 btrfs_release_path(path); 4090 4091 ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, 4092 dir_ino, &index, name, name_len); 4093 if (ret < 0) { 4094 if (ret != -ENOENT) { 4095 btrfs_abort_transaction(trans, ret); 4096 goto out; 4097 } 4098 di = btrfs_search_dir_index_item(root, path, dir_ino, 4099 name, name_len); 4100 if (IS_ERR_OR_NULL(di)) { 4101 if (!di) 4102 ret = -ENOENT; 4103 else 4104 ret = PTR_ERR(di); 4105 btrfs_abort_transaction(trans, ret); 4106 goto out; 4107 } 4108 4109 leaf = path->nodes[0]; 4110 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4111 index = key.offset; 4112 } 4113 btrfs_release_path(path); 4114 4115 ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); 4116 if (ret) { 4117 btrfs_abort_transaction(trans, ret); 4118 goto out; 4119 } 4120 4121 btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); 4122 inode_inc_iversion(dir); 4123 dir->i_mtime = dir->i_ctime = current_time(dir); 4124 ret = btrfs_update_inode_fallback(trans, root, dir); 4125 if (ret) 4126 btrfs_abort_transaction(trans, ret); 4127 out: 4128 btrfs_free_path(path); 4129 return ret; 4130 } 4131 4132 /* 4133 * Helper to check if the subvolume references other subvolumes or if it's 4134 * default. 4135 */ 4136 static noinline int may_destroy_subvol(struct btrfs_root *root) 4137 { 4138 struct btrfs_fs_info *fs_info = root->fs_info; 4139 struct btrfs_path *path; 4140 struct btrfs_dir_item *di; 4141 struct btrfs_key key; 4142 u64 dir_id; 4143 int ret; 4144 4145 path = btrfs_alloc_path(); 4146 if (!path) 4147 return -ENOMEM; 4148 4149 /* Make sure this root isn't set as the default subvol */ 4150 dir_id = btrfs_super_root_dir(fs_info->super_copy); 4151 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, 4152 dir_id, "default", 7, 0); 4153 if (di && !IS_ERR(di)) { 4154 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 4155 if (key.objectid == root->root_key.objectid) { 4156 ret = -EPERM; 4157 btrfs_err(fs_info, 4158 "deleting default subvolume %llu is not allowed", 4159 key.objectid); 4160 goto out; 4161 } 4162 btrfs_release_path(path); 4163 } 4164 4165 key.objectid = root->root_key.objectid; 4166 key.type = BTRFS_ROOT_REF_KEY; 4167 key.offset = (u64)-1; 4168 4169 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4170 if (ret < 0) 4171 goto out; 4172 BUG_ON(ret == 0); 4173 4174 ret = 0; 4175 if (path->slots[0] > 0) { 4176 path->slots[0]--; 4177 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 4178 if (key.objectid == root->root_key.objectid && 4179 key.type == BTRFS_ROOT_REF_KEY) 4180 ret = -ENOTEMPTY; 4181 } 4182 out: 4183 btrfs_free_path(path); 4184 return ret; 4185 } 4186 4187 /* Delete all dentries for inodes belonging to the root */ 4188 static void btrfs_prune_dentries(struct btrfs_root *root) 4189 { 4190 struct btrfs_fs_info *fs_info = root->fs_info; 4191 struct rb_node *node; 4192 struct rb_node *prev; 4193 struct btrfs_inode *entry; 4194 struct inode *inode; 4195 u64 objectid = 0; 4196 4197 if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 4198 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4199 4200 spin_lock(&root->inode_lock); 4201 again: 4202 node = root->inode_tree.rb_node; 4203 prev = NULL; 4204 while (node) { 4205 prev = node; 4206 entry = rb_entry(node, struct btrfs_inode, rb_node); 4207 4208 if (objectid < btrfs_ino(entry)) 4209 node = node->rb_left; 4210 else if (objectid > btrfs_ino(entry)) 4211 node = node->rb_right; 4212 else 4213 break; 4214 } 4215 if (!node) { 4216 while (prev) { 4217 entry = rb_entry(prev, struct btrfs_inode, rb_node); 4218 if (objectid <= btrfs_ino(entry)) { 4219 node = prev; 4220 break; 4221 } 4222 prev = rb_next(prev); 4223 } 4224 } 4225 while (node) { 4226 entry = rb_entry(node, struct btrfs_inode, rb_node); 4227 objectid = btrfs_ino(entry) + 1; 4228 inode = igrab(&entry->vfs_inode); 4229 if (inode) { 4230 spin_unlock(&root->inode_lock); 4231 if (atomic_read(&inode->i_count) > 1) 4232 d_prune_aliases(inode); 4233 /* 4234 * btrfs_drop_inode will have it removed from the inode 4235 * cache when its usage count hits zero. 4236 */ 4237 iput(inode); 4238 cond_resched(); 4239 spin_lock(&root->inode_lock); 4240 goto again; 4241 } 4242 4243 if (cond_resched_lock(&root->inode_lock)) 4244 goto again; 4245 4246 node = rb_next(node); 4247 } 4248 spin_unlock(&root->inode_lock); 4249 } 4250 4251 int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) 4252 { 4253 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); 4254 struct btrfs_root *root = BTRFS_I(dir)->root; 4255 struct inode *inode = d_inode(dentry); 4256 struct btrfs_root *dest = BTRFS_I(inode)->root; 4257 struct btrfs_trans_handle *trans; 4258 struct btrfs_block_rsv block_rsv; 4259 u64 root_flags; 4260 int ret; 4261 int err; 4262 4263 /* 4264 * Don't allow to delete a subvolume with send in progress. This is 4265 * inside the inode lock so the error handling that has to drop the bit 4266 * again is not run concurrently. 4267 */ 4268 spin_lock(&dest->root_item_lock); 4269 if (dest->send_in_progress) { 4270 spin_unlock(&dest->root_item_lock); 4271 btrfs_warn(fs_info, 4272 "attempt to delete subvolume %llu during send", 4273 dest->root_key.objectid); 4274 return -EPERM; 4275 } 4276 root_flags = btrfs_root_flags(&dest->root_item); 4277 btrfs_set_root_flags(&dest->root_item, 4278 root_flags | BTRFS_ROOT_SUBVOL_DEAD); 4279 spin_unlock(&dest->root_item_lock); 4280 4281 down_write(&fs_info->subvol_sem); 4282 4283 err = may_destroy_subvol(dest); 4284 if (err) 4285 goto out_up_write; 4286 4287 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 4288 /* 4289 * One for dir inode, 4290 * two for dir entries, 4291 * two for root ref/backref. 4292 */ 4293 err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); 4294 if (err) 4295 goto out_up_write; 4296 4297 trans = btrfs_start_transaction(root, 0); 4298 if (IS_ERR(trans)) { 4299 err = PTR_ERR(trans); 4300 goto out_release; 4301 } 4302 trans->block_rsv = &block_rsv; 4303 trans->bytes_reserved = block_rsv.size; 4304 4305 btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); 4306 4307 ret = btrfs_unlink_subvol(trans, dir, dest->root_key.objectid, 4308 dentry->d_name.name, dentry->d_name.len); 4309 if (ret) { 4310 err = ret; 4311 btrfs_abort_transaction(trans, ret); 4312 goto out_end_trans; 4313 } 4314 4315 btrfs_record_root_in_trans(trans, dest); 4316 4317 memset(&dest->root_item.drop_progress, 0, 4318 sizeof(dest->root_item.drop_progress)); 4319 dest->root_item.drop_level = 0; 4320 btrfs_set_root_refs(&dest->root_item, 0); 4321 4322 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { 4323 ret = btrfs_insert_orphan_item(trans, 4324 fs_info->tree_root, 4325 dest->root_key.objectid); 4326 if (ret) { 4327 btrfs_abort_transaction(trans, ret); 4328 err = ret; 4329 goto out_end_trans; 4330 } 4331 } 4332 4333 ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, 4334 BTRFS_UUID_KEY_SUBVOL, 4335 dest->root_key.objectid); 4336 if (ret && ret != -ENOENT) { 4337 btrfs_abort_transaction(trans, ret); 4338 err = ret; 4339 goto out_end_trans; 4340 } 4341 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { 4342 ret = btrfs_uuid_tree_remove(trans, 4343 dest->root_item.received_uuid, 4344 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4345 dest->root_key.objectid); 4346 if (ret && ret != -ENOENT) { 4347 btrfs_abort_transaction(trans, ret); 4348 err = ret; 4349 goto out_end_trans; 4350 } 4351 } 4352 4353 out_end_trans: 4354 trans->block_rsv = NULL; 4355 trans->bytes_reserved = 0; 4356 ret = btrfs_end_transaction(trans); 4357 if (ret && !err) 4358 err = ret; 4359 inode->i_flags |= S_DEAD; 4360 out_release: 4361 btrfs_subvolume_release_metadata(fs_info, &block_rsv); 4362 out_up_write: 4363 up_write(&fs_info->subvol_sem); 4364 if (err) { 4365 spin_lock(&dest->root_item_lock); 4366 root_flags = btrfs_root_flags(&dest->root_item); 4367 btrfs_set_root_flags(&dest->root_item, 4368 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); 4369 spin_unlock(&dest->root_item_lock); 4370 } else { 4371 d_invalidate(dentry); 4372 btrfs_prune_dentries(dest); 4373 ASSERT(dest->send_in_progress == 0); 4374 4375 /* the last ref */ 4376 if (dest->ino_cache_inode) { 4377 iput(dest->ino_cache_inode); 4378 dest->ino_cache_inode = NULL; 4379 } 4380 } 4381 4382 return err; 4383 } 4384 4385 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 4386 { 4387 struct inode *inode = d_inode(dentry); 4388 int err = 0; 4389 struct btrfs_root *root = BTRFS_I(dir)->root; 4390 struct btrfs_trans_handle *trans; 4391 u64 last_unlink_trans; 4392 4393 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 4394 return -ENOTEMPTY; 4395 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) 4396 return btrfs_delete_subvolume(dir, dentry); 4397 4398 trans = __unlink_start_trans(dir); 4399 if (IS_ERR(trans)) 4400 return PTR_ERR(trans); 4401 4402 if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 4403 err = btrfs_unlink_subvol(trans, dir, 4404 BTRFS_I(inode)->location.objectid, 4405 dentry->d_name.name, 4406 dentry->d_name.len); 4407 goto out; 4408 } 4409 4410 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 4411 if (err) 4412 goto out; 4413 4414 last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; 4415 4416 /* now the directory is empty */ 4417 err = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 4418 BTRFS_I(d_inode(dentry)), dentry->d_name.name, 4419 dentry->d_name.len); 4420 if (!err) { 4421 btrfs_i_size_write(BTRFS_I(inode), 0); 4422 /* 4423 * Propagate the last_unlink_trans value of the deleted dir to 4424 * its parent directory. This is to prevent an unrecoverable 4425 * log tree in the case we do something like this: 4426 * 1) create dir foo 4427 * 2) create snapshot under dir foo 4428 * 3) delete the snapshot 4429 * 4) rmdir foo 4430 * 5) mkdir foo 4431 * 6) fsync foo or some file inside foo 4432 */ 4433 if (last_unlink_trans >= trans->transid) 4434 BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; 4435 } 4436 out: 4437 btrfs_end_transaction(trans); 4438 btrfs_btree_balance_dirty(root->fs_info); 4439 4440 return err; 4441 } 4442 4443 static int truncate_space_check(struct btrfs_trans_handle *trans, 4444 struct btrfs_root *root, 4445 u64 bytes_deleted) 4446 { 4447 struct btrfs_fs_info *fs_info = root->fs_info; 4448 int ret; 4449 4450 /* 4451 * This is only used to apply pressure to the enospc system, we don't 4452 * intend to use this reservation at all. 4453 */ 4454 bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted); 4455 bytes_deleted *= fs_info->nodesize; 4456 ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, 4457 bytes_deleted, BTRFS_RESERVE_NO_FLUSH); 4458 if (!ret) { 4459 trace_btrfs_space_reservation(fs_info, "transaction", 4460 trans->transid, 4461 bytes_deleted, 1); 4462 trans->bytes_reserved += bytes_deleted; 4463 } 4464 return ret; 4465 4466 } 4467 4468 /* 4469 * Return this if we need to call truncate_block for the last bit of the 4470 * truncate. 4471 */ 4472 #define NEED_TRUNCATE_BLOCK 1 4473 4474 /* 4475 * this can truncate away extent items, csum items and directory items. 4476 * It starts at a high offset and removes keys until it can't find 4477 * any higher than new_size 4478 * 4479 * csum items that cross the new i_size are truncated to the new size 4480 * as well. 4481 * 4482 * min_type is the minimum key type to truncate down to. If set to 0, this 4483 * will kill all the items on this inode, including the INODE_ITEM_KEY. 4484 */ 4485 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 4486 struct btrfs_root *root, 4487 struct inode *inode, 4488 u64 new_size, u32 min_type) 4489 { 4490 struct btrfs_fs_info *fs_info = root->fs_info; 4491 struct btrfs_path *path; 4492 struct extent_buffer *leaf; 4493 struct btrfs_file_extent_item *fi; 4494 struct btrfs_key key; 4495 struct btrfs_key found_key; 4496 u64 extent_start = 0; 4497 u64 extent_num_bytes = 0; 4498 u64 extent_offset = 0; 4499 u64 item_end = 0; 4500 u64 last_size = new_size; 4501 u32 found_type = (u8)-1; 4502 int found_extent; 4503 int del_item; 4504 int pending_del_nr = 0; 4505 int pending_del_slot = 0; 4506 int extent_type = -1; 4507 int ret; 4508 u64 ino = btrfs_ino(BTRFS_I(inode)); 4509 u64 bytes_deleted = 0; 4510 bool be_nice = false; 4511 bool should_throttle = false; 4512 bool should_end = false; 4513 4514 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 4515 4516 /* 4517 * for non-free space inodes and ref cows, we want to back off from 4518 * time to time 4519 */ 4520 if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && 4521 test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4522 be_nice = true; 4523 4524 path = btrfs_alloc_path(); 4525 if (!path) 4526 return -ENOMEM; 4527 path->reada = READA_BACK; 4528 4529 /* 4530 * We want to drop from the next block forward in case this new size is 4531 * not block aligned since we will be keeping the last block of the 4532 * extent just the way it is. 4533 */ 4534 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4535 root == fs_info->tree_root) 4536 btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size, 4537 fs_info->sectorsize), 4538 (u64)-1, 0); 4539 4540 /* 4541 * This function is also used to drop the items in the log tree before 4542 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 4543 * it is used to drop the loged items. So we shouldn't kill the delayed 4544 * items. 4545 */ 4546 if (min_type == 0 && root == BTRFS_I(inode)->root) 4547 btrfs_kill_delayed_inode_items(BTRFS_I(inode)); 4548 4549 key.objectid = ino; 4550 key.offset = (u64)-1; 4551 key.type = (u8)-1; 4552 4553 search_again: 4554 /* 4555 * with a 16K leaf size and 128MB extents, you can actually queue 4556 * up a huge file in a single leaf. Most of the time that 4557 * bytes_deleted is > 0, it will be huge by the time we get here 4558 */ 4559 if (be_nice && bytes_deleted > SZ_32M && 4560 btrfs_should_end_transaction(trans)) { 4561 ret = -EAGAIN; 4562 goto out; 4563 } 4564 4565 path->leave_spinning = 1; 4566 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 4567 if (ret < 0) 4568 goto out; 4569 4570 if (ret > 0) { 4571 ret = 0; 4572 /* there are no items in the tree for us to truncate, we're 4573 * done 4574 */ 4575 if (path->slots[0] == 0) 4576 goto out; 4577 path->slots[0]--; 4578 } 4579 4580 while (1) { 4581 fi = NULL; 4582 leaf = path->nodes[0]; 4583 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4584 found_type = found_key.type; 4585 4586 if (found_key.objectid != ino) 4587 break; 4588 4589 if (found_type < min_type) 4590 break; 4591 4592 item_end = found_key.offset; 4593 if (found_type == BTRFS_EXTENT_DATA_KEY) { 4594 fi = btrfs_item_ptr(leaf, path->slots[0], 4595 struct btrfs_file_extent_item); 4596 extent_type = btrfs_file_extent_type(leaf, fi); 4597 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4598 item_end += 4599 btrfs_file_extent_num_bytes(leaf, fi); 4600 4601 trace_btrfs_truncate_show_fi_regular( 4602 BTRFS_I(inode), leaf, fi, 4603 found_key.offset); 4604 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4605 item_end += btrfs_file_extent_ram_bytes(leaf, 4606 fi); 4607 4608 trace_btrfs_truncate_show_fi_inline( 4609 BTRFS_I(inode), leaf, fi, path->slots[0], 4610 found_key.offset); 4611 } 4612 item_end--; 4613 } 4614 if (found_type > min_type) { 4615 del_item = 1; 4616 } else { 4617 if (item_end < new_size) 4618 break; 4619 if (found_key.offset >= new_size) 4620 del_item = 1; 4621 else 4622 del_item = 0; 4623 } 4624 found_extent = 0; 4625 /* FIXME, shrink the extent if the ref count is only 1 */ 4626 if (found_type != BTRFS_EXTENT_DATA_KEY) 4627 goto delete; 4628 4629 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4630 u64 num_dec; 4631 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 4632 if (!del_item) { 4633 u64 orig_num_bytes = 4634 btrfs_file_extent_num_bytes(leaf, fi); 4635 extent_num_bytes = ALIGN(new_size - 4636 found_key.offset, 4637 fs_info->sectorsize); 4638 btrfs_set_file_extent_num_bytes(leaf, fi, 4639 extent_num_bytes); 4640 num_dec = (orig_num_bytes - 4641 extent_num_bytes); 4642 if (test_bit(BTRFS_ROOT_REF_COWS, 4643 &root->state) && 4644 extent_start != 0) 4645 inode_sub_bytes(inode, num_dec); 4646 btrfs_mark_buffer_dirty(leaf); 4647 } else { 4648 extent_num_bytes = 4649 btrfs_file_extent_disk_num_bytes(leaf, 4650 fi); 4651 extent_offset = found_key.offset - 4652 btrfs_file_extent_offset(leaf, fi); 4653 4654 /* FIXME blocksize != 4096 */ 4655 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 4656 if (extent_start != 0) { 4657 found_extent = 1; 4658 if (test_bit(BTRFS_ROOT_REF_COWS, 4659 &root->state)) 4660 inode_sub_bytes(inode, num_dec); 4661 } 4662 } 4663 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4664 /* 4665 * we can't truncate inline items that have had 4666 * special encodings 4667 */ 4668 if (!del_item && 4669 btrfs_file_extent_encryption(leaf, fi) == 0 && 4670 btrfs_file_extent_other_encoding(leaf, fi) == 0 && 4671 btrfs_file_extent_compression(leaf, fi) == 0) { 4672 u32 size = (u32)(new_size - found_key.offset); 4673 4674 btrfs_set_file_extent_ram_bytes(leaf, fi, size); 4675 size = btrfs_file_extent_calc_inline_size(size); 4676 btrfs_truncate_item(root->fs_info, path, size, 1); 4677 } else if (!del_item) { 4678 /* 4679 * We have to bail so the last_size is set to 4680 * just before this extent. 4681 */ 4682 ret = NEED_TRUNCATE_BLOCK; 4683 break; 4684 } 4685 4686 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4687 inode_sub_bytes(inode, item_end + 1 - new_size); 4688 } 4689 delete: 4690 if (del_item) 4691 last_size = found_key.offset; 4692 else 4693 last_size = new_size; 4694 if (del_item) { 4695 if (!pending_del_nr) { 4696 /* no pending yet, add ourselves */ 4697 pending_del_slot = path->slots[0]; 4698 pending_del_nr = 1; 4699 } else if (pending_del_nr && 4700 path->slots[0] + 1 == pending_del_slot) { 4701 /* hop on the pending chunk */ 4702 pending_del_nr++; 4703 pending_del_slot = path->slots[0]; 4704 } else { 4705 BUG(); 4706 } 4707 } else { 4708 break; 4709 } 4710 should_throttle = false; 4711 4712 if (found_extent && 4713 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4714 root == fs_info->tree_root)) { 4715 btrfs_set_path_blocking(path); 4716 bytes_deleted += extent_num_bytes; 4717 ret = btrfs_free_extent(trans, root, extent_start, 4718 extent_num_bytes, 0, 4719 btrfs_header_owner(leaf), 4720 ino, extent_offset); 4721 if (ret) { 4722 btrfs_abort_transaction(trans, ret); 4723 break; 4724 } 4725 if (btrfs_should_throttle_delayed_refs(trans)) 4726 btrfs_async_run_delayed_refs(fs_info, 4727 trans->delayed_ref_updates * 2, 4728 trans->transid, 0); 4729 if (be_nice) { 4730 if (truncate_space_check(trans, root, 4731 extent_num_bytes)) { 4732 should_end = true; 4733 } 4734 if (btrfs_should_throttle_delayed_refs(trans)) 4735 should_throttle = true; 4736 } 4737 } 4738 4739 if (found_type == BTRFS_INODE_ITEM_KEY) 4740 break; 4741 4742 if (path->slots[0] == 0 || 4743 path->slots[0] != pending_del_slot || 4744 should_throttle || should_end) { 4745 if (pending_del_nr) { 4746 ret = btrfs_del_items(trans, root, path, 4747 pending_del_slot, 4748 pending_del_nr); 4749 if (ret) { 4750 btrfs_abort_transaction(trans, ret); 4751 break; 4752 } 4753 pending_del_nr = 0; 4754 } 4755 btrfs_release_path(path); 4756 if (should_throttle) { 4757 unsigned long updates = trans->delayed_ref_updates; 4758 if (updates) { 4759 trans->delayed_ref_updates = 0; 4760 ret = btrfs_run_delayed_refs(trans, 4761 updates * 2); 4762 if (ret) 4763 break; 4764 } 4765 } 4766 /* 4767 * if we failed to refill our space rsv, bail out 4768 * and let the transaction restart 4769 */ 4770 if (should_end) { 4771 ret = -EAGAIN; 4772 break; 4773 } 4774 goto search_again; 4775 } else { 4776 path->slots[0]--; 4777 } 4778 } 4779 out: 4780 if (ret >= 0 && pending_del_nr) { 4781 int err; 4782 4783 err = btrfs_del_items(trans, root, path, pending_del_slot, 4784 pending_del_nr); 4785 if (err) { 4786 btrfs_abort_transaction(trans, err); 4787 ret = err; 4788 } 4789 } 4790 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4791 ASSERT(last_size >= new_size); 4792 if (!ret && last_size > new_size) 4793 last_size = new_size; 4794 btrfs_ordered_update_i_size(inode, last_size, NULL); 4795 } 4796 4797 btrfs_free_path(path); 4798 4799 if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) { 4800 unsigned long updates = trans->delayed_ref_updates; 4801 int err; 4802 4803 if (updates) { 4804 trans->delayed_ref_updates = 0; 4805 err = btrfs_run_delayed_refs(trans, updates * 2); 4806 if (err) 4807 ret = err; 4808 } 4809 } 4810 return ret; 4811 } 4812 4813 /* 4814 * btrfs_truncate_block - read, zero a chunk and write a block 4815 * @inode - inode that we're zeroing 4816 * @from - the offset to start zeroing 4817 * @len - the length to zero, 0 to zero the entire range respective to the 4818 * offset 4819 * @front - zero up to the offset instead of from the offset on 4820 * 4821 * This will find the block for the "from" offset and cow the block and zero the 4822 * part we want to zero. This is used with truncate and hole punching. 4823 */ 4824 int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, 4825 int front) 4826 { 4827 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4828 struct address_space *mapping = inode->i_mapping; 4829 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4830 struct btrfs_ordered_extent *ordered; 4831 struct extent_state *cached_state = NULL; 4832 struct extent_changeset *data_reserved = NULL; 4833 char *kaddr; 4834 u32 blocksize = fs_info->sectorsize; 4835 pgoff_t index = from >> PAGE_SHIFT; 4836 unsigned offset = from & (blocksize - 1); 4837 struct page *page; 4838 gfp_t mask = btrfs_alloc_write_mask(mapping); 4839 int ret = 0; 4840 u64 block_start; 4841 u64 block_end; 4842 4843 if (IS_ALIGNED(offset, blocksize) && 4844 (!len || IS_ALIGNED(len, blocksize))) 4845 goto out; 4846 4847 block_start = round_down(from, blocksize); 4848 block_end = block_start + blocksize - 1; 4849 4850 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 4851 block_start, blocksize); 4852 if (ret) 4853 goto out; 4854 4855 again: 4856 page = find_or_create_page(mapping, index, mask); 4857 if (!page) { 4858 btrfs_delalloc_release_space(inode, data_reserved, 4859 block_start, blocksize, true); 4860 btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true); 4861 ret = -ENOMEM; 4862 goto out; 4863 } 4864 4865 if (!PageUptodate(page)) { 4866 ret = btrfs_readpage(NULL, page); 4867 lock_page(page); 4868 if (page->mapping != mapping) { 4869 unlock_page(page); 4870 put_page(page); 4871 goto again; 4872 } 4873 if (!PageUptodate(page)) { 4874 ret = -EIO; 4875 goto out_unlock; 4876 } 4877 } 4878 wait_on_page_writeback(page); 4879 4880 lock_extent_bits(io_tree, block_start, block_end, &cached_state); 4881 set_page_extent_mapped(page); 4882 4883 ordered = btrfs_lookup_ordered_extent(inode, block_start); 4884 if (ordered) { 4885 unlock_extent_cached(io_tree, block_start, block_end, 4886 &cached_state); 4887 unlock_page(page); 4888 put_page(page); 4889 btrfs_start_ordered_extent(inode, ordered, 1); 4890 btrfs_put_ordered_extent(ordered); 4891 goto again; 4892 } 4893 4894 clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, 4895 EXTENT_DIRTY | EXTENT_DELALLOC | 4896 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 4897 0, 0, &cached_state); 4898 4899 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, 4900 &cached_state, 0); 4901 if (ret) { 4902 unlock_extent_cached(io_tree, block_start, block_end, 4903 &cached_state); 4904 goto out_unlock; 4905 } 4906 4907 if (offset != blocksize) { 4908 if (!len) 4909 len = blocksize - offset; 4910 kaddr = kmap(page); 4911 if (front) 4912 memset(kaddr + (block_start - page_offset(page)), 4913 0, offset); 4914 else 4915 memset(kaddr + (block_start - page_offset(page)) + offset, 4916 0, len); 4917 flush_dcache_page(page); 4918 kunmap(page); 4919 } 4920 ClearPageChecked(page); 4921 set_page_dirty(page); 4922 unlock_extent_cached(io_tree, block_start, block_end, &cached_state); 4923 4924 out_unlock: 4925 if (ret) 4926 btrfs_delalloc_release_space(inode, data_reserved, block_start, 4927 blocksize, true); 4928 btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0)); 4929 unlock_page(page); 4930 put_page(page); 4931 out: 4932 extent_changeset_free(data_reserved); 4933 return ret; 4934 } 4935 4936 static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, 4937 u64 offset, u64 len) 4938 { 4939 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4940 struct btrfs_trans_handle *trans; 4941 int ret; 4942 4943 /* 4944 * Still need to make sure the inode looks like it's been updated so 4945 * that any holes get logged if we fsync. 4946 */ 4947 if (btrfs_fs_incompat(fs_info, NO_HOLES)) { 4948 BTRFS_I(inode)->last_trans = fs_info->generation; 4949 BTRFS_I(inode)->last_sub_trans = root->log_transid; 4950 BTRFS_I(inode)->last_log_commit = root->last_log_commit; 4951 return 0; 4952 } 4953 4954 /* 4955 * 1 - for the one we're dropping 4956 * 1 - for the one we're adding 4957 * 1 - for updating the inode. 4958 */ 4959 trans = btrfs_start_transaction(root, 3); 4960 if (IS_ERR(trans)) 4961 return PTR_ERR(trans); 4962 4963 ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); 4964 if (ret) { 4965 btrfs_abort_transaction(trans, ret); 4966 btrfs_end_transaction(trans); 4967 return ret; 4968 } 4969 4970 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)), 4971 offset, 0, 0, len, 0, len, 0, 0, 0); 4972 if (ret) 4973 btrfs_abort_transaction(trans, ret); 4974 else 4975 btrfs_update_inode(trans, root, inode); 4976 btrfs_end_transaction(trans); 4977 return ret; 4978 } 4979 4980 /* 4981 * This function puts in dummy file extents for the area we're creating a hole 4982 * for. So if we are truncating this file to a larger size we need to insert 4983 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 4984 * the range between oldsize and size 4985 */ 4986 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 4987 { 4988 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4989 struct btrfs_root *root = BTRFS_I(inode)->root; 4990 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4991 struct extent_map *em = NULL; 4992 struct extent_state *cached_state = NULL; 4993 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4994 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); 4995 u64 block_end = ALIGN(size, fs_info->sectorsize); 4996 u64 last_byte; 4997 u64 cur_offset; 4998 u64 hole_size; 4999 int err = 0; 5000 5001 /* 5002 * If our size started in the middle of a block we need to zero out the 5003 * rest of the block before we expand the i_size, otherwise we could 5004 * expose stale data. 5005 */ 5006 err = btrfs_truncate_block(inode, oldsize, 0, 0); 5007 if (err) 5008 return err; 5009 5010 if (size <= hole_start) 5011 return 0; 5012 5013 while (1) { 5014 struct btrfs_ordered_extent *ordered; 5015 5016 lock_extent_bits(io_tree, hole_start, block_end - 1, 5017 &cached_state); 5018 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start, 5019 block_end - hole_start); 5020 if (!ordered) 5021 break; 5022 unlock_extent_cached(io_tree, hole_start, block_end - 1, 5023 &cached_state); 5024 btrfs_start_ordered_extent(inode, ordered, 1); 5025 btrfs_put_ordered_extent(ordered); 5026 } 5027 5028 cur_offset = hole_start; 5029 while (1) { 5030 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, 5031 block_end - cur_offset, 0); 5032 if (IS_ERR(em)) { 5033 err = PTR_ERR(em); 5034 em = NULL; 5035 break; 5036 } 5037 last_byte = min(extent_map_end(em), block_end); 5038 last_byte = ALIGN(last_byte, fs_info->sectorsize); 5039 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 5040 struct extent_map *hole_em; 5041 hole_size = last_byte - cur_offset; 5042 5043 err = maybe_insert_hole(root, inode, cur_offset, 5044 hole_size); 5045 if (err) 5046 break; 5047 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, 5048 cur_offset + hole_size - 1, 0); 5049 hole_em = alloc_extent_map(); 5050 if (!hole_em) { 5051 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5052 &BTRFS_I(inode)->runtime_flags); 5053 goto next; 5054 } 5055 hole_em->start = cur_offset; 5056 hole_em->len = hole_size; 5057 hole_em->orig_start = cur_offset; 5058 5059 hole_em->block_start = EXTENT_MAP_HOLE; 5060 hole_em->block_len = 0; 5061 hole_em->orig_block_len = 0; 5062 hole_em->ram_bytes = hole_size; 5063 hole_em->bdev = fs_info->fs_devices->latest_bdev; 5064 hole_em->compress_type = BTRFS_COMPRESS_NONE; 5065 hole_em->generation = fs_info->generation; 5066 5067 while (1) { 5068 write_lock(&em_tree->lock); 5069 err = add_extent_mapping(em_tree, hole_em, 1); 5070 write_unlock(&em_tree->lock); 5071 if (err != -EEXIST) 5072 break; 5073 btrfs_drop_extent_cache(BTRFS_I(inode), 5074 cur_offset, 5075 cur_offset + 5076 hole_size - 1, 0); 5077 } 5078 free_extent_map(hole_em); 5079 } 5080 next: 5081 free_extent_map(em); 5082 em = NULL; 5083 cur_offset = last_byte; 5084 if (cur_offset >= block_end) 5085 break; 5086 } 5087 free_extent_map(em); 5088 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state); 5089 return err; 5090 } 5091 5092 static int btrfs_setsize(struct inode *inode, struct iattr *attr) 5093 { 5094 struct btrfs_root *root = BTRFS_I(inode)->root; 5095 struct btrfs_trans_handle *trans; 5096 loff_t oldsize = i_size_read(inode); 5097 loff_t newsize = attr->ia_size; 5098 int mask = attr->ia_valid; 5099 int ret; 5100 5101 /* 5102 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a 5103 * special case where we need to update the times despite not having 5104 * these flags set. For all other operations the VFS set these flags 5105 * explicitly if it wants a timestamp update. 5106 */ 5107 if (newsize != oldsize) { 5108 inode_inc_iversion(inode); 5109 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) 5110 inode->i_ctime = inode->i_mtime = 5111 current_time(inode); 5112 } 5113 5114 if (newsize > oldsize) { 5115 /* 5116 * Don't do an expanding truncate while snapshotting is ongoing. 5117 * This is to ensure the snapshot captures a fully consistent 5118 * state of this file - if the snapshot captures this expanding 5119 * truncation, it must capture all writes that happened before 5120 * this truncation. 5121 */ 5122 btrfs_wait_for_snapshot_creation(root); 5123 ret = btrfs_cont_expand(inode, oldsize, newsize); 5124 if (ret) { 5125 btrfs_end_write_no_snapshotting(root); 5126 return ret; 5127 } 5128 5129 trans = btrfs_start_transaction(root, 1); 5130 if (IS_ERR(trans)) { 5131 btrfs_end_write_no_snapshotting(root); 5132 return PTR_ERR(trans); 5133 } 5134 5135 i_size_write(inode, newsize); 5136 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 5137 pagecache_isize_extended(inode, oldsize, newsize); 5138 ret = btrfs_update_inode(trans, root, inode); 5139 btrfs_end_write_no_snapshotting(root); 5140 btrfs_end_transaction(trans); 5141 } else { 5142 5143 /* 5144 * We're truncating a file that used to have good data down to 5145 * zero. Make sure it gets into the ordered flush list so that 5146 * any new writes get down to disk quickly. 5147 */ 5148 if (newsize == 0) 5149 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 5150 &BTRFS_I(inode)->runtime_flags); 5151 5152 truncate_setsize(inode, newsize); 5153 5154 /* Disable nonlocked read DIO to avoid the end less truncate */ 5155 btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); 5156 inode_dio_wait(inode); 5157 btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); 5158 5159 ret = btrfs_truncate(inode, newsize == oldsize); 5160 if (ret && inode->i_nlink) { 5161 int err; 5162 5163 /* 5164 * Truncate failed, so fix up the in-memory size. We 5165 * adjusted disk_i_size down as we removed extents, so 5166 * wait for disk_i_size to be stable and then update the 5167 * in-memory size to match. 5168 */ 5169 err = btrfs_wait_ordered_range(inode, 0, (u64)-1); 5170 if (err) 5171 return err; 5172 i_size_write(inode, BTRFS_I(inode)->disk_i_size); 5173 } 5174 } 5175 5176 return ret; 5177 } 5178 5179 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 5180 { 5181 struct inode *inode = d_inode(dentry); 5182 struct btrfs_root *root = BTRFS_I(inode)->root; 5183 int err; 5184 5185 if (btrfs_root_readonly(root)) 5186 return -EROFS; 5187 5188 err = setattr_prepare(dentry, attr); 5189 if (err) 5190 return err; 5191 5192 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 5193 err = btrfs_setsize(inode, attr); 5194 if (err) 5195 return err; 5196 } 5197 5198 if (attr->ia_valid) { 5199 setattr_copy(inode, attr); 5200 inode_inc_iversion(inode); 5201 err = btrfs_dirty_inode(inode); 5202 5203 if (!err && attr->ia_valid & ATTR_MODE) 5204 err = posix_acl_chmod(inode, inode->i_mode); 5205 } 5206 5207 return err; 5208 } 5209 5210 /* 5211 * While truncating the inode pages during eviction, we get the VFS calling 5212 * btrfs_invalidatepage() against each page of the inode. This is slow because 5213 * the calls to btrfs_invalidatepage() result in a huge amount of calls to 5214 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting 5215 * extent_state structures over and over, wasting lots of time. 5216 * 5217 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all 5218 * those expensive operations on a per page basis and do only the ordered io 5219 * finishing, while we release here the extent_map and extent_state structures, 5220 * without the excessive merging and splitting. 5221 */ 5222 static void evict_inode_truncate_pages(struct inode *inode) 5223 { 5224 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5225 struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; 5226 struct rb_node *node; 5227 5228 ASSERT(inode->i_state & I_FREEING); 5229 truncate_inode_pages_final(&inode->i_data); 5230 5231 write_lock(&map_tree->lock); 5232 while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { 5233 struct extent_map *em; 5234 5235 node = rb_first_cached(&map_tree->map); 5236 em = rb_entry(node, struct extent_map, rb_node); 5237 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 5238 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 5239 remove_extent_mapping(map_tree, em); 5240 free_extent_map(em); 5241 if (need_resched()) { 5242 write_unlock(&map_tree->lock); 5243 cond_resched(); 5244 write_lock(&map_tree->lock); 5245 } 5246 } 5247 write_unlock(&map_tree->lock); 5248 5249 /* 5250 * Keep looping until we have no more ranges in the io tree. 5251 * We can have ongoing bios started by readpages (called from readahead) 5252 * that have their endio callback (extent_io.c:end_bio_extent_readpage) 5253 * still in progress (unlocked the pages in the bio but did not yet 5254 * unlocked the ranges in the io tree). Therefore this means some 5255 * ranges can still be locked and eviction started because before 5256 * submitting those bios, which are executed by a separate task (work 5257 * queue kthread), inode references (inode->i_count) were not taken 5258 * (which would be dropped in the end io callback of each bio). 5259 * Therefore here we effectively end up waiting for those bios and 5260 * anyone else holding locked ranges without having bumped the inode's 5261 * reference count - if we don't do it, when they access the inode's 5262 * io_tree to unlock a range it may be too late, leading to an 5263 * use-after-free issue. 5264 */ 5265 spin_lock(&io_tree->lock); 5266 while (!RB_EMPTY_ROOT(&io_tree->state)) { 5267 struct extent_state *state; 5268 struct extent_state *cached_state = NULL; 5269 u64 start; 5270 u64 end; 5271 unsigned state_flags; 5272 5273 node = rb_first(&io_tree->state); 5274 state = rb_entry(node, struct extent_state, rb_node); 5275 start = state->start; 5276 end = state->end; 5277 state_flags = state->state; 5278 spin_unlock(&io_tree->lock); 5279 5280 lock_extent_bits(io_tree, start, end, &cached_state); 5281 5282 /* 5283 * If still has DELALLOC flag, the extent didn't reach disk, 5284 * and its reserved space won't be freed by delayed_ref. 5285 * So we need to free its reserved space here. 5286 * (Refer to comment in btrfs_invalidatepage, case 2) 5287 * 5288 * Note, end is the bytenr of last byte, so we need + 1 here. 5289 */ 5290 if (state_flags & EXTENT_DELALLOC) 5291 btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); 5292 5293 clear_extent_bit(io_tree, start, end, 5294 EXTENT_LOCKED | EXTENT_DIRTY | 5295 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 5296 EXTENT_DEFRAG, 1, 1, &cached_state); 5297 5298 cond_resched(); 5299 spin_lock(&io_tree->lock); 5300 } 5301 spin_unlock(&io_tree->lock); 5302 } 5303 5304 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, 5305 struct btrfs_block_rsv *rsv) 5306 { 5307 struct btrfs_fs_info *fs_info = root->fs_info; 5308 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5309 int failures = 0; 5310 5311 for (;;) { 5312 struct btrfs_trans_handle *trans; 5313 int ret; 5314 5315 ret = btrfs_block_rsv_refill(root, rsv, rsv->size, 5316 BTRFS_RESERVE_FLUSH_LIMIT); 5317 5318 if (ret && ++failures > 2) { 5319 btrfs_warn(fs_info, 5320 "could not allocate space for a delete; will truncate on mount"); 5321 return ERR_PTR(-ENOSPC); 5322 } 5323 5324 trans = btrfs_join_transaction(root); 5325 if (IS_ERR(trans) || !ret) 5326 return trans; 5327 5328 /* 5329 * Try to steal from the global reserve if there is space for 5330 * it. 5331 */ 5332 if (!btrfs_check_space_for_delayed_refs(trans) && 5333 !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false)) 5334 return trans; 5335 5336 /* If not, commit and try again. */ 5337 ret = btrfs_commit_transaction(trans); 5338 if (ret) 5339 return ERR_PTR(ret); 5340 } 5341 } 5342 5343 void btrfs_evict_inode(struct inode *inode) 5344 { 5345 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5346 struct btrfs_trans_handle *trans; 5347 struct btrfs_root *root = BTRFS_I(inode)->root; 5348 struct btrfs_block_rsv *rsv; 5349 int ret; 5350 5351 trace_btrfs_inode_evict(inode); 5352 5353 if (!root) { 5354 clear_inode(inode); 5355 return; 5356 } 5357 5358 evict_inode_truncate_pages(inode); 5359 5360 if (inode->i_nlink && 5361 ((btrfs_root_refs(&root->root_item) != 0 && 5362 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || 5363 btrfs_is_free_space_inode(BTRFS_I(inode)))) 5364 goto no_delete; 5365 5366 if (is_bad_inode(inode)) 5367 goto no_delete; 5368 5369 btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); 5370 5371 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 5372 goto no_delete; 5373 5374 if (inode->i_nlink > 0) { 5375 BUG_ON(btrfs_root_refs(&root->root_item) != 0 && 5376 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); 5377 goto no_delete; 5378 } 5379 5380 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); 5381 if (ret) 5382 goto no_delete; 5383 5384 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 5385 if (!rsv) 5386 goto no_delete; 5387 rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1); 5388 rsv->failfast = 1; 5389 5390 btrfs_i_size_write(BTRFS_I(inode), 0); 5391 5392 while (1) { 5393 trans = evict_refill_and_join(root, rsv); 5394 if (IS_ERR(trans)) 5395 goto free_rsv; 5396 5397 trans->block_rsv = rsv; 5398 5399 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 5400 trans->block_rsv = &fs_info->trans_block_rsv; 5401 btrfs_end_transaction(trans); 5402 btrfs_btree_balance_dirty(fs_info); 5403 if (ret && ret != -ENOSPC && ret != -EAGAIN) 5404 goto free_rsv; 5405 else if (!ret) 5406 break; 5407 } 5408 5409 /* 5410 * Errors here aren't a big deal, it just means we leave orphan items in 5411 * the tree. They will be cleaned up on the next mount. If the inode 5412 * number gets reused, cleanup deletes the orphan item without doing 5413 * anything, and unlink reuses the existing orphan item. 5414 * 5415 * If it turns out that we are dropping too many of these, we might want 5416 * to add a mechanism for retrying these after a commit. 5417 */ 5418 trans = evict_refill_and_join(root, rsv); 5419 if (!IS_ERR(trans)) { 5420 trans->block_rsv = rsv; 5421 btrfs_orphan_del(trans, BTRFS_I(inode)); 5422 trans->block_rsv = &fs_info->trans_block_rsv; 5423 btrfs_end_transaction(trans); 5424 } 5425 5426 if (!(root == fs_info->tree_root || 5427 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 5428 btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode))); 5429 5430 free_rsv: 5431 btrfs_free_block_rsv(fs_info, rsv); 5432 no_delete: 5433 /* 5434 * If we didn't successfully delete, the orphan item will still be in 5435 * the tree and we'll retry on the next mount. Again, we might also want 5436 * to retry these periodically in the future. 5437 */ 5438 btrfs_remove_delayed_node(BTRFS_I(inode)); 5439 clear_inode(inode); 5440 } 5441 5442 /* 5443 * this returns the key found in the dir entry in the location pointer. 5444 * If no dir entries were found, returns -ENOENT. 5445 * If found a corrupted location in dir entry, returns -EUCLEAN. 5446 */ 5447 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 5448 struct btrfs_key *location) 5449 { 5450 const char *name = dentry->d_name.name; 5451 int namelen = dentry->d_name.len; 5452 struct btrfs_dir_item *di; 5453 struct btrfs_path *path; 5454 struct btrfs_root *root = BTRFS_I(dir)->root; 5455 int ret = 0; 5456 5457 path = btrfs_alloc_path(); 5458 if (!path) 5459 return -ENOMEM; 5460 5461 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), 5462 name, namelen, 0); 5463 if (IS_ERR_OR_NULL(di)) { 5464 ret = di ? PTR_ERR(di) : -ENOENT; 5465 goto out; 5466 } 5467 5468 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 5469 if (location->type != BTRFS_INODE_ITEM_KEY && 5470 location->type != BTRFS_ROOT_ITEM_KEY) { 5471 ret = -EUCLEAN; 5472 btrfs_warn(root->fs_info, 5473 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", 5474 __func__, name, btrfs_ino(BTRFS_I(dir)), 5475 location->objectid, location->type, location->offset); 5476 } 5477 out: 5478 btrfs_free_path(path); 5479 return ret; 5480 } 5481 5482 /* 5483 * when we hit a tree root in a directory, the btrfs part of the inode 5484 * needs to be changed to reflect the root directory of the tree root. This 5485 * is kind of like crossing a mount point. 5486 */ 5487 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, 5488 struct inode *dir, 5489 struct dentry *dentry, 5490 struct btrfs_key *location, 5491 struct btrfs_root **sub_root) 5492 { 5493 struct btrfs_path *path; 5494 struct btrfs_root *new_root; 5495 struct btrfs_root_ref *ref; 5496 struct extent_buffer *leaf; 5497 struct btrfs_key key; 5498 int ret; 5499 int err = 0; 5500 5501 path = btrfs_alloc_path(); 5502 if (!path) { 5503 err = -ENOMEM; 5504 goto out; 5505 } 5506 5507 err = -ENOENT; 5508 key.objectid = BTRFS_I(dir)->root->root_key.objectid; 5509 key.type = BTRFS_ROOT_REF_KEY; 5510 key.offset = location->objectid; 5511 5512 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 5513 if (ret) { 5514 if (ret < 0) 5515 err = ret; 5516 goto out; 5517 } 5518 5519 leaf = path->nodes[0]; 5520 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 5521 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || 5522 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 5523 goto out; 5524 5525 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 5526 (unsigned long)(ref + 1), 5527 dentry->d_name.len); 5528 if (ret) 5529 goto out; 5530 5531 btrfs_release_path(path); 5532 5533 new_root = btrfs_read_fs_root_no_name(fs_info, location); 5534 if (IS_ERR(new_root)) { 5535 err = PTR_ERR(new_root); 5536 goto out; 5537 } 5538 5539 *sub_root = new_root; 5540 location->objectid = btrfs_root_dirid(&new_root->root_item); 5541 location->type = BTRFS_INODE_ITEM_KEY; 5542 location->offset = 0; 5543 err = 0; 5544 out: 5545 btrfs_free_path(path); 5546 return err; 5547 } 5548 5549 static void inode_tree_add(struct inode *inode) 5550 { 5551 struct btrfs_root *root = BTRFS_I(inode)->root; 5552 struct btrfs_inode *entry; 5553 struct rb_node **p; 5554 struct rb_node *parent; 5555 struct rb_node *new = &BTRFS_I(inode)->rb_node; 5556 u64 ino = btrfs_ino(BTRFS_I(inode)); 5557 5558 if (inode_unhashed(inode)) 5559 return; 5560 parent = NULL; 5561 spin_lock(&root->inode_lock); 5562 p = &root->inode_tree.rb_node; 5563 while (*p) { 5564 parent = *p; 5565 entry = rb_entry(parent, struct btrfs_inode, rb_node); 5566 5567 if (ino < btrfs_ino(entry)) 5568 p = &parent->rb_left; 5569 else if (ino > btrfs_ino(entry)) 5570 p = &parent->rb_right; 5571 else { 5572 WARN_ON(!(entry->vfs_inode.i_state & 5573 (I_WILL_FREE | I_FREEING))); 5574 rb_replace_node(parent, new, &root->inode_tree); 5575 RB_CLEAR_NODE(parent); 5576 spin_unlock(&root->inode_lock); 5577 return; 5578 } 5579 } 5580 rb_link_node(new, parent, p); 5581 rb_insert_color(new, &root->inode_tree); 5582 spin_unlock(&root->inode_lock); 5583 } 5584 5585 static void inode_tree_del(struct inode *inode) 5586 { 5587 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5588 struct btrfs_root *root = BTRFS_I(inode)->root; 5589 int empty = 0; 5590 5591 spin_lock(&root->inode_lock); 5592 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 5593 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 5594 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 5595 empty = RB_EMPTY_ROOT(&root->inode_tree); 5596 } 5597 spin_unlock(&root->inode_lock); 5598 5599 if (empty && btrfs_root_refs(&root->root_item) == 0) { 5600 synchronize_srcu(&fs_info->subvol_srcu); 5601 spin_lock(&root->inode_lock); 5602 empty = RB_EMPTY_ROOT(&root->inode_tree); 5603 spin_unlock(&root->inode_lock); 5604 if (empty) 5605 btrfs_add_dead_root(root); 5606 } 5607 } 5608 5609 5610 static int btrfs_init_locked_inode(struct inode *inode, void *p) 5611 { 5612 struct btrfs_iget_args *args = p; 5613 inode->i_ino = args->location->objectid; 5614 memcpy(&BTRFS_I(inode)->location, args->location, 5615 sizeof(*args->location)); 5616 BTRFS_I(inode)->root = args->root; 5617 return 0; 5618 } 5619 5620 static int btrfs_find_actor(struct inode *inode, void *opaque) 5621 { 5622 struct btrfs_iget_args *args = opaque; 5623 return args->location->objectid == BTRFS_I(inode)->location.objectid && 5624 args->root == BTRFS_I(inode)->root; 5625 } 5626 5627 static struct inode *btrfs_iget_locked(struct super_block *s, 5628 struct btrfs_key *location, 5629 struct btrfs_root *root) 5630 { 5631 struct inode *inode; 5632 struct btrfs_iget_args args; 5633 unsigned long hashval = btrfs_inode_hash(location->objectid, root); 5634 5635 args.location = location; 5636 args.root = root; 5637 5638 inode = iget5_locked(s, hashval, btrfs_find_actor, 5639 btrfs_init_locked_inode, 5640 (void *)&args); 5641 return inode; 5642 } 5643 5644 /* Get an inode object given its location and corresponding root. 5645 * Returns in *is_new if the inode was read from disk 5646 */ 5647 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 5648 struct btrfs_root *root, int *new) 5649 { 5650 struct inode *inode; 5651 5652 inode = btrfs_iget_locked(s, location, root); 5653 if (!inode) 5654 return ERR_PTR(-ENOMEM); 5655 5656 if (inode->i_state & I_NEW) { 5657 int ret; 5658 5659 ret = btrfs_read_locked_inode(inode); 5660 if (!ret) { 5661 inode_tree_add(inode); 5662 unlock_new_inode(inode); 5663 if (new) 5664 *new = 1; 5665 } else { 5666 iget_failed(inode); 5667 /* 5668 * ret > 0 can come from btrfs_search_slot called by 5669 * btrfs_read_locked_inode, this means the inode item 5670 * was not found. 5671 */ 5672 if (ret > 0) 5673 ret = -ENOENT; 5674 inode = ERR_PTR(ret); 5675 } 5676 } 5677 5678 return inode; 5679 } 5680 5681 static struct inode *new_simple_dir(struct super_block *s, 5682 struct btrfs_key *key, 5683 struct btrfs_root *root) 5684 { 5685 struct inode *inode = new_inode(s); 5686 5687 if (!inode) 5688 return ERR_PTR(-ENOMEM); 5689 5690 BTRFS_I(inode)->root = root; 5691 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 5692 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 5693 5694 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 5695 inode->i_op = &btrfs_dir_ro_inode_operations; 5696 inode->i_opflags &= ~IOP_XATTR; 5697 inode->i_fop = &simple_dir_operations; 5698 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5699 inode->i_mtime = current_time(inode); 5700 inode->i_atime = inode->i_mtime; 5701 inode->i_ctime = inode->i_mtime; 5702 BTRFS_I(inode)->i_otime = inode->i_mtime; 5703 5704 return inode; 5705 } 5706 5707 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 5708 { 5709 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 5710 struct inode *inode; 5711 struct btrfs_root *root = BTRFS_I(dir)->root; 5712 struct btrfs_root *sub_root = root; 5713 struct btrfs_key location; 5714 int index; 5715 int ret = 0; 5716 5717 if (dentry->d_name.len > BTRFS_NAME_LEN) 5718 return ERR_PTR(-ENAMETOOLONG); 5719 5720 ret = btrfs_inode_by_name(dir, dentry, &location); 5721 if (ret < 0) 5722 return ERR_PTR(ret); 5723 5724 if (location.type == BTRFS_INODE_ITEM_KEY) { 5725 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 5726 return inode; 5727 } 5728 5729 index = srcu_read_lock(&fs_info->subvol_srcu); 5730 ret = fixup_tree_root_location(fs_info, dir, dentry, 5731 &location, &sub_root); 5732 if (ret < 0) { 5733 if (ret != -ENOENT) 5734 inode = ERR_PTR(ret); 5735 else 5736 inode = new_simple_dir(dir->i_sb, &location, sub_root); 5737 } else { 5738 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); 5739 } 5740 srcu_read_unlock(&fs_info->subvol_srcu, index); 5741 5742 if (!IS_ERR(inode) && root != sub_root) { 5743 down_read(&fs_info->cleanup_work_sem); 5744 if (!sb_rdonly(inode->i_sb)) 5745 ret = btrfs_orphan_cleanup(sub_root); 5746 up_read(&fs_info->cleanup_work_sem); 5747 if (ret) { 5748 iput(inode); 5749 inode = ERR_PTR(ret); 5750 } 5751 } 5752 5753 return inode; 5754 } 5755 5756 static int btrfs_dentry_delete(const struct dentry *dentry) 5757 { 5758 struct btrfs_root *root; 5759 struct inode *inode = d_inode(dentry); 5760 5761 if (!inode && !IS_ROOT(dentry)) 5762 inode = d_inode(dentry->d_parent); 5763 5764 if (inode) { 5765 root = BTRFS_I(inode)->root; 5766 if (btrfs_root_refs(&root->root_item) == 0) 5767 return 1; 5768 5769 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 5770 return 1; 5771 } 5772 return 0; 5773 } 5774 5775 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 5776 unsigned int flags) 5777 { 5778 struct inode *inode = btrfs_lookup_dentry(dir, dentry); 5779 5780 if (inode == ERR_PTR(-ENOENT)) 5781 inode = NULL; 5782 return d_splice_alias(inode, dentry); 5783 } 5784 5785 unsigned char btrfs_filetype_table[] = { 5786 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 5787 }; 5788 5789 /* 5790 * All this infrastructure exists because dir_emit can fault, and we are holding 5791 * the tree lock when doing readdir. For now just allocate a buffer and copy 5792 * our information into that, and then dir_emit from the buffer. This is 5793 * similar to what NFS does, only we don't keep the buffer around in pagecache 5794 * because I'm afraid I'll mess that up. Long term we need to make filldir do 5795 * copy_to_user_inatomic so we don't have to worry about page faulting under the 5796 * tree lock. 5797 */ 5798 static int btrfs_opendir(struct inode *inode, struct file *file) 5799 { 5800 struct btrfs_file_private *private; 5801 5802 private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); 5803 if (!private) 5804 return -ENOMEM; 5805 private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); 5806 if (!private->filldir_buf) { 5807 kfree(private); 5808 return -ENOMEM; 5809 } 5810 file->private_data = private; 5811 return 0; 5812 } 5813 5814 struct dir_entry { 5815 u64 ino; 5816 u64 offset; 5817 unsigned type; 5818 int name_len; 5819 }; 5820 5821 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) 5822 { 5823 while (entries--) { 5824 struct dir_entry *entry = addr; 5825 char *name = (char *)(entry + 1); 5826 5827 ctx->pos = get_unaligned(&entry->offset); 5828 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), 5829 get_unaligned(&entry->ino), 5830 get_unaligned(&entry->type))) 5831 return 1; 5832 addr += sizeof(struct dir_entry) + 5833 get_unaligned(&entry->name_len); 5834 ctx->pos++; 5835 } 5836 return 0; 5837 } 5838 5839 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) 5840 { 5841 struct inode *inode = file_inode(file); 5842 struct btrfs_root *root = BTRFS_I(inode)->root; 5843 struct btrfs_file_private *private = file->private_data; 5844 struct btrfs_dir_item *di; 5845 struct btrfs_key key; 5846 struct btrfs_key found_key; 5847 struct btrfs_path *path; 5848 void *addr; 5849 struct list_head ins_list; 5850 struct list_head del_list; 5851 int ret; 5852 struct extent_buffer *leaf; 5853 int slot; 5854 char *name_ptr; 5855 int name_len; 5856 int entries = 0; 5857 int total_len = 0; 5858 bool put = false; 5859 struct btrfs_key location; 5860 5861 if (!dir_emit_dots(file, ctx)) 5862 return 0; 5863 5864 path = btrfs_alloc_path(); 5865 if (!path) 5866 return -ENOMEM; 5867 5868 addr = private->filldir_buf; 5869 path->reada = READA_FORWARD; 5870 5871 INIT_LIST_HEAD(&ins_list); 5872 INIT_LIST_HEAD(&del_list); 5873 put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list); 5874 5875 again: 5876 key.type = BTRFS_DIR_INDEX_KEY; 5877 key.offset = ctx->pos; 5878 key.objectid = btrfs_ino(BTRFS_I(inode)); 5879 5880 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5881 if (ret < 0) 5882 goto err; 5883 5884 while (1) { 5885 struct dir_entry *entry; 5886 5887 leaf = path->nodes[0]; 5888 slot = path->slots[0]; 5889 if (slot >= btrfs_header_nritems(leaf)) { 5890 ret = btrfs_next_leaf(root, path); 5891 if (ret < 0) 5892 goto err; 5893 else if (ret > 0) 5894 break; 5895 continue; 5896 } 5897 5898 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5899 5900 if (found_key.objectid != key.objectid) 5901 break; 5902 if (found_key.type != BTRFS_DIR_INDEX_KEY) 5903 break; 5904 if (found_key.offset < ctx->pos) 5905 goto next; 5906 if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) 5907 goto next; 5908 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 5909 name_len = btrfs_dir_name_len(leaf, di); 5910 if ((total_len + sizeof(struct dir_entry) + name_len) >= 5911 PAGE_SIZE) { 5912 btrfs_release_path(path); 5913 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 5914 if (ret) 5915 goto nopos; 5916 addr = private->filldir_buf; 5917 entries = 0; 5918 total_len = 0; 5919 goto again; 5920 } 5921 5922 entry = addr; 5923 put_unaligned(name_len, &entry->name_len); 5924 name_ptr = (char *)(entry + 1); 5925 read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), 5926 name_len); 5927 put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)], 5928 &entry->type); 5929 btrfs_dir_item_key_to_cpu(leaf, di, &location); 5930 put_unaligned(location.objectid, &entry->ino); 5931 put_unaligned(found_key.offset, &entry->offset); 5932 entries++; 5933 addr += sizeof(struct dir_entry) + name_len; 5934 total_len += sizeof(struct dir_entry) + name_len; 5935 next: 5936 path->slots[0]++; 5937 } 5938 btrfs_release_path(path); 5939 5940 ret = btrfs_filldir(private->filldir_buf, entries, ctx); 5941 if (ret) 5942 goto nopos; 5943 5944 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); 5945 if (ret) 5946 goto nopos; 5947 5948 /* 5949 * Stop new entries from being returned after we return the last 5950 * entry. 5951 * 5952 * New directory entries are assigned a strictly increasing 5953 * offset. This means that new entries created during readdir 5954 * are *guaranteed* to be seen in the future by that readdir. 5955 * This has broken buggy programs which operate on names as 5956 * they're returned by readdir. Until we re-use freed offsets 5957 * we have this hack to stop new entries from being returned 5958 * under the assumption that they'll never reach this huge 5959 * offset. 5960 * 5961 * This is being careful not to overflow 32bit loff_t unless the 5962 * last entry requires it because doing so has broken 32bit apps 5963 * in the past. 5964 */ 5965 if (ctx->pos >= INT_MAX) 5966 ctx->pos = LLONG_MAX; 5967 else 5968 ctx->pos = INT_MAX; 5969 nopos: 5970 ret = 0; 5971 err: 5972 if (put) 5973 btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); 5974 btrfs_free_path(path); 5975 return ret; 5976 } 5977 5978 /* 5979 * This is somewhat expensive, updating the tree every time the 5980 * inode changes. But, it is most likely to find the inode in cache. 5981 * FIXME, needs more benchmarking...there are no reasons other than performance 5982 * to keep or drop this code. 5983 */ 5984 static int btrfs_dirty_inode(struct inode *inode) 5985 { 5986 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5987 struct btrfs_root *root = BTRFS_I(inode)->root; 5988 struct btrfs_trans_handle *trans; 5989 int ret; 5990 5991 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 5992 return 0; 5993 5994 trans = btrfs_join_transaction(root); 5995 if (IS_ERR(trans)) 5996 return PTR_ERR(trans); 5997 5998 ret = btrfs_update_inode(trans, root, inode); 5999 if (ret && ret == -ENOSPC) { 6000 /* whoops, lets try again with the full transaction */ 6001 btrfs_end_transaction(trans); 6002 trans = btrfs_start_transaction(root, 1); 6003 if (IS_ERR(trans)) 6004 return PTR_ERR(trans); 6005 6006 ret = btrfs_update_inode(trans, root, inode); 6007 } 6008 btrfs_end_transaction(trans); 6009 if (BTRFS_I(inode)->delayed_node) 6010 btrfs_balance_delayed_items(fs_info); 6011 6012 return ret; 6013 } 6014 6015 /* 6016 * This is a copy of file_update_time. We need this so we can return error on 6017 * ENOSPC for updating the inode in the case of file write and mmap writes. 6018 */ 6019 static int btrfs_update_time(struct inode *inode, struct timespec64 *now, 6020 int flags) 6021 { 6022 struct btrfs_root *root = BTRFS_I(inode)->root; 6023 bool dirty = flags & ~S_VERSION; 6024 6025 if (btrfs_root_readonly(root)) 6026 return -EROFS; 6027 6028 if (flags & S_VERSION) 6029 dirty |= inode_maybe_inc_iversion(inode, dirty); 6030 if (flags & S_CTIME) 6031 inode->i_ctime = *now; 6032 if (flags & S_MTIME) 6033 inode->i_mtime = *now; 6034 if (flags & S_ATIME) 6035 inode->i_atime = *now; 6036 return dirty ? btrfs_dirty_inode(inode) : 0; 6037 } 6038 6039 /* 6040 * find the highest existing sequence number in a directory 6041 * and then set the in-memory index_cnt variable to reflect 6042 * free sequence numbers 6043 */ 6044 static int btrfs_set_inode_index_count(struct btrfs_inode *inode) 6045 { 6046 struct btrfs_root *root = inode->root; 6047 struct btrfs_key key, found_key; 6048 struct btrfs_path *path; 6049 struct extent_buffer *leaf; 6050 int ret; 6051 6052 key.objectid = btrfs_ino(inode); 6053 key.type = BTRFS_DIR_INDEX_KEY; 6054 key.offset = (u64)-1; 6055 6056 path = btrfs_alloc_path(); 6057 if (!path) 6058 return -ENOMEM; 6059 6060 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6061 if (ret < 0) 6062 goto out; 6063 /* FIXME: we should be able to handle this */ 6064 if (ret == 0) 6065 goto out; 6066 ret = 0; 6067 6068 /* 6069 * MAGIC NUMBER EXPLANATION: 6070 * since we search a directory based on f_pos we have to start at 2 6071 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 6072 * else has to start at 2 6073 */ 6074 if (path->slots[0] == 0) { 6075 inode->index_cnt = 2; 6076 goto out; 6077 } 6078 6079 path->slots[0]--; 6080 6081 leaf = path->nodes[0]; 6082 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6083 6084 if (found_key.objectid != btrfs_ino(inode) || 6085 found_key.type != BTRFS_DIR_INDEX_KEY) { 6086 inode->index_cnt = 2; 6087 goto out; 6088 } 6089 6090 inode->index_cnt = found_key.offset + 1; 6091 out: 6092 btrfs_free_path(path); 6093 return ret; 6094 } 6095 6096 /* 6097 * helper to find a free sequence number in a given directory. This current 6098 * code is very simple, later versions will do smarter things in the btree 6099 */ 6100 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) 6101 { 6102 int ret = 0; 6103 6104 if (dir->index_cnt == (u64)-1) { 6105 ret = btrfs_inode_delayed_dir_index_count(dir); 6106 if (ret) { 6107 ret = btrfs_set_inode_index_count(dir); 6108 if (ret) 6109 return ret; 6110 } 6111 } 6112 6113 *index = dir->index_cnt; 6114 dir->index_cnt++; 6115 6116 return ret; 6117 } 6118 6119 static int btrfs_insert_inode_locked(struct inode *inode) 6120 { 6121 struct btrfs_iget_args args; 6122 args.location = &BTRFS_I(inode)->location; 6123 args.root = BTRFS_I(inode)->root; 6124 6125 return insert_inode_locked4(inode, 6126 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), 6127 btrfs_find_actor, &args); 6128 } 6129 6130 /* 6131 * Inherit flags from the parent inode. 6132 * 6133 * Currently only the compression flags and the cow flags are inherited. 6134 */ 6135 static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 6136 { 6137 unsigned int flags; 6138 6139 if (!dir) 6140 return; 6141 6142 flags = BTRFS_I(dir)->flags; 6143 6144 if (flags & BTRFS_INODE_NOCOMPRESS) { 6145 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; 6146 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 6147 } else if (flags & BTRFS_INODE_COMPRESS) { 6148 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; 6149 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 6150 } 6151 6152 if (flags & BTRFS_INODE_NODATACOW) { 6153 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 6154 if (S_ISREG(inode->i_mode)) 6155 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6156 } 6157 6158 btrfs_sync_inode_flags_to_i_flags(inode); 6159 } 6160 6161 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 6162 struct btrfs_root *root, 6163 struct inode *dir, 6164 const char *name, int name_len, 6165 u64 ref_objectid, u64 objectid, 6166 umode_t mode, u64 *index) 6167 { 6168 struct btrfs_fs_info *fs_info = root->fs_info; 6169 struct inode *inode; 6170 struct btrfs_inode_item *inode_item; 6171 struct btrfs_key *location; 6172 struct btrfs_path *path; 6173 struct btrfs_inode_ref *ref; 6174 struct btrfs_key key[2]; 6175 u32 sizes[2]; 6176 int nitems = name ? 2 : 1; 6177 unsigned long ptr; 6178 int ret; 6179 6180 path = btrfs_alloc_path(); 6181 if (!path) 6182 return ERR_PTR(-ENOMEM); 6183 6184 inode = new_inode(fs_info->sb); 6185 if (!inode) { 6186 btrfs_free_path(path); 6187 return ERR_PTR(-ENOMEM); 6188 } 6189 6190 /* 6191 * O_TMPFILE, set link count to 0, so that after this point, 6192 * we fill in an inode item with the correct link count. 6193 */ 6194 if (!name) 6195 set_nlink(inode, 0); 6196 6197 /* 6198 * we have to initialize this early, so we can reclaim the inode 6199 * number if we fail afterwards in this function. 6200 */ 6201 inode->i_ino = objectid; 6202 6203 if (dir && name) { 6204 trace_btrfs_inode_request(dir); 6205 6206 ret = btrfs_set_inode_index(BTRFS_I(dir), index); 6207 if (ret) { 6208 btrfs_free_path(path); 6209 iput(inode); 6210 return ERR_PTR(ret); 6211 } 6212 } else if (dir) { 6213 *index = 0; 6214 } 6215 /* 6216 * index_cnt is ignored for everything but a dir, 6217 * btrfs_set_inode_index_count has an explanation for the magic 6218 * number 6219 */ 6220 BTRFS_I(inode)->index_cnt = 2; 6221 BTRFS_I(inode)->dir_index = *index; 6222 BTRFS_I(inode)->root = root; 6223 BTRFS_I(inode)->generation = trans->transid; 6224 inode->i_generation = BTRFS_I(inode)->generation; 6225 6226 /* 6227 * We could have gotten an inode number from somebody who was fsynced 6228 * and then removed in this same transaction, so let's just set full 6229 * sync since it will be a full sync anyway and this will blow away the 6230 * old info in the log. 6231 */ 6232 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 6233 6234 key[0].objectid = objectid; 6235 key[0].type = BTRFS_INODE_ITEM_KEY; 6236 key[0].offset = 0; 6237 6238 sizes[0] = sizeof(struct btrfs_inode_item); 6239 6240 if (name) { 6241 /* 6242 * Start new inodes with an inode_ref. This is slightly more 6243 * efficient for small numbers of hard links since they will 6244 * be packed into one item. Extended refs will kick in if we 6245 * add more hard links than can fit in the ref item. 6246 */ 6247 key[1].objectid = objectid; 6248 key[1].type = BTRFS_INODE_REF_KEY; 6249 key[1].offset = ref_objectid; 6250 6251 sizes[1] = name_len + sizeof(*ref); 6252 } 6253 6254 location = &BTRFS_I(inode)->location; 6255 location->objectid = objectid; 6256 location->offset = 0; 6257 location->type = BTRFS_INODE_ITEM_KEY; 6258 6259 ret = btrfs_insert_inode_locked(inode); 6260 if (ret < 0) { 6261 iput(inode); 6262 goto fail; 6263 } 6264 6265 path->leave_spinning = 1; 6266 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); 6267 if (ret != 0) 6268 goto fail_unlock; 6269 6270 inode_init_owner(inode, dir, mode); 6271 inode_set_bytes(inode, 0); 6272 6273 inode->i_mtime = current_time(inode); 6274 inode->i_atime = inode->i_mtime; 6275 inode->i_ctime = inode->i_mtime; 6276 BTRFS_I(inode)->i_otime = inode->i_mtime; 6277 6278 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 6279 struct btrfs_inode_item); 6280 memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, 6281 sizeof(*inode_item)); 6282 fill_inode_item(trans, path->nodes[0], inode_item, inode); 6283 6284 if (name) { 6285 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 6286 struct btrfs_inode_ref); 6287 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 6288 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 6289 ptr = (unsigned long)(ref + 1); 6290 write_extent_buffer(path->nodes[0], name, ptr, name_len); 6291 } 6292 6293 btrfs_mark_buffer_dirty(path->nodes[0]); 6294 btrfs_free_path(path); 6295 6296 btrfs_inherit_iflags(inode, dir); 6297 6298 if (S_ISREG(mode)) { 6299 if (btrfs_test_opt(fs_info, NODATASUM)) 6300 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6301 if (btrfs_test_opt(fs_info, NODATACOW)) 6302 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | 6303 BTRFS_INODE_NODATASUM; 6304 } 6305 6306 inode_tree_add(inode); 6307 6308 trace_btrfs_inode_new(inode); 6309 btrfs_set_inode_last_trans(trans, inode); 6310 6311 btrfs_update_root_times(trans, root); 6312 6313 ret = btrfs_inode_inherit_props(trans, inode, dir); 6314 if (ret) 6315 btrfs_err(fs_info, 6316 "error inheriting props for ino %llu (root %llu): %d", 6317 btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); 6318 6319 return inode; 6320 6321 fail_unlock: 6322 discard_new_inode(inode); 6323 fail: 6324 if (dir && name) 6325 BTRFS_I(dir)->index_cnt--; 6326 btrfs_free_path(path); 6327 return ERR_PTR(ret); 6328 } 6329 6330 static inline u8 btrfs_inode_type(struct inode *inode) 6331 { 6332 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 6333 } 6334 6335 /* 6336 * utility function to add 'inode' into 'parent_inode' with 6337 * a give name and a given sequence number. 6338 * if 'add_backref' is true, also insert a backref from the 6339 * inode to the parent directory. 6340 */ 6341 int btrfs_add_link(struct btrfs_trans_handle *trans, 6342 struct btrfs_inode *parent_inode, struct btrfs_inode *inode, 6343 const char *name, int name_len, int add_backref, u64 index) 6344 { 6345 int ret = 0; 6346 struct btrfs_key key; 6347 struct btrfs_root *root = parent_inode->root; 6348 u64 ino = btrfs_ino(inode); 6349 u64 parent_ino = btrfs_ino(parent_inode); 6350 6351 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6352 memcpy(&key, &inode->root->root_key, sizeof(key)); 6353 } else { 6354 key.objectid = ino; 6355 key.type = BTRFS_INODE_ITEM_KEY; 6356 key.offset = 0; 6357 } 6358 6359 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6360 ret = btrfs_add_root_ref(trans, key.objectid, 6361 root->root_key.objectid, parent_ino, 6362 index, name, name_len); 6363 } else if (add_backref) { 6364 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 6365 parent_ino, index); 6366 } 6367 6368 /* Nothing to clean up yet */ 6369 if (ret) 6370 return ret; 6371 6372 ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, 6373 btrfs_inode_type(&inode->vfs_inode), index); 6374 if (ret == -EEXIST || ret == -EOVERFLOW) 6375 goto fail_dir_item; 6376 else if (ret) { 6377 btrfs_abort_transaction(trans, ret); 6378 return ret; 6379 } 6380 6381 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + 6382 name_len * 2); 6383 inode_inc_iversion(&parent_inode->vfs_inode); 6384 parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime = 6385 current_time(&parent_inode->vfs_inode); 6386 ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode); 6387 if (ret) 6388 btrfs_abort_transaction(trans, ret); 6389 return ret; 6390 6391 fail_dir_item: 6392 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6393 u64 local_index; 6394 int err; 6395 err = btrfs_del_root_ref(trans, key.objectid, 6396 root->root_key.objectid, parent_ino, 6397 &local_index, name, name_len); 6398 6399 } else if (add_backref) { 6400 u64 local_index; 6401 int err; 6402 6403 err = btrfs_del_inode_ref(trans, root, name, name_len, 6404 ino, parent_ino, &local_index); 6405 } 6406 return ret; 6407 } 6408 6409 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 6410 struct btrfs_inode *dir, struct dentry *dentry, 6411 struct btrfs_inode *inode, int backref, u64 index) 6412 { 6413 int err = btrfs_add_link(trans, dir, inode, 6414 dentry->d_name.name, dentry->d_name.len, 6415 backref, index); 6416 if (err > 0) 6417 err = -EEXIST; 6418 return err; 6419 } 6420 6421 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 6422 umode_t mode, dev_t rdev) 6423 { 6424 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6425 struct btrfs_trans_handle *trans; 6426 struct btrfs_root *root = BTRFS_I(dir)->root; 6427 struct inode *inode = NULL; 6428 int err; 6429 u64 objectid; 6430 u64 index = 0; 6431 6432 /* 6433 * 2 for inode item and ref 6434 * 2 for dir items 6435 * 1 for xattr if selinux is on 6436 */ 6437 trans = btrfs_start_transaction(root, 5); 6438 if (IS_ERR(trans)) 6439 return PTR_ERR(trans); 6440 6441 err = btrfs_find_free_ino(root, &objectid); 6442 if (err) 6443 goto out_unlock; 6444 6445 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6446 dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, 6447 mode, &index); 6448 if (IS_ERR(inode)) { 6449 err = PTR_ERR(inode); 6450 inode = NULL; 6451 goto out_unlock; 6452 } 6453 6454 /* 6455 * If the active LSM wants to access the inode during 6456 * d_instantiate it needs these. Smack checks to see 6457 * if the filesystem supports xattrs by looking at the 6458 * ops vector. 6459 */ 6460 inode->i_op = &btrfs_special_inode_operations; 6461 init_special_inode(inode, inode->i_mode, rdev); 6462 6463 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6464 if (err) 6465 goto out_unlock; 6466 6467 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6468 0, index); 6469 if (err) 6470 goto out_unlock; 6471 6472 btrfs_update_inode(trans, root, inode); 6473 d_instantiate_new(dentry, inode); 6474 6475 out_unlock: 6476 btrfs_end_transaction(trans); 6477 btrfs_btree_balance_dirty(fs_info); 6478 if (err && inode) { 6479 inode_dec_link_count(inode); 6480 discard_new_inode(inode); 6481 } 6482 return err; 6483 } 6484 6485 static int btrfs_create(struct inode *dir, struct dentry *dentry, 6486 umode_t mode, bool excl) 6487 { 6488 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6489 struct btrfs_trans_handle *trans; 6490 struct btrfs_root *root = BTRFS_I(dir)->root; 6491 struct inode *inode = NULL; 6492 int err; 6493 u64 objectid; 6494 u64 index = 0; 6495 6496 /* 6497 * 2 for inode item and ref 6498 * 2 for dir items 6499 * 1 for xattr if selinux is on 6500 */ 6501 trans = btrfs_start_transaction(root, 5); 6502 if (IS_ERR(trans)) 6503 return PTR_ERR(trans); 6504 6505 err = btrfs_find_free_ino(root, &objectid); 6506 if (err) 6507 goto out_unlock; 6508 6509 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6510 dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, 6511 mode, &index); 6512 if (IS_ERR(inode)) { 6513 err = PTR_ERR(inode); 6514 inode = NULL; 6515 goto out_unlock; 6516 } 6517 /* 6518 * If the active LSM wants to access the inode during 6519 * d_instantiate it needs these. Smack checks to see 6520 * if the filesystem supports xattrs by looking at the 6521 * ops vector. 6522 */ 6523 inode->i_fop = &btrfs_file_operations; 6524 inode->i_op = &btrfs_file_inode_operations; 6525 inode->i_mapping->a_ops = &btrfs_aops; 6526 6527 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6528 if (err) 6529 goto out_unlock; 6530 6531 err = btrfs_update_inode(trans, root, inode); 6532 if (err) 6533 goto out_unlock; 6534 6535 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6536 0, index); 6537 if (err) 6538 goto out_unlock; 6539 6540 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 6541 d_instantiate_new(dentry, inode); 6542 6543 out_unlock: 6544 btrfs_end_transaction(trans); 6545 if (err && inode) { 6546 inode_dec_link_count(inode); 6547 discard_new_inode(inode); 6548 } 6549 btrfs_btree_balance_dirty(fs_info); 6550 return err; 6551 } 6552 6553 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 6554 struct dentry *dentry) 6555 { 6556 struct btrfs_trans_handle *trans = NULL; 6557 struct btrfs_root *root = BTRFS_I(dir)->root; 6558 struct inode *inode = d_inode(old_dentry); 6559 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 6560 u64 index; 6561 int err; 6562 int drop_inode = 0; 6563 6564 /* do not allow sys_link's with other subvols of the same device */ 6565 if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) 6566 return -EXDEV; 6567 6568 if (inode->i_nlink >= BTRFS_LINK_MAX) 6569 return -EMLINK; 6570 6571 err = btrfs_set_inode_index(BTRFS_I(dir), &index); 6572 if (err) 6573 goto fail; 6574 6575 /* 6576 * 2 items for inode and inode ref 6577 * 2 items for dir items 6578 * 1 item for parent inode 6579 * 1 item for orphan item deletion if O_TMPFILE 6580 */ 6581 trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); 6582 if (IS_ERR(trans)) { 6583 err = PTR_ERR(trans); 6584 trans = NULL; 6585 goto fail; 6586 } 6587 6588 /* There are several dir indexes for this inode, clear the cache. */ 6589 BTRFS_I(inode)->dir_index = 0ULL; 6590 inc_nlink(inode); 6591 inode_inc_iversion(inode); 6592 inode->i_ctime = current_time(inode); 6593 ihold(inode); 6594 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6595 6596 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6597 1, index); 6598 6599 if (err) { 6600 drop_inode = 1; 6601 } else { 6602 struct dentry *parent = dentry->d_parent; 6603 int ret; 6604 6605 err = btrfs_update_inode(trans, root, inode); 6606 if (err) 6607 goto fail; 6608 if (inode->i_nlink == 1) { 6609 /* 6610 * If new hard link count is 1, it's a file created 6611 * with open(2) O_TMPFILE flag. 6612 */ 6613 err = btrfs_orphan_del(trans, BTRFS_I(inode)); 6614 if (err) 6615 goto fail; 6616 } 6617 d_instantiate(dentry, inode); 6618 ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, 6619 true, NULL); 6620 if (ret == BTRFS_NEED_TRANS_COMMIT) { 6621 err = btrfs_commit_transaction(trans); 6622 trans = NULL; 6623 } 6624 } 6625 6626 fail: 6627 if (trans) 6628 btrfs_end_transaction(trans); 6629 if (drop_inode) { 6630 inode_dec_link_count(inode); 6631 iput(inode); 6632 } 6633 btrfs_btree_balance_dirty(fs_info); 6634 return err; 6635 } 6636 6637 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 6638 { 6639 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6640 struct inode *inode = NULL; 6641 struct btrfs_trans_handle *trans; 6642 struct btrfs_root *root = BTRFS_I(dir)->root; 6643 int err = 0; 6644 int drop_on_err = 0; 6645 u64 objectid = 0; 6646 u64 index = 0; 6647 6648 /* 6649 * 2 items for inode and ref 6650 * 2 items for dir items 6651 * 1 for xattr if selinux is on 6652 */ 6653 trans = btrfs_start_transaction(root, 5); 6654 if (IS_ERR(trans)) 6655 return PTR_ERR(trans); 6656 6657 err = btrfs_find_free_ino(root, &objectid); 6658 if (err) 6659 goto out_fail; 6660 6661 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6662 dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, 6663 S_IFDIR | mode, &index); 6664 if (IS_ERR(inode)) { 6665 err = PTR_ERR(inode); 6666 inode = NULL; 6667 goto out_fail; 6668 } 6669 6670 drop_on_err = 1; 6671 /* these must be set before we unlock the inode */ 6672 inode->i_op = &btrfs_dir_inode_operations; 6673 inode->i_fop = &btrfs_dir_file_operations; 6674 6675 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6676 if (err) 6677 goto out_fail; 6678 6679 btrfs_i_size_write(BTRFS_I(inode), 0); 6680 err = btrfs_update_inode(trans, root, inode); 6681 if (err) 6682 goto out_fail; 6683 6684 err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 6685 dentry->d_name.name, 6686 dentry->d_name.len, 0, index); 6687 if (err) 6688 goto out_fail; 6689 6690 d_instantiate_new(dentry, inode); 6691 drop_on_err = 0; 6692 6693 out_fail: 6694 btrfs_end_transaction(trans); 6695 if (err && inode) { 6696 inode_dec_link_count(inode); 6697 discard_new_inode(inode); 6698 } 6699 btrfs_btree_balance_dirty(fs_info); 6700 return err; 6701 } 6702 6703 static noinline int uncompress_inline(struct btrfs_path *path, 6704 struct page *page, 6705 size_t pg_offset, u64 extent_offset, 6706 struct btrfs_file_extent_item *item) 6707 { 6708 int ret; 6709 struct extent_buffer *leaf = path->nodes[0]; 6710 char *tmp; 6711 size_t max_size; 6712 unsigned long inline_size; 6713 unsigned long ptr; 6714 int compress_type; 6715 6716 WARN_ON(pg_offset != 0); 6717 compress_type = btrfs_file_extent_compression(leaf, item); 6718 max_size = btrfs_file_extent_ram_bytes(leaf, item); 6719 inline_size = btrfs_file_extent_inline_item_len(leaf, 6720 btrfs_item_nr(path->slots[0])); 6721 tmp = kmalloc(inline_size, GFP_NOFS); 6722 if (!tmp) 6723 return -ENOMEM; 6724 ptr = btrfs_file_extent_inline_start(item); 6725 6726 read_extent_buffer(leaf, tmp, ptr, inline_size); 6727 6728 max_size = min_t(unsigned long, PAGE_SIZE, max_size); 6729 ret = btrfs_decompress(compress_type, tmp, page, 6730 extent_offset, inline_size, max_size); 6731 6732 /* 6733 * decompression code contains a memset to fill in any space between the end 6734 * of the uncompressed data and the end of max_size in case the decompressed 6735 * data ends up shorter than ram_bytes. That doesn't cover the hole between 6736 * the end of an inline extent and the beginning of the next block, so we 6737 * cover that region here. 6738 */ 6739 6740 if (max_size + pg_offset < PAGE_SIZE) { 6741 char *map = kmap(page); 6742 memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset); 6743 kunmap(page); 6744 } 6745 kfree(tmp); 6746 return ret; 6747 } 6748 6749 /* 6750 * a bit scary, this does extent mapping from logical file offset to the disk. 6751 * the ugly parts come from merging extents from the disk with the in-ram 6752 * representation. This gets more complex because of the data=ordered code, 6753 * where the in-ram extents might be locked pending data=ordered completion. 6754 * 6755 * This also copies inline extents directly into the page. 6756 */ 6757 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, 6758 struct page *page, 6759 size_t pg_offset, u64 start, u64 len, 6760 int create) 6761 { 6762 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6763 int ret; 6764 int err = 0; 6765 u64 extent_start = 0; 6766 u64 extent_end = 0; 6767 u64 objectid = btrfs_ino(inode); 6768 u32 found_type; 6769 struct btrfs_path *path = NULL; 6770 struct btrfs_root *root = inode->root; 6771 struct btrfs_file_extent_item *item; 6772 struct extent_buffer *leaf; 6773 struct btrfs_key found_key; 6774 struct extent_map *em = NULL; 6775 struct extent_map_tree *em_tree = &inode->extent_tree; 6776 struct extent_io_tree *io_tree = &inode->io_tree; 6777 const bool new_inline = !page || create; 6778 6779 read_lock(&em_tree->lock); 6780 em = lookup_extent_mapping(em_tree, start, len); 6781 if (em) 6782 em->bdev = fs_info->fs_devices->latest_bdev; 6783 read_unlock(&em_tree->lock); 6784 6785 if (em) { 6786 if (em->start > start || em->start + em->len <= start) 6787 free_extent_map(em); 6788 else if (em->block_start == EXTENT_MAP_INLINE && page) 6789 free_extent_map(em); 6790 else 6791 goto out; 6792 } 6793 em = alloc_extent_map(); 6794 if (!em) { 6795 err = -ENOMEM; 6796 goto out; 6797 } 6798 em->bdev = fs_info->fs_devices->latest_bdev; 6799 em->start = EXTENT_MAP_HOLE; 6800 em->orig_start = EXTENT_MAP_HOLE; 6801 em->len = (u64)-1; 6802 em->block_len = (u64)-1; 6803 6804 path = btrfs_alloc_path(); 6805 if (!path) { 6806 err = -ENOMEM; 6807 goto out; 6808 } 6809 6810 /* Chances are we'll be called again, so go ahead and do readahead */ 6811 path->reada = READA_FORWARD; 6812 6813 /* 6814 * Unless we're going to uncompress the inline extent, no sleep would 6815 * happen. 6816 */ 6817 path->leave_spinning = 1; 6818 6819 ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); 6820 if (ret < 0) { 6821 err = ret; 6822 goto out; 6823 } 6824 6825 if (ret != 0) { 6826 if (path->slots[0] == 0) 6827 goto not_found; 6828 path->slots[0]--; 6829 } 6830 6831 leaf = path->nodes[0]; 6832 item = btrfs_item_ptr(leaf, path->slots[0], 6833 struct btrfs_file_extent_item); 6834 /* are we inside the extent that was found? */ 6835 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6836 found_type = found_key.type; 6837 if (found_key.objectid != objectid || 6838 found_type != BTRFS_EXTENT_DATA_KEY) { 6839 /* 6840 * If we backup past the first extent we want to move forward 6841 * and see if there is an extent in front of us, otherwise we'll 6842 * say there is a hole for our whole search range which can 6843 * cause problems. 6844 */ 6845 extent_end = start; 6846 goto next; 6847 } 6848 6849 found_type = btrfs_file_extent_type(leaf, item); 6850 extent_start = found_key.offset; 6851 if (found_type == BTRFS_FILE_EXTENT_REG || 6852 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6853 extent_end = extent_start + 6854 btrfs_file_extent_num_bytes(leaf, item); 6855 6856 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, 6857 extent_start); 6858 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6859 size_t size; 6860 6861 size = btrfs_file_extent_ram_bytes(leaf, item); 6862 extent_end = ALIGN(extent_start + size, 6863 fs_info->sectorsize); 6864 6865 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, 6866 path->slots[0], 6867 extent_start); 6868 } 6869 next: 6870 if (start >= extent_end) { 6871 path->slots[0]++; 6872 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 6873 ret = btrfs_next_leaf(root, path); 6874 if (ret < 0) { 6875 err = ret; 6876 goto out; 6877 } 6878 if (ret > 0) 6879 goto not_found; 6880 leaf = path->nodes[0]; 6881 } 6882 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6883 if (found_key.objectid != objectid || 6884 found_key.type != BTRFS_EXTENT_DATA_KEY) 6885 goto not_found; 6886 if (start + len <= found_key.offset) 6887 goto not_found; 6888 if (start > found_key.offset) 6889 goto next; 6890 em->start = start; 6891 em->orig_start = start; 6892 em->len = found_key.offset - start; 6893 goto not_found_em; 6894 } 6895 6896 btrfs_extent_item_to_extent_map(inode, path, item, 6897 new_inline, em); 6898 6899 if (found_type == BTRFS_FILE_EXTENT_REG || 6900 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6901 goto insert; 6902 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6903 unsigned long ptr; 6904 char *map; 6905 size_t size; 6906 size_t extent_offset; 6907 size_t copy_size; 6908 6909 if (new_inline) 6910 goto out; 6911 6912 size = btrfs_file_extent_ram_bytes(leaf, item); 6913 extent_offset = page_offset(page) + pg_offset - extent_start; 6914 copy_size = min_t(u64, PAGE_SIZE - pg_offset, 6915 size - extent_offset); 6916 em->start = extent_start + extent_offset; 6917 em->len = ALIGN(copy_size, fs_info->sectorsize); 6918 em->orig_block_len = em->len; 6919 em->orig_start = em->start; 6920 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 6921 6922 btrfs_set_path_blocking(path); 6923 if (!PageUptodate(page)) { 6924 if (btrfs_file_extent_compression(leaf, item) != 6925 BTRFS_COMPRESS_NONE) { 6926 ret = uncompress_inline(path, page, pg_offset, 6927 extent_offset, item); 6928 if (ret) { 6929 err = ret; 6930 goto out; 6931 } 6932 } else { 6933 map = kmap(page); 6934 read_extent_buffer(leaf, map + pg_offset, ptr, 6935 copy_size); 6936 if (pg_offset + copy_size < PAGE_SIZE) { 6937 memset(map + pg_offset + copy_size, 0, 6938 PAGE_SIZE - pg_offset - 6939 copy_size); 6940 } 6941 kunmap(page); 6942 } 6943 flush_dcache_page(page); 6944 } 6945 set_extent_uptodate(io_tree, em->start, 6946 extent_map_end(em) - 1, NULL, GFP_NOFS); 6947 goto insert; 6948 } 6949 not_found: 6950 em->start = start; 6951 em->orig_start = start; 6952 em->len = len; 6953 not_found_em: 6954 em->block_start = EXTENT_MAP_HOLE; 6955 insert: 6956 btrfs_release_path(path); 6957 if (em->start > start || extent_map_end(em) <= start) { 6958 btrfs_err(fs_info, 6959 "bad extent! em: [%llu %llu] passed [%llu %llu]", 6960 em->start, em->len, start, len); 6961 err = -EIO; 6962 goto out; 6963 } 6964 6965 err = 0; 6966 write_lock(&em_tree->lock); 6967 err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); 6968 write_unlock(&em_tree->lock); 6969 out: 6970 btrfs_free_path(path); 6971 6972 trace_btrfs_get_extent(root, inode, em); 6973 6974 if (err) { 6975 free_extent_map(em); 6976 return ERR_PTR(err); 6977 } 6978 BUG_ON(!em); /* Error is always set */ 6979 return em; 6980 } 6981 6982 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, 6983 struct page *page, 6984 size_t pg_offset, u64 start, u64 len, 6985 int create) 6986 { 6987 struct extent_map *em; 6988 struct extent_map *hole_em = NULL; 6989 u64 range_start = start; 6990 u64 end; 6991 u64 found; 6992 u64 found_end; 6993 int err = 0; 6994 6995 em = btrfs_get_extent(inode, page, pg_offset, start, len, create); 6996 if (IS_ERR(em)) 6997 return em; 6998 /* 6999 * If our em maps to: 7000 * - a hole or 7001 * - a pre-alloc extent, 7002 * there might actually be delalloc bytes behind it. 7003 */ 7004 if (em->block_start != EXTENT_MAP_HOLE && 7005 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7006 return em; 7007 else 7008 hole_em = em; 7009 7010 /* check to see if we've wrapped (len == -1 or similar) */ 7011 end = start + len; 7012 if (end < start) 7013 end = (u64)-1; 7014 else 7015 end -= 1; 7016 7017 em = NULL; 7018 7019 /* ok, we didn't find anything, lets look for delalloc */ 7020 found = count_range_bits(&inode->io_tree, &range_start, 7021 end, len, EXTENT_DELALLOC, 1); 7022 found_end = range_start + found; 7023 if (found_end < range_start) 7024 found_end = (u64)-1; 7025 7026 /* 7027 * we didn't find anything useful, return 7028 * the original results from get_extent() 7029 */ 7030 if (range_start > end || found_end <= start) { 7031 em = hole_em; 7032 hole_em = NULL; 7033 goto out; 7034 } 7035 7036 /* adjust the range_start to make sure it doesn't 7037 * go backwards from the start they passed in 7038 */ 7039 range_start = max(start, range_start); 7040 found = found_end - range_start; 7041 7042 if (found > 0) { 7043 u64 hole_start = start; 7044 u64 hole_len = len; 7045 7046 em = alloc_extent_map(); 7047 if (!em) { 7048 err = -ENOMEM; 7049 goto out; 7050 } 7051 /* 7052 * when btrfs_get_extent can't find anything it 7053 * returns one huge hole 7054 * 7055 * make sure what it found really fits our range, and 7056 * adjust to make sure it is based on the start from 7057 * the caller 7058 */ 7059 if (hole_em) { 7060 u64 calc_end = extent_map_end(hole_em); 7061 7062 if (calc_end <= start || (hole_em->start > end)) { 7063 free_extent_map(hole_em); 7064 hole_em = NULL; 7065 } else { 7066 hole_start = max(hole_em->start, start); 7067 hole_len = calc_end - hole_start; 7068 } 7069 } 7070 em->bdev = NULL; 7071 if (hole_em && range_start > hole_start) { 7072 /* our hole starts before our delalloc, so we 7073 * have to return just the parts of the hole 7074 * that go until the delalloc starts 7075 */ 7076 em->len = min(hole_len, 7077 range_start - hole_start); 7078 em->start = hole_start; 7079 em->orig_start = hole_start; 7080 /* 7081 * don't adjust block start at all, 7082 * it is fixed at EXTENT_MAP_HOLE 7083 */ 7084 em->block_start = hole_em->block_start; 7085 em->block_len = hole_len; 7086 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) 7087 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7088 } else { 7089 em->start = range_start; 7090 em->len = found; 7091 em->orig_start = range_start; 7092 em->block_start = EXTENT_MAP_DELALLOC; 7093 em->block_len = found; 7094 } 7095 } else { 7096 return hole_em; 7097 } 7098 out: 7099 7100 free_extent_map(hole_em); 7101 if (err) { 7102 free_extent_map(em); 7103 return ERR_PTR(err); 7104 } 7105 return em; 7106 } 7107 7108 static struct extent_map *btrfs_create_dio_extent(struct inode *inode, 7109 const u64 start, 7110 const u64 len, 7111 const u64 orig_start, 7112 const u64 block_start, 7113 const u64 block_len, 7114 const u64 orig_block_len, 7115 const u64 ram_bytes, 7116 const int type) 7117 { 7118 struct extent_map *em = NULL; 7119 int ret; 7120 7121 if (type != BTRFS_ORDERED_NOCOW) { 7122 em = create_io_em(inode, start, len, orig_start, 7123 block_start, block_len, orig_block_len, 7124 ram_bytes, 7125 BTRFS_COMPRESS_NONE, /* compress_type */ 7126 type); 7127 if (IS_ERR(em)) 7128 goto out; 7129 } 7130 ret = btrfs_add_ordered_extent_dio(inode, start, block_start, 7131 len, block_len, type); 7132 if (ret) { 7133 if (em) { 7134 free_extent_map(em); 7135 btrfs_drop_extent_cache(BTRFS_I(inode), start, 7136 start + len - 1, 0); 7137 } 7138 em = ERR_PTR(ret); 7139 } 7140 out: 7141 7142 return em; 7143 } 7144 7145 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 7146 u64 start, u64 len) 7147 { 7148 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7149 struct btrfs_root *root = BTRFS_I(inode)->root; 7150 struct extent_map *em; 7151 struct btrfs_key ins; 7152 u64 alloc_hint; 7153 int ret; 7154 7155 alloc_hint = get_extent_allocation_hint(inode, start, len); 7156 ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 7157 0, alloc_hint, &ins, 1, 1); 7158 if (ret) 7159 return ERR_PTR(ret); 7160 7161 em = btrfs_create_dio_extent(inode, start, ins.offset, start, 7162 ins.objectid, ins.offset, ins.offset, 7163 ins.offset, BTRFS_ORDERED_REGULAR); 7164 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 7165 if (IS_ERR(em)) 7166 btrfs_free_reserved_extent(fs_info, ins.objectid, 7167 ins.offset, 1); 7168 7169 return em; 7170 } 7171 7172 /* 7173 * returns 1 when the nocow is safe, < 1 on error, 0 if the 7174 * block must be cow'd 7175 */ 7176 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, 7177 u64 *orig_start, u64 *orig_block_len, 7178 u64 *ram_bytes) 7179 { 7180 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7181 struct btrfs_path *path; 7182 int ret; 7183 struct extent_buffer *leaf; 7184 struct btrfs_root *root = BTRFS_I(inode)->root; 7185 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7186 struct btrfs_file_extent_item *fi; 7187 struct btrfs_key key; 7188 u64 disk_bytenr; 7189 u64 backref_offset; 7190 u64 extent_end; 7191 u64 num_bytes; 7192 int slot; 7193 int found_type; 7194 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); 7195 7196 path = btrfs_alloc_path(); 7197 if (!path) 7198 return -ENOMEM; 7199 7200 ret = btrfs_lookup_file_extent(NULL, root, path, 7201 btrfs_ino(BTRFS_I(inode)), offset, 0); 7202 if (ret < 0) 7203 goto out; 7204 7205 slot = path->slots[0]; 7206 if (ret == 1) { 7207 if (slot == 0) { 7208 /* can't find the item, must cow */ 7209 ret = 0; 7210 goto out; 7211 } 7212 slot--; 7213 } 7214 ret = 0; 7215 leaf = path->nodes[0]; 7216 btrfs_item_key_to_cpu(leaf, &key, slot); 7217 if (key.objectid != btrfs_ino(BTRFS_I(inode)) || 7218 key.type != BTRFS_EXTENT_DATA_KEY) { 7219 /* not our file or wrong item type, must cow */ 7220 goto out; 7221 } 7222 7223 if (key.offset > offset) { 7224 /* Wrong offset, must cow */ 7225 goto out; 7226 } 7227 7228 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 7229 found_type = btrfs_file_extent_type(leaf, fi); 7230 if (found_type != BTRFS_FILE_EXTENT_REG && 7231 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 7232 /* not a regular extent, must cow */ 7233 goto out; 7234 } 7235 7236 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) 7237 goto out; 7238 7239 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 7240 if (extent_end <= offset) 7241 goto out; 7242 7243 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 7244 if (disk_bytenr == 0) 7245 goto out; 7246 7247 if (btrfs_file_extent_compression(leaf, fi) || 7248 btrfs_file_extent_encryption(leaf, fi) || 7249 btrfs_file_extent_other_encoding(leaf, fi)) 7250 goto out; 7251 7252 /* 7253 * Do the same check as in btrfs_cross_ref_exist but without the 7254 * unnecessary search. 7255 */ 7256 if (btrfs_file_extent_generation(leaf, fi) <= 7257 btrfs_root_last_snapshot(&root->root_item)) 7258 goto out; 7259 7260 backref_offset = btrfs_file_extent_offset(leaf, fi); 7261 7262 if (orig_start) { 7263 *orig_start = key.offset - backref_offset; 7264 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 7265 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 7266 } 7267 7268 if (btrfs_extent_readonly(fs_info, disk_bytenr)) 7269 goto out; 7270 7271 num_bytes = min(offset + *len, extent_end) - offset; 7272 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { 7273 u64 range_end; 7274 7275 range_end = round_up(offset + num_bytes, 7276 root->fs_info->sectorsize) - 1; 7277 ret = test_range_bit(io_tree, offset, range_end, 7278 EXTENT_DELALLOC, 0, NULL); 7279 if (ret) { 7280 ret = -EAGAIN; 7281 goto out; 7282 } 7283 } 7284 7285 btrfs_release_path(path); 7286 7287 /* 7288 * look for other files referencing this extent, if we 7289 * find any we must cow 7290 */ 7291 7292 ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), 7293 key.offset - backref_offset, disk_bytenr); 7294 if (ret) { 7295 ret = 0; 7296 goto out; 7297 } 7298 7299 /* 7300 * adjust disk_bytenr and num_bytes to cover just the bytes 7301 * in this extent we are about to write. If there 7302 * are any csums in that range we have to cow in order 7303 * to keep the csums correct 7304 */ 7305 disk_bytenr += backref_offset; 7306 disk_bytenr += offset - key.offset; 7307 if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) 7308 goto out; 7309 /* 7310 * all of the above have passed, it is safe to overwrite this extent 7311 * without cow 7312 */ 7313 *len = num_bytes; 7314 ret = 1; 7315 out: 7316 btrfs_free_path(path); 7317 return ret; 7318 } 7319 7320 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 7321 struct extent_state **cached_state, int writing) 7322 { 7323 struct btrfs_ordered_extent *ordered; 7324 int ret = 0; 7325 7326 while (1) { 7327 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7328 cached_state); 7329 /* 7330 * We're concerned with the entire range that we're going to be 7331 * doing DIO to, so we need to make sure there's no ordered 7332 * extents in this range. 7333 */ 7334 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, 7335 lockend - lockstart + 1); 7336 7337 /* 7338 * We need to make sure there are no buffered pages in this 7339 * range either, we could have raced between the invalidate in 7340 * generic_file_direct_write and locking the extent. The 7341 * invalidate needs to happen so that reads after a write do not 7342 * get stale data. 7343 */ 7344 if (!ordered && 7345 (!writing || !filemap_range_has_page(inode->i_mapping, 7346 lockstart, lockend))) 7347 break; 7348 7349 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7350 cached_state); 7351 7352 if (ordered) { 7353 /* 7354 * If we are doing a DIO read and the ordered extent we 7355 * found is for a buffered write, we can not wait for it 7356 * to complete and retry, because if we do so we can 7357 * deadlock with concurrent buffered writes on page 7358 * locks. This happens only if our DIO read covers more 7359 * than one extent map, if at this point has already 7360 * created an ordered extent for a previous extent map 7361 * and locked its range in the inode's io tree, and a 7362 * concurrent write against that previous extent map's 7363 * range and this range started (we unlock the ranges 7364 * in the io tree only when the bios complete and 7365 * buffered writes always lock pages before attempting 7366 * to lock range in the io tree). 7367 */ 7368 if (writing || 7369 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) 7370 btrfs_start_ordered_extent(inode, ordered, 1); 7371 else 7372 ret = -ENOTBLK; 7373 btrfs_put_ordered_extent(ordered); 7374 } else { 7375 /* 7376 * We could trigger writeback for this range (and wait 7377 * for it to complete) and then invalidate the pages for 7378 * this range (through invalidate_inode_pages2_range()), 7379 * but that can lead us to a deadlock with a concurrent 7380 * call to readpages() (a buffered read or a defrag call 7381 * triggered a readahead) on a page lock due to an 7382 * ordered dio extent we created before but did not have 7383 * yet a corresponding bio submitted (whence it can not 7384 * complete), which makes readpages() wait for that 7385 * ordered extent to complete while holding a lock on 7386 * that page. 7387 */ 7388 ret = -ENOTBLK; 7389 } 7390 7391 if (ret) 7392 break; 7393 7394 cond_resched(); 7395 } 7396 7397 return ret; 7398 } 7399 7400 /* The callers of this must take lock_extent() */ 7401 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, 7402 u64 orig_start, u64 block_start, 7403 u64 block_len, u64 orig_block_len, 7404 u64 ram_bytes, int compress_type, 7405 int type) 7406 { 7407 struct extent_map_tree *em_tree; 7408 struct extent_map *em; 7409 struct btrfs_root *root = BTRFS_I(inode)->root; 7410 int ret; 7411 7412 ASSERT(type == BTRFS_ORDERED_PREALLOC || 7413 type == BTRFS_ORDERED_COMPRESSED || 7414 type == BTRFS_ORDERED_NOCOW || 7415 type == BTRFS_ORDERED_REGULAR); 7416 7417 em_tree = &BTRFS_I(inode)->extent_tree; 7418 em = alloc_extent_map(); 7419 if (!em) 7420 return ERR_PTR(-ENOMEM); 7421 7422 em->start = start; 7423 em->orig_start = orig_start; 7424 em->len = len; 7425 em->block_len = block_len; 7426 em->block_start = block_start; 7427 em->bdev = root->fs_info->fs_devices->latest_bdev; 7428 em->orig_block_len = orig_block_len; 7429 em->ram_bytes = ram_bytes; 7430 em->generation = -1; 7431 set_bit(EXTENT_FLAG_PINNED, &em->flags); 7432 if (type == BTRFS_ORDERED_PREALLOC) { 7433 set_bit(EXTENT_FLAG_FILLING, &em->flags); 7434 } else if (type == BTRFS_ORDERED_COMPRESSED) { 7435 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 7436 em->compress_type = compress_type; 7437 } 7438 7439 do { 7440 btrfs_drop_extent_cache(BTRFS_I(inode), em->start, 7441 em->start + em->len - 1, 0); 7442 write_lock(&em_tree->lock); 7443 ret = add_extent_mapping(em_tree, em, 1); 7444 write_unlock(&em_tree->lock); 7445 /* 7446 * The caller has taken lock_extent(), who could race with us 7447 * to add em? 7448 */ 7449 } while (ret == -EEXIST); 7450 7451 if (ret) { 7452 free_extent_map(em); 7453 return ERR_PTR(ret); 7454 } 7455 7456 /* em got 2 refs now, callers needs to do free_extent_map once. */ 7457 return em; 7458 } 7459 7460 7461 static int btrfs_get_blocks_direct_read(struct extent_map *em, 7462 struct buffer_head *bh_result, 7463 struct inode *inode, 7464 u64 start, u64 len) 7465 { 7466 if (em->block_start == EXTENT_MAP_HOLE || 7467 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7468 return -ENOENT; 7469 7470 len = min(len, em->len - (start - em->start)); 7471 7472 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 7473 inode->i_blkbits; 7474 bh_result->b_size = len; 7475 bh_result->b_bdev = em->bdev; 7476 set_buffer_mapped(bh_result); 7477 7478 return 0; 7479 } 7480 7481 static int btrfs_get_blocks_direct_write(struct extent_map **map, 7482 struct buffer_head *bh_result, 7483 struct inode *inode, 7484 struct btrfs_dio_data *dio_data, 7485 u64 start, u64 len) 7486 { 7487 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7488 struct extent_map *em = *map; 7489 int ret = 0; 7490 7491 /* 7492 * We don't allocate a new extent in the following cases 7493 * 7494 * 1) The inode is marked as NODATACOW. In this case we'll just use the 7495 * existing extent. 7496 * 2) The extent is marked as PREALLOC. We're good to go here and can 7497 * just use the extent. 7498 * 7499 */ 7500 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 7501 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 7502 em->block_start != EXTENT_MAP_HOLE)) { 7503 int type; 7504 u64 block_start, orig_start, orig_block_len, ram_bytes; 7505 7506 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7507 type = BTRFS_ORDERED_PREALLOC; 7508 else 7509 type = BTRFS_ORDERED_NOCOW; 7510 len = min(len, em->len - (start - em->start)); 7511 block_start = em->block_start + (start - em->start); 7512 7513 if (can_nocow_extent(inode, start, &len, &orig_start, 7514 &orig_block_len, &ram_bytes) == 1 && 7515 btrfs_inc_nocow_writers(fs_info, block_start)) { 7516 struct extent_map *em2; 7517 7518 em2 = btrfs_create_dio_extent(inode, start, len, 7519 orig_start, block_start, 7520 len, orig_block_len, 7521 ram_bytes, type); 7522 btrfs_dec_nocow_writers(fs_info, block_start); 7523 if (type == BTRFS_ORDERED_PREALLOC) { 7524 free_extent_map(em); 7525 *map = em = em2; 7526 } 7527 7528 if (em2 && IS_ERR(em2)) { 7529 ret = PTR_ERR(em2); 7530 goto out; 7531 } 7532 /* 7533 * For inode marked NODATACOW or extent marked PREALLOC, 7534 * use the existing or preallocated extent, so does not 7535 * need to adjust btrfs_space_info's bytes_may_use. 7536 */ 7537 btrfs_free_reserved_data_space_noquota(inode, start, 7538 len); 7539 goto skip_cow; 7540 } 7541 } 7542 7543 /* this will cow the extent */ 7544 len = bh_result->b_size; 7545 free_extent_map(em); 7546 *map = em = btrfs_new_extent_direct(inode, start, len); 7547 if (IS_ERR(em)) { 7548 ret = PTR_ERR(em); 7549 goto out; 7550 } 7551 7552 len = min(len, em->len - (start - em->start)); 7553 7554 skip_cow: 7555 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 7556 inode->i_blkbits; 7557 bh_result->b_size = len; 7558 bh_result->b_bdev = em->bdev; 7559 set_buffer_mapped(bh_result); 7560 7561 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7562 set_buffer_new(bh_result); 7563 7564 /* 7565 * Need to update the i_size under the extent lock so buffered 7566 * readers will get the updated i_size when we unlock. 7567 */ 7568 if (!dio_data->overwrite && start + len > i_size_read(inode)) 7569 i_size_write(inode, start + len); 7570 7571 WARN_ON(dio_data->reserve < len); 7572 dio_data->reserve -= len; 7573 dio_data->unsubmitted_oe_range_end = start + len; 7574 current->journal_info = dio_data; 7575 out: 7576 return ret; 7577 } 7578 7579 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 7580 struct buffer_head *bh_result, int create) 7581 { 7582 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7583 struct extent_map *em; 7584 struct extent_state *cached_state = NULL; 7585 struct btrfs_dio_data *dio_data = NULL; 7586 u64 start = iblock << inode->i_blkbits; 7587 u64 lockstart, lockend; 7588 u64 len = bh_result->b_size; 7589 int unlock_bits = EXTENT_LOCKED; 7590 int ret = 0; 7591 7592 if (create) 7593 unlock_bits |= EXTENT_DIRTY; 7594 else 7595 len = min_t(u64, len, fs_info->sectorsize); 7596 7597 lockstart = start; 7598 lockend = start + len - 1; 7599 7600 if (current->journal_info) { 7601 /* 7602 * Need to pull our outstanding extents and set journal_info to NULL so 7603 * that anything that needs to check if there's a transaction doesn't get 7604 * confused. 7605 */ 7606 dio_data = current->journal_info; 7607 current->journal_info = NULL; 7608 } 7609 7610 /* 7611 * If this errors out it's because we couldn't invalidate pagecache for 7612 * this range and we need to fallback to buffered. 7613 */ 7614 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, 7615 create)) { 7616 ret = -ENOTBLK; 7617 goto err; 7618 } 7619 7620 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); 7621 if (IS_ERR(em)) { 7622 ret = PTR_ERR(em); 7623 goto unlock_err; 7624 } 7625 7626 /* 7627 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 7628 * io. INLINE is special, and we could probably kludge it in here, but 7629 * it's still buffered so for safety lets just fall back to the generic 7630 * buffered path. 7631 * 7632 * For COMPRESSED we _have_ to read the entire extent in so we can 7633 * decompress it, so there will be buffering required no matter what we 7634 * do, so go ahead and fallback to buffered. 7635 * 7636 * We return -ENOTBLK because that's what makes DIO go ahead and go back 7637 * to buffered IO. Don't blame me, this is the price we pay for using 7638 * the generic code. 7639 */ 7640 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 7641 em->block_start == EXTENT_MAP_INLINE) { 7642 free_extent_map(em); 7643 ret = -ENOTBLK; 7644 goto unlock_err; 7645 } 7646 7647 if (create) { 7648 ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, 7649 dio_data, start, len); 7650 if (ret < 0) 7651 goto unlock_err; 7652 7653 /* clear and unlock the entire range */ 7654 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7655 unlock_bits, 1, 0, &cached_state); 7656 } else { 7657 ret = btrfs_get_blocks_direct_read(em, bh_result, inode, 7658 start, len); 7659 /* Can be negative only if we read from a hole */ 7660 if (ret < 0) { 7661 ret = 0; 7662 free_extent_map(em); 7663 goto unlock_err; 7664 } 7665 /* 7666 * We need to unlock only the end area that we aren't using. 7667 * The rest is going to be unlocked by the endio routine. 7668 */ 7669 lockstart = start + bh_result->b_size; 7670 if (lockstart < lockend) { 7671 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 7672 lockend, unlock_bits, 1, 0, 7673 &cached_state); 7674 } else { 7675 free_extent_state(cached_state); 7676 } 7677 } 7678 7679 free_extent_map(em); 7680 7681 return 0; 7682 7683 unlock_err: 7684 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7685 unlock_bits, 1, 0, &cached_state); 7686 err: 7687 if (dio_data) 7688 current->journal_info = dio_data; 7689 return ret; 7690 } 7691 7692 static inline blk_status_t submit_dio_repair_bio(struct inode *inode, 7693 struct bio *bio, 7694 int mirror_num) 7695 { 7696 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7697 blk_status_t ret; 7698 7699 BUG_ON(bio_op(bio) == REQ_OP_WRITE); 7700 7701 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR); 7702 if (ret) 7703 return ret; 7704 7705 ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); 7706 7707 return ret; 7708 } 7709 7710 static int btrfs_check_dio_repairable(struct inode *inode, 7711 struct bio *failed_bio, 7712 struct io_failure_record *failrec, 7713 int failed_mirror) 7714 { 7715 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7716 int num_copies; 7717 7718 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 7719 if (num_copies == 1) { 7720 /* 7721 * we only have a single copy of the data, so don't bother with 7722 * all the retry and error correction code that follows. no 7723 * matter what the error is, it is very likely to persist. 7724 */ 7725 btrfs_debug(fs_info, 7726 "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 7727 num_copies, failrec->this_mirror, failed_mirror); 7728 return 0; 7729 } 7730 7731 failrec->failed_mirror = failed_mirror; 7732 failrec->this_mirror++; 7733 if (failrec->this_mirror == failed_mirror) 7734 failrec->this_mirror++; 7735 7736 if (failrec->this_mirror > num_copies) { 7737 btrfs_debug(fs_info, 7738 "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 7739 num_copies, failrec->this_mirror, failed_mirror); 7740 return 0; 7741 } 7742 7743 return 1; 7744 } 7745 7746 static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, 7747 struct page *page, unsigned int pgoff, 7748 u64 start, u64 end, int failed_mirror, 7749 bio_end_io_t *repair_endio, void *repair_arg) 7750 { 7751 struct io_failure_record *failrec; 7752 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7753 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 7754 struct bio *bio; 7755 int isector; 7756 unsigned int read_mode = 0; 7757 int segs; 7758 int ret; 7759 blk_status_t status; 7760 struct bio_vec bvec; 7761 7762 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 7763 7764 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 7765 if (ret) 7766 return errno_to_blk_status(ret); 7767 7768 ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, 7769 failed_mirror); 7770 if (!ret) { 7771 free_io_failure(failure_tree, io_tree, failrec); 7772 return BLK_STS_IOERR; 7773 } 7774 7775 segs = bio_segments(failed_bio); 7776 bio_get_first_bvec(failed_bio, &bvec); 7777 if (segs > 1 || 7778 (bvec.bv_len > btrfs_inode_sectorsize(inode))) 7779 read_mode |= REQ_FAILFAST_DEV; 7780 7781 isector = start - btrfs_io_bio(failed_bio)->logical; 7782 isector >>= inode->i_sb->s_blocksize_bits; 7783 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 7784 pgoff, isector, repair_endio, repair_arg); 7785 bio->bi_opf = REQ_OP_READ | read_mode; 7786 7787 btrfs_debug(BTRFS_I(inode)->root->fs_info, 7788 "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d", 7789 read_mode, failrec->this_mirror, failrec->in_validation); 7790 7791 status = submit_dio_repair_bio(inode, bio, failrec->this_mirror); 7792 if (status) { 7793 free_io_failure(failure_tree, io_tree, failrec); 7794 bio_put(bio); 7795 } 7796 7797 return status; 7798 } 7799 7800 struct btrfs_retry_complete { 7801 struct completion done; 7802 struct inode *inode; 7803 u64 start; 7804 int uptodate; 7805 }; 7806 7807 static void btrfs_retry_endio_nocsum(struct bio *bio) 7808 { 7809 struct btrfs_retry_complete *done = bio->bi_private; 7810 struct inode *inode = done->inode; 7811 struct bio_vec *bvec; 7812 struct extent_io_tree *io_tree, *failure_tree; 7813 int i; 7814 7815 if (bio->bi_status) 7816 goto end; 7817 7818 ASSERT(bio->bi_vcnt == 1); 7819 io_tree = &BTRFS_I(inode)->io_tree; 7820 failure_tree = &BTRFS_I(inode)->io_failure_tree; 7821 ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode)); 7822 7823 done->uptodate = 1; 7824 ASSERT(!bio_flagged(bio, BIO_CLONED)); 7825 bio_for_each_segment_all(bvec, bio, i) 7826 clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, 7827 io_tree, done->start, bvec->bv_page, 7828 btrfs_ino(BTRFS_I(inode)), 0); 7829 end: 7830 complete(&done->done); 7831 bio_put(bio); 7832 } 7833 7834 static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode, 7835 struct btrfs_io_bio *io_bio) 7836 { 7837 struct btrfs_fs_info *fs_info; 7838 struct bio_vec bvec; 7839 struct bvec_iter iter; 7840 struct btrfs_retry_complete done; 7841 u64 start; 7842 unsigned int pgoff; 7843 u32 sectorsize; 7844 int nr_sectors; 7845 blk_status_t ret; 7846 blk_status_t err = BLK_STS_OK; 7847 7848 fs_info = BTRFS_I(inode)->root->fs_info; 7849 sectorsize = fs_info->sectorsize; 7850 7851 start = io_bio->logical; 7852 done.inode = inode; 7853 io_bio->bio.bi_iter = io_bio->iter; 7854 7855 bio_for_each_segment(bvec, &io_bio->bio, iter) { 7856 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); 7857 pgoff = bvec.bv_offset; 7858 7859 next_block_or_try_again: 7860 done.uptodate = 0; 7861 done.start = start; 7862 init_completion(&done.done); 7863 7864 ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, 7865 pgoff, start, start + sectorsize - 1, 7866 io_bio->mirror_num, 7867 btrfs_retry_endio_nocsum, &done); 7868 if (ret) { 7869 err = ret; 7870 goto next; 7871 } 7872 7873 wait_for_completion_io(&done.done); 7874 7875 if (!done.uptodate) { 7876 /* We might have another mirror, so try again */ 7877 goto next_block_or_try_again; 7878 } 7879 7880 next: 7881 start += sectorsize; 7882 7883 nr_sectors--; 7884 if (nr_sectors) { 7885 pgoff += sectorsize; 7886 ASSERT(pgoff < PAGE_SIZE); 7887 goto next_block_or_try_again; 7888 } 7889 } 7890 7891 return err; 7892 } 7893 7894 static void btrfs_retry_endio(struct bio *bio) 7895 { 7896 struct btrfs_retry_complete *done = bio->bi_private; 7897 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 7898 struct extent_io_tree *io_tree, *failure_tree; 7899 struct inode *inode = done->inode; 7900 struct bio_vec *bvec; 7901 int uptodate; 7902 int ret; 7903 int i; 7904 7905 if (bio->bi_status) 7906 goto end; 7907 7908 uptodate = 1; 7909 7910 ASSERT(bio->bi_vcnt == 1); 7911 ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode)); 7912 7913 io_tree = &BTRFS_I(inode)->io_tree; 7914 failure_tree = &BTRFS_I(inode)->io_failure_tree; 7915 7916 ASSERT(!bio_flagged(bio, BIO_CLONED)); 7917 bio_for_each_segment_all(bvec, bio, i) { 7918 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, 7919 bvec->bv_offset, done->start, 7920 bvec->bv_len); 7921 if (!ret) 7922 clean_io_failure(BTRFS_I(inode)->root->fs_info, 7923 failure_tree, io_tree, done->start, 7924 bvec->bv_page, 7925 btrfs_ino(BTRFS_I(inode)), 7926 bvec->bv_offset); 7927 else 7928 uptodate = 0; 7929 } 7930 7931 done->uptodate = uptodate; 7932 end: 7933 complete(&done->done); 7934 bio_put(bio); 7935 } 7936 7937 static blk_status_t __btrfs_subio_endio_read(struct inode *inode, 7938 struct btrfs_io_bio *io_bio, blk_status_t err) 7939 { 7940 struct btrfs_fs_info *fs_info; 7941 struct bio_vec bvec; 7942 struct bvec_iter iter; 7943 struct btrfs_retry_complete done; 7944 u64 start; 7945 u64 offset = 0; 7946 u32 sectorsize; 7947 int nr_sectors; 7948 unsigned int pgoff; 7949 int csum_pos; 7950 bool uptodate = (err == 0); 7951 int ret; 7952 blk_status_t status; 7953 7954 fs_info = BTRFS_I(inode)->root->fs_info; 7955 sectorsize = fs_info->sectorsize; 7956 7957 err = BLK_STS_OK; 7958 start = io_bio->logical; 7959 done.inode = inode; 7960 io_bio->bio.bi_iter = io_bio->iter; 7961 7962 bio_for_each_segment(bvec, &io_bio->bio, iter) { 7963 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); 7964 7965 pgoff = bvec.bv_offset; 7966 next_block: 7967 if (uptodate) { 7968 csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); 7969 ret = __readpage_endio_check(inode, io_bio, csum_pos, 7970 bvec.bv_page, pgoff, start, sectorsize); 7971 if (likely(!ret)) 7972 goto next; 7973 } 7974 try_again: 7975 done.uptodate = 0; 7976 done.start = start; 7977 init_completion(&done.done); 7978 7979 status = dio_read_error(inode, &io_bio->bio, bvec.bv_page, 7980 pgoff, start, start + sectorsize - 1, 7981 io_bio->mirror_num, btrfs_retry_endio, 7982 &done); 7983 if (status) { 7984 err = status; 7985 goto next; 7986 } 7987 7988 wait_for_completion_io(&done.done); 7989 7990 if (!done.uptodate) { 7991 /* We might have another mirror, so try again */ 7992 goto try_again; 7993 } 7994 next: 7995 offset += sectorsize; 7996 start += sectorsize; 7997 7998 ASSERT(nr_sectors); 7999 8000 nr_sectors--; 8001 if (nr_sectors) { 8002 pgoff += sectorsize; 8003 ASSERT(pgoff < PAGE_SIZE); 8004 goto next_block; 8005 } 8006 } 8007 8008 return err; 8009 } 8010 8011 static blk_status_t btrfs_subio_endio_read(struct inode *inode, 8012 struct btrfs_io_bio *io_bio, blk_status_t err) 8013 { 8014 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 8015 8016 if (skip_csum) { 8017 if (unlikely(err)) 8018 return __btrfs_correct_data_nocsum(inode, io_bio); 8019 else 8020 return BLK_STS_OK; 8021 } else { 8022 return __btrfs_subio_endio_read(inode, io_bio, err); 8023 } 8024 } 8025 8026 static void btrfs_endio_direct_read(struct bio *bio) 8027 { 8028 struct btrfs_dio_private *dip = bio->bi_private; 8029 struct inode *inode = dip->inode; 8030 struct bio *dio_bio; 8031 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 8032 blk_status_t err = bio->bi_status; 8033 8034 if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) 8035 err = btrfs_subio_endio_read(inode, io_bio, err); 8036 8037 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 8038 dip->logical_offset + dip->bytes - 1); 8039 dio_bio = dip->dio_bio; 8040 8041 kfree(dip); 8042 8043 dio_bio->bi_status = err; 8044 dio_end_io(dio_bio); 8045 8046 if (io_bio->end_io) 8047 io_bio->end_io(io_bio, blk_status_to_errno(err)); 8048 bio_put(bio); 8049 } 8050 8051 static void __endio_write_update_ordered(struct inode *inode, 8052 const u64 offset, const u64 bytes, 8053 const bool uptodate) 8054 { 8055 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8056 struct btrfs_ordered_extent *ordered = NULL; 8057 struct btrfs_workqueue *wq; 8058 btrfs_work_func_t func; 8059 u64 ordered_offset = offset; 8060 u64 ordered_bytes = bytes; 8061 u64 last_offset; 8062 8063 if (btrfs_is_free_space_inode(BTRFS_I(inode))) { 8064 wq = fs_info->endio_freespace_worker; 8065 func = btrfs_freespace_write_helper; 8066 } else { 8067 wq = fs_info->endio_write_workers; 8068 func = btrfs_endio_write_helper; 8069 } 8070 8071 while (ordered_offset < offset + bytes) { 8072 last_offset = ordered_offset; 8073 if (btrfs_dec_test_first_ordered_pending(inode, &ordered, 8074 &ordered_offset, 8075 ordered_bytes, 8076 uptodate)) { 8077 btrfs_init_work(&ordered->work, func, 8078 finish_ordered_fn, 8079 NULL, NULL); 8080 btrfs_queue_work(wq, &ordered->work); 8081 } 8082 /* 8083 * If btrfs_dec_test_ordered_pending does not find any ordered 8084 * extent in the range, we can exit. 8085 */ 8086 if (ordered_offset == last_offset) 8087 return; 8088 /* 8089 * Our bio might span multiple ordered extents. In this case 8090 * we keep goin until we have accounted the whole dio. 8091 */ 8092 if (ordered_offset < offset + bytes) { 8093 ordered_bytes = offset + bytes - ordered_offset; 8094 ordered = NULL; 8095 } 8096 } 8097 } 8098 8099 static void btrfs_endio_direct_write(struct bio *bio) 8100 { 8101 struct btrfs_dio_private *dip = bio->bi_private; 8102 struct bio *dio_bio = dip->dio_bio; 8103 8104 __endio_write_update_ordered(dip->inode, dip->logical_offset, 8105 dip->bytes, !bio->bi_status); 8106 8107 kfree(dip); 8108 8109 dio_bio->bi_status = bio->bi_status; 8110 dio_end_io(dio_bio); 8111 bio_put(bio); 8112 } 8113 8114 static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, 8115 struct bio *bio, u64 offset) 8116 { 8117 struct inode *inode = private_data; 8118 blk_status_t ret; 8119 ret = btrfs_csum_one_bio(inode, bio, offset, 1); 8120 BUG_ON(ret); /* -ENOMEM */ 8121 return 0; 8122 } 8123 8124 static void btrfs_end_dio_bio(struct bio *bio) 8125 { 8126 struct btrfs_dio_private *dip = bio->bi_private; 8127 blk_status_t err = bio->bi_status; 8128 8129 if (err) 8130 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, 8131 "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", 8132 btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), 8133 bio->bi_opf, 8134 (unsigned long long)bio->bi_iter.bi_sector, 8135 bio->bi_iter.bi_size, err); 8136 8137 if (dip->subio_endio) 8138 err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); 8139 8140 if (err) { 8141 /* 8142 * We want to perceive the errors flag being set before 8143 * decrementing the reference count. We don't need a barrier 8144 * since atomic operations with a return value are fully 8145 * ordered as per atomic_t.txt 8146 */ 8147 dip->errors = 1; 8148 } 8149 8150 /* if there are more bios still pending for this dio, just exit */ 8151 if (!atomic_dec_and_test(&dip->pending_bios)) 8152 goto out; 8153 8154 if (dip->errors) { 8155 bio_io_error(dip->orig_bio); 8156 } else { 8157 dip->dio_bio->bi_status = BLK_STS_OK; 8158 bio_endio(dip->orig_bio); 8159 } 8160 out: 8161 bio_put(bio); 8162 } 8163 8164 static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, 8165 struct btrfs_dio_private *dip, 8166 struct bio *bio, 8167 u64 file_offset) 8168 { 8169 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 8170 struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); 8171 blk_status_t ret; 8172 8173 /* 8174 * We load all the csum data we need when we submit 8175 * the first bio to reduce the csum tree search and 8176 * contention. 8177 */ 8178 if (dip->logical_offset == file_offset) { 8179 ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio, 8180 file_offset); 8181 if (ret) 8182 return ret; 8183 } 8184 8185 if (bio == dip->orig_bio) 8186 return 0; 8187 8188 file_offset -= dip->logical_offset; 8189 file_offset >>= inode->i_sb->s_blocksize_bits; 8190 io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset); 8191 8192 return 0; 8193 } 8194 8195 static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, 8196 struct inode *inode, u64 file_offset, int async_submit) 8197 { 8198 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8199 struct btrfs_dio_private *dip = bio->bi_private; 8200 bool write = bio_op(bio) == REQ_OP_WRITE; 8201 blk_status_t ret; 8202 8203 /* Check btrfs_submit_bio_hook() for rules about async submit. */ 8204 if (async_submit) 8205 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); 8206 8207 if (!write) { 8208 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 8209 if (ret) 8210 goto err; 8211 } 8212 8213 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 8214 goto map; 8215 8216 if (write && async_submit) { 8217 ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0, 8218 file_offset, inode, 8219 btrfs_submit_bio_start_direct_io); 8220 goto err; 8221 } else if (write) { 8222 /* 8223 * If we aren't doing async submit, calculate the csum of the 8224 * bio now. 8225 */ 8226 ret = btrfs_csum_one_bio(inode, bio, file_offset, 1); 8227 if (ret) 8228 goto err; 8229 } else { 8230 ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio, 8231 file_offset); 8232 if (ret) 8233 goto err; 8234 } 8235 map: 8236 ret = btrfs_map_bio(fs_info, bio, 0, 0); 8237 err: 8238 return ret; 8239 } 8240 8241 static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) 8242 { 8243 struct inode *inode = dip->inode; 8244 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8245 struct bio *bio; 8246 struct bio *orig_bio = dip->orig_bio; 8247 u64 start_sector = orig_bio->bi_iter.bi_sector; 8248 u64 file_offset = dip->logical_offset; 8249 u64 map_length; 8250 int async_submit = 0; 8251 u64 submit_len; 8252 int clone_offset = 0; 8253 int clone_len; 8254 int ret; 8255 blk_status_t status; 8256 8257 map_length = orig_bio->bi_iter.bi_size; 8258 submit_len = map_length; 8259 ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, 8260 &map_length, NULL, 0); 8261 if (ret) 8262 return -EIO; 8263 8264 if (map_length >= submit_len) { 8265 bio = orig_bio; 8266 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; 8267 goto submit; 8268 } 8269 8270 /* async crcs make it difficult to collect full stripe writes. */ 8271 if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK) 8272 async_submit = 0; 8273 else 8274 async_submit = 1; 8275 8276 /* bio split */ 8277 ASSERT(map_length <= INT_MAX); 8278 atomic_inc(&dip->pending_bios); 8279 do { 8280 clone_len = min_t(int, submit_len, map_length); 8281 8282 /* 8283 * This will never fail as it's passing GPF_NOFS and 8284 * the allocation is backed by btrfs_bioset. 8285 */ 8286 bio = btrfs_bio_clone_partial(orig_bio, clone_offset, 8287 clone_len); 8288 bio->bi_private = dip; 8289 bio->bi_end_io = btrfs_end_dio_bio; 8290 btrfs_io_bio(bio)->logical = file_offset; 8291 8292 ASSERT(submit_len >= clone_len); 8293 submit_len -= clone_len; 8294 if (submit_len == 0) 8295 break; 8296 8297 /* 8298 * Increase the count before we submit the bio so we know 8299 * the end IO handler won't happen before we increase the 8300 * count. Otherwise, the dip might get freed before we're 8301 * done setting it up. 8302 */ 8303 atomic_inc(&dip->pending_bios); 8304 8305 status = btrfs_submit_dio_bio(bio, inode, file_offset, 8306 async_submit); 8307 if (status) { 8308 bio_put(bio); 8309 atomic_dec(&dip->pending_bios); 8310 goto out_err; 8311 } 8312 8313 clone_offset += clone_len; 8314 start_sector += clone_len >> 9; 8315 file_offset += clone_len; 8316 8317 map_length = submit_len; 8318 ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), 8319 start_sector << 9, &map_length, NULL, 0); 8320 if (ret) 8321 goto out_err; 8322 } while (submit_len > 0); 8323 8324 submit: 8325 status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); 8326 if (!status) 8327 return 0; 8328 8329 bio_put(bio); 8330 out_err: 8331 dip->errors = 1; 8332 /* 8333 * Before atomic variable goto zero, we must make sure dip->errors is 8334 * perceived to be set. This ordering is ensured by the fact that an 8335 * atomic operations with a return value are fully ordered as per 8336 * atomic_t.txt 8337 */ 8338 if (atomic_dec_and_test(&dip->pending_bios)) 8339 bio_io_error(dip->orig_bio); 8340 8341 /* bio_end_io() will handle error, so we needn't return it */ 8342 return 0; 8343 } 8344 8345 static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, 8346 loff_t file_offset) 8347 { 8348 struct btrfs_dio_private *dip = NULL; 8349 struct bio *bio = NULL; 8350 struct btrfs_io_bio *io_bio; 8351 bool write = (bio_op(dio_bio) == REQ_OP_WRITE); 8352 int ret = 0; 8353 8354 bio = btrfs_bio_clone(dio_bio); 8355 8356 dip = kzalloc(sizeof(*dip), GFP_NOFS); 8357 if (!dip) { 8358 ret = -ENOMEM; 8359 goto free_ordered; 8360 } 8361 8362 dip->private = dio_bio->bi_private; 8363 dip->inode = inode; 8364 dip->logical_offset = file_offset; 8365 dip->bytes = dio_bio->bi_iter.bi_size; 8366 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; 8367 bio->bi_private = dip; 8368 dip->orig_bio = bio; 8369 dip->dio_bio = dio_bio; 8370 atomic_set(&dip->pending_bios, 0); 8371 io_bio = btrfs_io_bio(bio); 8372 io_bio->logical = file_offset; 8373 8374 if (write) { 8375 bio->bi_end_io = btrfs_endio_direct_write; 8376 } else { 8377 bio->bi_end_io = btrfs_endio_direct_read; 8378 dip->subio_endio = btrfs_subio_endio_read; 8379 } 8380 8381 /* 8382 * Reset the range for unsubmitted ordered extents (to a 0 length range) 8383 * even if we fail to submit a bio, because in such case we do the 8384 * corresponding error handling below and it must not be done a second 8385 * time by btrfs_direct_IO(). 8386 */ 8387 if (write) { 8388 struct btrfs_dio_data *dio_data = current->journal_info; 8389 8390 dio_data->unsubmitted_oe_range_end = dip->logical_offset + 8391 dip->bytes; 8392 dio_data->unsubmitted_oe_range_start = 8393 dio_data->unsubmitted_oe_range_end; 8394 } 8395 8396 ret = btrfs_submit_direct_hook(dip); 8397 if (!ret) 8398 return; 8399 8400 if (io_bio->end_io) 8401 io_bio->end_io(io_bio, ret); 8402 8403 free_ordered: 8404 /* 8405 * If we arrived here it means either we failed to submit the dip 8406 * or we either failed to clone the dio_bio or failed to allocate the 8407 * dip. If we cloned the dio_bio and allocated the dip, we can just 8408 * call bio_endio against our io_bio so that we get proper resource 8409 * cleanup if we fail to submit the dip, otherwise, we must do the 8410 * same as btrfs_endio_direct_[write|read] because we can't call these 8411 * callbacks - they require an allocated dip and a clone of dio_bio. 8412 */ 8413 if (bio && dip) { 8414 bio_io_error(bio); 8415 /* 8416 * The end io callbacks free our dip, do the final put on bio 8417 * and all the cleanup and final put for dio_bio (through 8418 * dio_end_io()). 8419 */ 8420 dip = NULL; 8421 bio = NULL; 8422 } else { 8423 if (write) 8424 __endio_write_update_ordered(inode, 8425 file_offset, 8426 dio_bio->bi_iter.bi_size, 8427 false); 8428 else 8429 unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, 8430 file_offset + dio_bio->bi_iter.bi_size - 1); 8431 8432 dio_bio->bi_status = BLK_STS_IOERR; 8433 /* 8434 * Releases and cleans up our dio_bio, no need to bio_put() 8435 * nor bio_endio()/bio_io_error() against dio_bio. 8436 */ 8437 dio_end_io(dio_bio); 8438 } 8439 if (bio) 8440 bio_put(bio); 8441 kfree(dip); 8442 } 8443 8444 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, 8445 const struct iov_iter *iter, loff_t offset) 8446 { 8447 int seg; 8448 int i; 8449 unsigned int blocksize_mask = fs_info->sectorsize - 1; 8450 ssize_t retval = -EINVAL; 8451 8452 if (offset & blocksize_mask) 8453 goto out; 8454 8455 if (iov_iter_alignment(iter) & blocksize_mask) 8456 goto out; 8457 8458 /* If this is a write we don't need to check anymore */ 8459 if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter)) 8460 return 0; 8461 /* 8462 * Check to make sure we don't have duplicate iov_base's in this 8463 * iovec, if so return EINVAL, otherwise we'll get csum errors 8464 * when reading back. 8465 */ 8466 for (seg = 0; seg < iter->nr_segs; seg++) { 8467 for (i = seg + 1; i < iter->nr_segs; i++) { 8468 if (iter->iov[seg].iov_base == iter->iov[i].iov_base) 8469 goto out; 8470 } 8471 } 8472 retval = 0; 8473 out: 8474 return retval; 8475 } 8476 8477 static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 8478 { 8479 struct file *file = iocb->ki_filp; 8480 struct inode *inode = file->f_mapping->host; 8481 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8482 struct btrfs_dio_data dio_data = { 0 }; 8483 struct extent_changeset *data_reserved = NULL; 8484 loff_t offset = iocb->ki_pos; 8485 size_t count = 0; 8486 int flags = 0; 8487 bool wakeup = true; 8488 bool relock = false; 8489 ssize_t ret; 8490 8491 if (check_direct_IO(fs_info, iter, offset)) 8492 return 0; 8493 8494 inode_dio_begin(inode); 8495 8496 /* 8497 * The generic stuff only does filemap_write_and_wait_range, which 8498 * isn't enough if we've written compressed pages to this area, so 8499 * we need to flush the dirty pages again to make absolutely sure 8500 * that any outstanding dirty pages are on disk. 8501 */ 8502 count = iov_iter_count(iter); 8503 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 8504 &BTRFS_I(inode)->runtime_flags)) 8505 filemap_fdatawrite_range(inode->i_mapping, offset, 8506 offset + count - 1); 8507 8508 if (iov_iter_rw(iter) == WRITE) { 8509 /* 8510 * If the write DIO is beyond the EOF, we need update 8511 * the isize, but it is protected by i_mutex. So we can 8512 * not unlock the i_mutex at this case. 8513 */ 8514 if (offset + count <= inode->i_size) { 8515 dio_data.overwrite = 1; 8516 inode_unlock(inode); 8517 relock = true; 8518 } else if (iocb->ki_flags & IOCB_NOWAIT) { 8519 ret = -EAGAIN; 8520 goto out; 8521 } 8522 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 8523 offset, count); 8524 if (ret) 8525 goto out; 8526 8527 /* 8528 * We need to know how many extents we reserved so that we can 8529 * do the accounting properly if we go over the number we 8530 * originally calculated. Abuse current->journal_info for this. 8531 */ 8532 dio_data.reserve = round_up(count, 8533 fs_info->sectorsize); 8534 dio_data.unsubmitted_oe_range_start = (u64)offset; 8535 dio_data.unsubmitted_oe_range_end = (u64)offset; 8536 current->journal_info = &dio_data; 8537 down_read(&BTRFS_I(inode)->dio_sem); 8538 } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, 8539 &BTRFS_I(inode)->runtime_flags)) { 8540 inode_dio_end(inode); 8541 flags = DIO_LOCKING | DIO_SKIP_HOLES; 8542 wakeup = false; 8543 } 8544 8545 ret = __blockdev_direct_IO(iocb, inode, 8546 fs_info->fs_devices->latest_bdev, 8547 iter, btrfs_get_blocks_direct, NULL, 8548 btrfs_submit_direct, flags); 8549 if (iov_iter_rw(iter) == WRITE) { 8550 up_read(&BTRFS_I(inode)->dio_sem); 8551 current->journal_info = NULL; 8552 if (ret < 0 && ret != -EIOCBQUEUED) { 8553 if (dio_data.reserve) 8554 btrfs_delalloc_release_space(inode, data_reserved, 8555 offset, dio_data.reserve, true); 8556 /* 8557 * On error we might have left some ordered extents 8558 * without submitting corresponding bios for them, so 8559 * cleanup them up to avoid other tasks getting them 8560 * and waiting for them to complete forever. 8561 */ 8562 if (dio_data.unsubmitted_oe_range_start < 8563 dio_data.unsubmitted_oe_range_end) 8564 __endio_write_update_ordered(inode, 8565 dio_data.unsubmitted_oe_range_start, 8566 dio_data.unsubmitted_oe_range_end - 8567 dio_data.unsubmitted_oe_range_start, 8568 false); 8569 } else if (ret >= 0 && (size_t)ret < count) 8570 btrfs_delalloc_release_space(inode, data_reserved, 8571 offset, count - (size_t)ret, true); 8572 btrfs_delalloc_release_extents(BTRFS_I(inode), count, false); 8573 } 8574 out: 8575 if (wakeup) 8576 inode_dio_end(inode); 8577 if (relock) 8578 inode_lock(inode); 8579 8580 extent_changeset_free(data_reserved); 8581 return ret; 8582 } 8583 8584 #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 8585 8586 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 8587 __u64 start, __u64 len) 8588 { 8589 int ret; 8590 8591 ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); 8592 if (ret) 8593 return ret; 8594 8595 return extent_fiemap(inode, fieinfo, start, len); 8596 } 8597 8598 int btrfs_readpage(struct file *file, struct page *page) 8599 { 8600 struct extent_io_tree *tree; 8601 tree = &BTRFS_I(page->mapping->host)->io_tree; 8602 return extent_read_full_page(tree, page, btrfs_get_extent, 0); 8603 } 8604 8605 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 8606 { 8607 struct inode *inode = page->mapping->host; 8608 int ret; 8609 8610 if (current->flags & PF_MEMALLOC) { 8611 redirty_page_for_writepage(wbc, page); 8612 unlock_page(page); 8613 return 0; 8614 } 8615 8616 /* 8617 * If we are under memory pressure we will call this directly from the 8618 * VM, we need to make sure we have the inode referenced for the ordered 8619 * extent. If not just return like we didn't do anything. 8620 */ 8621 if (!igrab(inode)) { 8622 redirty_page_for_writepage(wbc, page); 8623 return AOP_WRITEPAGE_ACTIVATE; 8624 } 8625 ret = extent_write_full_page(page, wbc); 8626 btrfs_add_delayed_iput(inode); 8627 return ret; 8628 } 8629 8630 static int btrfs_writepages(struct address_space *mapping, 8631 struct writeback_control *wbc) 8632 { 8633 return extent_writepages(mapping, wbc); 8634 } 8635 8636 static int 8637 btrfs_readpages(struct file *file, struct address_space *mapping, 8638 struct list_head *pages, unsigned nr_pages) 8639 { 8640 return extent_readpages(mapping, pages, nr_pages); 8641 } 8642 8643 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8644 { 8645 int ret = try_release_extent_mapping(page, gfp_flags); 8646 if (ret == 1) { 8647 ClearPagePrivate(page); 8648 set_page_private(page, 0); 8649 put_page(page); 8650 } 8651 return ret; 8652 } 8653 8654 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8655 { 8656 if (PageWriteback(page) || PageDirty(page)) 8657 return 0; 8658 return __btrfs_releasepage(page, gfp_flags); 8659 } 8660 8661 static void btrfs_invalidatepage(struct page *page, unsigned int offset, 8662 unsigned int length) 8663 { 8664 struct inode *inode = page->mapping->host; 8665 struct extent_io_tree *tree; 8666 struct btrfs_ordered_extent *ordered; 8667 struct extent_state *cached_state = NULL; 8668 u64 page_start = page_offset(page); 8669 u64 page_end = page_start + PAGE_SIZE - 1; 8670 u64 start; 8671 u64 end; 8672 int inode_evicting = inode->i_state & I_FREEING; 8673 8674 /* 8675 * we have the page locked, so new writeback can't start, 8676 * and the dirty bit won't be cleared while we are here. 8677 * 8678 * Wait for IO on this page so that we can safely clear 8679 * the PagePrivate2 bit and do ordered accounting 8680 */ 8681 wait_on_page_writeback(page); 8682 8683 tree = &BTRFS_I(inode)->io_tree; 8684 if (offset) { 8685 btrfs_releasepage(page, GFP_NOFS); 8686 return; 8687 } 8688 8689 if (!inode_evicting) 8690 lock_extent_bits(tree, page_start, page_end, &cached_state); 8691 again: 8692 start = page_start; 8693 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 8694 page_end - start + 1); 8695 if (ordered) { 8696 end = min(page_end, ordered->file_offset + ordered->len - 1); 8697 /* 8698 * IO on this page will never be started, so we need 8699 * to account for any ordered extents now 8700 */ 8701 if (!inode_evicting) 8702 clear_extent_bit(tree, start, end, 8703 EXTENT_DIRTY | EXTENT_DELALLOC | 8704 EXTENT_DELALLOC_NEW | 8705 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 8706 EXTENT_DEFRAG, 1, 0, &cached_state); 8707 /* 8708 * whoever cleared the private bit is responsible 8709 * for the finish_ordered_io 8710 */ 8711 if (TestClearPagePrivate2(page)) { 8712 struct btrfs_ordered_inode_tree *tree; 8713 u64 new_len; 8714 8715 tree = &BTRFS_I(inode)->ordered_tree; 8716 8717 spin_lock_irq(&tree->lock); 8718 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 8719 new_len = start - ordered->file_offset; 8720 if (new_len < ordered->truncated_len) 8721 ordered->truncated_len = new_len; 8722 spin_unlock_irq(&tree->lock); 8723 8724 if (btrfs_dec_test_ordered_pending(inode, &ordered, 8725 start, 8726 end - start + 1, 1)) 8727 btrfs_finish_ordered_io(ordered); 8728 } 8729 btrfs_put_ordered_extent(ordered); 8730 if (!inode_evicting) { 8731 cached_state = NULL; 8732 lock_extent_bits(tree, start, end, 8733 &cached_state); 8734 } 8735 8736 start = end + 1; 8737 if (start < page_end) 8738 goto again; 8739 } 8740 8741 /* 8742 * Qgroup reserved space handler 8743 * Page here will be either 8744 * 1) Already written to disk 8745 * In this case, its reserved space is released from data rsv map 8746 * and will be freed by delayed_ref handler finally. 8747 * So even we call qgroup_free_data(), it won't decrease reserved 8748 * space. 8749 * 2) Not written to disk 8750 * This means the reserved space should be freed here. However, 8751 * if a truncate invalidates the page (by clearing PageDirty) 8752 * and the page is accounted for while allocating extent 8753 * in btrfs_check_data_free_space() we let delayed_ref to 8754 * free the entire extent. 8755 */ 8756 if (PageDirty(page)) 8757 btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); 8758 if (!inode_evicting) { 8759 clear_extent_bit(tree, page_start, page_end, 8760 EXTENT_LOCKED | EXTENT_DIRTY | 8761 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | 8762 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, 8763 &cached_state); 8764 8765 __btrfs_releasepage(page, GFP_NOFS); 8766 } 8767 8768 ClearPageChecked(page); 8769 if (PagePrivate(page)) { 8770 ClearPagePrivate(page); 8771 set_page_private(page, 0); 8772 put_page(page); 8773 } 8774 } 8775 8776 /* 8777 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 8778 * called from a page fault handler when a page is first dirtied. Hence we must 8779 * be careful to check for EOF conditions here. We set the page up correctly 8780 * for a written page which means we get ENOSPC checking when writing into 8781 * holes and correct delalloc and unwritten extent mapping on filesystems that 8782 * support these features. 8783 * 8784 * We are not allowed to take the i_mutex here so we have to play games to 8785 * protect against truncate races as the page could now be beyond EOF. Because 8786 * truncate_setsize() writes the inode size before removing pages, once we have 8787 * the page lock we can determine safely if the page is beyond EOF. If it is not 8788 * beyond EOF, then the page is guaranteed safe against truncation until we 8789 * unlock the page. 8790 */ 8791 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) 8792 { 8793 struct page *page = vmf->page; 8794 struct inode *inode = file_inode(vmf->vma->vm_file); 8795 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8796 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 8797 struct btrfs_ordered_extent *ordered; 8798 struct extent_state *cached_state = NULL; 8799 struct extent_changeset *data_reserved = NULL; 8800 char *kaddr; 8801 unsigned long zero_start; 8802 loff_t size; 8803 vm_fault_t ret; 8804 int ret2; 8805 int reserved = 0; 8806 u64 reserved_space; 8807 u64 page_start; 8808 u64 page_end; 8809 u64 end; 8810 8811 reserved_space = PAGE_SIZE; 8812 8813 sb_start_pagefault(inode->i_sb); 8814 page_start = page_offset(page); 8815 page_end = page_start + PAGE_SIZE - 1; 8816 end = page_end; 8817 8818 /* 8819 * Reserving delalloc space after obtaining the page lock can lead to 8820 * deadlock. For example, if a dirty page is locked by this function 8821 * and the call to btrfs_delalloc_reserve_space() ends up triggering 8822 * dirty page write out, then the btrfs_writepage() function could 8823 * end up waiting indefinitely to get a lock on the page currently 8824 * being processed by btrfs_page_mkwrite() function. 8825 */ 8826 ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, 8827 reserved_space); 8828 if (!ret2) { 8829 ret2 = file_update_time(vmf->vma->vm_file); 8830 reserved = 1; 8831 } 8832 if (ret2) { 8833 ret = vmf_error(ret2); 8834 if (reserved) 8835 goto out; 8836 goto out_noreserve; 8837 } 8838 8839 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 8840 again: 8841 lock_page(page); 8842 size = i_size_read(inode); 8843 8844 if ((page->mapping != inode->i_mapping) || 8845 (page_start >= size)) { 8846 /* page got truncated out from underneath us */ 8847 goto out_unlock; 8848 } 8849 wait_on_page_writeback(page); 8850 8851 lock_extent_bits(io_tree, page_start, page_end, &cached_state); 8852 set_page_extent_mapped(page); 8853 8854 /* 8855 * we can't set the delalloc bits if there are pending ordered 8856 * extents. Drop our locks and wait for them to finish 8857 */ 8858 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, 8859 PAGE_SIZE); 8860 if (ordered) { 8861 unlock_extent_cached(io_tree, page_start, page_end, 8862 &cached_state); 8863 unlock_page(page); 8864 btrfs_start_ordered_extent(inode, ordered, 1); 8865 btrfs_put_ordered_extent(ordered); 8866 goto again; 8867 } 8868 8869 if (page->index == ((size - 1) >> PAGE_SHIFT)) { 8870 reserved_space = round_up(size - page_start, 8871 fs_info->sectorsize); 8872 if (reserved_space < PAGE_SIZE) { 8873 end = page_start + reserved_space - 1; 8874 btrfs_delalloc_release_space(inode, data_reserved, 8875 page_start, PAGE_SIZE - reserved_space, 8876 true); 8877 } 8878 } 8879 8880 /* 8881 * page_mkwrite gets called when the page is firstly dirtied after it's 8882 * faulted in, but write(2) could also dirty a page and set delalloc 8883 * bits, thus in this case for space account reason, we still need to 8884 * clear any delalloc bits within this page range since we have to 8885 * reserve data&meta space before lock_page() (see above comments). 8886 */ 8887 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, 8888 EXTENT_DIRTY | EXTENT_DELALLOC | 8889 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 8890 0, 0, &cached_state); 8891 8892 ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0, 8893 &cached_state, 0); 8894 if (ret2) { 8895 unlock_extent_cached(io_tree, page_start, page_end, 8896 &cached_state); 8897 ret = VM_FAULT_SIGBUS; 8898 goto out_unlock; 8899 } 8900 ret2 = 0; 8901 8902 /* page is wholly or partially inside EOF */ 8903 if (page_start + PAGE_SIZE > size) 8904 zero_start = size & ~PAGE_MASK; 8905 else 8906 zero_start = PAGE_SIZE; 8907 8908 if (zero_start != PAGE_SIZE) { 8909 kaddr = kmap(page); 8910 memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); 8911 flush_dcache_page(page); 8912 kunmap(page); 8913 } 8914 ClearPageChecked(page); 8915 set_page_dirty(page); 8916 SetPageUptodate(page); 8917 8918 BTRFS_I(inode)->last_trans = fs_info->generation; 8919 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 8920 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; 8921 8922 unlock_extent_cached(io_tree, page_start, page_end, &cached_state); 8923 8924 if (!ret2) { 8925 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true); 8926 sb_end_pagefault(inode->i_sb); 8927 extent_changeset_free(data_reserved); 8928 return VM_FAULT_LOCKED; 8929 } 8930 8931 out_unlock: 8932 unlock_page(page); 8933 out: 8934 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0)); 8935 btrfs_delalloc_release_space(inode, data_reserved, page_start, 8936 reserved_space, (ret != 0)); 8937 out_noreserve: 8938 sb_end_pagefault(inode->i_sb); 8939 extent_changeset_free(data_reserved); 8940 return ret; 8941 } 8942 8943 static int btrfs_truncate(struct inode *inode, bool skip_writeback) 8944 { 8945 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8946 struct btrfs_root *root = BTRFS_I(inode)->root; 8947 struct btrfs_block_rsv *rsv; 8948 int ret; 8949 struct btrfs_trans_handle *trans; 8950 u64 mask = fs_info->sectorsize - 1; 8951 u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); 8952 8953 if (!skip_writeback) { 8954 ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), 8955 (u64)-1); 8956 if (ret) 8957 return ret; 8958 } 8959 8960 /* 8961 * Yes ladies and gentlemen, this is indeed ugly. We have a couple of 8962 * things going on here: 8963 * 8964 * 1) We need to reserve space to update our inode. 8965 * 8966 * 2) We need to have something to cache all the space that is going to 8967 * be free'd up by the truncate operation, but also have some slack 8968 * space reserved in case it uses space during the truncate (thank you 8969 * very much snapshotting). 8970 * 8971 * And we need these to be separate. The fact is we can use a lot of 8972 * space doing the truncate, and we have no earthly idea how much space 8973 * we will use, so we need the truncate reservation to be separate so it 8974 * doesn't end up using space reserved for updating the inode. We also 8975 * need to be able to stop the transaction and start a new one, which 8976 * means we need to be able to update the inode several times, and we 8977 * have no idea of knowing how many times that will be, so we can't just 8978 * reserve 1 item for the entirety of the operation, so that has to be 8979 * done separately as well. 8980 * 8981 * So that leaves us with 8982 * 8983 * 1) rsv - for the truncate reservation, which we will steal from the 8984 * transaction reservation. 8985 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for 8986 * updating the inode. 8987 */ 8988 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 8989 if (!rsv) 8990 return -ENOMEM; 8991 rsv->size = min_size; 8992 rsv->failfast = 1; 8993 8994 /* 8995 * 1 for the truncate slack space 8996 * 1 for updating the inode. 8997 */ 8998 trans = btrfs_start_transaction(root, 2); 8999 if (IS_ERR(trans)) { 9000 ret = PTR_ERR(trans); 9001 goto out; 9002 } 9003 9004 /* Migrate the slack space for the truncate to our reserve */ 9005 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, 9006 min_size, false); 9007 BUG_ON(ret); 9008 9009 /* 9010 * So if we truncate and then write and fsync we normally would just 9011 * write the extents that changed, which is a problem if we need to 9012 * first truncate that entire inode. So set this flag so we write out 9013 * all of the extents in the inode to the sync log so we're completely 9014 * safe. 9015 */ 9016 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 9017 trans->block_rsv = rsv; 9018 9019 while (1) { 9020 ret = btrfs_truncate_inode_items(trans, root, inode, 9021 inode->i_size, 9022 BTRFS_EXTENT_DATA_KEY); 9023 trans->block_rsv = &fs_info->trans_block_rsv; 9024 if (ret != -ENOSPC && ret != -EAGAIN) 9025 break; 9026 9027 ret = btrfs_update_inode(trans, root, inode); 9028 if (ret) 9029 break; 9030 9031 btrfs_end_transaction(trans); 9032 btrfs_btree_balance_dirty(fs_info); 9033 9034 trans = btrfs_start_transaction(root, 2); 9035 if (IS_ERR(trans)) { 9036 ret = PTR_ERR(trans); 9037 trans = NULL; 9038 break; 9039 } 9040 9041 btrfs_block_rsv_release(fs_info, rsv, -1); 9042 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 9043 rsv, min_size, false); 9044 BUG_ON(ret); /* shouldn't happen */ 9045 trans->block_rsv = rsv; 9046 } 9047 9048 /* 9049 * We can't call btrfs_truncate_block inside a trans handle as we could 9050 * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know 9051 * we've truncated everything except the last little bit, and can do 9052 * btrfs_truncate_block and then update the disk_i_size. 9053 */ 9054 if (ret == NEED_TRUNCATE_BLOCK) { 9055 btrfs_end_transaction(trans); 9056 btrfs_btree_balance_dirty(fs_info); 9057 9058 ret = btrfs_truncate_block(inode, inode->i_size, 0, 0); 9059 if (ret) 9060 goto out; 9061 trans = btrfs_start_transaction(root, 1); 9062 if (IS_ERR(trans)) { 9063 ret = PTR_ERR(trans); 9064 goto out; 9065 } 9066 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 9067 } 9068 9069 if (trans) { 9070 int ret2; 9071 9072 trans->block_rsv = &fs_info->trans_block_rsv; 9073 ret2 = btrfs_update_inode(trans, root, inode); 9074 if (ret2 && !ret) 9075 ret = ret2; 9076 9077 ret2 = btrfs_end_transaction(trans); 9078 if (ret2 && !ret) 9079 ret = ret2; 9080 btrfs_btree_balance_dirty(fs_info); 9081 } 9082 out: 9083 btrfs_free_block_rsv(fs_info, rsv); 9084 9085 return ret; 9086 } 9087 9088 /* 9089 * create a new subvolume directory/inode (helper for the ioctl). 9090 */ 9091 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 9092 struct btrfs_root *new_root, 9093 struct btrfs_root *parent_root, 9094 u64 new_dirid) 9095 { 9096 struct inode *inode; 9097 int err; 9098 u64 index = 0; 9099 9100 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, 9101 new_dirid, new_dirid, 9102 S_IFDIR | (~current_umask() & S_IRWXUGO), 9103 &index); 9104 if (IS_ERR(inode)) 9105 return PTR_ERR(inode); 9106 inode->i_op = &btrfs_dir_inode_operations; 9107 inode->i_fop = &btrfs_dir_file_operations; 9108 9109 set_nlink(inode, 1); 9110 btrfs_i_size_write(BTRFS_I(inode), 0); 9111 unlock_new_inode(inode); 9112 9113 err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 9114 if (err) 9115 btrfs_err(new_root->fs_info, 9116 "error inheriting subvolume %llu properties: %d", 9117 new_root->root_key.objectid, err); 9118 9119 err = btrfs_update_inode(trans, new_root, inode); 9120 9121 iput(inode); 9122 return err; 9123 } 9124 9125 struct inode *btrfs_alloc_inode(struct super_block *sb) 9126 { 9127 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 9128 struct btrfs_inode *ei; 9129 struct inode *inode; 9130 9131 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); 9132 if (!ei) 9133 return NULL; 9134 9135 ei->root = NULL; 9136 ei->generation = 0; 9137 ei->last_trans = 0; 9138 ei->last_sub_trans = 0; 9139 ei->logged_trans = 0; 9140 ei->delalloc_bytes = 0; 9141 ei->new_delalloc_bytes = 0; 9142 ei->defrag_bytes = 0; 9143 ei->disk_i_size = 0; 9144 ei->flags = 0; 9145 ei->csum_bytes = 0; 9146 ei->index_cnt = (u64)-1; 9147 ei->dir_index = 0; 9148 ei->last_unlink_trans = 0; 9149 ei->last_log_commit = 0; 9150 9151 spin_lock_init(&ei->lock); 9152 ei->outstanding_extents = 0; 9153 if (sb->s_magic != BTRFS_TEST_MAGIC) 9154 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, 9155 BTRFS_BLOCK_RSV_DELALLOC); 9156 ei->runtime_flags = 0; 9157 ei->prop_compress = BTRFS_COMPRESS_NONE; 9158 ei->defrag_compress = BTRFS_COMPRESS_NONE; 9159 9160 ei->delayed_node = NULL; 9161 9162 ei->i_otime.tv_sec = 0; 9163 ei->i_otime.tv_nsec = 0; 9164 9165 inode = &ei->vfs_inode; 9166 extent_map_tree_init(&ei->extent_tree); 9167 extent_io_tree_init(&ei->io_tree, inode); 9168 extent_io_tree_init(&ei->io_failure_tree, inode); 9169 ei->io_tree.track_uptodate = 1; 9170 ei->io_failure_tree.track_uptodate = 1; 9171 atomic_set(&ei->sync_writers, 0); 9172 mutex_init(&ei->log_mutex); 9173 mutex_init(&ei->delalloc_mutex); 9174 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 9175 INIT_LIST_HEAD(&ei->delalloc_inodes); 9176 INIT_LIST_HEAD(&ei->delayed_iput); 9177 RB_CLEAR_NODE(&ei->rb_node); 9178 init_rwsem(&ei->dio_sem); 9179 9180 return inode; 9181 } 9182 9183 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 9184 void btrfs_test_destroy_inode(struct inode *inode) 9185 { 9186 btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); 9187 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 9188 } 9189 #endif 9190 9191 static void btrfs_i_callback(struct rcu_head *head) 9192 { 9193 struct inode *inode = container_of(head, struct inode, i_rcu); 9194 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 9195 } 9196 9197 void btrfs_destroy_inode(struct inode *inode) 9198 { 9199 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 9200 struct btrfs_ordered_extent *ordered; 9201 struct btrfs_root *root = BTRFS_I(inode)->root; 9202 9203 WARN_ON(!hlist_empty(&inode->i_dentry)); 9204 WARN_ON(inode->i_data.nrpages); 9205 WARN_ON(BTRFS_I(inode)->block_rsv.reserved); 9206 WARN_ON(BTRFS_I(inode)->block_rsv.size); 9207 WARN_ON(BTRFS_I(inode)->outstanding_extents); 9208 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 9209 WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); 9210 WARN_ON(BTRFS_I(inode)->csum_bytes); 9211 WARN_ON(BTRFS_I(inode)->defrag_bytes); 9212 9213 /* 9214 * This can happen where we create an inode, but somebody else also 9215 * created the same inode and we need to destroy the one we already 9216 * created. 9217 */ 9218 if (!root) 9219 goto free; 9220 9221 while (1) { 9222 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 9223 if (!ordered) 9224 break; 9225 else { 9226 btrfs_err(fs_info, 9227 "found ordered extent %llu %llu on inode cleanup", 9228 ordered->file_offset, ordered->len); 9229 btrfs_remove_ordered_extent(inode, ordered); 9230 btrfs_put_ordered_extent(ordered); 9231 btrfs_put_ordered_extent(ordered); 9232 } 9233 } 9234 btrfs_qgroup_check_reserved_leak(inode); 9235 inode_tree_del(inode); 9236 btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); 9237 free: 9238 call_rcu(&inode->i_rcu, btrfs_i_callback); 9239 } 9240 9241 int btrfs_drop_inode(struct inode *inode) 9242 { 9243 struct btrfs_root *root = BTRFS_I(inode)->root; 9244 9245 if (root == NULL) 9246 return 1; 9247 9248 /* the snap/subvol tree is on deleting */ 9249 if (btrfs_root_refs(&root->root_item) == 0) 9250 return 1; 9251 else 9252 return generic_drop_inode(inode); 9253 } 9254 9255 static void init_once(void *foo) 9256 { 9257 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 9258 9259 inode_init_once(&ei->vfs_inode); 9260 } 9261 9262 void __cold btrfs_destroy_cachep(void) 9263 { 9264 /* 9265 * Make sure all delayed rcu free inodes are flushed before we 9266 * destroy cache. 9267 */ 9268 rcu_barrier(); 9269 kmem_cache_destroy(btrfs_inode_cachep); 9270 kmem_cache_destroy(btrfs_trans_handle_cachep); 9271 kmem_cache_destroy(btrfs_path_cachep); 9272 kmem_cache_destroy(btrfs_free_space_cachep); 9273 } 9274 9275 int __init btrfs_init_cachep(void) 9276 { 9277 btrfs_inode_cachep = kmem_cache_create("btrfs_inode", 9278 sizeof(struct btrfs_inode), 0, 9279 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, 9280 init_once); 9281 if (!btrfs_inode_cachep) 9282 goto fail; 9283 9284 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", 9285 sizeof(struct btrfs_trans_handle), 0, 9286 SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); 9287 if (!btrfs_trans_handle_cachep) 9288 goto fail; 9289 9290 btrfs_path_cachep = kmem_cache_create("btrfs_path", 9291 sizeof(struct btrfs_path), 0, 9292 SLAB_MEM_SPREAD, NULL); 9293 if (!btrfs_path_cachep) 9294 goto fail; 9295 9296 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 9297 sizeof(struct btrfs_free_space), 0, 9298 SLAB_MEM_SPREAD, NULL); 9299 if (!btrfs_free_space_cachep) 9300 goto fail; 9301 9302 return 0; 9303 fail: 9304 btrfs_destroy_cachep(); 9305 return -ENOMEM; 9306 } 9307 9308 static int btrfs_getattr(const struct path *path, struct kstat *stat, 9309 u32 request_mask, unsigned int flags) 9310 { 9311 u64 delalloc_bytes; 9312 struct inode *inode = d_inode(path->dentry); 9313 u32 blocksize = inode->i_sb->s_blocksize; 9314 u32 bi_flags = BTRFS_I(inode)->flags; 9315 9316 stat->result_mask |= STATX_BTIME; 9317 stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; 9318 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; 9319 if (bi_flags & BTRFS_INODE_APPEND) 9320 stat->attributes |= STATX_ATTR_APPEND; 9321 if (bi_flags & BTRFS_INODE_COMPRESS) 9322 stat->attributes |= STATX_ATTR_COMPRESSED; 9323 if (bi_flags & BTRFS_INODE_IMMUTABLE) 9324 stat->attributes |= STATX_ATTR_IMMUTABLE; 9325 if (bi_flags & BTRFS_INODE_NODUMP) 9326 stat->attributes |= STATX_ATTR_NODUMP; 9327 9328 stat->attributes_mask |= (STATX_ATTR_APPEND | 9329 STATX_ATTR_COMPRESSED | 9330 STATX_ATTR_IMMUTABLE | 9331 STATX_ATTR_NODUMP); 9332 9333 generic_fillattr(inode, stat); 9334 stat->dev = BTRFS_I(inode)->root->anon_dev; 9335 9336 spin_lock(&BTRFS_I(inode)->lock); 9337 delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; 9338 spin_unlock(&BTRFS_I(inode)->lock); 9339 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + 9340 ALIGN(delalloc_bytes, blocksize)) >> 9; 9341 return 0; 9342 } 9343 9344 static int btrfs_rename_exchange(struct inode *old_dir, 9345 struct dentry *old_dentry, 9346 struct inode *new_dir, 9347 struct dentry *new_dentry) 9348 { 9349 struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); 9350 struct btrfs_trans_handle *trans; 9351 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9352 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9353 struct inode *new_inode = new_dentry->d_inode; 9354 struct inode *old_inode = old_dentry->d_inode; 9355 struct timespec64 ctime = current_time(old_inode); 9356 struct dentry *parent; 9357 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 9358 u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); 9359 u64 old_idx = 0; 9360 u64 new_idx = 0; 9361 u64 root_objectid; 9362 int ret; 9363 bool root_log_pinned = false; 9364 bool dest_log_pinned = false; 9365 struct btrfs_log_ctx ctx_root; 9366 struct btrfs_log_ctx ctx_dest; 9367 bool sync_log_root = false; 9368 bool sync_log_dest = false; 9369 bool commit_transaction = false; 9370 9371 /* we only allow rename subvolume link between subvolumes */ 9372 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 9373 return -EXDEV; 9374 9375 btrfs_init_log_ctx(&ctx_root, old_inode); 9376 btrfs_init_log_ctx(&ctx_dest, new_inode); 9377 9378 /* close the race window with snapshot create/destroy ioctl */ 9379 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9380 down_read(&fs_info->subvol_sem); 9381 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) 9382 down_read(&fs_info->subvol_sem); 9383 9384 /* 9385 * We want to reserve the absolute worst case amount of items. So if 9386 * both inodes are subvols and we need to unlink them then that would 9387 * require 4 item modifications, but if they are both normal inodes it 9388 * would require 5 item modifications, so we'll assume their normal 9389 * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items 9390 * should cover the worst case number of items we'll modify. 9391 */ 9392 trans = btrfs_start_transaction(root, 12); 9393 if (IS_ERR(trans)) { 9394 ret = PTR_ERR(trans); 9395 goto out_notrans; 9396 } 9397 9398 /* 9399 * We need to find a free sequence number both in the source and 9400 * in the destination directory for the exchange. 9401 */ 9402 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); 9403 if (ret) 9404 goto out_fail; 9405 ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); 9406 if (ret) 9407 goto out_fail; 9408 9409 BTRFS_I(old_inode)->dir_index = 0ULL; 9410 BTRFS_I(new_inode)->dir_index = 0ULL; 9411 9412 /* Reference for the source. */ 9413 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 9414 /* force full log commit if subvolume involved. */ 9415 btrfs_set_log_full_commit(fs_info, trans); 9416 } else { 9417 btrfs_pin_log_trans(root); 9418 root_log_pinned = true; 9419 ret = btrfs_insert_inode_ref(trans, dest, 9420 new_dentry->d_name.name, 9421 new_dentry->d_name.len, 9422 old_ino, 9423 btrfs_ino(BTRFS_I(new_dir)), 9424 old_idx); 9425 if (ret) 9426 goto out_fail; 9427 } 9428 9429 /* And now for the dest. */ 9430 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 9431 /* force full log commit if subvolume involved. */ 9432 btrfs_set_log_full_commit(fs_info, trans); 9433 } else { 9434 btrfs_pin_log_trans(dest); 9435 dest_log_pinned = true; 9436 ret = btrfs_insert_inode_ref(trans, root, 9437 old_dentry->d_name.name, 9438 old_dentry->d_name.len, 9439 new_ino, 9440 btrfs_ino(BTRFS_I(old_dir)), 9441 new_idx); 9442 if (ret) 9443 goto out_fail; 9444 } 9445 9446 /* Update inode version and ctime/mtime. */ 9447 inode_inc_iversion(old_dir); 9448 inode_inc_iversion(new_dir); 9449 inode_inc_iversion(old_inode); 9450 inode_inc_iversion(new_inode); 9451 old_dir->i_ctime = old_dir->i_mtime = ctime; 9452 new_dir->i_ctime = new_dir->i_mtime = ctime; 9453 old_inode->i_ctime = ctime; 9454 new_inode->i_ctime = ctime; 9455 9456 if (old_dentry->d_parent != new_dentry->d_parent) { 9457 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 9458 BTRFS_I(old_inode), 1); 9459 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), 9460 BTRFS_I(new_inode), 1); 9461 } 9462 9463 /* src is a subvolume */ 9464 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 9465 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 9466 ret = btrfs_unlink_subvol(trans, old_dir, root_objectid, 9467 old_dentry->d_name.name, 9468 old_dentry->d_name.len); 9469 } else { /* src is an inode */ 9470 ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), 9471 BTRFS_I(old_dentry->d_inode), 9472 old_dentry->d_name.name, 9473 old_dentry->d_name.len); 9474 if (!ret) 9475 ret = btrfs_update_inode(trans, root, old_inode); 9476 } 9477 if (ret) { 9478 btrfs_abort_transaction(trans, ret); 9479 goto out_fail; 9480 } 9481 9482 /* dest is a subvolume */ 9483 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 9484 root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; 9485 ret = btrfs_unlink_subvol(trans, new_dir, root_objectid, 9486 new_dentry->d_name.name, 9487 new_dentry->d_name.len); 9488 } else { /* dest is an inode */ 9489 ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), 9490 BTRFS_I(new_dentry->d_inode), 9491 new_dentry->d_name.name, 9492 new_dentry->d_name.len); 9493 if (!ret) 9494 ret = btrfs_update_inode(trans, dest, new_inode); 9495 } 9496 if (ret) { 9497 btrfs_abort_transaction(trans, ret); 9498 goto out_fail; 9499 } 9500 9501 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 9502 new_dentry->d_name.name, 9503 new_dentry->d_name.len, 0, old_idx); 9504 if (ret) { 9505 btrfs_abort_transaction(trans, ret); 9506 goto out_fail; 9507 } 9508 9509 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), 9510 old_dentry->d_name.name, 9511 old_dentry->d_name.len, 0, new_idx); 9512 if (ret) { 9513 btrfs_abort_transaction(trans, ret); 9514 goto out_fail; 9515 } 9516 9517 if (old_inode->i_nlink == 1) 9518 BTRFS_I(old_inode)->dir_index = old_idx; 9519 if (new_inode->i_nlink == 1) 9520 BTRFS_I(new_inode)->dir_index = new_idx; 9521 9522 if (root_log_pinned) { 9523 parent = new_dentry->d_parent; 9524 ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), 9525 BTRFS_I(old_dir), parent, 9526 false, &ctx_root); 9527 if (ret == BTRFS_NEED_LOG_SYNC) 9528 sync_log_root = true; 9529 else if (ret == BTRFS_NEED_TRANS_COMMIT) 9530 commit_transaction = true; 9531 ret = 0; 9532 btrfs_end_log_trans(root); 9533 root_log_pinned = false; 9534 } 9535 if (dest_log_pinned) { 9536 if (!commit_transaction) { 9537 parent = old_dentry->d_parent; 9538 ret = btrfs_log_new_name(trans, BTRFS_I(new_inode), 9539 BTRFS_I(new_dir), parent, 9540 false, &ctx_dest); 9541 if (ret == BTRFS_NEED_LOG_SYNC) 9542 sync_log_dest = true; 9543 else if (ret == BTRFS_NEED_TRANS_COMMIT) 9544 commit_transaction = true; 9545 ret = 0; 9546 } 9547 btrfs_end_log_trans(dest); 9548 dest_log_pinned = false; 9549 } 9550 out_fail: 9551 /* 9552 * If we have pinned a log and an error happened, we unpin tasks 9553 * trying to sync the log and force them to fallback to a transaction 9554 * commit if the log currently contains any of the inodes involved in 9555 * this rename operation (to ensure we do not persist a log with an 9556 * inconsistent state for any of these inodes or leading to any 9557 * inconsistencies when replayed). If the transaction was aborted, the 9558 * abortion reason is propagated to userspace when attempting to commit 9559 * the transaction. If the log does not contain any of these inodes, we 9560 * allow the tasks to sync it. 9561 */ 9562 if (ret && (root_log_pinned || dest_log_pinned)) { 9563 if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || 9564 btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || 9565 btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || 9566 (new_inode && 9567 btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) 9568 btrfs_set_log_full_commit(fs_info, trans); 9569 9570 if (root_log_pinned) { 9571 btrfs_end_log_trans(root); 9572 root_log_pinned = false; 9573 } 9574 if (dest_log_pinned) { 9575 btrfs_end_log_trans(dest); 9576 dest_log_pinned = false; 9577 } 9578 } 9579 if (!ret && sync_log_root && !commit_transaction) { 9580 ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, 9581 &ctx_root); 9582 if (ret) 9583 commit_transaction = true; 9584 } 9585 if (!ret && sync_log_dest && !commit_transaction) { 9586 ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root, 9587 &ctx_dest); 9588 if (ret) 9589 commit_transaction = true; 9590 } 9591 if (commit_transaction) { 9592 ret = btrfs_commit_transaction(trans); 9593 } else { 9594 int ret2; 9595 9596 ret2 = btrfs_end_transaction(trans); 9597 ret = ret ? ret : ret2; 9598 } 9599 out_notrans: 9600 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) 9601 up_read(&fs_info->subvol_sem); 9602 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9603 up_read(&fs_info->subvol_sem); 9604 9605 return ret; 9606 } 9607 9608 static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, 9609 struct btrfs_root *root, 9610 struct inode *dir, 9611 struct dentry *dentry) 9612 { 9613 int ret; 9614 struct inode *inode; 9615 u64 objectid; 9616 u64 index; 9617 9618 ret = btrfs_find_free_ino(root, &objectid); 9619 if (ret) 9620 return ret; 9621 9622 inode = btrfs_new_inode(trans, root, dir, 9623 dentry->d_name.name, 9624 dentry->d_name.len, 9625 btrfs_ino(BTRFS_I(dir)), 9626 objectid, 9627 S_IFCHR | WHITEOUT_MODE, 9628 &index); 9629 9630 if (IS_ERR(inode)) { 9631 ret = PTR_ERR(inode); 9632 return ret; 9633 } 9634 9635 inode->i_op = &btrfs_special_inode_operations; 9636 init_special_inode(inode, inode->i_mode, 9637 WHITEOUT_DEV); 9638 9639 ret = btrfs_init_inode_security(trans, inode, dir, 9640 &dentry->d_name); 9641 if (ret) 9642 goto out; 9643 9644 ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, 9645 BTRFS_I(inode), 0, index); 9646 if (ret) 9647 goto out; 9648 9649 ret = btrfs_update_inode(trans, root, inode); 9650 out: 9651 unlock_new_inode(inode); 9652 if (ret) 9653 inode_dec_link_count(inode); 9654 iput(inode); 9655 9656 return ret; 9657 } 9658 9659 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 9660 struct inode *new_dir, struct dentry *new_dentry, 9661 unsigned int flags) 9662 { 9663 struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); 9664 struct btrfs_trans_handle *trans; 9665 unsigned int trans_num_items; 9666 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9667 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9668 struct inode *new_inode = d_inode(new_dentry); 9669 struct inode *old_inode = d_inode(old_dentry); 9670 u64 index = 0; 9671 u64 root_objectid; 9672 int ret; 9673 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 9674 bool log_pinned = false; 9675 struct btrfs_log_ctx ctx; 9676 bool sync_log = false; 9677 bool commit_transaction = false; 9678 9679 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 9680 return -EPERM; 9681 9682 /* we only allow rename subvolume link between subvolumes */ 9683 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 9684 return -EXDEV; 9685 9686 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 9687 (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID)) 9688 return -ENOTEMPTY; 9689 9690 if (S_ISDIR(old_inode->i_mode) && new_inode && 9691 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 9692 return -ENOTEMPTY; 9693 9694 9695 /* check for collisions, even if the name isn't there */ 9696 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, 9697 new_dentry->d_name.name, 9698 new_dentry->d_name.len); 9699 9700 if (ret) { 9701 if (ret == -EEXIST) { 9702 /* we shouldn't get 9703 * eexist without a new_inode */ 9704 if (WARN_ON(!new_inode)) { 9705 return ret; 9706 } 9707 } else { 9708 /* maybe -EOVERFLOW */ 9709 return ret; 9710 } 9711 } 9712 ret = 0; 9713 9714 /* 9715 * we're using rename to replace one file with another. Start IO on it 9716 * now so we don't add too much work to the end of the transaction 9717 */ 9718 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) 9719 filemap_flush(old_inode->i_mapping); 9720 9721 /* close the racy window with snapshot create/destroy ioctl */ 9722 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9723 down_read(&fs_info->subvol_sem); 9724 /* 9725 * We want to reserve the absolute worst case amount of items. So if 9726 * both inodes are subvols and we need to unlink them then that would 9727 * require 4 item modifications, but if they are both normal inodes it 9728 * would require 5 item modifications, so we'll assume they are normal 9729 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 9730 * should cover the worst case number of items we'll modify. 9731 * If our rename has the whiteout flag, we need more 5 units for the 9732 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item 9733 * when selinux is enabled). 9734 */ 9735 trans_num_items = 11; 9736 if (flags & RENAME_WHITEOUT) 9737 trans_num_items += 5; 9738 trans = btrfs_start_transaction(root, trans_num_items); 9739 if (IS_ERR(trans)) { 9740 ret = PTR_ERR(trans); 9741 goto out_notrans; 9742 } 9743 9744 if (dest != root) 9745 btrfs_record_root_in_trans(trans, dest); 9746 9747 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); 9748 if (ret) 9749 goto out_fail; 9750 9751 BTRFS_I(old_inode)->dir_index = 0ULL; 9752 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 9753 /* force full log commit if subvolume involved. */ 9754 btrfs_set_log_full_commit(fs_info, trans); 9755 } else { 9756 btrfs_pin_log_trans(root); 9757 log_pinned = true; 9758 ret = btrfs_insert_inode_ref(trans, dest, 9759 new_dentry->d_name.name, 9760 new_dentry->d_name.len, 9761 old_ino, 9762 btrfs_ino(BTRFS_I(new_dir)), index); 9763 if (ret) 9764 goto out_fail; 9765 } 9766 9767 inode_inc_iversion(old_dir); 9768 inode_inc_iversion(new_dir); 9769 inode_inc_iversion(old_inode); 9770 old_dir->i_ctime = old_dir->i_mtime = 9771 new_dir->i_ctime = new_dir->i_mtime = 9772 old_inode->i_ctime = current_time(old_dir); 9773 9774 if (old_dentry->d_parent != new_dentry->d_parent) 9775 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), 9776 BTRFS_I(old_inode), 1); 9777 9778 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 9779 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 9780 ret = btrfs_unlink_subvol(trans, old_dir, root_objectid, 9781 old_dentry->d_name.name, 9782 old_dentry->d_name.len); 9783 } else { 9784 ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), 9785 BTRFS_I(d_inode(old_dentry)), 9786 old_dentry->d_name.name, 9787 old_dentry->d_name.len); 9788 if (!ret) 9789 ret = btrfs_update_inode(trans, root, old_inode); 9790 } 9791 if (ret) { 9792 btrfs_abort_transaction(trans, ret); 9793 goto out_fail; 9794 } 9795 9796 if (new_inode) { 9797 inode_inc_iversion(new_inode); 9798 new_inode->i_ctime = current_time(new_inode); 9799 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == 9800 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 9801 root_objectid = BTRFS_I(new_inode)->location.objectid; 9802 ret = btrfs_unlink_subvol(trans, new_dir, root_objectid, 9803 new_dentry->d_name.name, 9804 new_dentry->d_name.len); 9805 BUG_ON(new_inode->i_nlink == 0); 9806 } else { 9807 ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), 9808 BTRFS_I(d_inode(new_dentry)), 9809 new_dentry->d_name.name, 9810 new_dentry->d_name.len); 9811 } 9812 if (!ret && new_inode->i_nlink == 0) 9813 ret = btrfs_orphan_add(trans, 9814 BTRFS_I(d_inode(new_dentry))); 9815 if (ret) { 9816 btrfs_abort_transaction(trans, ret); 9817 goto out_fail; 9818 } 9819 } 9820 9821 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 9822 new_dentry->d_name.name, 9823 new_dentry->d_name.len, 0, index); 9824 if (ret) { 9825 btrfs_abort_transaction(trans, ret); 9826 goto out_fail; 9827 } 9828 9829 if (old_inode->i_nlink == 1) 9830 BTRFS_I(old_inode)->dir_index = index; 9831 9832 if (log_pinned) { 9833 struct dentry *parent = new_dentry->d_parent; 9834 9835 btrfs_init_log_ctx(&ctx, old_inode); 9836 ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), 9837 BTRFS_I(old_dir), parent, 9838 false, &ctx); 9839 if (ret == BTRFS_NEED_LOG_SYNC) 9840 sync_log = true; 9841 else if (ret == BTRFS_NEED_TRANS_COMMIT) 9842 commit_transaction = true; 9843 ret = 0; 9844 btrfs_end_log_trans(root); 9845 log_pinned = false; 9846 } 9847 9848 if (flags & RENAME_WHITEOUT) { 9849 ret = btrfs_whiteout_for_rename(trans, root, old_dir, 9850 old_dentry); 9851 9852 if (ret) { 9853 btrfs_abort_transaction(trans, ret); 9854 goto out_fail; 9855 } 9856 } 9857 out_fail: 9858 /* 9859 * If we have pinned the log and an error happened, we unpin tasks 9860 * trying to sync the log and force them to fallback to a transaction 9861 * commit if the log currently contains any of the inodes involved in 9862 * this rename operation (to ensure we do not persist a log with an 9863 * inconsistent state for any of these inodes or leading to any 9864 * inconsistencies when replayed). If the transaction was aborted, the 9865 * abortion reason is propagated to userspace when attempting to commit 9866 * the transaction. If the log does not contain any of these inodes, we 9867 * allow the tasks to sync it. 9868 */ 9869 if (ret && log_pinned) { 9870 if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || 9871 btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || 9872 btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || 9873 (new_inode && 9874 btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) 9875 btrfs_set_log_full_commit(fs_info, trans); 9876 9877 btrfs_end_log_trans(root); 9878 log_pinned = false; 9879 } 9880 if (!ret && sync_log) { 9881 ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); 9882 if (ret) 9883 commit_transaction = true; 9884 } 9885 if (commit_transaction) { 9886 ret = btrfs_commit_transaction(trans); 9887 } else { 9888 int ret2; 9889 9890 ret2 = btrfs_end_transaction(trans); 9891 ret = ret ? ret : ret2; 9892 } 9893 out_notrans: 9894 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9895 up_read(&fs_info->subvol_sem); 9896 9897 return ret; 9898 } 9899 9900 static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, 9901 struct inode *new_dir, struct dentry *new_dentry, 9902 unsigned int flags) 9903 { 9904 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 9905 return -EINVAL; 9906 9907 if (flags & RENAME_EXCHANGE) 9908 return btrfs_rename_exchange(old_dir, old_dentry, new_dir, 9909 new_dentry); 9910 9911 return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); 9912 } 9913 9914 struct btrfs_delalloc_work { 9915 struct inode *inode; 9916 struct completion completion; 9917 struct list_head list; 9918 struct btrfs_work work; 9919 }; 9920 9921 static void btrfs_run_delalloc_work(struct btrfs_work *work) 9922 { 9923 struct btrfs_delalloc_work *delalloc_work; 9924 struct inode *inode; 9925 9926 delalloc_work = container_of(work, struct btrfs_delalloc_work, 9927 work); 9928 inode = delalloc_work->inode; 9929 filemap_flush(inode->i_mapping); 9930 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 9931 &BTRFS_I(inode)->runtime_flags)) 9932 filemap_flush(inode->i_mapping); 9933 9934 iput(inode); 9935 complete(&delalloc_work->completion); 9936 } 9937 9938 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) 9939 { 9940 struct btrfs_delalloc_work *work; 9941 9942 work = kmalloc(sizeof(*work), GFP_NOFS); 9943 if (!work) 9944 return NULL; 9945 9946 init_completion(&work->completion); 9947 INIT_LIST_HEAD(&work->list); 9948 work->inode = inode; 9949 WARN_ON_ONCE(!inode); 9950 btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, 9951 btrfs_run_delalloc_work, NULL, NULL); 9952 9953 return work; 9954 } 9955 9956 /* 9957 * some fairly slow code that needs optimization. This walks the list 9958 * of all the inodes with pending delalloc and forces them to disk. 9959 */ 9960 static int start_delalloc_inodes(struct btrfs_root *root, int nr) 9961 { 9962 struct btrfs_inode *binode; 9963 struct inode *inode; 9964 struct btrfs_delalloc_work *work, *next; 9965 struct list_head works; 9966 struct list_head splice; 9967 int ret = 0; 9968 9969 INIT_LIST_HEAD(&works); 9970 INIT_LIST_HEAD(&splice); 9971 9972 mutex_lock(&root->delalloc_mutex); 9973 spin_lock(&root->delalloc_lock); 9974 list_splice_init(&root->delalloc_inodes, &splice); 9975 while (!list_empty(&splice)) { 9976 binode = list_entry(splice.next, struct btrfs_inode, 9977 delalloc_inodes); 9978 9979 list_move_tail(&binode->delalloc_inodes, 9980 &root->delalloc_inodes); 9981 inode = igrab(&binode->vfs_inode); 9982 if (!inode) { 9983 cond_resched_lock(&root->delalloc_lock); 9984 continue; 9985 } 9986 spin_unlock(&root->delalloc_lock); 9987 9988 work = btrfs_alloc_delalloc_work(inode); 9989 if (!work) { 9990 iput(inode); 9991 ret = -ENOMEM; 9992 goto out; 9993 } 9994 list_add_tail(&work->list, &works); 9995 btrfs_queue_work(root->fs_info->flush_workers, 9996 &work->work); 9997 ret++; 9998 if (nr != -1 && ret >= nr) 9999 goto out; 10000 cond_resched(); 10001 spin_lock(&root->delalloc_lock); 10002 } 10003 spin_unlock(&root->delalloc_lock); 10004 10005 out: 10006 list_for_each_entry_safe(work, next, &works, list) { 10007 list_del_init(&work->list); 10008 wait_for_completion(&work->completion); 10009 kfree(work); 10010 } 10011 10012 if (!list_empty(&splice)) { 10013 spin_lock(&root->delalloc_lock); 10014 list_splice_tail(&splice, &root->delalloc_inodes); 10015 spin_unlock(&root->delalloc_lock); 10016 } 10017 mutex_unlock(&root->delalloc_mutex); 10018 return ret; 10019 } 10020 10021 int btrfs_start_delalloc_inodes(struct btrfs_root *root) 10022 { 10023 struct btrfs_fs_info *fs_info = root->fs_info; 10024 int ret; 10025 10026 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 10027 return -EROFS; 10028 10029 ret = start_delalloc_inodes(root, -1); 10030 if (ret > 0) 10031 ret = 0; 10032 return ret; 10033 } 10034 10035 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) 10036 { 10037 struct btrfs_root *root; 10038 struct list_head splice; 10039 int ret; 10040 10041 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 10042 return -EROFS; 10043 10044 INIT_LIST_HEAD(&splice); 10045 10046 mutex_lock(&fs_info->delalloc_root_mutex); 10047 spin_lock(&fs_info->delalloc_root_lock); 10048 list_splice_init(&fs_info->delalloc_roots, &splice); 10049 while (!list_empty(&splice) && nr) { 10050 root = list_first_entry(&splice, struct btrfs_root, 10051 delalloc_root); 10052 root = btrfs_grab_fs_root(root); 10053 BUG_ON(!root); 10054 list_move_tail(&root->delalloc_root, 10055 &fs_info->delalloc_roots); 10056 spin_unlock(&fs_info->delalloc_root_lock); 10057 10058 ret = start_delalloc_inodes(root, nr); 10059 btrfs_put_fs_root(root); 10060 if (ret < 0) 10061 goto out; 10062 10063 if (nr != -1) { 10064 nr -= ret; 10065 WARN_ON(nr < 0); 10066 } 10067 spin_lock(&fs_info->delalloc_root_lock); 10068 } 10069 spin_unlock(&fs_info->delalloc_root_lock); 10070 10071 ret = 0; 10072 out: 10073 if (!list_empty(&splice)) { 10074 spin_lock(&fs_info->delalloc_root_lock); 10075 list_splice_tail(&splice, &fs_info->delalloc_roots); 10076 spin_unlock(&fs_info->delalloc_root_lock); 10077 } 10078 mutex_unlock(&fs_info->delalloc_root_mutex); 10079 return ret; 10080 } 10081 10082 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 10083 const char *symname) 10084 { 10085 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 10086 struct btrfs_trans_handle *trans; 10087 struct btrfs_root *root = BTRFS_I(dir)->root; 10088 struct btrfs_path *path; 10089 struct btrfs_key key; 10090 struct inode *inode = NULL; 10091 int err; 10092 u64 objectid; 10093 u64 index = 0; 10094 int name_len; 10095 int datasize; 10096 unsigned long ptr; 10097 struct btrfs_file_extent_item *ei; 10098 struct extent_buffer *leaf; 10099 10100 name_len = strlen(symname); 10101 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) 10102 return -ENAMETOOLONG; 10103 10104 /* 10105 * 2 items for inode item and ref 10106 * 2 items for dir items 10107 * 1 item for updating parent inode item 10108 * 1 item for the inline extent item 10109 * 1 item for xattr if selinux is on 10110 */ 10111 trans = btrfs_start_transaction(root, 7); 10112 if (IS_ERR(trans)) 10113 return PTR_ERR(trans); 10114 10115 err = btrfs_find_free_ino(root, &objectid); 10116 if (err) 10117 goto out_unlock; 10118 10119 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 10120 dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), 10121 objectid, S_IFLNK|S_IRWXUGO, &index); 10122 if (IS_ERR(inode)) { 10123 err = PTR_ERR(inode); 10124 inode = NULL; 10125 goto out_unlock; 10126 } 10127 10128 /* 10129 * If the active LSM wants to access the inode during 10130 * d_instantiate it needs these. Smack checks to see 10131 * if the filesystem supports xattrs by looking at the 10132 * ops vector. 10133 */ 10134 inode->i_fop = &btrfs_file_operations; 10135 inode->i_op = &btrfs_file_inode_operations; 10136 inode->i_mapping->a_ops = &btrfs_aops; 10137 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 10138 10139 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 10140 if (err) 10141 goto out_unlock; 10142 10143 path = btrfs_alloc_path(); 10144 if (!path) { 10145 err = -ENOMEM; 10146 goto out_unlock; 10147 } 10148 key.objectid = btrfs_ino(BTRFS_I(inode)); 10149 key.offset = 0; 10150 key.type = BTRFS_EXTENT_DATA_KEY; 10151 datasize = btrfs_file_extent_calc_inline_size(name_len); 10152 err = btrfs_insert_empty_item(trans, root, path, &key, 10153 datasize); 10154 if (err) { 10155 btrfs_free_path(path); 10156 goto out_unlock; 10157 } 10158 leaf = path->nodes[0]; 10159 ei = btrfs_item_ptr(leaf, path->slots[0], 10160 struct btrfs_file_extent_item); 10161 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 10162 btrfs_set_file_extent_type(leaf, ei, 10163 BTRFS_FILE_EXTENT_INLINE); 10164 btrfs_set_file_extent_encryption(leaf, ei, 0); 10165 btrfs_set_file_extent_compression(leaf, ei, 0); 10166 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 10167 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 10168 10169 ptr = btrfs_file_extent_inline_start(ei); 10170 write_extent_buffer(leaf, symname, ptr, name_len); 10171 btrfs_mark_buffer_dirty(leaf); 10172 btrfs_free_path(path); 10173 10174 inode->i_op = &btrfs_symlink_inode_operations; 10175 inode_nohighmem(inode); 10176 inode->i_mapping->a_ops = &btrfs_aops; 10177 inode_set_bytes(inode, name_len); 10178 btrfs_i_size_write(BTRFS_I(inode), name_len); 10179 err = btrfs_update_inode(trans, root, inode); 10180 /* 10181 * Last step, add directory indexes for our symlink inode. This is the 10182 * last step to avoid extra cleanup of these indexes if an error happens 10183 * elsewhere above. 10184 */ 10185 if (!err) 10186 err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, 10187 BTRFS_I(inode), 0, index); 10188 if (err) 10189 goto out_unlock; 10190 10191 d_instantiate_new(dentry, inode); 10192 10193 out_unlock: 10194 btrfs_end_transaction(trans); 10195 if (err && inode) { 10196 inode_dec_link_count(inode); 10197 discard_new_inode(inode); 10198 } 10199 btrfs_btree_balance_dirty(fs_info); 10200 return err; 10201 } 10202 10203 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 10204 u64 start, u64 num_bytes, u64 min_size, 10205 loff_t actual_len, u64 *alloc_hint, 10206 struct btrfs_trans_handle *trans) 10207 { 10208 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 10209 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 10210 struct extent_map *em; 10211 struct btrfs_root *root = BTRFS_I(inode)->root; 10212 struct btrfs_key ins; 10213 u64 cur_offset = start; 10214 u64 i_size; 10215 u64 cur_bytes; 10216 u64 last_alloc = (u64)-1; 10217 int ret = 0; 10218 bool own_trans = true; 10219 u64 end = start + num_bytes - 1; 10220 10221 if (trans) 10222 own_trans = false; 10223 while (num_bytes > 0) { 10224 if (own_trans) { 10225 trans = btrfs_start_transaction(root, 3); 10226 if (IS_ERR(trans)) { 10227 ret = PTR_ERR(trans); 10228 break; 10229 } 10230 } 10231 10232 cur_bytes = min_t(u64, num_bytes, SZ_256M); 10233 cur_bytes = max(cur_bytes, min_size); 10234 /* 10235 * If we are severely fragmented we could end up with really 10236 * small allocations, so if the allocator is returning small 10237 * chunks lets make its job easier by only searching for those 10238 * sized chunks. 10239 */ 10240 cur_bytes = min(cur_bytes, last_alloc); 10241 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, 10242 min_size, 0, *alloc_hint, &ins, 1, 0); 10243 if (ret) { 10244 if (own_trans) 10245 btrfs_end_transaction(trans); 10246 break; 10247 } 10248 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 10249 10250 last_alloc = ins.offset; 10251 ret = insert_reserved_file_extent(trans, inode, 10252 cur_offset, ins.objectid, 10253 ins.offset, ins.offset, 10254 ins.offset, 0, 0, 0, 10255 BTRFS_FILE_EXTENT_PREALLOC); 10256 if (ret) { 10257 btrfs_free_reserved_extent(fs_info, ins.objectid, 10258 ins.offset, 0); 10259 btrfs_abort_transaction(trans, ret); 10260 if (own_trans) 10261 btrfs_end_transaction(trans); 10262 break; 10263 } 10264 10265 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, 10266 cur_offset + ins.offset -1, 0); 10267 10268 em = alloc_extent_map(); 10269 if (!em) { 10270 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 10271 &BTRFS_I(inode)->runtime_flags); 10272 goto next; 10273 } 10274 10275 em->start = cur_offset; 10276 em->orig_start = cur_offset; 10277 em->len = ins.offset; 10278 em->block_start = ins.objectid; 10279 em->block_len = ins.offset; 10280 em->orig_block_len = ins.offset; 10281 em->ram_bytes = ins.offset; 10282 em->bdev = fs_info->fs_devices->latest_bdev; 10283 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 10284 em->generation = trans->transid; 10285 10286 while (1) { 10287 write_lock(&em_tree->lock); 10288 ret = add_extent_mapping(em_tree, em, 1); 10289 write_unlock(&em_tree->lock); 10290 if (ret != -EEXIST) 10291 break; 10292 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, 10293 cur_offset + ins.offset - 1, 10294 0); 10295 } 10296 free_extent_map(em); 10297 next: 10298 num_bytes -= ins.offset; 10299 cur_offset += ins.offset; 10300 *alloc_hint = ins.objectid + ins.offset; 10301 10302 inode_inc_iversion(inode); 10303 inode->i_ctime = current_time(inode); 10304 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 10305 if (!(mode & FALLOC_FL_KEEP_SIZE) && 10306 (actual_len > inode->i_size) && 10307 (cur_offset > inode->i_size)) { 10308 if (cur_offset > actual_len) 10309 i_size = actual_len; 10310 else 10311 i_size = cur_offset; 10312 i_size_write(inode, i_size); 10313 btrfs_ordered_update_i_size(inode, i_size, NULL); 10314 } 10315 10316 ret = btrfs_update_inode(trans, root, inode); 10317 10318 if (ret) { 10319 btrfs_abort_transaction(trans, ret); 10320 if (own_trans) 10321 btrfs_end_transaction(trans); 10322 break; 10323 } 10324 10325 if (own_trans) 10326 btrfs_end_transaction(trans); 10327 } 10328 if (cur_offset < end) 10329 btrfs_free_reserved_data_space(inode, NULL, cur_offset, 10330 end - cur_offset + 1); 10331 return ret; 10332 } 10333 10334 int btrfs_prealloc_file_range(struct inode *inode, int mode, 10335 u64 start, u64 num_bytes, u64 min_size, 10336 loff_t actual_len, u64 *alloc_hint) 10337 { 10338 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 10339 min_size, actual_len, alloc_hint, 10340 NULL); 10341 } 10342 10343 int btrfs_prealloc_file_range_trans(struct inode *inode, 10344 struct btrfs_trans_handle *trans, int mode, 10345 u64 start, u64 num_bytes, u64 min_size, 10346 loff_t actual_len, u64 *alloc_hint) 10347 { 10348 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 10349 min_size, actual_len, alloc_hint, trans); 10350 } 10351 10352 static int btrfs_set_page_dirty(struct page *page) 10353 { 10354 return __set_page_dirty_nobuffers(page); 10355 } 10356 10357 static int btrfs_permission(struct inode *inode, int mask) 10358 { 10359 struct btrfs_root *root = BTRFS_I(inode)->root; 10360 umode_t mode = inode->i_mode; 10361 10362 if (mask & MAY_WRITE && 10363 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 10364 if (btrfs_root_readonly(root)) 10365 return -EROFS; 10366 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 10367 return -EACCES; 10368 } 10369 return generic_permission(inode, mask); 10370 } 10371 10372 static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 10373 { 10374 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 10375 struct btrfs_trans_handle *trans; 10376 struct btrfs_root *root = BTRFS_I(dir)->root; 10377 struct inode *inode = NULL; 10378 u64 objectid; 10379 u64 index; 10380 int ret = 0; 10381 10382 /* 10383 * 5 units required for adding orphan entry 10384 */ 10385 trans = btrfs_start_transaction(root, 5); 10386 if (IS_ERR(trans)) 10387 return PTR_ERR(trans); 10388 10389 ret = btrfs_find_free_ino(root, &objectid); 10390 if (ret) 10391 goto out; 10392 10393 inode = btrfs_new_inode(trans, root, dir, NULL, 0, 10394 btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); 10395 if (IS_ERR(inode)) { 10396 ret = PTR_ERR(inode); 10397 inode = NULL; 10398 goto out; 10399 } 10400 10401 inode->i_fop = &btrfs_file_operations; 10402 inode->i_op = &btrfs_file_inode_operations; 10403 10404 inode->i_mapping->a_ops = &btrfs_aops; 10405 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 10406 10407 ret = btrfs_init_inode_security(trans, inode, dir, NULL); 10408 if (ret) 10409 goto out; 10410 10411 ret = btrfs_update_inode(trans, root, inode); 10412 if (ret) 10413 goto out; 10414 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 10415 if (ret) 10416 goto out; 10417 10418 /* 10419 * We set number of links to 0 in btrfs_new_inode(), and here we set 10420 * it to 1 because d_tmpfile() will issue a warning if the count is 0, 10421 * through: 10422 * 10423 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() 10424 */ 10425 set_nlink(inode, 1); 10426 d_tmpfile(dentry, inode); 10427 unlock_new_inode(inode); 10428 mark_inode_dirty(inode); 10429 out: 10430 btrfs_end_transaction(trans); 10431 if (ret && inode) 10432 discard_new_inode(inode); 10433 btrfs_btree_balance_dirty(fs_info); 10434 return ret; 10435 } 10436 10437 __attribute__((const)) 10438 static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) 10439 { 10440 return -EAGAIN; 10441 } 10442 10443 static void btrfs_check_extent_io_range(void *private_data, const char *caller, 10444 u64 start, u64 end) 10445 { 10446 struct inode *inode = private_data; 10447 u64 isize; 10448 10449 isize = i_size_read(inode); 10450 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 10451 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 10452 "%s: ino %llu isize %llu odd range [%llu,%llu]", 10453 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); 10454 } 10455 } 10456 10457 void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 10458 { 10459 struct inode *inode = tree->private_data; 10460 unsigned long index = start >> PAGE_SHIFT; 10461 unsigned long end_index = end >> PAGE_SHIFT; 10462 struct page *page; 10463 10464 while (index <= end_index) { 10465 page = find_get_page(inode->i_mapping, index); 10466 ASSERT(page); /* Pages should be in the extent_io_tree */ 10467 set_page_writeback(page); 10468 put_page(page); 10469 index++; 10470 } 10471 } 10472 10473 static const struct inode_operations btrfs_dir_inode_operations = { 10474 .getattr = btrfs_getattr, 10475 .lookup = btrfs_lookup, 10476 .create = btrfs_create, 10477 .unlink = btrfs_unlink, 10478 .link = btrfs_link, 10479 .mkdir = btrfs_mkdir, 10480 .rmdir = btrfs_rmdir, 10481 .rename = btrfs_rename2, 10482 .symlink = btrfs_symlink, 10483 .setattr = btrfs_setattr, 10484 .mknod = btrfs_mknod, 10485 .listxattr = btrfs_listxattr, 10486 .permission = btrfs_permission, 10487 .get_acl = btrfs_get_acl, 10488 .set_acl = btrfs_set_acl, 10489 .update_time = btrfs_update_time, 10490 .tmpfile = btrfs_tmpfile, 10491 }; 10492 static const struct inode_operations btrfs_dir_ro_inode_operations = { 10493 .lookup = btrfs_lookup, 10494 .permission = btrfs_permission, 10495 .update_time = btrfs_update_time, 10496 }; 10497 10498 static const struct file_operations btrfs_dir_file_operations = { 10499 .llseek = generic_file_llseek, 10500 .read = generic_read_dir, 10501 .iterate_shared = btrfs_real_readdir, 10502 .open = btrfs_opendir, 10503 .unlocked_ioctl = btrfs_ioctl, 10504 #ifdef CONFIG_COMPAT 10505 .compat_ioctl = btrfs_compat_ioctl, 10506 #endif 10507 .release = btrfs_release_file, 10508 .fsync = btrfs_sync_file, 10509 }; 10510 10511 static const struct extent_io_ops btrfs_extent_io_ops = { 10512 /* mandatory callbacks */ 10513 .submit_bio_hook = btrfs_submit_bio_hook, 10514 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 10515 .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, 10516 10517 /* optional callbacks */ 10518 .fill_delalloc = run_delalloc_range, 10519 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 10520 .writepage_start_hook = btrfs_writepage_start_hook, 10521 .set_bit_hook = btrfs_set_bit_hook, 10522 .clear_bit_hook = btrfs_clear_bit_hook, 10523 .merge_extent_hook = btrfs_merge_extent_hook, 10524 .split_extent_hook = btrfs_split_extent_hook, 10525 .check_extent_io_range = btrfs_check_extent_io_range, 10526 }; 10527 10528 /* 10529 * btrfs doesn't support the bmap operation because swapfiles 10530 * use bmap to make a mapping of extents in the file. They assume 10531 * these extents won't change over the life of the file and they 10532 * use the bmap result to do IO directly to the drive. 10533 * 10534 * the btrfs bmap call would return logical addresses that aren't 10535 * suitable for IO and they also will change frequently as COW 10536 * operations happen. So, swapfile + btrfs == corruption. 10537 * 10538 * For now we're avoiding this by dropping bmap. 10539 */ 10540 static const struct address_space_operations btrfs_aops = { 10541 .readpage = btrfs_readpage, 10542 .writepage = btrfs_writepage, 10543 .writepages = btrfs_writepages, 10544 .readpages = btrfs_readpages, 10545 .direct_IO = btrfs_direct_IO, 10546 .invalidatepage = btrfs_invalidatepage, 10547 .releasepage = btrfs_releasepage, 10548 .set_page_dirty = btrfs_set_page_dirty, 10549 .error_remove_page = generic_error_remove_page, 10550 }; 10551 10552 static const struct inode_operations btrfs_file_inode_operations = { 10553 .getattr = btrfs_getattr, 10554 .setattr = btrfs_setattr, 10555 .listxattr = btrfs_listxattr, 10556 .permission = btrfs_permission, 10557 .fiemap = btrfs_fiemap, 10558 .get_acl = btrfs_get_acl, 10559 .set_acl = btrfs_set_acl, 10560 .update_time = btrfs_update_time, 10561 }; 10562 static const struct inode_operations btrfs_special_inode_operations = { 10563 .getattr = btrfs_getattr, 10564 .setattr = btrfs_setattr, 10565 .permission = btrfs_permission, 10566 .listxattr = btrfs_listxattr, 10567 .get_acl = btrfs_get_acl, 10568 .set_acl = btrfs_set_acl, 10569 .update_time = btrfs_update_time, 10570 }; 10571 static const struct inode_operations btrfs_symlink_inode_operations = { 10572 .get_link = page_get_link, 10573 .getattr = btrfs_getattr, 10574 .setattr = btrfs_setattr, 10575 .permission = btrfs_permission, 10576 .listxattr = btrfs_listxattr, 10577 .update_time = btrfs_update_time, 10578 }; 10579 10580 const struct dentry_operations btrfs_dentry_operations = { 10581 .d_delete = btrfs_dentry_delete, 10582 }; 10583