1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/bit_spinlock.h> 36 #include <linux/xattr.h> 37 #include <linux/posix_acl.h> 38 #include <linux/falloc.h> 39 #include <linux/slab.h> 40 #include <linux/ratelimit.h> 41 #include <linux/mount.h> 42 #include <linux/btrfs.h> 43 #include <linux/blkdev.h> 44 #include <linux/posix_acl_xattr.h> 45 #include <linux/uio.h> 46 #include "ctree.h" 47 #include "disk-io.h" 48 #include "transaction.h" 49 #include "btrfs_inode.h" 50 #include "print-tree.h" 51 #include "ordered-data.h" 52 #include "xattr.h" 53 #include "tree-log.h" 54 #include "volumes.h" 55 #include "compression.h" 56 #include "locking.h" 57 #include "free-space-cache.h" 58 #include "inode-map.h" 59 #include "backref.h" 60 #include "hash.h" 61 #include "props.h" 62 #include "qgroup.h" 63 #include "dedupe.h" 64 65 struct btrfs_iget_args { 66 struct btrfs_key *location; 67 struct btrfs_root *root; 68 }; 69 70 struct btrfs_dio_data { 71 u64 outstanding_extents; 72 u64 reserve; 73 u64 unsubmitted_oe_range_start; 74 u64 unsubmitted_oe_range_end; 75 }; 76 77 static const struct inode_operations btrfs_dir_inode_operations; 78 static const struct inode_operations btrfs_symlink_inode_operations; 79 static const struct inode_operations btrfs_dir_ro_inode_operations; 80 static const struct inode_operations btrfs_special_inode_operations; 81 static const struct inode_operations btrfs_file_inode_operations; 82 static const struct address_space_operations btrfs_aops; 83 static const struct address_space_operations btrfs_symlink_aops; 84 static const struct file_operations btrfs_dir_file_operations; 85 static const struct extent_io_ops btrfs_extent_io_ops; 86 87 static struct kmem_cache *btrfs_inode_cachep; 88 struct kmem_cache *btrfs_trans_handle_cachep; 89 struct kmem_cache *btrfs_transaction_cachep; 90 struct kmem_cache *btrfs_path_cachep; 91 struct kmem_cache *btrfs_free_space_cachep; 92 93 #define S_SHIFT 12 94 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 95 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 96 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 97 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 98 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 99 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 100 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 101 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 102 }; 103 104 static int btrfs_setsize(struct inode *inode, struct iattr *attr); 105 static int btrfs_truncate(struct inode *inode); 106 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 107 static noinline int cow_file_range(struct inode *inode, 108 struct page *locked_page, 109 u64 start, u64 end, u64 delalloc_end, 110 int *page_started, unsigned long *nr_written, 111 int unlock, struct btrfs_dedupe_hash *hash); 112 static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 113 u64 len, u64 orig_start, 114 u64 block_start, u64 block_len, 115 u64 orig_block_len, u64 ram_bytes, 116 int type); 117 118 static int btrfs_dirty_inode(struct inode *inode); 119 120 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 121 void btrfs_test_inode_set_ops(struct inode *inode) 122 { 123 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 124 } 125 #endif 126 127 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 128 struct inode *inode, struct inode *dir, 129 const struct qstr *qstr) 130 { 131 int err; 132 133 err = btrfs_init_acl(trans, inode, dir); 134 if (!err) 135 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 136 return err; 137 } 138 139 /* 140 * this does all the hard work for inserting an inline extent into 141 * the btree. The caller should have done a btrfs_drop_extents so that 142 * no overlapping inline items exist in the btree 143 */ 144 static int insert_inline_extent(struct btrfs_trans_handle *trans, 145 struct btrfs_path *path, int extent_inserted, 146 struct btrfs_root *root, struct inode *inode, 147 u64 start, size_t size, size_t compressed_size, 148 int compress_type, 149 struct page **compressed_pages) 150 { 151 struct extent_buffer *leaf; 152 struct page *page = NULL; 153 char *kaddr; 154 unsigned long ptr; 155 struct btrfs_file_extent_item *ei; 156 int err = 0; 157 int ret; 158 size_t cur_size = size; 159 unsigned long offset; 160 161 if (compressed_size && compressed_pages) 162 cur_size = compressed_size; 163 164 inode_add_bytes(inode, size); 165 166 if (!extent_inserted) { 167 struct btrfs_key key; 168 size_t datasize; 169 170 key.objectid = btrfs_ino(inode); 171 key.offset = start; 172 key.type = BTRFS_EXTENT_DATA_KEY; 173 174 datasize = btrfs_file_extent_calc_inline_size(cur_size); 175 path->leave_spinning = 1; 176 ret = btrfs_insert_empty_item(trans, root, path, &key, 177 datasize); 178 if (ret) { 179 err = ret; 180 goto fail; 181 } 182 } 183 leaf = path->nodes[0]; 184 ei = btrfs_item_ptr(leaf, path->slots[0], 185 struct btrfs_file_extent_item); 186 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 187 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 188 btrfs_set_file_extent_encryption(leaf, ei, 0); 189 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 190 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 191 ptr = btrfs_file_extent_inline_start(ei); 192 193 if (compress_type != BTRFS_COMPRESS_NONE) { 194 struct page *cpage; 195 int i = 0; 196 while (compressed_size > 0) { 197 cpage = compressed_pages[i]; 198 cur_size = min_t(unsigned long, compressed_size, 199 PAGE_SIZE); 200 201 kaddr = kmap_atomic(cpage); 202 write_extent_buffer(leaf, kaddr, ptr, cur_size); 203 kunmap_atomic(kaddr); 204 205 i++; 206 ptr += cur_size; 207 compressed_size -= cur_size; 208 } 209 btrfs_set_file_extent_compression(leaf, ei, 210 compress_type); 211 } else { 212 page = find_get_page(inode->i_mapping, 213 start >> PAGE_SHIFT); 214 btrfs_set_file_extent_compression(leaf, ei, 0); 215 kaddr = kmap_atomic(page); 216 offset = start & (PAGE_SIZE - 1); 217 write_extent_buffer(leaf, kaddr + offset, ptr, size); 218 kunmap_atomic(kaddr); 219 put_page(page); 220 } 221 btrfs_mark_buffer_dirty(leaf); 222 btrfs_release_path(path); 223 224 /* 225 * we're an inline extent, so nobody can 226 * extend the file past i_size without locking 227 * a page we already have locked. 228 * 229 * We must do any isize and inode updates 230 * before we unlock the pages. Otherwise we 231 * could end up racing with unlink. 232 */ 233 BTRFS_I(inode)->disk_i_size = inode->i_size; 234 ret = btrfs_update_inode(trans, root, inode); 235 236 return ret; 237 fail: 238 return err; 239 } 240 241 242 /* 243 * conditionally insert an inline extent into the file. This 244 * does the checks required to make sure the data is small enough 245 * to fit as an inline extent. 246 */ 247 static noinline int cow_file_range_inline(struct btrfs_root *root, 248 struct inode *inode, u64 start, 249 u64 end, size_t compressed_size, 250 int compress_type, 251 struct page **compressed_pages) 252 { 253 struct btrfs_trans_handle *trans; 254 u64 isize = i_size_read(inode); 255 u64 actual_end = min(end + 1, isize); 256 u64 inline_len = actual_end - start; 257 u64 aligned_end = ALIGN(end, root->sectorsize); 258 u64 data_len = inline_len; 259 int ret; 260 struct btrfs_path *path; 261 int extent_inserted = 0; 262 u32 extent_item_size; 263 264 if (compressed_size) 265 data_len = compressed_size; 266 267 if (start > 0 || 268 actual_end > root->sectorsize || 269 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || 270 (!compressed_size && 271 (actual_end & (root->sectorsize - 1)) == 0) || 272 end + 1 < isize || 273 data_len > root->fs_info->max_inline) { 274 return 1; 275 } 276 277 path = btrfs_alloc_path(); 278 if (!path) 279 return -ENOMEM; 280 281 trans = btrfs_join_transaction(root); 282 if (IS_ERR(trans)) { 283 btrfs_free_path(path); 284 return PTR_ERR(trans); 285 } 286 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 287 288 if (compressed_size && compressed_pages) 289 extent_item_size = btrfs_file_extent_calc_inline_size( 290 compressed_size); 291 else 292 extent_item_size = btrfs_file_extent_calc_inline_size( 293 inline_len); 294 295 ret = __btrfs_drop_extents(trans, root, inode, path, 296 start, aligned_end, NULL, 297 1, 1, extent_item_size, &extent_inserted); 298 if (ret) { 299 btrfs_abort_transaction(trans, ret); 300 goto out; 301 } 302 303 if (isize > actual_end) 304 inline_len = min_t(u64, isize, actual_end); 305 ret = insert_inline_extent(trans, path, extent_inserted, 306 root, inode, start, 307 inline_len, compressed_size, 308 compress_type, compressed_pages); 309 if (ret && ret != -ENOSPC) { 310 btrfs_abort_transaction(trans, ret); 311 goto out; 312 } else if (ret == -ENOSPC) { 313 ret = 1; 314 goto out; 315 } 316 317 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 318 btrfs_delalloc_release_metadata(inode, end + 1 - start); 319 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 320 out: 321 /* 322 * Don't forget to free the reserved space, as for inlined extent 323 * it won't count as data extent, free them directly here. 324 * And at reserve time, it's always aligned to page size, so 325 * just free one page here. 326 */ 327 btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); 328 btrfs_free_path(path); 329 btrfs_end_transaction(trans, root); 330 return ret; 331 } 332 333 struct async_extent { 334 u64 start; 335 u64 ram_size; 336 u64 compressed_size; 337 struct page **pages; 338 unsigned long nr_pages; 339 int compress_type; 340 struct list_head list; 341 }; 342 343 struct async_cow { 344 struct inode *inode; 345 struct btrfs_root *root; 346 struct page *locked_page; 347 u64 start; 348 u64 end; 349 struct list_head extents; 350 struct btrfs_work work; 351 }; 352 353 static noinline int add_async_extent(struct async_cow *cow, 354 u64 start, u64 ram_size, 355 u64 compressed_size, 356 struct page **pages, 357 unsigned long nr_pages, 358 int compress_type) 359 { 360 struct async_extent *async_extent; 361 362 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 363 BUG_ON(!async_extent); /* -ENOMEM */ 364 async_extent->start = start; 365 async_extent->ram_size = ram_size; 366 async_extent->compressed_size = compressed_size; 367 async_extent->pages = pages; 368 async_extent->nr_pages = nr_pages; 369 async_extent->compress_type = compress_type; 370 list_add_tail(&async_extent->list, &cow->extents); 371 return 0; 372 } 373 374 static inline int inode_need_compress(struct inode *inode) 375 { 376 struct btrfs_root *root = BTRFS_I(inode)->root; 377 378 /* force compress */ 379 if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) 380 return 1; 381 /* bad compression ratios */ 382 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 383 return 0; 384 if (btrfs_test_opt(root->fs_info, COMPRESS) || 385 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || 386 BTRFS_I(inode)->force_compress) 387 return 1; 388 return 0; 389 } 390 391 /* 392 * we create compressed extents in two phases. The first 393 * phase compresses a range of pages that have already been 394 * locked (both pages and state bits are locked). 395 * 396 * This is done inside an ordered work queue, and the compression 397 * is spread across many cpus. The actual IO submission is step 398 * two, and the ordered work queue takes care of making sure that 399 * happens in the same order things were put onto the queue by 400 * writepages and friends. 401 * 402 * If this code finds it can't get good compression, it puts an 403 * entry onto the work queue to write the uncompressed bytes. This 404 * makes sure that both compressed inodes and uncompressed inodes 405 * are written in the same order that the flusher thread sent them 406 * down. 407 */ 408 static noinline void compress_file_range(struct inode *inode, 409 struct page *locked_page, 410 u64 start, u64 end, 411 struct async_cow *async_cow, 412 int *num_added) 413 { 414 struct btrfs_root *root = BTRFS_I(inode)->root; 415 u64 num_bytes; 416 u64 blocksize = root->sectorsize; 417 u64 actual_end; 418 u64 isize = i_size_read(inode); 419 int ret = 0; 420 struct page **pages = NULL; 421 unsigned long nr_pages; 422 unsigned long nr_pages_ret = 0; 423 unsigned long total_compressed = 0; 424 unsigned long total_in = 0; 425 unsigned long max_compressed = SZ_128K; 426 unsigned long max_uncompressed = SZ_128K; 427 int i; 428 int will_compress; 429 int compress_type = root->fs_info->compress_type; 430 int redirty = 0; 431 432 /* if this is a small write inside eof, kick off a defrag */ 433 if ((end - start + 1) < SZ_16K && 434 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 435 btrfs_add_inode_defrag(NULL, inode); 436 437 actual_end = min_t(u64, isize, end + 1); 438 again: 439 will_compress = 0; 440 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; 441 nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE); 442 443 /* 444 * we don't want to send crud past the end of i_size through 445 * compression, that's just a waste of CPU time. So, if the 446 * end of the file is before the start of our current 447 * requested range of bytes, we bail out to the uncompressed 448 * cleanup code that can deal with all of this. 449 * 450 * It isn't really the fastest way to fix things, but this is a 451 * very uncommon corner. 452 */ 453 if (actual_end <= start) 454 goto cleanup_and_bail_uncompressed; 455 456 total_compressed = actual_end - start; 457 458 /* 459 * skip compression for a small file range(<=blocksize) that 460 * isn't an inline extent, since it doesn't save disk space at all. 461 */ 462 if (total_compressed <= blocksize && 463 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 464 goto cleanup_and_bail_uncompressed; 465 466 /* we want to make sure that amount of ram required to uncompress 467 * an extent is reasonable, so we limit the total size in ram 468 * of a compressed extent to 128k. This is a crucial number 469 * because it also controls how easily we can spread reads across 470 * cpus for decompression. 471 * 472 * We also want to make sure the amount of IO required to do 473 * a random read is reasonably small, so we limit the size of 474 * a compressed extent to 128k. 475 */ 476 total_compressed = min(total_compressed, max_uncompressed); 477 num_bytes = ALIGN(end - start + 1, blocksize); 478 num_bytes = max(blocksize, num_bytes); 479 total_in = 0; 480 ret = 0; 481 482 /* 483 * we do compression for mount -o compress and when the 484 * inode has not been flagged as nocompress. This flag can 485 * change at any time if we discover bad compression ratios. 486 */ 487 if (inode_need_compress(inode)) { 488 WARN_ON(pages); 489 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 490 if (!pages) { 491 /* just bail out to the uncompressed code */ 492 goto cont; 493 } 494 495 if (BTRFS_I(inode)->force_compress) 496 compress_type = BTRFS_I(inode)->force_compress; 497 498 /* 499 * we need to call clear_page_dirty_for_io on each 500 * page in the range. Otherwise applications with the file 501 * mmap'd can wander in and change the page contents while 502 * we are compressing them. 503 * 504 * If the compression fails for any reason, we set the pages 505 * dirty again later on. 506 */ 507 extent_range_clear_dirty_for_io(inode, start, end); 508 redirty = 1; 509 ret = btrfs_compress_pages(compress_type, 510 inode->i_mapping, start, 511 total_compressed, pages, 512 nr_pages, &nr_pages_ret, 513 &total_in, 514 &total_compressed, 515 max_compressed); 516 517 if (!ret) { 518 unsigned long offset = total_compressed & 519 (PAGE_SIZE - 1); 520 struct page *page = pages[nr_pages_ret - 1]; 521 char *kaddr; 522 523 /* zero the tail end of the last page, we might be 524 * sending it down to disk 525 */ 526 if (offset) { 527 kaddr = kmap_atomic(page); 528 memset(kaddr + offset, 0, 529 PAGE_SIZE - offset); 530 kunmap_atomic(kaddr); 531 } 532 will_compress = 1; 533 } 534 } 535 cont: 536 if (start == 0) { 537 /* lets try to make an inline extent */ 538 if (ret || total_in < (actual_end - start)) { 539 /* we didn't compress the entire range, try 540 * to make an uncompressed inline extent. 541 */ 542 ret = cow_file_range_inline(root, inode, start, end, 543 0, 0, NULL); 544 } else { 545 /* try making a compressed inline extent */ 546 ret = cow_file_range_inline(root, inode, start, end, 547 total_compressed, 548 compress_type, pages); 549 } 550 if (ret <= 0) { 551 unsigned long clear_flags = EXTENT_DELALLOC | 552 EXTENT_DEFRAG; 553 unsigned long page_error_op; 554 555 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; 556 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; 557 558 /* 559 * inline extent creation worked or returned error, 560 * we don't need to create any more async work items. 561 * Unlock and free up our temp pages. 562 */ 563 extent_clear_unlock_delalloc(inode, start, end, end, 564 NULL, clear_flags, 565 PAGE_UNLOCK | 566 PAGE_CLEAR_DIRTY | 567 PAGE_SET_WRITEBACK | 568 page_error_op | 569 PAGE_END_WRITEBACK); 570 btrfs_free_reserved_data_space_noquota(inode, start, 571 end - start + 1); 572 goto free_pages_out; 573 } 574 } 575 576 if (will_compress) { 577 /* 578 * we aren't doing an inline extent round the compressed size 579 * up to a block size boundary so the allocator does sane 580 * things 581 */ 582 total_compressed = ALIGN(total_compressed, blocksize); 583 584 /* 585 * one last check to make sure the compression is really a 586 * win, compare the page count read with the blocks on disk 587 */ 588 total_in = ALIGN(total_in, PAGE_SIZE); 589 if (total_compressed >= total_in) { 590 will_compress = 0; 591 } else { 592 num_bytes = total_in; 593 *num_added += 1; 594 595 /* 596 * The async work queues will take care of doing actual 597 * allocation on disk for these compressed pages, and 598 * will submit them to the elevator. 599 */ 600 add_async_extent(async_cow, start, num_bytes, 601 total_compressed, pages, nr_pages_ret, 602 compress_type); 603 604 if (start + num_bytes < end) { 605 start += num_bytes; 606 pages = NULL; 607 cond_resched(); 608 goto again; 609 } 610 return; 611 } 612 } 613 if (pages) { 614 /* 615 * the compression code ran but failed to make things smaller, 616 * free any pages it allocated and our page pointer array 617 */ 618 for (i = 0; i < nr_pages_ret; i++) { 619 WARN_ON(pages[i]->mapping); 620 put_page(pages[i]); 621 } 622 kfree(pages); 623 pages = NULL; 624 total_compressed = 0; 625 nr_pages_ret = 0; 626 627 /* flag the file so we don't compress in the future */ 628 if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) && 629 !(BTRFS_I(inode)->force_compress)) { 630 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 631 } 632 } 633 cleanup_and_bail_uncompressed: 634 /* 635 * No compression, but we still need to write the pages in the file 636 * we've been given so far. redirty the locked page if it corresponds 637 * to our extent and set things up for the async work queue to run 638 * cow_file_range to do the normal delalloc dance. 639 */ 640 if (page_offset(locked_page) >= start && 641 page_offset(locked_page) <= end) 642 __set_page_dirty_nobuffers(locked_page); 643 /* unlocked later on in the async handlers */ 644 645 if (redirty) 646 extent_range_redirty_for_io(inode, start, end); 647 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, 648 BTRFS_COMPRESS_NONE); 649 *num_added += 1; 650 651 return; 652 653 free_pages_out: 654 for (i = 0; i < nr_pages_ret; i++) { 655 WARN_ON(pages[i]->mapping); 656 put_page(pages[i]); 657 } 658 kfree(pages); 659 } 660 661 static void free_async_extent_pages(struct async_extent *async_extent) 662 { 663 int i; 664 665 if (!async_extent->pages) 666 return; 667 668 for (i = 0; i < async_extent->nr_pages; i++) { 669 WARN_ON(async_extent->pages[i]->mapping); 670 put_page(async_extent->pages[i]); 671 } 672 kfree(async_extent->pages); 673 async_extent->nr_pages = 0; 674 async_extent->pages = NULL; 675 } 676 677 /* 678 * phase two of compressed writeback. This is the ordered portion 679 * of the code, which only gets called in the order the work was 680 * queued. We walk all the async extents created by compress_file_range 681 * and send them down to the disk. 682 */ 683 static noinline void submit_compressed_extents(struct inode *inode, 684 struct async_cow *async_cow) 685 { 686 struct async_extent *async_extent; 687 u64 alloc_hint = 0; 688 struct btrfs_key ins; 689 struct extent_map *em; 690 struct btrfs_root *root = BTRFS_I(inode)->root; 691 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 692 struct extent_io_tree *io_tree; 693 int ret = 0; 694 695 again: 696 while (!list_empty(&async_cow->extents)) { 697 async_extent = list_entry(async_cow->extents.next, 698 struct async_extent, list); 699 list_del(&async_extent->list); 700 701 io_tree = &BTRFS_I(inode)->io_tree; 702 703 retry: 704 /* did the compression code fall back to uncompressed IO? */ 705 if (!async_extent->pages) { 706 int page_started = 0; 707 unsigned long nr_written = 0; 708 709 lock_extent(io_tree, async_extent->start, 710 async_extent->start + 711 async_extent->ram_size - 1); 712 713 /* allocate blocks */ 714 ret = cow_file_range(inode, async_cow->locked_page, 715 async_extent->start, 716 async_extent->start + 717 async_extent->ram_size - 1, 718 async_extent->start + 719 async_extent->ram_size - 1, 720 &page_started, &nr_written, 0, 721 NULL); 722 723 /* JDM XXX */ 724 725 /* 726 * if page_started, cow_file_range inserted an 727 * inline extent and took care of all the unlocking 728 * and IO for us. Otherwise, we need to submit 729 * all those pages down to the drive. 730 */ 731 if (!page_started && !ret) 732 extent_write_locked_range(io_tree, 733 inode, async_extent->start, 734 async_extent->start + 735 async_extent->ram_size - 1, 736 btrfs_get_extent, 737 WB_SYNC_ALL); 738 else if (ret) 739 unlock_page(async_cow->locked_page); 740 kfree(async_extent); 741 cond_resched(); 742 continue; 743 } 744 745 lock_extent(io_tree, async_extent->start, 746 async_extent->start + async_extent->ram_size - 1); 747 748 ret = btrfs_reserve_extent(root, async_extent->ram_size, 749 async_extent->compressed_size, 750 async_extent->compressed_size, 751 0, alloc_hint, &ins, 1, 1); 752 if (ret) { 753 free_async_extent_pages(async_extent); 754 755 if (ret == -ENOSPC) { 756 unlock_extent(io_tree, async_extent->start, 757 async_extent->start + 758 async_extent->ram_size - 1); 759 760 /* 761 * we need to redirty the pages if we decide to 762 * fallback to uncompressed IO, otherwise we 763 * will not submit these pages down to lower 764 * layers. 765 */ 766 extent_range_redirty_for_io(inode, 767 async_extent->start, 768 async_extent->start + 769 async_extent->ram_size - 1); 770 771 goto retry; 772 } 773 goto out_free; 774 } 775 /* 776 * here we're doing allocation and writeback of the 777 * compressed pages 778 */ 779 btrfs_drop_extent_cache(inode, async_extent->start, 780 async_extent->start + 781 async_extent->ram_size - 1, 0); 782 783 em = alloc_extent_map(); 784 if (!em) { 785 ret = -ENOMEM; 786 goto out_free_reserve; 787 } 788 em->start = async_extent->start; 789 em->len = async_extent->ram_size; 790 em->orig_start = em->start; 791 em->mod_start = em->start; 792 em->mod_len = em->len; 793 794 em->block_start = ins.objectid; 795 em->block_len = ins.offset; 796 em->orig_block_len = ins.offset; 797 em->ram_bytes = async_extent->ram_size; 798 em->bdev = root->fs_info->fs_devices->latest_bdev; 799 em->compress_type = async_extent->compress_type; 800 set_bit(EXTENT_FLAG_PINNED, &em->flags); 801 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 802 em->generation = -1; 803 804 while (1) { 805 write_lock(&em_tree->lock); 806 ret = add_extent_mapping(em_tree, em, 1); 807 write_unlock(&em_tree->lock); 808 if (ret != -EEXIST) { 809 free_extent_map(em); 810 break; 811 } 812 btrfs_drop_extent_cache(inode, async_extent->start, 813 async_extent->start + 814 async_extent->ram_size - 1, 0); 815 } 816 817 if (ret) 818 goto out_free_reserve; 819 820 ret = btrfs_add_ordered_extent_compress(inode, 821 async_extent->start, 822 ins.objectid, 823 async_extent->ram_size, 824 ins.offset, 825 BTRFS_ORDERED_COMPRESSED, 826 async_extent->compress_type); 827 if (ret) { 828 btrfs_drop_extent_cache(inode, async_extent->start, 829 async_extent->start + 830 async_extent->ram_size - 1, 0); 831 goto out_free_reserve; 832 } 833 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); 834 835 /* 836 * clear dirty, set writeback and unlock the pages. 837 */ 838 extent_clear_unlock_delalloc(inode, async_extent->start, 839 async_extent->start + 840 async_extent->ram_size - 1, 841 async_extent->start + 842 async_extent->ram_size - 1, 843 NULL, EXTENT_LOCKED | EXTENT_DELALLOC, 844 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 845 PAGE_SET_WRITEBACK); 846 ret = btrfs_submit_compressed_write(inode, 847 async_extent->start, 848 async_extent->ram_size, 849 ins.objectid, 850 ins.offset, async_extent->pages, 851 async_extent->nr_pages); 852 if (ret) { 853 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 854 struct page *p = async_extent->pages[0]; 855 const u64 start = async_extent->start; 856 const u64 end = start + async_extent->ram_size - 1; 857 858 p->mapping = inode->i_mapping; 859 tree->ops->writepage_end_io_hook(p, start, end, 860 NULL, 0); 861 p->mapping = NULL; 862 extent_clear_unlock_delalloc(inode, start, end, end, 863 NULL, 0, 864 PAGE_END_WRITEBACK | 865 PAGE_SET_ERROR); 866 free_async_extent_pages(async_extent); 867 } 868 alloc_hint = ins.objectid + ins.offset; 869 kfree(async_extent); 870 cond_resched(); 871 } 872 return; 873 out_free_reserve: 874 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); 875 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 876 out_free: 877 extent_clear_unlock_delalloc(inode, async_extent->start, 878 async_extent->start + 879 async_extent->ram_size - 1, 880 async_extent->start + 881 async_extent->ram_size - 1, 882 NULL, EXTENT_LOCKED | EXTENT_DELALLOC | 883 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 884 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 885 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | 886 PAGE_SET_ERROR); 887 free_async_extent_pages(async_extent); 888 kfree(async_extent); 889 goto again; 890 } 891 892 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 893 u64 num_bytes) 894 { 895 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 896 struct extent_map *em; 897 u64 alloc_hint = 0; 898 899 read_lock(&em_tree->lock); 900 em = search_extent_mapping(em_tree, start, num_bytes); 901 if (em) { 902 /* 903 * if block start isn't an actual block number then find the 904 * first block in this inode and use that as a hint. If that 905 * block is also bogus then just don't worry about it. 906 */ 907 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 908 free_extent_map(em); 909 em = search_extent_mapping(em_tree, 0, 0); 910 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 911 alloc_hint = em->block_start; 912 if (em) 913 free_extent_map(em); 914 } else { 915 alloc_hint = em->block_start; 916 free_extent_map(em); 917 } 918 } 919 read_unlock(&em_tree->lock); 920 921 return alloc_hint; 922 } 923 924 /* 925 * when extent_io.c finds a delayed allocation range in the file, 926 * the call backs end up in this code. The basic idea is to 927 * allocate extents on disk for the range, and create ordered data structs 928 * in ram to track those extents. 929 * 930 * locked_page is the page that writepage had locked already. We use 931 * it to make sure we don't do extra locks or unlocks. 932 * 933 * *page_started is set to one if we unlock locked_page and do everything 934 * required to start IO on it. It may be clean and already done with 935 * IO when we return. 936 */ 937 static noinline int cow_file_range(struct inode *inode, 938 struct page *locked_page, 939 u64 start, u64 end, u64 delalloc_end, 940 int *page_started, unsigned long *nr_written, 941 int unlock, struct btrfs_dedupe_hash *hash) 942 { 943 struct btrfs_root *root = BTRFS_I(inode)->root; 944 u64 alloc_hint = 0; 945 u64 num_bytes; 946 unsigned long ram_size; 947 u64 disk_num_bytes; 948 u64 cur_alloc_size; 949 u64 blocksize = root->sectorsize; 950 struct btrfs_key ins; 951 struct extent_map *em; 952 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 953 int ret = 0; 954 955 if (btrfs_is_free_space_inode(inode)) { 956 WARN_ON_ONCE(1); 957 ret = -EINVAL; 958 goto out_unlock; 959 } 960 961 num_bytes = ALIGN(end - start + 1, blocksize); 962 num_bytes = max(blocksize, num_bytes); 963 disk_num_bytes = num_bytes; 964 965 /* if this is a small write inside eof, kick off defrag */ 966 if (num_bytes < SZ_64K && 967 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 968 btrfs_add_inode_defrag(NULL, inode); 969 970 if (start == 0) { 971 /* lets try to make an inline extent */ 972 ret = cow_file_range_inline(root, inode, start, end, 0, 0, 973 NULL); 974 if (ret == 0) { 975 extent_clear_unlock_delalloc(inode, start, end, 976 delalloc_end, NULL, 977 EXTENT_LOCKED | EXTENT_DELALLOC | 978 EXTENT_DEFRAG, PAGE_UNLOCK | 979 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 980 PAGE_END_WRITEBACK); 981 btrfs_free_reserved_data_space_noquota(inode, start, 982 end - start + 1); 983 *nr_written = *nr_written + 984 (end - start + PAGE_SIZE) / PAGE_SIZE; 985 *page_started = 1; 986 goto out; 987 } else if (ret < 0) { 988 goto out_unlock; 989 } 990 } 991 992 BUG_ON(disk_num_bytes > 993 btrfs_super_total_bytes(root->fs_info->super_copy)); 994 995 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 996 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 997 998 while (disk_num_bytes > 0) { 999 unsigned long op; 1000 1001 cur_alloc_size = disk_num_bytes; 1002 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, 1003 root->sectorsize, 0, alloc_hint, 1004 &ins, 1, 1); 1005 if (ret < 0) 1006 goto out_unlock; 1007 1008 em = alloc_extent_map(); 1009 if (!em) { 1010 ret = -ENOMEM; 1011 goto out_reserve; 1012 } 1013 em->start = start; 1014 em->orig_start = em->start; 1015 ram_size = ins.offset; 1016 em->len = ins.offset; 1017 em->mod_start = em->start; 1018 em->mod_len = em->len; 1019 1020 em->block_start = ins.objectid; 1021 em->block_len = ins.offset; 1022 em->orig_block_len = ins.offset; 1023 em->ram_bytes = ram_size; 1024 em->bdev = root->fs_info->fs_devices->latest_bdev; 1025 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1026 em->generation = -1; 1027 1028 while (1) { 1029 write_lock(&em_tree->lock); 1030 ret = add_extent_mapping(em_tree, em, 1); 1031 write_unlock(&em_tree->lock); 1032 if (ret != -EEXIST) { 1033 free_extent_map(em); 1034 break; 1035 } 1036 btrfs_drop_extent_cache(inode, start, 1037 start + ram_size - 1, 0); 1038 } 1039 if (ret) 1040 goto out_reserve; 1041 1042 cur_alloc_size = ins.offset; 1043 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 1044 ram_size, cur_alloc_size, 0); 1045 if (ret) 1046 goto out_drop_extent_cache; 1047 1048 if (root->root_key.objectid == 1049 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1050 ret = btrfs_reloc_clone_csums(inode, start, 1051 cur_alloc_size); 1052 if (ret) 1053 goto out_drop_extent_cache; 1054 } 1055 1056 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); 1057 1058 if (disk_num_bytes < cur_alloc_size) 1059 break; 1060 1061 /* we're not doing compressed IO, don't unlock the first 1062 * page (which the caller expects to stay locked), don't 1063 * clear any dirty bits and don't set any writeback bits 1064 * 1065 * Do set the Private2 bit so we know this page was properly 1066 * setup for writepage 1067 */ 1068 op = unlock ? PAGE_UNLOCK : 0; 1069 op |= PAGE_SET_PRIVATE2; 1070 1071 extent_clear_unlock_delalloc(inode, start, 1072 start + ram_size - 1, 1073 delalloc_end, locked_page, 1074 EXTENT_LOCKED | EXTENT_DELALLOC, 1075 op); 1076 disk_num_bytes -= cur_alloc_size; 1077 num_bytes -= cur_alloc_size; 1078 alloc_hint = ins.objectid + ins.offset; 1079 start += cur_alloc_size; 1080 } 1081 out: 1082 return ret; 1083 1084 out_drop_extent_cache: 1085 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); 1086 out_reserve: 1087 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); 1088 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 1089 out_unlock: 1090 extent_clear_unlock_delalloc(inode, start, end, delalloc_end, 1091 locked_page, 1092 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 1093 EXTENT_DELALLOC | EXTENT_DEFRAG, 1094 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 1095 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); 1096 goto out; 1097 } 1098 1099 /* 1100 * work queue call back to started compression on a file and pages 1101 */ 1102 static noinline void async_cow_start(struct btrfs_work *work) 1103 { 1104 struct async_cow *async_cow; 1105 int num_added = 0; 1106 async_cow = container_of(work, struct async_cow, work); 1107 1108 compress_file_range(async_cow->inode, async_cow->locked_page, 1109 async_cow->start, async_cow->end, async_cow, 1110 &num_added); 1111 if (num_added == 0) { 1112 btrfs_add_delayed_iput(async_cow->inode); 1113 async_cow->inode = NULL; 1114 } 1115 } 1116 1117 /* 1118 * work queue call back to submit previously compressed pages 1119 */ 1120 static noinline void async_cow_submit(struct btrfs_work *work) 1121 { 1122 struct async_cow *async_cow; 1123 struct btrfs_root *root; 1124 unsigned long nr_pages; 1125 1126 async_cow = container_of(work, struct async_cow, work); 1127 1128 root = async_cow->root; 1129 nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> 1130 PAGE_SHIFT; 1131 1132 /* 1133 * atomic_sub_return implies a barrier for waitqueue_active 1134 */ 1135 if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < 1136 5 * SZ_1M && 1137 waitqueue_active(&root->fs_info->async_submit_wait)) 1138 wake_up(&root->fs_info->async_submit_wait); 1139 1140 if (async_cow->inode) 1141 submit_compressed_extents(async_cow->inode, async_cow); 1142 } 1143 1144 static noinline void async_cow_free(struct btrfs_work *work) 1145 { 1146 struct async_cow *async_cow; 1147 async_cow = container_of(work, struct async_cow, work); 1148 if (async_cow->inode) 1149 btrfs_add_delayed_iput(async_cow->inode); 1150 kfree(async_cow); 1151 } 1152 1153 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1154 u64 start, u64 end, int *page_started, 1155 unsigned long *nr_written) 1156 { 1157 struct async_cow *async_cow; 1158 struct btrfs_root *root = BTRFS_I(inode)->root; 1159 unsigned long nr_pages; 1160 u64 cur_end; 1161 int limit = 10 * SZ_1M; 1162 1163 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1164 1, 0, NULL, GFP_NOFS); 1165 while (start < end) { 1166 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 1167 BUG_ON(!async_cow); /* -ENOMEM */ 1168 async_cow->inode = igrab(inode); 1169 async_cow->root = root; 1170 async_cow->locked_page = locked_page; 1171 async_cow->start = start; 1172 1173 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && 1174 !btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) 1175 cur_end = end; 1176 else 1177 cur_end = min(end, start + SZ_512K - 1); 1178 1179 async_cow->end = cur_end; 1180 INIT_LIST_HEAD(&async_cow->extents); 1181 1182 btrfs_init_work(&async_cow->work, 1183 btrfs_delalloc_helper, 1184 async_cow_start, async_cow_submit, 1185 async_cow_free); 1186 1187 nr_pages = (cur_end - start + PAGE_SIZE) >> 1188 PAGE_SHIFT; 1189 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1190 1191 btrfs_queue_work(root->fs_info->delalloc_workers, 1192 &async_cow->work); 1193 1194 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1195 wait_event(root->fs_info->async_submit_wait, 1196 (atomic_read(&root->fs_info->async_delalloc_pages) < 1197 limit)); 1198 } 1199 1200 while (atomic_read(&root->fs_info->async_submit_draining) && 1201 atomic_read(&root->fs_info->async_delalloc_pages)) { 1202 wait_event(root->fs_info->async_submit_wait, 1203 (atomic_read(&root->fs_info->async_delalloc_pages) == 1204 0)); 1205 } 1206 1207 *nr_written += nr_pages; 1208 start = cur_end + 1; 1209 } 1210 *page_started = 1; 1211 return 0; 1212 } 1213 1214 static noinline int csum_exist_in_range(struct btrfs_root *root, 1215 u64 bytenr, u64 num_bytes) 1216 { 1217 int ret; 1218 struct btrfs_ordered_sum *sums; 1219 LIST_HEAD(list); 1220 1221 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1222 bytenr + num_bytes - 1, &list, 0); 1223 if (ret == 0 && list_empty(&list)) 1224 return 0; 1225 1226 while (!list_empty(&list)) { 1227 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1228 list_del(&sums->list); 1229 kfree(sums); 1230 } 1231 return 1; 1232 } 1233 1234 /* 1235 * when nowcow writeback call back. This checks for snapshots or COW copies 1236 * of the extents that exist in the file, and COWs the file as required. 1237 * 1238 * If no cow copies or snapshots exist, we write directly to the existing 1239 * blocks on disk 1240 */ 1241 static noinline int run_delalloc_nocow(struct inode *inode, 1242 struct page *locked_page, 1243 u64 start, u64 end, int *page_started, int force, 1244 unsigned long *nr_written) 1245 { 1246 struct btrfs_root *root = BTRFS_I(inode)->root; 1247 struct btrfs_trans_handle *trans; 1248 struct extent_buffer *leaf; 1249 struct btrfs_path *path; 1250 struct btrfs_file_extent_item *fi; 1251 struct btrfs_key found_key; 1252 u64 cow_start; 1253 u64 cur_offset; 1254 u64 extent_end; 1255 u64 extent_offset; 1256 u64 disk_bytenr; 1257 u64 num_bytes; 1258 u64 disk_num_bytes; 1259 u64 ram_bytes; 1260 int extent_type; 1261 int ret, err; 1262 int type; 1263 int nocow; 1264 int check_prev = 1; 1265 bool nolock; 1266 u64 ino = btrfs_ino(inode); 1267 1268 path = btrfs_alloc_path(); 1269 if (!path) { 1270 extent_clear_unlock_delalloc(inode, start, end, end, 1271 locked_page, 1272 EXTENT_LOCKED | EXTENT_DELALLOC | 1273 EXTENT_DO_ACCOUNTING | 1274 EXTENT_DEFRAG, PAGE_UNLOCK | 1275 PAGE_CLEAR_DIRTY | 1276 PAGE_SET_WRITEBACK | 1277 PAGE_END_WRITEBACK); 1278 return -ENOMEM; 1279 } 1280 1281 nolock = btrfs_is_free_space_inode(inode); 1282 1283 if (nolock) 1284 trans = btrfs_join_transaction_nolock(root); 1285 else 1286 trans = btrfs_join_transaction(root); 1287 1288 if (IS_ERR(trans)) { 1289 extent_clear_unlock_delalloc(inode, start, end, end, 1290 locked_page, 1291 EXTENT_LOCKED | EXTENT_DELALLOC | 1292 EXTENT_DO_ACCOUNTING | 1293 EXTENT_DEFRAG, PAGE_UNLOCK | 1294 PAGE_CLEAR_DIRTY | 1295 PAGE_SET_WRITEBACK | 1296 PAGE_END_WRITEBACK); 1297 btrfs_free_path(path); 1298 return PTR_ERR(trans); 1299 } 1300 1301 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1302 1303 cow_start = (u64)-1; 1304 cur_offset = start; 1305 while (1) { 1306 ret = btrfs_lookup_file_extent(trans, root, path, ino, 1307 cur_offset, 0); 1308 if (ret < 0) 1309 goto error; 1310 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1311 leaf = path->nodes[0]; 1312 btrfs_item_key_to_cpu(leaf, &found_key, 1313 path->slots[0] - 1); 1314 if (found_key.objectid == ino && 1315 found_key.type == BTRFS_EXTENT_DATA_KEY) 1316 path->slots[0]--; 1317 } 1318 check_prev = 0; 1319 next_slot: 1320 leaf = path->nodes[0]; 1321 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1322 ret = btrfs_next_leaf(root, path); 1323 if (ret < 0) 1324 goto error; 1325 if (ret > 0) 1326 break; 1327 leaf = path->nodes[0]; 1328 } 1329 1330 nocow = 0; 1331 disk_bytenr = 0; 1332 num_bytes = 0; 1333 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1334 1335 if (found_key.objectid > ino) 1336 break; 1337 if (WARN_ON_ONCE(found_key.objectid < ino) || 1338 found_key.type < BTRFS_EXTENT_DATA_KEY) { 1339 path->slots[0]++; 1340 goto next_slot; 1341 } 1342 if (found_key.type > BTRFS_EXTENT_DATA_KEY || 1343 found_key.offset > end) 1344 break; 1345 1346 if (found_key.offset > cur_offset) { 1347 extent_end = found_key.offset; 1348 extent_type = 0; 1349 goto out_check; 1350 } 1351 1352 fi = btrfs_item_ptr(leaf, path->slots[0], 1353 struct btrfs_file_extent_item); 1354 extent_type = btrfs_file_extent_type(leaf, fi); 1355 1356 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1357 if (extent_type == BTRFS_FILE_EXTENT_REG || 1358 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1359 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1360 extent_offset = btrfs_file_extent_offset(leaf, fi); 1361 extent_end = found_key.offset + 1362 btrfs_file_extent_num_bytes(leaf, fi); 1363 disk_num_bytes = 1364 btrfs_file_extent_disk_num_bytes(leaf, fi); 1365 if (extent_end <= start) { 1366 path->slots[0]++; 1367 goto next_slot; 1368 } 1369 if (disk_bytenr == 0) 1370 goto out_check; 1371 if (btrfs_file_extent_compression(leaf, fi) || 1372 btrfs_file_extent_encryption(leaf, fi) || 1373 btrfs_file_extent_other_encoding(leaf, fi)) 1374 goto out_check; 1375 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1376 goto out_check; 1377 if (btrfs_extent_readonly(root, disk_bytenr)) 1378 goto out_check; 1379 if (btrfs_cross_ref_exist(trans, root, ino, 1380 found_key.offset - 1381 extent_offset, disk_bytenr)) 1382 goto out_check; 1383 disk_bytenr += extent_offset; 1384 disk_bytenr += cur_offset - found_key.offset; 1385 num_bytes = min(end + 1, extent_end) - cur_offset; 1386 /* 1387 * if there are pending snapshots for this root, 1388 * we fall into common COW way. 1389 */ 1390 if (!nolock) { 1391 err = btrfs_start_write_no_snapshoting(root); 1392 if (!err) 1393 goto out_check; 1394 } 1395 /* 1396 * force cow if csum exists in the range. 1397 * this ensure that csum for a given extent are 1398 * either valid or do not exist. 1399 */ 1400 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1401 goto out_check; 1402 if (!btrfs_inc_nocow_writers(root->fs_info, 1403 disk_bytenr)) 1404 goto out_check; 1405 nocow = 1; 1406 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1407 extent_end = found_key.offset + 1408 btrfs_file_extent_inline_len(leaf, 1409 path->slots[0], fi); 1410 extent_end = ALIGN(extent_end, root->sectorsize); 1411 } else { 1412 BUG_ON(1); 1413 } 1414 out_check: 1415 if (extent_end <= start) { 1416 path->slots[0]++; 1417 if (!nolock && nocow) 1418 btrfs_end_write_no_snapshoting(root); 1419 if (nocow) 1420 btrfs_dec_nocow_writers(root->fs_info, 1421 disk_bytenr); 1422 goto next_slot; 1423 } 1424 if (!nocow) { 1425 if (cow_start == (u64)-1) 1426 cow_start = cur_offset; 1427 cur_offset = extent_end; 1428 if (cur_offset > end) 1429 break; 1430 path->slots[0]++; 1431 goto next_slot; 1432 } 1433 1434 btrfs_release_path(path); 1435 if (cow_start != (u64)-1) { 1436 ret = cow_file_range(inode, locked_page, 1437 cow_start, found_key.offset - 1, 1438 end, page_started, nr_written, 1, 1439 NULL); 1440 if (ret) { 1441 if (!nolock && nocow) 1442 btrfs_end_write_no_snapshoting(root); 1443 if (nocow) 1444 btrfs_dec_nocow_writers(root->fs_info, 1445 disk_bytenr); 1446 goto error; 1447 } 1448 cow_start = (u64)-1; 1449 } 1450 1451 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1452 struct extent_map *em; 1453 struct extent_map_tree *em_tree; 1454 em_tree = &BTRFS_I(inode)->extent_tree; 1455 em = alloc_extent_map(); 1456 BUG_ON(!em); /* -ENOMEM */ 1457 em->start = cur_offset; 1458 em->orig_start = found_key.offset - extent_offset; 1459 em->len = num_bytes; 1460 em->block_len = num_bytes; 1461 em->block_start = disk_bytenr; 1462 em->orig_block_len = disk_num_bytes; 1463 em->ram_bytes = ram_bytes; 1464 em->bdev = root->fs_info->fs_devices->latest_bdev; 1465 em->mod_start = em->start; 1466 em->mod_len = em->len; 1467 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1468 set_bit(EXTENT_FLAG_FILLING, &em->flags); 1469 em->generation = -1; 1470 while (1) { 1471 write_lock(&em_tree->lock); 1472 ret = add_extent_mapping(em_tree, em, 1); 1473 write_unlock(&em_tree->lock); 1474 if (ret != -EEXIST) { 1475 free_extent_map(em); 1476 break; 1477 } 1478 btrfs_drop_extent_cache(inode, em->start, 1479 em->start + em->len - 1, 0); 1480 } 1481 type = BTRFS_ORDERED_PREALLOC; 1482 } else { 1483 type = BTRFS_ORDERED_NOCOW; 1484 } 1485 1486 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1487 num_bytes, num_bytes, type); 1488 if (nocow) 1489 btrfs_dec_nocow_writers(root->fs_info, disk_bytenr); 1490 BUG_ON(ret); /* -ENOMEM */ 1491 1492 if (root->root_key.objectid == 1493 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1494 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1495 num_bytes); 1496 if (ret) { 1497 if (!nolock && nocow) 1498 btrfs_end_write_no_snapshoting(root); 1499 goto error; 1500 } 1501 } 1502 1503 extent_clear_unlock_delalloc(inode, cur_offset, 1504 cur_offset + num_bytes - 1, end, 1505 locked_page, EXTENT_LOCKED | 1506 EXTENT_DELALLOC | 1507 EXTENT_CLEAR_DATA_RESV, 1508 PAGE_UNLOCK | PAGE_SET_PRIVATE2); 1509 1510 if (!nolock && nocow) 1511 btrfs_end_write_no_snapshoting(root); 1512 cur_offset = extent_end; 1513 if (cur_offset > end) 1514 break; 1515 } 1516 btrfs_release_path(path); 1517 1518 if (cur_offset <= end && cow_start == (u64)-1) { 1519 cow_start = cur_offset; 1520 cur_offset = end; 1521 } 1522 1523 if (cow_start != (u64)-1) { 1524 ret = cow_file_range(inode, locked_page, cow_start, end, end, 1525 page_started, nr_written, 1, NULL); 1526 if (ret) 1527 goto error; 1528 } 1529 1530 error: 1531 err = btrfs_end_transaction(trans, root); 1532 if (!ret) 1533 ret = err; 1534 1535 if (ret && cur_offset < end) 1536 extent_clear_unlock_delalloc(inode, cur_offset, end, end, 1537 locked_page, EXTENT_LOCKED | 1538 EXTENT_DELALLOC | EXTENT_DEFRAG | 1539 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 1540 PAGE_CLEAR_DIRTY | 1541 PAGE_SET_WRITEBACK | 1542 PAGE_END_WRITEBACK); 1543 btrfs_free_path(path); 1544 return ret; 1545 } 1546 1547 static inline int need_force_cow(struct inode *inode, u64 start, u64 end) 1548 { 1549 1550 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 1551 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) 1552 return 0; 1553 1554 /* 1555 * @defrag_bytes is a hint value, no spinlock held here, 1556 * if is not zero, it means the file is defragging. 1557 * Force cow if given extent needs to be defragged. 1558 */ 1559 if (BTRFS_I(inode)->defrag_bytes && 1560 test_range_bit(&BTRFS_I(inode)->io_tree, start, end, 1561 EXTENT_DEFRAG, 0, NULL)) 1562 return 1; 1563 1564 return 0; 1565 } 1566 1567 /* 1568 * extent_io.c call back to do delayed allocation processing 1569 */ 1570 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1571 u64 start, u64 end, int *page_started, 1572 unsigned long *nr_written) 1573 { 1574 int ret; 1575 int force_cow = need_force_cow(inode, start, end); 1576 1577 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { 1578 ret = run_delalloc_nocow(inode, locked_page, start, end, 1579 page_started, 1, nr_written); 1580 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { 1581 ret = run_delalloc_nocow(inode, locked_page, start, end, 1582 page_started, 0, nr_written); 1583 } else if (!inode_need_compress(inode)) { 1584 ret = cow_file_range(inode, locked_page, start, end, end, 1585 page_started, nr_written, 1, NULL); 1586 } else { 1587 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1588 &BTRFS_I(inode)->runtime_flags); 1589 ret = cow_file_range_async(inode, locked_page, start, end, 1590 page_started, nr_written); 1591 } 1592 return ret; 1593 } 1594 1595 static void btrfs_split_extent_hook(struct inode *inode, 1596 struct extent_state *orig, u64 split) 1597 { 1598 u64 size; 1599 1600 /* not delalloc, ignore it */ 1601 if (!(orig->state & EXTENT_DELALLOC)) 1602 return; 1603 1604 size = orig->end - orig->start + 1; 1605 if (size > BTRFS_MAX_EXTENT_SIZE) { 1606 u64 num_extents; 1607 u64 new_size; 1608 1609 /* 1610 * See the explanation in btrfs_merge_extent_hook, the same 1611 * applies here, just in reverse. 1612 */ 1613 new_size = orig->end - split + 1; 1614 num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, 1615 BTRFS_MAX_EXTENT_SIZE); 1616 new_size = split - orig->start; 1617 num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, 1618 BTRFS_MAX_EXTENT_SIZE); 1619 if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, 1620 BTRFS_MAX_EXTENT_SIZE) >= num_extents) 1621 return; 1622 } 1623 1624 spin_lock(&BTRFS_I(inode)->lock); 1625 BTRFS_I(inode)->outstanding_extents++; 1626 spin_unlock(&BTRFS_I(inode)->lock); 1627 } 1628 1629 /* 1630 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1631 * extents so we can keep track of new extents that are just merged onto old 1632 * extents, such as when we are doing sequential writes, so we can properly 1633 * account for the metadata space we'll need. 1634 */ 1635 static void btrfs_merge_extent_hook(struct inode *inode, 1636 struct extent_state *new, 1637 struct extent_state *other) 1638 { 1639 u64 new_size, old_size; 1640 u64 num_extents; 1641 1642 /* not delalloc, ignore it */ 1643 if (!(other->state & EXTENT_DELALLOC)) 1644 return; 1645 1646 if (new->start > other->start) 1647 new_size = new->end - other->start + 1; 1648 else 1649 new_size = other->end - new->start + 1; 1650 1651 /* we're not bigger than the max, unreserve the space and go */ 1652 if (new_size <= BTRFS_MAX_EXTENT_SIZE) { 1653 spin_lock(&BTRFS_I(inode)->lock); 1654 BTRFS_I(inode)->outstanding_extents--; 1655 spin_unlock(&BTRFS_I(inode)->lock); 1656 return; 1657 } 1658 1659 /* 1660 * We have to add up either side to figure out how many extents were 1661 * accounted for before we merged into one big extent. If the number of 1662 * extents we accounted for is <= the amount we need for the new range 1663 * then we can return, otherwise drop. Think of it like this 1664 * 1665 * [ 4k][MAX_SIZE] 1666 * 1667 * So we've grown the extent by a MAX_SIZE extent, this would mean we 1668 * need 2 outstanding extents, on one side we have 1 and the other side 1669 * we have 1 so they are == and we can return. But in this case 1670 * 1671 * [MAX_SIZE+4k][MAX_SIZE+4k] 1672 * 1673 * Each range on their own accounts for 2 extents, but merged together 1674 * they are only 3 extents worth of accounting, so we need to drop in 1675 * this case. 1676 */ 1677 old_size = other->end - other->start + 1; 1678 num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, 1679 BTRFS_MAX_EXTENT_SIZE); 1680 old_size = new->end - new->start + 1; 1681 num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, 1682 BTRFS_MAX_EXTENT_SIZE); 1683 1684 if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, 1685 BTRFS_MAX_EXTENT_SIZE) >= num_extents) 1686 return; 1687 1688 spin_lock(&BTRFS_I(inode)->lock); 1689 BTRFS_I(inode)->outstanding_extents--; 1690 spin_unlock(&BTRFS_I(inode)->lock); 1691 } 1692 1693 static void btrfs_add_delalloc_inodes(struct btrfs_root *root, 1694 struct inode *inode) 1695 { 1696 spin_lock(&root->delalloc_lock); 1697 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1698 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1699 &root->delalloc_inodes); 1700 set_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1701 &BTRFS_I(inode)->runtime_flags); 1702 root->nr_delalloc_inodes++; 1703 if (root->nr_delalloc_inodes == 1) { 1704 spin_lock(&root->fs_info->delalloc_root_lock); 1705 BUG_ON(!list_empty(&root->delalloc_root)); 1706 list_add_tail(&root->delalloc_root, 1707 &root->fs_info->delalloc_roots); 1708 spin_unlock(&root->fs_info->delalloc_root_lock); 1709 } 1710 } 1711 spin_unlock(&root->delalloc_lock); 1712 } 1713 1714 static void btrfs_del_delalloc_inode(struct btrfs_root *root, 1715 struct inode *inode) 1716 { 1717 spin_lock(&root->delalloc_lock); 1718 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1719 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1720 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1721 &BTRFS_I(inode)->runtime_flags); 1722 root->nr_delalloc_inodes--; 1723 if (!root->nr_delalloc_inodes) { 1724 spin_lock(&root->fs_info->delalloc_root_lock); 1725 BUG_ON(list_empty(&root->delalloc_root)); 1726 list_del_init(&root->delalloc_root); 1727 spin_unlock(&root->fs_info->delalloc_root_lock); 1728 } 1729 } 1730 spin_unlock(&root->delalloc_lock); 1731 } 1732 1733 /* 1734 * extent_io.c set_bit_hook, used to track delayed allocation 1735 * bytes in this file, and to maintain the list of inodes that 1736 * have pending delalloc work to be done. 1737 */ 1738 static void btrfs_set_bit_hook(struct inode *inode, 1739 struct extent_state *state, unsigned *bits) 1740 { 1741 1742 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) 1743 WARN_ON(1); 1744 /* 1745 * set_bit and clear bit hooks normally require _irqsave/restore 1746 * but in this case, we are only testing for the DELALLOC 1747 * bit, which is only set or cleared with irqs on 1748 */ 1749 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1750 struct btrfs_root *root = BTRFS_I(inode)->root; 1751 u64 len = state->end + 1 - state->start; 1752 bool do_list = !btrfs_is_free_space_inode(inode); 1753 1754 if (*bits & EXTENT_FIRST_DELALLOC) { 1755 *bits &= ~EXTENT_FIRST_DELALLOC; 1756 } else { 1757 spin_lock(&BTRFS_I(inode)->lock); 1758 BTRFS_I(inode)->outstanding_extents++; 1759 spin_unlock(&BTRFS_I(inode)->lock); 1760 } 1761 1762 /* For sanity tests */ 1763 if (btrfs_is_testing(root->fs_info)) 1764 return; 1765 1766 __percpu_counter_add(&root->fs_info->delalloc_bytes, len, 1767 root->fs_info->delalloc_batch); 1768 spin_lock(&BTRFS_I(inode)->lock); 1769 BTRFS_I(inode)->delalloc_bytes += len; 1770 if (*bits & EXTENT_DEFRAG) 1771 BTRFS_I(inode)->defrag_bytes += len; 1772 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1773 &BTRFS_I(inode)->runtime_flags)) 1774 btrfs_add_delalloc_inodes(root, inode); 1775 spin_unlock(&BTRFS_I(inode)->lock); 1776 } 1777 } 1778 1779 /* 1780 * extent_io.c clear_bit_hook, see set_bit_hook for why 1781 */ 1782 static void btrfs_clear_bit_hook(struct inode *inode, 1783 struct extent_state *state, 1784 unsigned *bits) 1785 { 1786 u64 len = state->end + 1 - state->start; 1787 u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1, 1788 BTRFS_MAX_EXTENT_SIZE); 1789 1790 spin_lock(&BTRFS_I(inode)->lock); 1791 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) 1792 BTRFS_I(inode)->defrag_bytes -= len; 1793 spin_unlock(&BTRFS_I(inode)->lock); 1794 1795 /* 1796 * set_bit and clear bit hooks normally require _irqsave/restore 1797 * but in this case, we are only testing for the DELALLOC 1798 * bit, which is only set or cleared with irqs on 1799 */ 1800 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1801 struct btrfs_root *root = BTRFS_I(inode)->root; 1802 bool do_list = !btrfs_is_free_space_inode(inode); 1803 1804 if (*bits & EXTENT_FIRST_DELALLOC) { 1805 *bits &= ~EXTENT_FIRST_DELALLOC; 1806 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { 1807 spin_lock(&BTRFS_I(inode)->lock); 1808 BTRFS_I(inode)->outstanding_extents -= num_extents; 1809 spin_unlock(&BTRFS_I(inode)->lock); 1810 } 1811 1812 /* 1813 * We don't reserve metadata space for space cache inodes so we 1814 * don't need to call dellalloc_release_metadata if there is an 1815 * error. 1816 */ 1817 if (*bits & EXTENT_DO_ACCOUNTING && 1818 root != root->fs_info->tree_root) 1819 btrfs_delalloc_release_metadata(inode, len); 1820 1821 /* For sanity tests. */ 1822 if (btrfs_is_testing(root->fs_info)) 1823 return; 1824 1825 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1826 && do_list && !(state->state & EXTENT_NORESERVE) 1827 && (*bits & (EXTENT_DO_ACCOUNTING | 1828 EXTENT_CLEAR_DATA_RESV))) 1829 btrfs_free_reserved_data_space_noquota(inode, 1830 state->start, len); 1831 1832 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, 1833 root->fs_info->delalloc_batch); 1834 spin_lock(&BTRFS_I(inode)->lock); 1835 BTRFS_I(inode)->delalloc_bytes -= len; 1836 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1837 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1838 &BTRFS_I(inode)->runtime_flags)) 1839 btrfs_del_delalloc_inode(root, inode); 1840 spin_unlock(&BTRFS_I(inode)->lock); 1841 } 1842 } 1843 1844 /* 1845 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1846 * we don't create bios that span stripes or chunks 1847 * 1848 * return 1 if page cannot be merged to bio 1849 * return 0 if page can be merged to bio 1850 * return error otherwise 1851 */ 1852 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1853 size_t size, struct bio *bio, 1854 unsigned long bio_flags) 1855 { 1856 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1857 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 1858 u64 length = 0; 1859 u64 map_length; 1860 int ret; 1861 1862 if (bio_flags & EXTENT_BIO_COMPRESSED) 1863 return 0; 1864 1865 length = bio->bi_iter.bi_size; 1866 map_length = length; 1867 ret = btrfs_map_block(root->fs_info, bio_op(bio), logical, 1868 &map_length, NULL, 0); 1869 if (ret < 0) 1870 return ret; 1871 if (map_length < length + size) 1872 return 1; 1873 return 0; 1874 } 1875 1876 /* 1877 * in order to insert checksums into the metadata in large chunks, 1878 * we wait until bio submission time. All the pages in the bio are 1879 * checksummed and sums are attached onto the ordered extent record. 1880 * 1881 * At IO completion time the cums attached on the ordered extent record 1882 * are inserted into the btree 1883 */ 1884 static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, 1885 int mirror_num, unsigned long bio_flags, 1886 u64 bio_offset) 1887 { 1888 struct btrfs_root *root = BTRFS_I(inode)->root; 1889 int ret = 0; 1890 1891 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1892 BUG_ON(ret); /* -ENOMEM */ 1893 return 0; 1894 } 1895 1896 /* 1897 * in order to insert checksums into the metadata in large chunks, 1898 * we wait until bio submission time. All the pages in the bio are 1899 * checksummed and sums are attached onto the ordered extent record. 1900 * 1901 * At IO completion time the cums attached on the ordered extent record 1902 * are inserted into the btree 1903 */ 1904 static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, 1905 int mirror_num, unsigned long bio_flags, 1906 u64 bio_offset) 1907 { 1908 struct btrfs_root *root = BTRFS_I(inode)->root; 1909 int ret; 1910 1911 ret = btrfs_map_bio(root, bio, mirror_num, 1); 1912 if (ret) { 1913 bio->bi_error = ret; 1914 bio_endio(bio); 1915 } 1916 return ret; 1917 } 1918 1919 /* 1920 * extent_io.c submission hook. This does the right thing for csum calculation 1921 * on write, or reading the csums from the tree before a read 1922 */ 1923 static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, 1924 int mirror_num, unsigned long bio_flags, 1925 u64 bio_offset) 1926 { 1927 struct btrfs_root *root = BTRFS_I(inode)->root; 1928 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; 1929 int ret = 0; 1930 int skip_sum; 1931 int async = !atomic_read(&BTRFS_I(inode)->sync_writers); 1932 1933 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1934 1935 if (btrfs_is_free_space_inode(inode)) 1936 metadata = BTRFS_WQ_ENDIO_FREE_SPACE; 1937 1938 if (bio_op(bio) != REQ_OP_WRITE) { 1939 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1940 if (ret) 1941 goto out; 1942 1943 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1944 ret = btrfs_submit_compressed_read(inode, bio, 1945 mirror_num, 1946 bio_flags); 1947 goto out; 1948 } else if (!skip_sum) { 1949 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1950 if (ret) 1951 goto out; 1952 } 1953 goto mapit; 1954 } else if (async && !skip_sum) { 1955 /* csum items have already been cloned */ 1956 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1957 goto mapit; 1958 /* we're doing a write, do the async checksumming */ 1959 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1960 inode, bio, mirror_num, 1961 bio_flags, bio_offset, 1962 __btrfs_submit_bio_start, 1963 __btrfs_submit_bio_done); 1964 goto out; 1965 } else if (!skip_sum) { 1966 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1967 if (ret) 1968 goto out; 1969 } 1970 1971 mapit: 1972 ret = btrfs_map_bio(root, bio, mirror_num, 0); 1973 1974 out: 1975 if (ret < 0) { 1976 bio->bi_error = ret; 1977 bio_endio(bio); 1978 } 1979 return ret; 1980 } 1981 1982 /* 1983 * given a list of ordered sums record them in the inode. This happens 1984 * at IO completion time based on sums calculated at bio submission time. 1985 */ 1986 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1987 struct inode *inode, u64 file_offset, 1988 struct list_head *list) 1989 { 1990 struct btrfs_ordered_sum *sum; 1991 1992 list_for_each_entry(sum, list, list) { 1993 trans->adding_csums = 1; 1994 btrfs_csum_file_blocks(trans, 1995 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1996 trans->adding_csums = 0; 1997 } 1998 return 0; 1999 } 2000 2001 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2002 struct extent_state **cached_state, int dedupe) 2003 { 2004 WARN_ON((end & (PAGE_SIZE - 1)) == 0); 2005 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 2006 cached_state); 2007 } 2008 2009 /* see btrfs_writepage_start_hook for details on why this is required */ 2010 struct btrfs_writepage_fixup { 2011 struct page *page; 2012 struct btrfs_work work; 2013 }; 2014 2015 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 2016 { 2017 struct btrfs_writepage_fixup *fixup; 2018 struct btrfs_ordered_extent *ordered; 2019 struct extent_state *cached_state = NULL; 2020 struct page *page; 2021 struct inode *inode; 2022 u64 page_start; 2023 u64 page_end; 2024 int ret; 2025 2026 fixup = container_of(work, struct btrfs_writepage_fixup, work); 2027 page = fixup->page; 2028 again: 2029 lock_page(page); 2030 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 2031 ClearPageChecked(page); 2032 goto out_page; 2033 } 2034 2035 inode = page->mapping->host; 2036 page_start = page_offset(page); 2037 page_end = page_offset(page) + PAGE_SIZE - 1; 2038 2039 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 2040 &cached_state); 2041 2042 /* already ordered? We're done */ 2043 if (PagePrivate2(page)) 2044 goto out; 2045 2046 ordered = btrfs_lookup_ordered_range(inode, page_start, 2047 PAGE_SIZE); 2048 if (ordered) { 2049 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 2050 page_end, &cached_state, GFP_NOFS); 2051 unlock_page(page); 2052 btrfs_start_ordered_extent(inode, ordered, 1); 2053 btrfs_put_ordered_extent(ordered); 2054 goto again; 2055 } 2056 2057 ret = btrfs_delalloc_reserve_space(inode, page_start, 2058 PAGE_SIZE); 2059 if (ret) { 2060 mapping_set_error(page->mapping, ret); 2061 end_extent_writepage(page, ret, page_start, page_end); 2062 ClearPageChecked(page); 2063 goto out; 2064 } 2065 2066 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state, 2067 0); 2068 ClearPageChecked(page); 2069 set_page_dirty(page); 2070 out: 2071 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 2072 &cached_state, GFP_NOFS); 2073 out_page: 2074 unlock_page(page); 2075 put_page(page); 2076 kfree(fixup); 2077 } 2078 2079 /* 2080 * There are a few paths in the higher layers of the kernel that directly 2081 * set the page dirty bit without asking the filesystem if it is a 2082 * good idea. This causes problems because we want to make sure COW 2083 * properly happens and the data=ordered rules are followed. 2084 * 2085 * In our case any range that doesn't have the ORDERED bit set 2086 * hasn't been properly setup for IO. We kick off an async process 2087 * to fix it up. The async helper will wait for ordered extents, set 2088 * the delalloc bit and make it safe to write the page. 2089 */ 2090 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 2091 { 2092 struct inode *inode = page->mapping->host; 2093 struct btrfs_writepage_fixup *fixup; 2094 struct btrfs_root *root = BTRFS_I(inode)->root; 2095 2096 /* this page is properly in the ordered list */ 2097 if (TestClearPagePrivate2(page)) 2098 return 0; 2099 2100 if (PageChecked(page)) 2101 return -EAGAIN; 2102 2103 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 2104 if (!fixup) 2105 return -EAGAIN; 2106 2107 SetPageChecked(page); 2108 get_page(page); 2109 btrfs_init_work(&fixup->work, btrfs_fixup_helper, 2110 btrfs_writepage_fixup_worker, NULL, NULL); 2111 fixup->page = page; 2112 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); 2113 return -EBUSY; 2114 } 2115 2116 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 2117 struct inode *inode, u64 file_pos, 2118 u64 disk_bytenr, u64 disk_num_bytes, 2119 u64 num_bytes, u64 ram_bytes, 2120 u8 compression, u8 encryption, 2121 u16 other_encoding, int extent_type) 2122 { 2123 struct btrfs_root *root = BTRFS_I(inode)->root; 2124 struct btrfs_file_extent_item *fi; 2125 struct btrfs_path *path; 2126 struct extent_buffer *leaf; 2127 struct btrfs_key ins; 2128 int extent_inserted = 0; 2129 int ret; 2130 2131 path = btrfs_alloc_path(); 2132 if (!path) 2133 return -ENOMEM; 2134 2135 /* 2136 * we may be replacing one extent in the tree with another. 2137 * The new extent is pinned in the extent map, and we don't want 2138 * to drop it from the cache until it is completely in the btree. 2139 * 2140 * So, tell btrfs_drop_extents to leave this extent in the cache. 2141 * the caller is expected to unpin it and allow it to be merged 2142 * with the others. 2143 */ 2144 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, 2145 file_pos + num_bytes, NULL, 0, 2146 1, sizeof(*fi), &extent_inserted); 2147 if (ret) 2148 goto out; 2149 2150 if (!extent_inserted) { 2151 ins.objectid = btrfs_ino(inode); 2152 ins.offset = file_pos; 2153 ins.type = BTRFS_EXTENT_DATA_KEY; 2154 2155 path->leave_spinning = 1; 2156 ret = btrfs_insert_empty_item(trans, root, path, &ins, 2157 sizeof(*fi)); 2158 if (ret) 2159 goto out; 2160 } 2161 leaf = path->nodes[0]; 2162 fi = btrfs_item_ptr(leaf, path->slots[0], 2163 struct btrfs_file_extent_item); 2164 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 2165 btrfs_set_file_extent_type(leaf, fi, extent_type); 2166 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 2167 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 2168 btrfs_set_file_extent_offset(leaf, fi, 0); 2169 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 2170 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 2171 btrfs_set_file_extent_compression(leaf, fi, compression); 2172 btrfs_set_file_extent_encryption(leaf, fi, encryption); 2173 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 2174 2175 btrfs_mark_buffer_dirty(leaf); 2176 btrfs_release_path(path); 2177 2178 inode_add_bytes(inode, num_bytes); 2179 2180 ins.objectid = disk_bytenr; 2181 ins.offset = disk_num_bytes; 2182 ins.type = BTRFS_EXTENT_ITEM_KEY; 2183 ret = btrfs_alloc_reserved_file_extent(trans, root, 2184 root->root_key.objectid, 2185 btrfs_ino(inode), file_pos, 2186 ram_bytes, &ins); 2187 /* 2188 * Release the reserved range from inode dirty range map, as it is 2189 * already moved into delayed_ref_head 2190 */ 2191 btrfs_qgroup_release_data(inode, file_pos, ram_bytes); 2192 out: 2193 btrfs_free_path(path); 2194 2195 return ret; 2196 } 2197 2198 /* snapshot-aware defrag */ 2199 struct sa_defrag_extent_backref { 2200 struct rb_node node; 2201 struct old_sa_defrag_extent *old; 2202 u64 root_id; 2203 u64 inum; 2204 u64 file_pos; 2205 u64 extent_offset; 2206 u64 num_bytes; 2207 u64 generation; 2208 }; 2209 2210 struct old_sa_defrag_extent { 2211 struct list_head list; 2212 struct new_sa_defrag_extent *new; 2213 2214 u64 extent_offset; 2215 u64 bytenr; 2216 u64 offset; 2217 u64 len; 2218 int count; 2219 }; 2220 2221 struct new_sa_defrag_extent { 2222 struct rb_root root; 2223 struct list_head head; 2224 struct btrfs_path *path; 2225 struct inode *inode; 2226 u64 file_pos; 2227 u64 len; 2228 u64 bytenr; 2229 u64 disk_len; 2230 u8 compress_type; 2231 }; 2232 2233 static int backref_comp(struct sa_defrag_extent_backref *b1, 2234 struct sa_defrag_extent_backref *b2) 2235 { 2236 if (b1->root_id < b2->root_id) 2237 return -1; 2238 else if (b1->root_id > b2->root_id) 2239 return 1; 2240 2241 if (b1->inum < b2->inum) 2242 return -1; 2243 else if (b1->inum > b2->inum) 2244 return 1; 2245 2246 if (b1->file_pos < b2->file_pos) 2247 return -1; 2248 else if (b1->file_pos > b2->file_pos) 2249 return 1; 2250 2251 /* 2252 * [------------------------------] ===> (a range of space) 2253 * |<--->| |<---->| =============> (fs/file tree A) 2254 * |<---------------------------->| ===> (fs/file tree B) 2255 * 2256 * A range of space can refer to two file extents in one tree while 2257 * refer to only one file extent in another tree. 2258 * 2259 * So we may process a disk offset more than one time(two extents in A) 2260 * and locate at the same extent(one extent in B), then insert two same 2261 * backrefs(both refer to the extent in B). 2262 */ 2263 return 0; 2264 } 2265 2266 static void backref_insert(struct rb_root *root, 2267 struct sa_defrag_extent_backref *backref) 2268 { 2269 struct rb_node **p = &root->rb_node; 2270 struct rb_node *parent = NULL; 2271 struct sa_defrag_extent_backref *entry; 2272 int ret; 2273 2274 while (*p) { 2275 parent = *p; 2276 entry = rb_entry(parent, struct sa_defrag_extent_backref, node); 2277 2278 ret = backref_comp(backref, entry); 2279 if (ret < 0) 2280 p = &(*p)->rb_left; 2281 else 2282 p = &(*p)->rb_right; 2283 } 2284 2285 rb_link_node(&backref->node, parent, p); 2286 rb_insert_color(&backref->node, root); 2287 } 2288 2289 /* 2290 * Note the backref might has changed, and in this case we just return 0. 2291 */ 2292 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, 2293 void *ctx) 2294 { 2295 struct btrfs_file_extent_item *extent; 2296 struct btrfs_fs_info *fs_info; 2297 struct old_sa_defrag_extent *old = ctx; 2298 struct new_sa_defrag_extent *new = old->new; 2299 struct btrfs_path *path = new->path; 2300 struct btrfs_key key; 2301 struct btrfs_root *root; 2302 struct sa_defrag_extent_backref *backref; 2303 struct extent_buffer *leaf; 2304 struct inode *inode = new->inode; 2305 int slot; 2306 int ret; 2307 u64 extent_offset; 2308 u64 num_bytes; 2309 2310 if (BTRFS_I(inode)->root->root_key.objectid == root_id && 2311 inum == btrfs_ino(inode)) 2312 return 0; 2313 2314 key.objectid = root_id; 2315 key.type = BTRFS_ROOT_ITEM_KEY; 2316 key.offset = (u64)-1; 2317 2318 fs_info = BTRFS_I(inode)->root->fs_info; 2319 root = btrfs_read_fs_root_no_name(fs_info, &key); 2320 if (IS_ERR(root)) { 2321 if (PTR_ERR(root) == -ENOENT) 2322 return 0; 2323 WARN_ON(1); 2324 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu", 2325 inum, offset, root_id); 2326 return PTR_ERR(root); 2327 } 2328 2329 key.objectid = inum; 2330 key.type = BTRFS_EXTENT_DATA_KEY; 2331 if (offset > (u64)-1 << 32) 2332 key.offset = 0; 2333 else 2334 key.offset = offset; 2335 2336 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2337 if (WARN_ON(ret < 0)) 2338 return ret; 2339 ret = 0; 2340 2341 while (1) { 2342 cond_resched(); 2343 2344 leaf = path->nodes[0]; 2345 slot = path->slots[0]; 2346 2347 if (slot >= btrfs_header_nritems(leaf)) { 2348 ret = btrfs_next_leaf(root, path); 2349 if (ret < 0) { 2350 goto out; 2351 } else if (ret > 0) { 2352 ret = 0; 2353 goto out; 2354 } 2355 continue; 2356 } 2357 2358 path->slots[0]++; 2359 2360 btrfs_item_key_to_cpu(leaf, &key, slot); 2361 2362 if (key.objectid > inum) 2363 goto out; 2364 2365 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) 2366 continue; 2367 2368 extent = btrfs_item_ptr(leaf, slot, 2369 struct btrfs_file_extent_item); 2370 2371 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) 2372 continue; 2373 2374 /* 2375 * 'offset' refers to the exact key.offset, 2376 * NOT the 'offset' field in btrfs_extent_data_ref, ie. 2377 * (key.offset - extent_offset). 2378 */ 2379 if (key.offset != offset) 2380 continue; 2381 2382 extent_offset = btrfs_file_extent_offset(leaf, extent); 2383 num_bytes = btrfs_file_extent_num_bytes(leaf, extent); 2384 2385 if (extent_offset >= old->extent_offset + old->offset + 2386 old->len || extent_offset + num_bytes <= 2387 old->extent_offset + old->offset) 2388 continue; 2389 break; 2390 } 2391 2392 backref = kmalloc(sizeof(*backref), GFP_NOFS); 2393 if (!backref) { 2394 ret = -ENOENT; 2395 goto out; 2396 } 2397 2398 backref->root_id = root_id; 2399 backref->inum = inum; 2400 backref->file_pos = offset; 2401 backref->num_bytes = num_bytes; 2402 backref->extent_offset = extent_offset; 2403 backref->generation = btrfs_file_extent_generation(leaf, extent); 2404 backref->old = old; 2405 backref_insert(&new->root, backref); 2406 old->count++; 2407 out: 2408 btrfs_release_path(path); 2409 WARN_ON(ret); 2410 return ret; 2411 } 2412 2413 static noinline bool record_extent_backrefs(struct btrfs_path *path, 2414 struct new_sa_defrag_extent *new) 2415 { 2416 struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; 2417 struct old_sa_defrag_extent *old, *tmp; 2418 int ret; 2419 2420 new->path = path; 2421 2422 list_for_each_entry_safe(old, tmp, &new->head, list) { 2423 ret = iterate_inodes_from_logical(old->bytenr + 2424 old->extent_offset, fs_info, 2425 path, record_one_backref, 2426 old); 2427 if (ret < 0 && ret != -ENOENT) 2428 return false; 2429 2430 /* no backref to be processed for this extent */ 2431 if (!old->count) { 2432 list_del(&old->list); 2433 kfree(old); 2434 } 2435 } 2436 2437 if (list_empty(&new->head)) 2438 return false; 2439 2440 return true; 2441 } 2442 2443 static int relink_is_mergable(struct extent_buffer *leaf, 2444 struct btrfs_file_extent_item *fi, 2445 struct new_sa_defrag_extent *new) 2446 { 2447 if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) 2448 return 0; 2449 2450 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 2451 return 0; 2452 2453 if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) 2454 return 0; 2455 2456 if (btrfs_file_extent_encryption(leaf, fi) || 2457 btrfs_file_extent_other_encoding(leaf, fi)) 2458 return 0; 2459 2460 return 1; 2461 } 2462 2463 /* 2464 * Note the backref might has changed, and in this case we just return 0. 2465 */ 2466 static noinline int relink_extent_backref(struct btrfs_path *path, 2467 struct sa_defrag_extent_backref *prev, 2468 struct sa_defrag_extent_backref *backref) 2469 { 2470 struct btrfs_file_extent_item *extent; 2471 struct btrfs_file_extent_item *item; 2472 struct btrfs_ordered_extent *ordered; 2473 struct btrfs_trans_handle *trans; 2474 struct btrfs_fs_info *fs_info; 2475 struct btrfs_root *root; 2476 struct btrfs_key key; 2477 struct extent_buffer *leaf; 2478 struct old_sa_defrag_extent *old = backref->old; 2479 struct new_sa_defrag_extent *new = old->new; 2480 struct inode *src_inode = new->inode; 2481 struct inode *inode; 2482 struct extent_state *cached = NULL; 2483 int ret = 0; 2484 u64 start; 2485 u64 len; 2486 u64 lock_start; 2487 u64 lock_end; 2488 bool merge = false; 2489 int index; 2490 2491 if (prev && prev->root_id == backref->root_id && 2492 prev->inum == backref->inum && 2493 prev->file_pos + prev->num_bytes == backref->file_pos) 2494 merge = true; 2495 2496 /* step 1: get root */ 2497 key.objectid = backref->root_id; 2498 key.type = BTRFS_ROOT_ITEM_KEY; 2499 key.offset = (u64)-1; 2500 2501 fs_info = BTRFS_I(src_inode)->root->fs_info; 2502 index = srcu_read_lock(&fs_info->subvol_srcu); 2503 2504 root = btrfs_read_fs_root_no_name(fs_info, &key); 2505 if (IS_ERR(root)) { 2506 srcu_read_unlock(&fs_info->subvol_srcu, index); 2507 if (PTR_ERR(root) == -ENOENT) 2508 return 0; 2509 return PTR_ERR(root); 2510 } 2511 2512 if (btrfs_root_readonly(root)) { 2513 srcu_read_unlock(&fs_info->subvol_srcu, index); 2514 return 0; 2515 } 2516 2517 /* step 2: get inode */ 2518 key.objectid = backref->inum; 2519 key.type = BTRFS_INODE_ITEM_KEY; 2520 key.offset = 0; 2521 2522 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 2523 if (IS_ERR(inode)) { 2524 srcu_read_unlock(&fs_info->subvol_srcu, index); 2525 return 0; 2526 } 2527 2528 srcu_read_unlock(&fs_info->subvol_srcu, index); 2529 2530 /* step 3: relink backref */ 2531 lock_start = backref->file_pos; 2532 lock_end = backref->file_pos + backref->num_bytes - 1; 2533 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, 2534 &cached); 2535 2536 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); 2537 if (ordered) { 2538 btrfs_put_ordered_extent(ordered); 2539 goto out_unlock; 2540 } 2541 2542 trans = btrfs_join_transaction(root); 2543 if (IS_ERR(trans)) { 2544 ret = PTR_ERR(trans); 2545 goto out_unlock; 2546 } 2547 2548 key.objectid = backref->inum; 2549 key.type = BTRFS_EXTENT_DATA_KEY; 2550 key.offset = backref->file_pos; 2551 2552 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2553 if (ret < 0) { 2554 goto out_free_path; 2555 } else if (ret > 0) { 2556 ret = 0; 2557 goto out_free_path; 2558 } 2559 2560 extent = btrfs_item_ptr(path->nodes[0], path->slots[0], 2561 struct btrfs_file_extent_item); 2562 2563 if (btrfs_file_extent_generation(path->nodes[0], extent) != 2564 backref->generation) 2565 goto out_free_path; 2566 2567 btrfs_release_path(path); 2568 2569 start = backref->file_pos; 2570 if (backref->extent_offset < old->extent_offset + old->offset) 2571 start += old->extent_offset + old->offset - 2572 backref->extent_offset; 2573 2574 len = min(backref->extent_offset + backref->num_bytes, 2575 old->extent_offset + old->offset + old->len); 2576 len -= max(backref->extent_offset, old->extent_offset + old->offset); 2577 2578 ret = btrfs_drop_extents(trans, root, inode, start, 2579 start + len, 1); 2580 if (ret) 2581 goto out_free_path; 2582 again: 2583 key.objectid = btrfs_ino(inode); 2584 key.type = BTRFS_EXTENT_DATA_KEY; 2585 key.offset = start; 2586 2587 path->leave_spinning = 1; 2588 if (merge) { 2589 struct btrfs_file_extent_item *fi; 2590 u64 extent_len; 2591 struct btrfs_key found_key; 2592 2593 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2594 if (ret < 0) 2595 goto out_free_path; 2596 2597 path->slots[0]--; 2598 leaf = path->nodes[0]; 2599 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2600 2601 fi = btrfs_item_ptr(leaf, path->slots[0], 2602 struct btrfs_file_extent_item); 2603 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 2604 2605 if (extent_len + found_key.offset == start && 2606 relink_is_mergable(leaf, fi, new)) { 2607 btrfs_set_file_extent_num_bytes(leaf, fi, 2608 extent_len + len); 2609 btrfs_mark_buffer_dirty(leaf); 2610 inode_add_bytes(inode, len); 2611 2612 ret = 1; 2613 goto out_free_path; 2614 } else { 2615 merge = false; 2616 btrfs_release_path(path); 2617 goto again; 2618 } 2619 } 2620 2621 ret = btrfs_insert_empty_item(trans, root, path, &key, 2622 sizeof(*extent)); 2623 if (ret) { 2624 btrfs_abort_transaction(trans, ret); 2625 goto out_free_path; 2626 } 2627 2628 leaf = path->nodes[0]; 2629 item = btrfs_item_ptr(leaf, path->slots[0], 2630 struct btrfs_file_extent_item); 2631 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); 2632 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); 2633 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); 2634 btrfs_set_file_extent_num_bytes(leaf, item, len); 2635 btrfs_set_file_extent_ram_bytes(leaf, item, new->len); 2636 btrfs_set_file_extent_generation(leaf, item, trans->transid); 2637 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); 2638 btrfs_set_file_extent_compression(leaf, item, new->compress_type); 2639 btrfs_set_file_extent_encryption(leaf, item, 0); 2640 btrfs_set_file_extent_other_encoding(leaf, item, 0); 2641 2642 btrfs_mark_buffer_dirty(leaf); 2643 inode_add_bytes(inode, len); 2644 btrfs_release_path(path); 2645 2646 ret = btrfs_inc_extent_ref(trans, root, new->bytenr, 2647 new->disk_len, 0, 2648 backref->root_id, backref->inum, 2649 new->file_pos); /* start - extent_offset */ 2650 if (ret) { 2651 btrfs_abort_transaction(trans, ret); 2652 goto out_free_path; 2653 } 2654 2655 ret = 1; 2656 out_free_path: 2657 btrfs_release_path(path); 2658 path->leave_spinning = 0; 2659 btrfs_end_transaction(trans, root); 2660 out_unlock: 2661 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, 2662 &cached, GFP_NOFS); 2663 iput(inode); 2664 return ret; 2665 } 2666 2667 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) 2668 { 2669 struct old_sa_defrag_extent *old, *tmp; 2670 2671 if (!new) 2672 return; 2673 2674 list_for_each_entry_safe(old, tmp, &new->head, list) { 2675 kfree(old); 2676 } 2677 kfree(new); 2678 } 2679 2680 static void relink_file_extents(struct new_sa_defrag_extent *new) 2681 { 2682 struct btrfs_path *path; 2683 struct sa_defrag_extent_backref *backref; 2684 struct sa_defrag_extent_backref *prev = NULL; 2685 struct inode *inode; 2686 struct btrfs_root *root; 2687 struct rb_node *node; 2688 int ret; 2689 2690 inode = new->inode; 2691 root = BTRFS_I(inode)->root; 2692 2693 path = btrfs_alloc_path(); 2694 if (!path) 2695 return; 2696 2697 if (!record_extent_backrefs(path, new)) { 2698 btrfs_free_path(path); 2699 goto out; 2700 } 2701 btrfs_release_path(path); 2702 2703 while (1) { 2704 node = rb_first(&new->root); 2705 if (!node) 2706 break; 2707 rb_erase(node, &new->root); 2708 2709 backref = rb_entry(node, struct sa_defrag_extent_backref, node); 2710 2711 ret = relink_extent_backref(path, prev, backref); 2712 WARN_ON(ret < 0); 2713 2714 kfree(prev); 2715 2716 if (ret == 1) 2717 prev = backref; 2718 else 2719 prev = NULL; 2720 cond_resched(); 2721 } 2722 kfree(prev); 2723 2724 btrfs_free_path(path); 2725 out: 2726 free_sa_defrag_extent(new); 2727 2728 atomic_dec(&root->fs_info->defrag_running); 2729 wake_up(&root->fs_info->transaction_wait); 2730 } 2731 2732 static struct new_sa_defrag_extent * 2733 record_old_file_extents(struct inode *inode, 2734 struct btrfs_ordered_extent *ordered) 2735 { 2736 struct btrfs_root *root = BTRFS_I(inode)->root; 2737 struct btrfs_path *path; 2738 struct btrfs_key key; 2739 struct old_sa_defrag_extent *old; 2740 struct new_sa_defrag_extent *new; 2741 int ret; 2742 2743 new = kmalloc(sizeof(*new), GFP_NOFS); 2744 if (!new) 2745 return NULL; 2746 2747 new->inode = inode; 2748 new->file_pos = ordered->file_offset; 2749 new->len = ordered->len; 2750 new->bytenr = ordered->start; 2751 new->disk_len = ordered->disk_len; 2752 new->compress_type = ordered->compress_type; 2753 new->root = RB_ROOT; 2754 INIT_LIST_HEAD(&new->head); 2755 2756 path = btrfs_alloc_path(); 2757 if (!path) 2758 goto out_kfree; 2759 2760 key.objectid = btrfs_ino(inode); 2761 key.type = BTRFS_EXTENT_DATA_KEY; 2762 key.offset = new->file_pos; 2763 2764 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2765 if (ret < 0) 2766 goto out_free_path; 2767 if (ret > 0 && path->slots[0] > 0) 2768 path->slots[0]--; 2769 2770 /* find out all the old extents for the file range */ 2771 while (1) { 2772 struct btrfs_file_extent_item *extent; 2773 struct extent_buffer *l; 2774 int slot; 2775 u64 num_bytes; 2776 u64 offset; 2777 u64 end; 2778 u64 disk_bytenr; 2779 u64 extent_offset; 2780 2781 l = path->nodes[0]; 2782 slot = path->slots[0]; 2783 2784 if (slot >= btrfs_header_nritems(l)) { 2785 ret = btrfs_next_leaf(root, path); 2786 if (ret < 0) 2787 goto out_free_path; 2788 else if (ret > 0) 2789 break; 2790 continue; 2791 } 2792 2793 btrfs_item_key_to_cpu(l, &key, slot); 2794 2795 if (key.objectid != btrfs_ino(inode)) 2796 break; 2797 if (key.type != BTRFS_EXTENT_DATA_KEY) 2798 break; 2799 if (key.offset >= new->file_pos + new->len) 2800 break; 2801 2802 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); 2803 2804 num_bytes = btrfs_file_extent_num_bytes(l, extent); 2805 if (key.offset + num_bytes < new->file_pos) 2806 goto next; 2807 2808 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); 2809 if (!disk_bytenr) 2810 goto next; 2811 2812 extent_offset = btrfs_file_extent_offset(l, extent); 2813 2814 old = kmalloc(sizeof(*old), GFP_NOFS); 2815 if (!old) 2816 goto out_free_path; 2817 2818 offset = max(new->file_pos, key.offset); 2819 end = min(new->file_pos + new->len, key.offset + num_bytes); 2820 2821 old->bytenr = disk_bytenr; 2822 old->extent_offset = extent_offset; 2823 old->offset = offset - key.offset; 2824 old->len = end - offset; 2825 old->new = new; 2826 old->count = 0; 2827 list_add_tail(&old->list, &new->head); 2828 next: 2829 path->slots[0]++; 2830 cond_resched(); 2831 } 2832 2833 btrfs_free_path(path); 2834 atomic_inc(&root->fs_info->defrag_running); 2835 2836 return new; 2837 2838 out_free_path: 2839 btrfs_free_path(path); 2840 out_kfree: 2841 free_sa_defrag_extent(new); 2842 return NULL; 2843 } 2844 2845 static void btrfs_release_delalloc_bytes(struct btrfs_root *root, 2846 u64 start, u64 len) 2847 { 2848 struct btrfs_block_group_cache *cache; 2849 2850 cache = btrfs_lookup_block_group(root->fs_info, start); 2851 ASSERT(cache); 2852 2853 spin_lock(&cache->lock); 2854 cache->delalloc_bytes -= len; 2855 spin_unlock(&cache->lock); 2856 2857 btrfs_put_block_group(cache); 2858 } 2859 2860 /* as ordered data IO finishes, this gets called so we can finish 2861 * an ordered extent if the range of bytes in the file it covers are 2862 * fully written. 2863 */ 2864 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 2865 { 2866 struct inode *inode = ordered_extent->inode; 2867 struct btrfs_root *root = BTRFS_I(inode)->root; 2868 struct btrfs_trans_handle *trans = NULL; 2869 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2870 struct extent_state *cached_state = NULL; 2871 struct new_sa_defrag_extent *new = NULL; 2872 int compress_type = 0; 2873 int ret = 0; 2874 u64 logical_len = ordered_extent->len; 2875 bool nolock; 2876 bool truncated = false; 2877 2878 nolock = btrfs_is_free_space_inode(inode); 2879 2880 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 2881 ret = -EIO; 2882 goto out; 2883 } 2884 2885 btrfs_free_io_failure_record(inode, ordered_extent->file_offset, 2886 ordered_extent->file_offset + 2887 ordered_extent->len - 1); 2888 2889 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 2890 truncated = true; 2891 logical_len = ordered_extent->truncated_len; 2892 /* Truncated the entire extent, don't bother adding */ 2893 if (!logical_len) 2894 goto out; 2895 } 2896 2897 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 2898 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 2899 2900 /* 2901 * For mwrite(mmap + memset to write) case, we still reserve 2902 * space for NOCOW range. 2903 * As NOCOW won't cause a new delayed ref, just free the space 2904 */ 2905 btrfs_qgroup_free_data(inode, ordered_extent->file_offset, 2906 ordered_extent->len); 2907 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 2908 if (nolock) 2909 trans = btrfs_join_transaction_nolock(root); 2910 else 2911 trans = btrfs_join_transaction(root); 2912 if (IS_ERR(trans)) { 2913 ret = PTR_ERR(trans); 2914 trans = NULL; 2915 goto out; 2916 } 2917 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 2918 ret = btrfs_update_inode_fallback(trans, root, inode); 2919 if (ret) /* -ENOMEM or corruption */ 2920 btrfs_abort_transaction(trans, ret); 2921 goto out; 2922 } 2923 2924 lock_extent_bits(io_tree, ordered_extent->file_offset, 2925 ordered_extent->file_offset + ordered_extent->len - 1, 2926 &cached_state); 2927 2928 ret = test_range_bit(io_tree, ordered_extent->file_offset, 2929 ordered_extent->file_offset + ordered_extent->len - 1, 2930 EXTENT_DEFRAG, 1, cached_state); 2931 if (ret) { 2932 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2933 if (0 && last_snapshot >= BTRFS_I(inode)->generation) 2934 /* the inode is shared */ 2935 new = record_old_file_extents(inode, ordered_extent); 2936 2937 clear_extent_bit(io_tree, ordered_extent->file_offset, 2938 ordered_extent->file_offset + ordered_extent->len - 1, 2939 EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); 2940 } 2941 2942 if (nolock) 2943 trans = btrfs_join_transaction_nolock(root); 2944 else 2945 trans = btrfs_join_transaction(root); 2946 if (IS_ERR(trans)) { 2947 ret = PTR_ERR(trans); 2948 trans = NULL; 2949 goto out_unlock; 2950 } 2951 2952 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 2953 2954 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 2955 compress_type = ordered_extent->compress_type; 2956 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 2957 BUG_ON(compress_type); 2958 ret = btrfs_mark_extent_written(trans, inode, 2959 ordered_extent->file_offset, 2960 ordered_extent->file_offset + 2961 logical_len); 2962 } else { 2963 BUG_ON(root == root->fs_info->tree_root); 2964 ret = insert_reserved_file_extent(trans, inode, 2965 ordered_extent->file_offset, 2966 ordered_extent->start, 2967 ordered_extent->disk_len, 2968 logical_len, logical_len, 2969 compress_type, 0, 0, 2970 BTRFS_FILE_EXTENT_REG); 2971 if (!ret) 2972 btrfs_release_delalloc_bytes(root, 2973 ordered_extent->start, 2974 ordered_extent->disk_len); 2975 } 2976 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 2977 ordered_extent->file_offset, ordered_extent->len, 2978 trans->transid); 2979 if (ret < 0) { 2980 btrfs_abort_transaction(trans, ret); 2981 goto out_unlock; 2982 } 2983 2984 add_pending_csums(trans, inode, ordered_extent->file_offset, 2985 &ordered_extent->list); 2986 2987 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 2988 ret = btrfs_update_inode_fallback(trans, root, inode); 2989 if (ret) { /* -ENOMEM or corruption */ 2990 btrfs_abort_transaction(trans, ret); 2991 goto out_unlock; 2992 } 2993 ret = 0; 2994 out_unlock: 2995 unlock_extent_cached(io_tree, ordered_extent->file_offset, 2996 ordered_extent->file_offset + 2997 ordered_extent->len - 1, &cached_state, GFP_NOFS); 2998 out: 2999 if (root != root->fs_info->tree_root) 3000 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 3001 if (trans) 3002 btrfs_end_transaction(trans, root); 3003 3004 if (ret || truncated) { 3005 u64 start, end; 3006 3007 if (truncated) 3008 start = ordered_extent->file_offset + logical_len; 3009 else 3010 start = ordered_extent->file_offset; 3011 end = ordered_extent->file_offset + ordered_extent->len - 1; 3012 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); 3013 3014 /* Drop the cache for the part of the extent we didn't write. */ 3015 btrfs_drop_extent_cache(inode, start, end, 0); 3016 3017 /* 3018 * If the ordered extent had an IOERR or something else went 3019 * wrong we need to return the space for this ordered extent 3020 * back to the allocator. We only free the extent in the 3021 * truncated case if we didn't write out the extent at all. 3022 */ 3023 if ((ret || !logical_len) && 3024 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3025 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) 3026 btrfs_free_reserved_extent(root, ordered_extent->start, 3027 ordered_extent->disk_len, 1); 3028 } 3029 3030 3031 /* 3032 * This needs to be done to make sure anybody waiting knows we are done 3033 * updating everything for this ordered extent. 3034 */ 3035 btrfs_remove_ordered_extent(inode, ordered_extent); 3036 3037 /* for snapshot-aware defrag */ 3038 if (new) { 3039 if (ret) { 3040 free_sa_defrag_extent(new); 3041 atomic_dec(&root->fs_info->defrag_running); 3042 } else { 3043 relink_file_extents(new); 3044 } 3045 } 3046 3047 /* once for us */ 3048 btrfs_put_ordered_extent(ordered_extent); 3049 /* once for the tree */ 3050 btrfs_put_ordered_extent(ordered_extent); 3051 3052 return ret; 3053 } 3054 3055 static void finish_ordered_fn(struct btrfs_work *work) 3056 { 3057 struct btrfs_ordered_extent *ordered_extent; 3058 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 3059 btrfs_finish_ordered_io(ordered_extent); 3060 } 3061 3062 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 3063 struct extent_state *state, int uptodate) 3064 { 3065 struct inode *inode = page->mapping->host; 3066 struct btrfs_root *root = BTRFS_I(inode)->root; 3067 struct btrfs_ordered_extent *ordered_extent = NULL; 3068 struct btrfs_workqueue *wq; 3069 btrfs_work_func_t func; 3070 3071 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 3072 3073 ClearPagePrivate2(page); 3074 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 3075 end - start + 1, uptodate)) 3076 return 0; 3077 3078 if (btrfs_is_free_space_inode(inode)) { 3079 wq = root->fs_info->endio_freespace_worker; 3080 func = btrfs_freespace_write_helper; 3081 } else { 3082 wq = root->fs_info->endio_write_workers; 3083 func = btrfs_endio_write_helper; 3084 } 3085 3086 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, 3087 NULL); 3088 btrfs_queue_work(wq, &ordered_extent->work); 3089 3090 return 0; 3091 } 3092 3093 static int __readpage_endio_check(struct inode *inode, 3094 struct btrfs_io_bio *io_bio, 3095 int icsum, struct page *page, 3096 int pgoff, u64 start, size_t len) 3097 { 3098 char *kaddr; 3099 u32 csum_expected; 3100 u32 csum = ~(u32)0; 3101 3102 csum_expected = *(((u32 *)io_bio->csum) + icsum); 3103 3104 kaddr = kmap_atomic(page); 3105 csum = btrfs_csum_data(kaddr + pgoff, csum, len); 3106 btrfs_csum_final(csum, (char *)&csum); 3107 if (csum != csum_expected) 3108 goto zeroit; 3109 3110 kunmap_atomic(kaddr); 3111 return 0; 3112 zeroit: 3113 btrfs_warn_rl(BTRFS_I(inode)->root->fs_info, 3114 "csum failed ino %llu off %llu csum %u expected csum %u", 3115 btrfs_ino(inode), start, csum, csum_expected); 3116 memset(kaddr + pgoff, 1, len); 3117 flush_dcache_page(page); 3118 kunmap_atomic(kaddr); 3119 if (csum_expected == 0) 3120 return 0; 3121 return -EIO; 3122 } 3123 3124 /* 3125 * when reads are done, we need to check csums to verify the data is correct 3126 * if there's a match, we allow the bio to finish. If not, the code in 3127 * extent_io.c will try to find good copies for us. 3128 */ 3129 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, 3130 u64 phy_offset, struct page *page, 3131 u64 start, u64 end, int mirror) 3132 { 3133 size_t offset = start - page_offset(page); 3134 struct inode *inode = page->mapping->host; 3135 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3136 struct btrfs_root *root = BTRFS_I(inode)->root; 3137 3138 if (PageChecked(page)) { 3139 ClearPageChecked(page); 3140 return 0; 3141 } 3142 3143 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 3144 return 0; 3145 3146 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 3147 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 3148 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); 3149 return 0; 3150 } 3151 3152 phy_offset >>= inode->i_sb->s_blocksize_bits; 3153 return __readpage_endio_check(inode, io_bio, phy_offset, page, offset, 3154 start, (size_t)(end - start + 1)); 3155 } 3156 3157 void btrfs_add_delayed_iput(struct inode *inode) 3158 { 3159 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 3160 struct btrfs_inode *binode = BTRFS_I(inode); 3161 3162 if (atomic_add_unless(&inode->i_count, -1, 1)) 3163 return; 3164 3165 spin_lock(&fs_info->delayed_iput_lock); 3166 if (binode->delayed_iput_count == 0) { 3167 ASSERT(list_empty(&binode->delayed_iput)); 3168 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); 3169 } else { 3170 binode->delayed_iput_count++; 3171 } 3172 spin_unlock(&fs_info->delayed_iput_lock); 3173 } 3174 3175 void btrfs_run_delayed_iputs(struct btrfs_root *root) 3176 { 3177 struct btrfs_fs_info *fs_info = root->fs_info; 3178 3179 spin_lock(&fs_info->delayed_iput_lock); 3180 while (!list_empty(&fs_info->delayed_iputs)) { 3181 struct btrfs_inode *inode; 3182 3183 inode = list_first_entry(&fs_info->delayed_iputs, 3184 struct btrfs_inode, delayed_iput); 3185 if (inode->delayed_iput_count) { 3186 inode->delayed_iput_count--; 3187 list_move_tail(&inode->delayed_iput, 3188 &fs_info->delayed_iputs); 3189 } else { 3190 list_del_init(&inode->delayed_iput); 3191 } 3192 spin_unlock(&fs_info->delayed_iput_lock); 3193 iput(&inode->vfs_inode); 3194 spin_lock(&fs_info->delayed_iput_lock); 3195 } 3196 spin_unlock(&fs_info->delayed_iput_lock); 3197 } 3198 3199 /* 3200 * This is called in transaction commit time. If there are no orphan 3201 * files in the subvolume, it removes orphan item and frees block_rsv 3202 * structure. 3203 */ 3204 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 3205 struct btrfs_root *root) 3206 { 3207 struct btrfs_block_rsv *block_rsv; 3208 int ret; 3209 3210 if (atomic_read(&root->orphan_inodes) || 3211 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 3212 return; 3213 3214 spin_lock(&root->orphan_lock); 3215 if (atomic_read(&root->orphan_inodes)) { 3216 spin_unlock(&root->orphan_lock); 3217 return; 3218 } 3219 3220 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { 3221 spin_unlock(&root->orphan_lock); 3222 return; 3223 } 3224 3225 block_rsv = root->orphan_block_rsv; 3226 root->orphan_block_rsv = NULL; 3227 spin_unlock(&root->orphan_lock); 3228 3229 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && 3230 btrfs_root_refs(&root->root_item) > 0) { 3231 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 3232 root->root_key.objectid); 3233 if (ret) 3234 btrfs_abort_transaction(trans, ret); 3235 else 3236 clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, 3237 &root->state); 3238 } 3239 3240 if (block_rsv) { 3241 WARN_ON(block_rsv->size > 0); 3242 btrfs_free_block_rsv(root, block_rsv); 3243 } 3244 } 3245 3246 /* 3247 * This creates an orphan entry for the given inode in case something goes 3248 * wrong in the middle of an unlink/truncate. 3249 * 3250 * NOTE: caller of this function should reserve 5 units of metadata for 3251 * this function. 3252 */ 3253 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 3254 { 3255 struct btrfs_root *root = BTRFS_I(inode)->root; 3256 struct btrfs_block_rsv *block_rsv = NULL; 3257 int reserve = 0; 3258 int insert = 0; 3259 int ret; 3260 3261 if (!root->orphan_block_rsv) { 3262 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 3263 if (!block_rsv) 3264 return -ENOMEM; 3265 } 3266 3267 spin_lock(&root->orphan_lock); 3268 if (!root->orphan_block_rsv) { 3269 root->orphan_block_rsv = block_rsv; 3270 } else if (block_rsv) { 3271 btrfs_free_block_rsv(root, block_rsv); 3272 block_rsv = NULL; 3273 } 3274 3275 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3276 &BTRFS_I(inode)->runtime_flags)) { 3277 #if 0 3278 /* 3279 * For proper ENOSPC handling, we should do orphan 3280 * cleanup when mounting. But this introduces backward 3281 * compatibility issue. 3282 */ 3283 if (!xchg(&root->orphan_item_inserted, 1)) 3284 insert = 2; 3285 else 3286 insert = 1; 3287 #endif 3288 insert = 1; 3289 atomic_inc(&root->orphan_inodes); 3290 } 3291 3292 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3293 &BTRFS_I(inode)->runtime_flags)) 3294 reserve = 1; 3295 spin_unlock(&root->orphan_lock); 3296 3297 /* grab metadata reservation from transaction handle */ 3298 if (reserve) { 3299 ret = btrfs_orphan_reserve_metadata(trans, inode); 3300 ASSERT(!ret); 3301 if (ret) { 3302 atomic_dec(&root->orphan_inodes); 3303 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3304 &BTRFS_I(inode)->runtime_flags); 3305 if (insert) 3306 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3307 &BTRFS_I(inode)->runtime_flags); 3308 return ret; 3309 } 3310 } 3311 3312 /* insert an orphan item to track this unlinked/truncated file */ 3313 if (insert >= 1) { 3314 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 3315 if (ret) { 3316 atomic_dec(&root->orphan_inodes); 3317 if (reserve) { 3318 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3319 &BTRFS_I(inode)->runtime_flags); 3320 btrfs_orphan_release_metadata(inode); 3321 } 3322 if (ret != -EEXIST) { 3323 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3324 &BTRFS_I(inode)->runtime_flags); 3325 btrfs_abort_transaction(trans, ret); 3326 return ret; 3327 } 3328 } 3329 ret = 0; 3330 } 3331 3332 /* insert an orphan item to track subvolume contains orphan files */ 3333 if (insert >= 2) { 3334 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 3335 root->root_key.objectid); 3336 if (ret && ret != -EEXIST) { 3337 btrfs_abort_transaction(trans, ret); 3338 return ret; 3339 } 3340 } 3341 return 0; 3342 } 3343 3344 /* 3345 * We have done the truncate/delete so we can go ahead and remove the orphan 3346 * item for this particular inode. 3347 */ 3348 static int btrfs_orphan_del(struct btrfs_trans_handle *trans, 3349 struct inode *inode) 3350 { 3351 struct btrfs_root *root = BTRFS_I(inode)->root; 3352 int delete_item = 0; 3353 int release_rsv = 0; 3354 int ret = 0; 3355 3356 spin_lock(&root->orphan_lock); 3357 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3358 &BTRFS_I(inode)->runtime_flags)) 3359 delete_item = 1; 3360 3361 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3362 &BTRFS_I(inode)->runtime_flags)) 3363 release_rsv = 1; 3364 spin_unlock(&root->orphan_lock); 3365 3366 if (delete_item) { 3367 atomic_dec(&root->orphan_inodes); 3368 if (trans) 3369 ret = btrfs_del_orphan_item(trans, root, 3370 btrfs_ino(inode)); 3371 } 3372 3373 if (release_rsv) 3374 btrfs_orphan_release_metadata(inode); 3375 3376 return ret; 3377 } 3378 3379 /* 3380 * this cleans up any orphans that may be left on the list from the last use 3381 * of this root. 3382 */ 3383 int btrfs_orphan_cleanup(struct btrfs_root *root) 3384 { 3385 struct btrfs_path *path; 3386 struct extent_buffer *leaf; 3387 struct btrfs_key key, found_key; 3388 struct btrfs_trans_handle *trans; 3389 struct inode *inode; 3390 u64 last_objectid = 0; 3391 int ret = 0, nr_unlink = 0, nr_truncate = 0; 3392 3393 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 3394 return 0; 3395 3396 path = btrfs_alloc_path(); 3397 if (!path) { 3398 ret = -ENOMEM; 3399 goto out; 3400 } 3401 path->reada = READA_BACK; 3402 3403 key.objectid = BTRFS_ORPHAN_OBJECTID; 3404 key.type = BTRFS_ORPHAN_ITEM_KEY; 3405 key.offset = (u64)-1; 3406 3407 while (1) { 3408 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3409 if (ret < 0) 3410 goto out; 3411 3412 /* 3413 * if ret == 0 means we found what we were searching for, which 3414 * is weird, but possible, so only screw with path if we didn't 3415 * find the key and see if we have stuff that matches 3416 */ 3417 if (ret > 0) { 3418 ret = 0; 3419 if (path->slots[0] == 0) 3420 break; 3421 path->slots[0]--; 3422 } 3423 3424 /* pull out the item */ 3425 leaf = path->nodes[0]; 3426 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3427 3428 /* make sure the item matches what we want */ 3429 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3430 break; 3431 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) 3432 break; 3433 3434 /* release the path since we're done with it */ 3435 btrfs_release_path(path); 3436 3437 /* 3438 * this is where we are basically btrfs_lookup, without the 3439 * crossing root thing. we store the inode number in the 3440 * offset of the orphan item. 3441 */ 3442 3443 if (found_key.offset == last_objectid) { 3444 btrfs_err(root->fs_info, 3445 "Error removing orphan entry, stopping orphan cleanup"); 3446 ret = -EINVAL; 3447 goto out; 3448 } 3449 3450 last_objectid = found_key.offset; 3451 3452 found_key.objectid = found_key.offset; 3453 found_key.type = BTRFS_INODE_ITEM_KEY; 3454 found_key.offset = 0; 3455 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 3456 ret = PTR_ERR_OR_ZERO(inode); 3457 if (ret && ret != -ENOENT) 3458 goto out; 3459 3460 if (ret == -ENOENT && root == root->fs_info->tree_root) { 3461 struct btrfs_root *dead_root; 3462 struct btrfs_fs_info *fs_info = root->fs_info; 3463 int is_dead_root = 0; 3464 3465 /* 3466 * this is an orphan in the tree root. Currently these 3467 * could come from 2 sources: 3468 * a) a snapshot deletion in progress 3469 * b) a free space cache inode 3470 * We need to distinguish those two, as the snapshot 3471 * orphan must not get deleted. 3472 * find_dead_roots already ran before us, so if this 3473 * is a snapshot deletion, we should find the root 3474 * in the dead_roots list 3475 */ 3476 spin_lock(&fs_info->trans_lock); 3477 list_for_each_entry(dead_root, &fs_info->dead_roots, 3478 root_list) { 3479 if (dead_root->root_key.objectid == 3480 found_key.objectid) { 3481 is_dead_root = 1; 3482 break; 3483 } 3484 } 3485 spin_unlock(&fs_info->trans_lock); 3486 if (is_dead_root) { 3487 /* prevent this orphan from being found again */ 3488 key.offset = found_key.objectid - 1; 3489 continue; 3490 } 3491 } 3492 /* 3493 * Inode is already gone but the orphan item is still there, 3494 * kill the orphan item. 3495 */ 3496 if (ret == -ENOENT) { 3497 trans = btrfs_start_transaction(root, 1); 3498 if (IS_ERR(trans)) { 3499 ret = PTR_ERR(trans); 3500 goto out; 3501 } 3502 btrfs_debug(root->fs_info, "auto deleting %Lu", 3503 found_key.objectid); 3504 ret = btrfs_del_orphan_item(trans, root, 3505 found_key.objectid); 3506 btrfs_end_transaction(trans, root); 3507 if (ret) 3508 goto out; 3509 continue; 3510 } 3511 3512 /* 3513 * add this inode to the orphan list so btrfs_orphan_del does 3514 * the proper thing when we hit it 3515 */ 3516 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3517 &BTRFS_I(inode)->runtime_flags); 3518 atomic_inc(&root->orphan_inodes); 3519 3520 /* if we have links, this was a truncate, lets do that */ 3521 if (inode->i_nlink) { 3522 if (WARN_ON(!S_ISREG(inode->i_mode))) { 3523 iput(inode); 3524 continue; 3525 } 3526 nr_truncate++; 3527 3528 /* 1 for the orphan item deletion. */ 3529 trans = btrfs_start_transaction(root, 1); 3530 if (IS_ERR(trans)) { 3531 iput(inode); 3532 ret = PTR_ERR(trans); 3533 goto out; 3534 } 3535 ret = btrfs_orphan_add(trans, inode); 3536 btrfs_end_transaction(trans, root); 3537 if (ret) { 3538 iput(inode); 3539 goto out; 3540 } 3541 3542 ret = btrfs_truncate(inode); 3543 if (ret) 3544 btrfs_orphan_del(NULL, inode); 3545 } else { 3546 nr_unlink++; 3547 } 3548 3549 /* this will do delete_inode and everything for us */ 3550 iput(inode); 3551 if (ret) 3552 goto out; 3553 } 3554 /* release the path since we're done with it */ 3555 btrfs_release_path(path); 3556 3557 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 3558 3559 if (root->orphan_block_rsv) 3560 btrfs_block_rsv_release(root, root->orphan_block_rsv, 3561 (u64)-1); 3562 3563 if (root->orphan_block_rsv || 3564 test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { 3565 trans = btrfs_join_transaction(root); 3566 if (!IS_ERR(trans)) 3567 btrfs_end_transaction(trans, root); 3568 } 3569 3570 if (nr_unlink) 3571 btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink); 3572 if (nr_truncate) 3573 btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate); 3574 3575 out: 3576 if (ret) 3577 btrfs_err(root->fs_info, 3578 "could not do orphan cleanup %d", ret); 3579 btrfs_free_path(path); 3580 return ret; 3581 } 3582 3583 /* 3584 * very simple check to peek ahead in the leaf looking for xattrs. If we 3585 * don't find any xattrs, we know there can't be any acls. 3586 * 3587 * slot is the slot the inode is in, objectid is the objectid of the inode 3588 */ 3589 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 3590 int slot, u64 objectid, 3591 int *first_xattr_slot) 3592 { 3593 u32 nritems = btrfs_header_nritems(leaf); 3594 struct btrfs_key found_key; 3595 static u64 xattr_access = 0; 3596 static u64 xattr_default = 0; 3597 int scanned = 0; 3598 3599 if (!xattr_access) { 3600 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, 3601 strlen(XATTR_NAME_POSIX_ACL_ACCESS)); 3602 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, 3603 strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); 3604 } 3605 3606 slot++; 3607 *first_xattr_slot = -1; 3608 while (slot < nritems) { 3609 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3610 3611 /* we found a different objectid, there must not be acls */ 3612 if (found_key.objectid != objectid) 3613 return 0; 3614 3615 /* we found an xattr, assume we've got an acl */ 3616 if (found_key.type == BTRFS_XATTR_ITEM_KEY) { 3617 if (*first_xattr_slot == -1) 3618 *first_xattr_slot = slot; 3619 if (found_key.offset == xattr_access || 3620 found_key.offset == xattr_default) 3621 return 1; 3622 } 3623 3624 /* 3625 * we found a key greater than an xattr key, there can't 3626 * be any acls later on 3627 */ 3628 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 3629 return 0; 3630 3631 slot++; 3632 scanned++; 3633 3634 /* 3635 * it goes inode, inode backrefs, xattrs, extents, 3636 * so if there are a ton of hard links to an inode there can 3637 * be a lot of backrefs. Don't waste time searching too hard, 3638 * this is just an optimization 3639 */ 3640 if (scanned >= 8) 3641 break; 3642 } 3643 /* we hit the end of the leaf before we found an xattr or 3644 * something larger than an xattr. We have to assume the inode 3645 * has acls 3646 */ 3647 if (*first_xattr_slot == -1) 3648 *first_xattr_slot = slot; 3649 return 1; 3650 } 3651 3652 /* 3653 * read an inode from the btree into the in-memory inode 3654 */ 3655 static int btrfs_read_locked_inode(struct inode *inode) 3656 { 3657 struct btrfs_path *path; 3658 struct extent_buffer *leaf; 3659 struct btrfs_inode_item *inode_item; 3660 struct btrfs_root *root = BTRFS_I(inode)->root; 3661 struct btrfs_key location; 3662 unsigned long ptr; 3663 int maybe_acls; 3664 u32 rdev; 3665 int ret; 3666 bool filled = false; 3667 int first_xattr_slot; 3668 3669 ret = btrfs_fill_inode(inode, &rdev); 3670 if (!ret) 3671 filled = true; 3672 3673 path = btrfs_alloc_path(); 3674 if (!path) { 3675 ret = -ENOMEM; 3676 goto make_bad; 3677 } 3678 3679 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3680 3681 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3682 if (ret) { 3683 if (ret > 0) 3684 ret = -ENOENT; 3685 goto make_bad; 3686 } 3687 3688 leaf = path->nodes[0]; 3689 3690 if (filled) 3691 goto cache_index; 3692 3693 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3694 struct btrfs_inode_item); 3695 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 3696 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 3697 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); 3698 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); 3699 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 3700 3701 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); 3702 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); 3703 3704 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); 3705 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); 3706 3707 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); 3708 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); 3709 3710 BTRFS_I(inode)->i_otime.tv_sec = 3711 btrfs_timespec_sec(leaf, &inode_item->otime); 3712 BTRFS_I(inode)->i_otime.tv_nsec = 3713 btrfs_timespec_nsec(leaf, &inode_item->otime); 3714 3715 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 3716 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 3717 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); 3718 3719 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 3720 inode->i_generation = BTRFS_I(inode)->generation; 3721 inode->i_rdev = 0; 3722 rdev = btrfs_inode_rdev(leaf, inode_item); 3723 3724 BTRFS_I(inode)->index_cnt = (u64)-1; 3725 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 3726 3727 cache_index: 3728 /* 3729 * If we were modified in the current generation and evicted from memory 3730 * and then re-read we need to do a full sync since we don't have any 3731 * idea about which extents were modified before we were evicted from 3732 * cache. 3733 * 3734 * This is required for both inode re-read from disk and delayed inode 3735 * in delayed_nodes_tree. 3736 */ 3737 if (BTRFS_I(inode)->last_trans == root->fs_info->generation) 3738 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3739 &BTRFS_I(inode)->runtime_flags); 3740 3741 /* 3742 * We don't persist the id of the transaction where an unlink operation 3743 * against the inode was last made. So here we assume the inode might 3744 * have been evicted, and therefore the exact value of last_unlink_trans 3745 * lost, and set it to last_trans to avoid metadata inconsistencies 3746 * between the inode and its parent if the inode is fsync'ed and the log 3747 * replayed. For example, in the scenario: 3748 * 3749 * touch mydir/foo 3750 * ln mydir/foo mydir/bar 3751 * sync 3752 * unlink mydir/bar 3753 * echo 2 > /proc/sys/vm/drop_caches # evicts inode 3754 * xfs_io -c fsync mydir/foo 3755 * <power failure> 3756 * mount fs, triggers fsync log replay 3757 * 3758 * We must make sure that when we fsync our inode foo we also log its 3759 * parent inode, otherwise after log replay the parent still has the 3760 * dentry with the "bar" name but our inode foo has a link count of 1 3761 * and doesn't have an inode ref with the name "bar" anymore. 3762 * 3763 * Setting last_unlink_trans to last_trans is a pessimistic approach, 3764 * but it guarantees correctness at the expense of occasional full 3765 * transaction commits on fsync if our inode is a directory, or if our 3766 * inode is not a directory, logging its parent unnecessarily. 3767 */ 3768 BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; 3769 3770 path->slots[0]++; 3771 if (inode->i_nlink != 1 || 3772 path->slots[0] >= btrfs_header_nritems(leaf)) 3773 goto cache_acl; 3774 3775 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); 3776 if (location.objectid != btrfs_ino(inode)) 3777 goto cache_acl; 3778 3779 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 3780 if (location.type == BTRFS_INODE_REF_KEY) { 3781 struct btrfs_inode_ref *ref; 3782 3783 ref = (struct btrfs_inode_ref *)ptr; 3784 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); 3785 } else if (location.type == BTRFS_INODE_EXTREF_KEY) { 3786 struct btrfs_inode_extref *extref; 3787 3788 extref = (struct btrfs_inode_extref *)ptr; 3789 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, 3790 extref); 3791 } 3792 cache_acl: 3793 /* 3794 * try to precache a NULL acl entry for files that don't have 3795 * any xattrs or acls 3796 */ 3797 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 3798 btrfs_ino(inode), &first_xattr_slot); 3799 if (first_xattr_slot != -1) { 3800 path->slots[0] = first_xattr_slot; 3801 ret = btrfs_load_inode_props(inode, path); 3802 if (ret) 3803 btrfs_err(root->fs_info, 3804 "error loading props for ino %llu (root %llu): %d", 3805 btrfs_ino(inode), 3806 root->root_key.objectid, ret); 3807 } 3808 btrfs_free_path(path); 3809 3810 if (!maybe_acls) 3811 cache_no_acl(inode); 3812 3813 switch (inode->i_mode & S_IFMT) { 3814 case S_IFREG: 3815 inode->i_mapping->a_ops = &btrfs_aops; 3816 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 3817 inode->i_fop = &btrfs_file_operations; 3818 inode->i_op = &btrfs_file_inode_operations; 3819 break; 3820 case S_IFDIR: 3821 inode->i_fop = &btrfs_dir_file_operations; 3822 if (root == root->fs_info->tree_root) 3823 inode->i_op = &btrfs_dir_ro_inode_operations; 3824 else 3825 inode->i_op = &btrfs_dir_inode_operations; 3826 break; 3827 case S_IFLNK: 3828 inode->i_op = &btrfs_symlink_inode_operations; 3829 inode_nohighmem(inode); 3830 inode->i_mapping->a_ops = &btrfs_symlink_aops; 3831 break; 3832 default: 3833 inode->i_op = &btrfs_special_inode_operations; 3834 init_special_inode(inode, inode->i_mode, rdev); 3835 break; 3836 } 3837 3838 btrfs_update_iflags(inode); 3839 return 0; 3840 3841 make_bad: 3842 btrfs_free_path(path); 3843 make_bad_inode(inode); 3844 return ret; 3845 } 3846 3847 /* 3848 * given a leaf and an inode, copy the inode fields into the leaf 3849 */ 3850 static void fill_inode_item(struct btrfs_trans_handle *trans, 3851 struct extent_buffer *leaf, 3852 struct btrfs_inode_item *item, 3853 struct inode *inode) 3854 { 3855 struct btrfs_map_token token; 3856 3857 btrfs_init_map_token(&token); 3858 3859 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3860 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3861 btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, 3862 &token); 3863 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3864 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3865 3866 btrfs_set_token_timespec_sec(leaf, &item->atime, 3867 inode->i_atime.tv_sec, &token); 3868 btrfs_set_token_timespec_nsec(leaf, &item->atime, 3869 inode->i_atime.tv_nsec, &token); 3870 3871 btrfs_set_token_timespec_sec(leaf, &item->mtime, 3872 inode->i_mtime.tv_sec, &token); 3873 btrfs_set_token_timespec_nsec(leaf, &item->mtime, 3874 inode->i_mtime.tv_nsec, &token); 3875 3876 btrfs_set_token_timespec_sec(leaf, &item->ctime, 3877 inode->i_ctime.tv_sec, &token); 3878 btrfs_set_token_timespec_nsec(leaf, &item->ctime, 3879 inode->i_ctime.tv_nsec, &token); 3880 3881 btrfs_set_token_timespec_sec(leaf, &item->otime, 3882 BTRFS_I(inode)->i_otime.tv_sec, &token); 3883 btrfs_set_token_timespec_nsec(leaf, &item->otime, 3884 BTRFS_I(inode)->i_otime.tv_nsec, &token); 3885 3886 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3887 &token); 3888 btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, 3889 &token); 3890 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3891 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3892 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3893 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3894 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3895 } 3896 3897 /* 3898 * copy everything in the in-memory inode into the btree. 3899 */ 3900 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 3901 struct btrfs_root *root, struct inode *inode) 3902 { 3903 struct btrfs_inode_item *inode_item; 3904 struct btrfs_path *path; 3905 struct extent_buffer *leaf; 3906 int ret; 3907 3908 path = btrfs_alloc_path(); 3909 if (!path) 3910 return -ENOMEM; 3911 3912 path->leave_spinning = 1; 3913 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 3914 1); 3915 if (ret) { 3916 if (ret > 0) 3917 ret = -ENOENT; 3918 goto failed; 3919 } 3920 3921 leaf = path->nodes[0]; 3922 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3923 struct btrfs_inode_item); 3924 3925 fill_inode_item(trans, leaf, inode_item, inode); 3926 btrfs_mark_buffer_dirty(leaf); 3927 btrfs_set_inode_last_trans(trans, inode); 3928 ret = 0; 3929 failed: 3930 btrfs_free_path(path); 3931 return ret; 3932 } 3933 3934 /* 3935 * copy everything in the in-memory inode into the btree. 3936 */ 3937 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 3938 struct btrfs_root *root, struct inode *inode) 3939 { 3940 int ret; 3941 3942 /* 3943 * If the inode is a free space inode, we can deadlock during commit 3944 * if we put it into the delayed code. 3945 * 3946 * The data relocation inode should also be directly updated 3947 * without delay 3948 */ 3949 if (!btrfs_is_free_space_inode(inode) 3950 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 3951 && !test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { 3952 btrfs_update_root_times(trans, root); 3953 3954 ret = btrfs_delayed_update_inode(trans, root, inode); 3955 if (!ret) 3956 btrfs_set_inode_last_trans(trans, inode); 3957 return ret; 3958 } 3959 3960 return btrfs_update_inode_item(trans, root, inode); 3961 } 3962 3963 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 3964 struct btrfs_root *root, 3965 struct inode *inode) 3966 { 3967 int ret; 3968 3969 ret = btrfs_update_inode(trans, root, inode); 3970 if (ret == -ENOSPC) 3971 return btrfs_update_inode_item(trans, root, inode); 3972 return ret; 3973 } 3974 3975 /* 3976 * unlink helper that gets used here in inode.c and in the tree logging 3977 * recovery code. It remove a link in a directory with a given name, and 3978 * also drops the back refs in the inode to the directory 3979 */ 3980 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 3981 struct btrfs_root *root, 3982 struct inode *dir, struct inode *inode, 3983 const char *name, int name_len) 3984 { 3985 struct btrfs_path *path; 3986 int ret = 0; 3987 struct extent_buffer *leaf; 3988 struct btrfs_dir_item *di; 3989 struct btrfs_key key; 3990 u64 index; 3991 u64 ino = btrfs_ino(inode); 3992 u64 dir_ino = btrfs_ino(dir); 3993 3994 path = btrfs_alloc_path(); 3995 if (!path) { 3996 ret = -ENOMEM; 3997 goto out; 3998 } 3999 4000 path->leave_spinning = 1; 4001 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 4002 name, name_len, -1); 4003 if (IS_ERR(di)) { 4004 ret = PTR_ERR(di); 4005 goto err; 4006 } 4007 if (!di) { 4008 ret = -ENOENT; 4009 goto err; 4010 } 4011 leaf = path->nodes[0]; 4012 btrfs_dir_item_key_to_cpu(leaf, di, &key); 4013 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4014 if (ret) 4015 goto err; 4016 btrfs_release_path(path); 4017 4018 /* 4019 * If we don't have dir index, we have to get it by looking up 4020 * the inode ref, since we get the inode ref, remove it directly, 4021 * it is unnecessary to do delayed deletion. 4022 * 4023 * But if we have dir index, needn't search inode ref to get it. 4024 * Since the inode ref is close to the inode item, it is better 4025 * that we delay to delete it, and just do this deletion when 4026 * we update the inode item. 4027 */ 4028 if (BTRFS_I(inode)->dir_index) { 4029 ret = btrfs_delayed_delete_inode_ref(inode); 4030 if (!ret) { 4031 index = BTRFS_I(inode)->dir_index; 4032 goto skip_backref; 4033 } 4034 } 4035 4036 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 4037 dir_ino, &index); 4038 if (ret) { 4039 btrfs_info(root->fs_info, 4040 "failed to delete reference to %.*s, inode %llu parent %llu", 4041 name_len, name, ino, dir_ino); 4042 btrfs_abort_transaction(trans, ret); 4043 goto err; 4044 } 4045 skip_backref: 4046 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 4047 if (ret) { 4048 btrfs_abort_transaction(trans, ret); 4049 goto err; 4050 } 4051 4052 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 4053 inode, dir_ino); 4054 if (ret != 0 && ret != -ENOENT) { 4055 btrfs_abort_transaction(trans, ret); 4056 goto err; 4057 } 4058 4059 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 4060 dir, index); 4061 if (ret == -ENOENT) 4062 ret = 0; 4063 else if (ret) 4064 btrfs_abort_transaction(trans, ret); 4065 err: 4066 btrfs_free_path(path); 4067 if (ret) 4068 goto out; 4069 4070 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 4071 inode_inc_iversion(inode); 4072 inode_inc_iversion(dir); 4073 inode->i_ctime = dir->i_mtime = 4074 dir->i_ctime = current_time(inode); 4075 ret = btrfs_update_inode(trans, root, dir); 4076 out: 4077 return ret; 4078 } 4079 4080 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 4081 struct btrfs_root *root, 4082 struct inode *dir, struct inode *inode, 4083 const char *name, int name_len) 4084 { 4085 int ret; 4086 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 4087 if (!ret) { 4088 drop_nlink(inode); 4089 ret = btrfs_update_inode(trans, root, inode); 4090 } 4091 return ret; 4092 } 4093 4094 /* 4095 * helper to start transaction for unlink and rmdir. 4096 * 4097 * unlink and rmdir are special in btrfs, they do not always free space, so 4098 * if we cannot make our reservations the normal way try and see if there is 4099 * plenty of slack room in the global reserve to migrate, otherwise we cannot 4100 * allow the unlink to occur. 4101 */ 4102 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) 4103 { 4104 struct btrfs_root *root = BTRFS_I(dir)->root; 4105 4106 /* 4107 * 1 for the possible orphan item 4108 * 1 for the dir item 4109 * 1 for the dir index 4110 * 1 for the inode ref 4111 * 1 for the inode 4112 */ 4113 return btrfs_start_transaction_fallback_global_rsv(root, 5, 5); 4114 } 4115 4116 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 4117 { 4118 struct btrfs_root *root = BTRFS_I(dir)->root; 4119 struct btrfs_trans_handle *trans; 4120 struct inode *inode = d_inode(dentry); 4121 int ret; 4122 4123 trans = __unlink_start_trans(dir); 4124 if (IS_ERR(trans)) 4125 return PTR_ERR(trans); 4126 4127 btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0); 4128 4129 ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), 4130 dentry->d_name.name, dentry->d_name.len); 4131 if (ret) 4132 goto out; 4133 4134 if (inode->i_nlink == 0) { 4135 ret = btrfs_orphan_add(trans, inode); 4136 if (ret) 4137 goto out; 4138 } 4139 4140 out: 4141 btrfs_end_transaction(trans, root); 4142 btrfs_btree_balance_dirty(root); 4143 return ret; 4144 } 4145 4146 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 4147 struct btrfs_root *root, 4148 struct inode *dir, u64 objectid, 4149 const char *name, int name_len) 4150 { 4151 struct btrfs_path *path; 4152 struct extent_buffer *leaf; 4153 struct btrfs_dir_item *di; 4154 struct btrfs_key key; 4155 u64 index; 4156 int ret; 4157 u64 dir_ino = btrfs_ino(dir); 4158 4159 path = btrfs_alloc_path(); 4160 if (!path) 4161 return -ENOMEM; 4162 4163 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 4164 name, name_len, -1); 4165 if (IS_ERR_OR_NULL(di)) { 4166 if (!di) 4167 ret = -ENOENT; 4168 else 4169 ret = PTR_ERR(di); 4170 goto out; 4171 } 4172 4173 leaf = path->nodes[0]; 4174 btrfs_dir_item_key_to_cpu(leaf, di, &key); 4175 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 4176 ret = btrfs_delete_one_dir_name(trans, root, path, di); 4177 if (ret) { 4178 btrfs_abort_transaction(trans, ret); 4179 goto out; 4180 } 4181 btrfs_release_path(path); 4182 4183 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 4184 objectid, root->root_key.objectid, 4185 dir_ino, &index, name, name_len); 4186 if (ret < 0) { 4187 if (ret != -ENOENT) { 4188 btrfs_abort_transaction(trans, ret); 4189 goto out; 4190 } 4191 di = btrfs_search_dir_index_item(root, path, dir_ino, 4192 name, name_len); 4193 if (IS_ERR_OR_NULL(di)) { 4194 if (!di) 4195 ret = -ENOENT; 4196 else 4197 ret = PTR_ERR(di); 4198 btrfs_abort_transaction(trans, ret); 4199 goto out; 4200 } 4201 4202 leaf = path->nodes[0]; 4203 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4204 btrfs_release_path(path); 4205 index = key.offset; 4206 } 4207 btrfs_release_path(path); 4208 4209 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 4210 if (ret) { 4211 btrfs_abort_transaction(trans, ret); 4212 goto out; 4213 } 4214 4215 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 4216 inode_inc_iversion(dir); 4217 dir->i_mtime = dir->i_ctime = current_time(dir); 4218 ret = btrfs_update_inode_fallback(trans, root, dir); 4219 if (ret) 4220 btrfs_abort_transaction(trans, ret); 4221 out: 4222 btrfs_free_path(path); 4223 return ret; 4224 } 4225 4226 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 4227 { 4228 struct inode *inode = d_inode(dentry); 4229 int err = 0; 4230 struct btrfs_root *root = BTRFS_I(dir)->root; 4231 struct btrfs_trans_handle *trans; 4232 u64 last_unlink_trans; 4233 4234 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 4235 return -ENOTEMPTY; 4236 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 4237 return -EPERM; 4238 4239 trans = __unlink_start_trans(dir); 4240 if (IS_ERR(trans)) 4241 return PTR_ERR(trans); 4242 4243 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 4244 err = btrfs_unlink_subvol(trans, root, dir, 4245 BTRFS_I(inode)->location.objectid, 4246 dentry->d_name.name, 4247 dentry->d_name.len); 4248 goto out; 4249 } 4250 4251 err = btrfs_orphan_add(trans, inode); 4252 if (err) 4253 goto out; 4254 4255 last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; 4256 4257 /* now the directory is empty */ 4258 err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), 4259 dentry->d_name.name, dentry->d_name.len); 4260 if (!err) { 4261 btrfs_i_size_write(inode, 0); 4262 /* 4263 * Propagate the last_unlink_trans value of the deleted dir to 4264 * its parent directory. This is to prevent an unrecoverable 4265 * log tree in the case we do something like this: 4266 * 1) create dir foo 4267 * 2) create snapshot under dir foo 4268 * 3) delete the snapshot 4269 * 4) rmdir foo 4270 * 5) mkdir foo 4271 * 6) fsync foo or some file inside foo 4272 */ 4273 if (last_unlink_trans >= trans->transid) 4274 BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; 4275 } 4276 out: 4277 btrfs_end_transaction(trans, root); 4278 btrfs_btree_balance_dirty(root); 4279 4280 return err; 4281 } 4282 4283 static int truncate_space_check(struct btrfs_trans_handle *trans, 4284 struct btrfs_root *root, 4285 u64 bytes_deleted) 4286 { 4287 int ret; 4288 4289 /* 4290 * This is only used to apply pressure to the enospc system, we don't 4291 * intend to use this reservation at all. 4292 */ 4293 bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); 4294 bytes_deleted *= root->nodesize; 4295 ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, 4296 bytes_deleted, BTRFS_RESERVE_NO_FLUSH); 4297 if (!ret) { 4298 trace_btrfs_space_reservation(root->fs_info, "transaction", 4299 trans->transid, 4300 bytes_deleted, 1); 4301 trans->bytes_reserved += bytes_deleted; 4302 } 4303 return ret; 4304 4305 } 4306 4307 static int truncate_inline_extent(struct inode *inode, 4308 struct btrfs_path *path, 4309 struct btrfs_key *found_key, 4310 const u64 item_end, 4311 const u64 new_size) 4312 { 4313 struct extent_buffer *leaf = path->nodes[0]; 4314 int slot = path->slots[0]; 4315 struct btrfs_file_extent_item *fi; 4316 u32 size = (u32)(new_size - found_key->offset); 4317 struct btrfs_root *root = BTRFS_I(inode)->root; 4318 4319 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4320 4321 if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { 4322 loff_t offset = new_size; 4323 loff_t page_end = ALIGN(offset, PAGE_SIZE); 4324 4325 /* 4326 * Zero out the remaining of the last page of our inline extent, 4327 * instead of directly truncating our inline extent here - that 4328 * would be much more complex (decompressing all the data, then 4329 * compressing the truncated data, which might be bigger than 4330 * the size of the inline extent, resize the extent, etc). 4331 * We release the path because to get the page we might need to 4332 * read the extent item from disk (data not in the page cache). 4333 */ 4334 btrfs_release_path(path); 4335 return btrfs_truncate_block(inode, offset, page_end - offset, 4336 0); 4337 } 4338 4339 btrfs_set_file_extent_ram_bytes(leaf, fi, size); 4340 size = btrfs_file_extent_calc_inline_size(size); 4341 btrfs_truncate_item(root, path, size, 1); 4342 4343 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4344 inode_sub_bytes(inode, item_end + 1 - new_size); 4345 4346 return 0; 4347 } 4348 4349 /* 4350 * this can truncate away extent items, csum items and directory items. 4351 * It starts at a high offset and removes keys until it can't find 4352 * any higher than new_size 4353 * 4354 * csum items that cross the new i_size are truncated to the new size 4355 * as well. 4356 * 4357 * min_type is the minimum key type to truncate down to. If set to 0, this 4358 * will kill all the items on this inode, including the INODE_ITEM_KEY. 4359 */ 4360 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 4361 struct btrfs_root *root, 4362 struct inode *inode, 4363 u64 new_size, u32 min_type) 4364 { 4365 struct btrfs_path *path; 4366 struct extent_buffer *leaf; 4367 struct btrfs_file_extent_item *fi; 4368 struct btrfs_key key; 4369 struct btrfs_key found_key; 4370 u64 extent_start = 0; 4371 u64 extent_num_bytes = 0; 4372 u64 extent_offset = 0; 4373 u64 item_end = 0; 4374 u64 last_size = new_size; 4375 u32 found_type = (u8)-1; 4376 int found_extent; 4377 int del_item; 4378 int pending_del_nr = 0; 4379 int pending_del_slot = 0; 4380 int extent_type = -1; 4381 int ret; 4382 int err = 0; 4383 u64 ino = btrfs_ino(inode); 4384 u64 bytes_deleted = 0; 4385 bool be_nice = 0; 4386 bool should_throttle = 0; 4387 bool should_end = 0; 4388 4389 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 4390 4391 /* 4392 * for non-free space inodes and ref cows, we want to back off from 4393 * time to time 4394 */ 4395 if (!btrfs_is_free_space_inode(inode) && 4396 test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4397 be_nice = 1; 4398 4399 path = btrfs_alloc_path(); 4400 if (!path) 4401 return -ENOMEM; 4402 path->reada = READA_BACK; 4403 4404 /* 4405 * We want to drop from the next block forward in case this new size is 4406 * not block aligned since we will be keeping the last block of the 4407 * extent just the way it is. 4408 */ 4409 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4410 root == root->fs_info->tree_root) 4411 btrfs_drop_extent_cache(inode, ALIGN(new_size, 4412 root->sectorsize), (u64)-1, 0); 4413 4414 /* 4415 * This function is also used to drop the items in the log tree before 4416 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 4417 * it is used to drop the loged items. So we shouldn't kill the delayed 4418 * items. 4419 */ 4420 if (min_type == 0 && root == BTRFS_I(inode)->root) 4421 btrfs_kill_delayed_inode_items(inode); 4422 4423 key.objectid = ino; 4424 key.offset = (u64)-1; 4425 key.type = (u8)-1; 4426 4427 search_again: 4428 /* 4429 * with a 16K leaf size and 128MB extents, you can actually queue 4430 * up a huge file in a single leaf. Most of the time that 4431 * bytes_deleted is > 0, it will be huge by the time we get here 4432 */ 4433 if (be_nice && bytes_deleted > SZ_32M) { 4434 if (btrfs_should_end_transaction(trans, root)) { 4435 err = -EAGAIN; 4436 goto error; 4437 } 4438 } 4439 4440 4441 path->leave_spinning = 1; 4442 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 4443 if (ret < 0) { 4444 err = ret; 4445 goto out; 4446 } 4447 4448 if (ret > 0) { 4449 /* there are no items in the tree for us to truncate, we're 4450 * done 4451 */ 4452 if (path->slots[0] == 0) 4453 goto out; 4454 path->slots[0]--; 4455 } 4456 4457 while (1) { 4458 fi = NULL; 4459 leaf = path->nodes[0]; 4460 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4461 found_type = found_key.type; 4462 4463 if (found_key.objectid != ino) 4464 break; 4465 4466 if (found_type < min_type) 4467 break; 4468 4469 item_end = found_key.offset; 4470 if (found_type == BTRFS_EXTENT_DATA_KEY) { 4471 fi = btrfs_item_ptr(leaf, path->slots[0], 4472 struct btrfs_file_extent_item); 4473 extent_type = btrfs_file_extent_type(leaf, fi); 4474 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4475 item_end += 4476 btrfs_file_extent_num_bytes(leaf, fi); 4477 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4478 item_end += btrfs_file_extent_inline_len(leaf, 4479 path->slots[0], fi); 4480 } 4481 item_end--; 4482 } 4483 if (found_type > min_type) { 4484 del_item = 1; 4485 } else { 4486 if (item_end < new_size) 4487 break; 4488 if (found_key.offset >= new_size) 4489 del_item = 1; 4490 else 4491 del_item = 0; 4492 } 4493 found_extent = 0; 4494 /* FIXME, shrink the extent if the ref count is only 1 */ 4495 if (found_type != BTRFS_EXTENT_DATA_KEY) 4496 goto delete; 4497 4498 if (del_item) 4499 last_size = found_key.offset; 4500 else 4501 last_size = new_size; 4502 4503 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4504 u64 num_dec; 4505 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 4506 if (!del_item) { 4507 u64 orig_num_bytes = 4508 btrfs_file_extent_num_bytes(leaf, fi); 4509 extent_num_bytes = ALIGN(new_size - 4510 found_key.offset, 4511 root->sectorsize); 4512 btrfs_set_file_extent_num_bytes(leaf, fi, 4513 extent_num_bytes); 4514 num_dec = (orig_num_bytes - 4515 extent_num_bytes); 4516 if (test_bit(BTRFS_ROOT_REF_COWS, 4517 &root->state) && 4518 extent_start != 0) 4519 inode_sub_bytes(inode, num_dec); 4520 btrfs_mark_buffer_dirty(leaf); 4521 } else { 4522 extent_num_bytes = 4523 btrfs_file_extent_disk_num_bytes(leaf, 4524 fi); 4525 extent_offset = found_key.offset - 4526 btrfs_file_extent_offset(leaf, fi); 4527 4528 /* FIXME blocksize != 4096 */ 4529 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 4530 if (extent_start != 0) { 4531 found_extent = 1; 4532 if (test_bit(BTRFS_ROOT_REF_COWS, 4533 &root->state)) 4534 inode_sub_bytes(inode, num_dec); 4535 } 4536 } 4537 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4538 /* 4539 * we can't truncate inline items that have had 4540 * special encodings 4541 */ 4542 if (!del_item && 4543 btrfs_file_extent_encryption(leaf, fi) == 0 && 4544 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 4545 4546 /* 4547 * Need to release path in order to truncate a 4548 * compressed extent. So delete any accumulated 4549 * extent items so far. 4550 */ 4551 if (btrfs_file_extent_compression(leaf, fi) != 4552 BTRFS_COMPRESS_NONE && pending_del_nr) { 4553 err = btrfs_del_items(trans, root, path, 4554 pending_del_slot, 4555 pending_del_nr); 4556 if (err) { 4557 btrfs_abort_transaction(trans, 4558 err); 4559 goto error; 4560 } 4561 pending_del_nr = 0; 4562 } 4563 4564 err = truncate_inline_extent(inode, path, 4565 &found_key, 4566 item_end, 4567 new_size); 4568 if (err) { 4569 btrfs_abort_transaction(trans, err); 4570 goto error; 4571 } 4572 } else if (test_bit(BTRFS_ROOT_REF_COWS, 4573 &root->state)) { 4574 inode_sub_bytes(inode, item_end + 1 - new_size); 4575 } 4576 } 4577 delete: 4578 if (del_item) { 4579 if (!pending_del_nr) { 4580 /* no pending yet, add ourselves */ 4581 pending_del_slot = path->slots[0]; 4582 pending_del_nr = 1; 4583 } else if (pending_del_nr && 4584 path->slots[0] + 1 == pending_del_slot) { 4585 /* hop on the pending chunk */ 4586 pending_del_nr++; 4587 pending_del_slot = path->slots[0]; 4588 } else { 4589 BUG(); 4590 } 4591 } else { 4592 break; 4593 } 4594 should_throttle = 0; 4595 4596 if (found_extent && 4597 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4598 root == root->fs_info->tree_root)) { 4599 btrfs_set_path_blocking(path); 4600 bytes_deleted += extent_num_bytes; 4601 ret = btrfs_free_extent(trans, root, extent_start, 4602 extent_num_bytes, 0, 4603 btrfs_header_owner(leaf), 4604 ino, extent_offset); 4605 BUG_ON(ret); 4606 if (btrfs_should_throttle_delayed_refs(trans, root)) 4607 btrfs_async_run_delayed_refs(root, 4608 trans->transid, 4609 trans->delayed_ref_updates * 2, 0); 4610 if (be_nice) { 4611 if (truncate_space_check(trans, root, 4612 extent_num_bytes)) { 4613 should_end = 1; 4614 } 4615 if (btrfs_should_throttle_delayed_refs(trans, 4616 root)) { 4617 should_throttle = 1; 4618 } 4619 } 4620 } 4621 4622 if (found_type == BTRFS_INODE_ITEM_KEY) 4623 break; 4624 4625 if (path->slots[0] == 0 || 4626 path->slots[0] != pending_del_slot || 4627 should_throttle || should_end) { 4628 if (pending_del_nr) { 4629 ret = btrfs_del_items(trans, root, path, 4630 pending_del_slot, 4631 pending_del_nr); 4632 if (ret) { 4633 btrfs_abort_transaction(trans, ret); 4634 goto error; 4635 } 4636 pending_del_nr = 0; 4637 } 4638 btrfs_release_path(path); 4639 if (should_throttle) { 4640 unsigned long updates = trans->delayed_ref_updates; 4641 if (updates) { 4642 trans->delayed_ref_updates = 0; 4643 ret = btrfs_run_delayed_refs(trans, root, updates * 2); 4644 if (ret && !err) 4645 err = ret; 4646 } 4647 } 4648 /* 4649 * if we failed to refill our space rsv, bail out 4650 * and let the transaction restart 4651 */ 4652 if (should_end) { 4653 err = -EAGAIN; 4654 goto error; 4655 } 4656 goto search_again; 4657 } else { 4658 path->slots[0]--; 4659 } 4660 } 4661 out: 4662 if (pending_del_nr) { 4663 ret = btrfs_del_items(trans, root, path, pending_del_slot, 4664 pending_del_nr); 4665 if (ret) 4666 btrfs_abort_transaction(trans, ret); 4667 } 4668 error: 4669 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4670 btrfs_ordered_update_i_size(inode, last_size, NULL); 4671 4672 btrfs_free_path(path); 4673 4674 if (be_nice && bytes_deleted > SZ_32M) { 4675 unsigned long updates = trans->delayed_ref_updates; 4676 if (updates) { 4677 trans->delayed_ref_updates = 0; 4678 ret = btrfs_run_delayed_refs(trans, root, updates * 2); 4679 if (ret && !err) 4680 err = ret; 4681 } 4682 } 4683 return err; 4684 } 4685 4686 /* 4687 * btrfs_truncate_block - read, zero a chunk and write a block 4688 * @inode - inode that we're zeroing 4689 * @from - the offset to start zeroing 4690 * @len - the length to zero, 0 to zero the entire range respective to the 4691 * offset 4692 * @front - zero up to the offset instead of from the offset on 4693 * 4694 * This will find the block for the "from" offset and cow the block and zero the 4695 * part we want to zero. This is used with truncate and hole punching. 4696 */ 4697 int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, 4698 int front) 4699 { 4700 struct address_space *mapping = inode->i_mapping; 4701 struct btrfs_root *root = BTRFS_I(inode)->root; 4702 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4703 struct btrfs_ordered_extent *ordered; 4704 struct extent_state *cached_state = NULL; 4705 char *kaddr; 4706 u32 blocksize = root->sectorsize; 4707 pgoff_t index = from >> PAGE_SHIFT; 4708 unsigned offset = from & (blocksize - 1); 4709 struct page *page; 4710 gfp_t mask = btrfs_alloc_write_mask(mapping); 4711 int ret = 0; 4712 u64 block_start; 4713 u64 block_end; 4714 4715 if ((offset & (blocksize - 1)) == 0 && 4716 (!len || ((len & (blocksize - 1)) == 0))) 4717 goto out; 4718 4719 ret = btrfs_delalloc_reserve_space(inode, 4720 round_down(from, blocksize), blocksize); 4721 if (ret) 4722 goto out; 4723 4724 again: 4725 page = find_or_create_page(mapping, index, mask); 4726 if (!page) { 4727 btrfs_delalloc_release_space(inode, 4728 round_down(from, blocksize), 4729 blocksize); 4730 ret = -ENOMEM; 4731 goto out; 4732 } 4733 4734 block_start = round_down(from, blocksize); 4735 block_end = block_start + blocksize - 1; 4736 4737 if (!PageUptodate(page)) { 4738 ret = btrfs_readpage(NULL, page); 4739 lock_page(page); 4740 if (page->mapping != mapping) { 4741 unlock_page(page); 4742 put_page(page); 4743 goto again; 4744 } 4745 if (!PageUptodate(page)) { 4746 ret = -EIO; 4747 goto out_unlock; 4748 } 4749 } 4750 wait_on_page_writeback(page); 4751 4752 lock_extent_bits(io_tree, block_start, block_end, &cached_state); 4753 set_page_extent_mapped(page); 4754 4755 ordered = btrfs_lookup_ordered_extent(inode, block_start); 4756 if (ordered) { 4757 unlock_extent_cached(io_tree, block_start, block_end, 4758 &cached_state, GFP_NOFS); 4759 unlock_page(page); 4760 put_page(page); 4761 btrfs_start_ordered_extent(inode, ordered, 1); 4762 btrfs_put_ordered_extent(ordered); 4763 goto again; 4764 } 4765 4766 clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, 4767 EXTENT_DIRTY | EXTENT_DELALLOC | 4768 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 4769 0, 0, &cached_state, GFP_NOFS); 4770 4771 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 4772 &cached_state, 0); 4773 if (ret) { 4774 unlock_extent_cached(io_tree, block_start, block_end, 4775 &cached_state, GFP_NOFS); 4776 goto out_unlock; 4777 } 4778 4779 if (offset != blocksize) { 4780 if (!len) 4781 len = blocksize - offset; 4782 kaddr = kmap(page); 4783 if (front) 4784 memset(kaddr + (block_start - page_offset(page)), 4785 0, offset); 4786 else 4787 memset(kaddr + (block_start - page_offset(page)) + offset, 4788 0, len); 4789 flush_dcache_page(page); 4790 kunmap(page); 4791 } 4792 ClearPageChecked(page); 4793 set_page_dirty(page); 4794 unlock_extent_cached(io_tree, block_start, block_end, &cached_state, 4795 GFP_NOFS); 4796 4797 out_unlock: 4798 if (ret) 4799 btrfs_delalloc_release_space(inode, block_start, 4800 blocksize); 4801 unlock_page(page); 4802 put_page(page); 4803 out: 4804 return ret; 4805 } 4806 4807 static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, 4808 u64 offset, u64 len) 4809 { 4810 struct btrfs_trans_handle *trans; 4811 int ret; 4812 4813 /* 4814 * Still need to make sure the inode looks like it's been updated so 4815 * that any holes get logged if we fsync. 4816 */ 4817 if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) { 4818 BTRFS_I(inode)->last_trans = root->fs_info->generation; 4819 BTRFS_I(inode)->last_sub_trans = root->log_transid; 4820 BTRFS_I(inode)->last_log_commit = root->last_log_commit; 4821 return 0; 4822 } 4823 4824 /* 4825 * 1 - for the one we're dropping 4826 * 1 - for the one we're adding 4827 * 1 - for updating the inode. 4828 */ 4829 trans = btrfs_start_transaction(root, 3); 4830 if (IS_ERR(trans)) 4831 return PTR_ERR(trans); 4832 4833 ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); 4834 if (ret) { 4835 btrfs_abort_transaction(trans, ret); 4836 btrfs_end_transaction(trans, root); 4837 return ret; 4838 } 4839 4840 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 4841 0, 0, len, 0, len, 0, 0, 0); 4842 if (ret) 4843 btrfs_abort_transaction(trans, ret); 4844 else 4845 btrfs_update_inode(trans, root, inode); 4846 btrfs_end_transaction(trans, root); 4847 return ret; 4848 } 4849 4850 /* 4851 * This function puts in dummy file extents for the area we're creating a hole 4852 * for. So if we are truncating this file to a larger size we need to insert 4853 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 4854 * the range between oldsize and size 4855 */ 4856 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 4857 { 4858 struct btrfs_root *root = BTRFS_I(inode)->root; 4859 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4860 struct extent_map *em = NULL; 4861 struct extent_state *cached_state = NULL; 4862 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4863 u64 hole_start = ALIGN(oldsize, root->sectorsize); 4864 u64 block_end = ALIGN(size, root->sectorsize); 4865 u64 last_byte; 4866 u64 cur_offset; 4867 u64 hole_size; 4868 int err = 0; 4869 4870 /* 4871 * If our size started in the middle of a block we need to zero out the 4872 * rest of the block before we expand the i_size, otherwise we could 4873 * expose stale data. 4874 */ 4875 err = btrfs_truncate_block(inode, oldsize, 0, 0); 4876 if (err) 4877 return err; 4878 4879 if (size <= hole_start) 4880 return 0; 4881 4882 while (1) { 4883 struct btrfs_ordered_extent *ordered; 4884 4885 lock_extent_bits(io_tree, hole_start, block_end - 1, 4886 &cached_state); 4887 ordered = btrfs_lookup_ordered_range(inode, hole_start, 4888 block_end - hole_start); 4889 if (!ordered) 4890 break; 4891 unlock_extent_cached(io_tree, hole_start, block_end - 1, 4892 &cached_state, GFP_NOFS); 4893 btrfs_start_ordered_extent(inode, ordered, 1); 4894 btrfs_put_ordered_extent(ordered); 4895 } 4896 4897 cur_offset = hole_start; 4898 while (1) { 4899 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 4900 block_end - cur_offset, 0); 4901 if (IS_ERR(em)) { 4902 err = PTR_ERR(em); 4903 em = NULL; 4904 break; 4905 } 4906 last_byte = min(extent_map_end(em), block_end); 4907 last_byte = ALIGN(last_byte , root->sectorsize); 4908 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 4909 struct extent_map *hole_em; 4910 hole_size = last_byte - cur_offset; 4911 4912 err = maybe_insert_hole(root, inode, cur_offset, 4913 hole_size); 4914 if (err) 4915 break; 4916 btrfs_drop_extent_cache(inode, cur_offset, 4917 cur_offset + hole_size - 1, 0); 4918 hole_em = alloc_extent_map(); 4919 if (!hole_em) { 4920 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4921 &BTRFS_I(inode)->runtime_flags); 4922 goto next; 4923 } 4924 hole_em->start = cur_offset; 4925 hole_em->len = hole_size; 4926 hole_em->orig_start = cur_offset; 4927 4928 hole_em->block_start = EXTENT_MAP_HOLE; 4929 hole_em->block_len = 0; 4930 hole_em->orig_block_len = 0; 4931 hole_em->ram_bytes = hole_size; 4932 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 4933 hole_em->compress_type = BTRFS_COMPRESS_NONE; 4934 hole_em->generation = root->fs_info->generation; 4935 4936 while (1) { 4937 write_lock(&em_tree->lock); 4938 err = add_extent_mapping(em_tree, hole_em, 1); 4939 write_unlock(&em_tree->lock); 4940 if (err != -EEXIST) 4941 break; 4942 btrfs_drop_extent_cache(inode, cur_offset, 4943 cur_offset + 4944 hole_size - 1, 0); 4945 } 4946 free_extent_map(hole_em); 4947 } 4948 next: 4949 free_extent_map(em); 4950 em = NULL; 4951 cur_offset = last_byte; 4952 if (cur_offset >= block_end) 4953 break; 4954 } 4955 free_extent_map(em); 4956 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 4957 GFP_NOFS); 4958 return err; 4959 } 4960 4961 static int btrfs_setsize(struct inode *inode, struct iattr *attr) 4962 { 4963 struct btrfs_root *root = BTRFS_I(inode)->root; 4964 struct btrfs_trans_handle *trans; 4965 loff_t oldsize = i_size_read(inode); 4966 loff_t newsize = attr->ia_size; 4967 int mask = attr->ia_valid; 4968 int ret; 4969 4970 /* 4971 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a 4972 * special case where we need to update the times despite not having 4973 * these flags set. For all other operations the VFS set these flags 4974 * explicitly if it wants a timestamp update. 4975 */ 4976 if (newsize != oldsize) { 4977 inode_inc_iversion(inode); 4978 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) 4979 inode->i_ctime = inode->i_mtime = 4980 current_time(inode); 4981 } 4982 4983 if (newsize > oldsize) { 4984 /* 4985 * Don't do an expanding truncate while snapshoting is ongoing. 4986 * This is to ensure the snapshot captures a fully consistent 4987 * state of this file - if the snapshot captures this expanding 4988 * truncation, it must capture all writes that happened before 4989 * this truncation. 4990 */ 4991 btrfs_wait_for_snapshot_creation(root); 4992 ret = btrfs_cont_expand(inode, oldsize, newsize); 4993 if (ret) { 4994 btrfs_end_write_no_snapshoting(root); 4995 return ret; 4996 } 4997 4998 trans = btrfs_start_transaction(root, 1); 4999 if (IS_ERR(trans)) { 5000 btrfs_end_write_no_snapshoting(root); 5001 return PTR_ERR(trans); 5002 } 5003 5004 i_size_write(inode, newsize); 5005 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 5006 pagecache_isize_extended(inode, oldsize, newsize); 5007 ret = btrfs_update_inode(trans, root, inode); 5008 btrfs_end_write_no_snapshoting(root); 5009 btrfs_end_transaction(trans, root); 5010 } else { 5011 5012 /* 5013 * We're truncating a file that used to have good data down to 5014 * zero. Make sure it gets into the ordered flush list so that 5015 * any new writes get down to disk quickly. 5016 */ 5017 if (newsize == 0) 5018 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 5019 &BTRFS_I(inode)->runtime_flags); 5020 5021 /* 5022 * 1 for the orphan item we're going to add 5023 * 1 for the orphan item deletion. 5024 */ 5025 trans = btrfs_start_transaction(root, 2); 5026 if (IS_ERR(trans)) 5027 return PTR_ERR(trans); 5028 5029 /* 5030 * We need to do this in case we fail at _any_ point during the 5031 * actual truncate. Once we do the truncate_setsize we could 5032 * invalidate pages which forces any outstanding ordered io to 5033 * be instantly completed which will give us extents that need 5034 * to be truncated. If we fail to get an orphan inode down we 5035 * could have left over extents that were never meant to live, 5036 * so we need to guarantee from this point on that everything 5037 * will be consistent. 5038 */ 5039 ret = btrfs_orphan_add(trans, inode); 5040 btrfs_end_transaction(trans, root); 5041 if (ret) 5042 return ret; 5043 5044 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 5045 truncate_setsize(inode, newsize); 5046 5047 /* Disable nonlocked read DIO to avoid the end less truncate */ 5048 btrfs_inode_block_unlocked_dio(inode); 5049 inode_dio_wait(inode); 5050 btrfs_inode_resume_unlocked_dio(inode); 5051 5052 ret = btrfs_truncate(inode); 5053 if (ret && inode->i_nlink) { 5054 int err; 5055 5056 /* 5057 * failed to truncate, disk_i_size is only adjusted down 5058 * as we remove extents, so it should represent the true 5059 * size of the inode, so reset the in memory size and 5060 * delete our orphan entry. 5061 */ 5062 trans = btrfs_join_transaction(root); 5063 if (IS_ERR(trans)) { 5064 btrfs_orphan_del(NULL, inode); 5065 return ret; 5066 } 5067 i_size_write(inode, BTRFS_I(inode)->disk_i_size); 5068 err = btrfs_orphan_del(trans, inode); 5069 if (err) 5070 btrfs_abort_transaction(trans, err); 5071 btrfs_end_transaction(trans, root); 5072 } 5073 } 5074 5075 return ret; 5076 } 5077 5078 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 5079 { 5080 struct inode *inode = d_inode(dentry); 5081 struct btrfs_root *root = BTRFS_I(inode)->root; 5082 int err; 5083 5084 if (btrfs_root_readonly(root)) 5085 return -EROFS; 5086 5087 err = setattr_prepare(dentry, attr); 5088 if (err) 5089 return err; 5090 5091 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 5092 err = btrfs_setsize(inode, attr); 5093 if (err) 5094 return err; 5095 } 5096 5097 if (attr->ia_valid) { 5098 setattr_copy(inode, attr); 5099 inode_inc_iversion(inode); 5100 err = btrfs_dirty_inode(inode); 5101 5102 if (!err && attr->ia_valid & ATTR_MODE) 5103 err = posix_acl_chmod(inode, inode->i_mode); 5104 } 5105 5106 return err; 5107 } 5108 5109 /* 5110 * While truncating the inode pages during eviction, we get the VFS calling 5111 * btrfs_invalidatepage() against each page of the inode. This is slow because 5112 * the calls to btrfs_invalidatepage() result in a huge amount of calls to 5113 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting 5114 * extent_state structures over and over, wasting lots of time. 5115 * 5116 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all 5117 * those expensive operations on a per page basis and do only the ordered io 5118 * finishing, while we release here the extent_map and extent_state structures, 5119 * without the excessive merging and splitting. 5120 */ 5121 static void evict_inode_truncate_pages(struct inode *inode) 5122 { 5123 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5124 struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; 5125 struct rb_node *node; 5126 5127 ASSERT(inode->i_state & I_FREEING); 5128 truncate_inode_pages_final(&inode->i_data); 5129 5130 write_lock(&map_tree->lock); 5131 while (!RB_EMPTY_ROOT(&map_tree->map)) { 5132 struct extent_map *em; 5133 5134 node = rb_first(&map_tree->map); 5135 em = rb_entry(node, struct extent_map, rb_node); 5136 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 5137 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 5138 remove_extent_mapping(map_tree, em); 5139 free_extent_map(em); 5140 if (need_resched()) { 5141 write_unlock(&map_tree->lock); 5142 cond_resched(); 5143 write_lock(&map_tree->lock); 5144 } 5145 } 5146 write_unlock(&map_tree->lock); 5147 5148 /* 5149 * Keep looping until we have no more ranges in the io tree. 5150 * We can have ongoing bios started by readpages (called from readahead) 5151 * that have their endio callback (extent_io.c:end_bio_extent_readpage) 5152 * still in progress (unlocked the pages in the bio but did not yet 5153 * unlocked the ranges in the io tree). Therefore this means some 5154 * ranges can still be locked and eviction started because before 5155 * submitting those bios, which are executed by a separate task (work 5156 * queue kthread), inode references (inode->i_count) were not taken 5157 * (which would be dropped in the end io callback of each bio). 5158 * Therefore here we effectively end up waiting for those bios and 5159 * anyone else holding locked ranges without having bumped the inode's 5160 * reference count - if we don't do it, when they access the inode's 5161 * io_tree to unlock a range it may be too late, leading to an 5162 * use-after-free issue. 5163 */ 5164 spin_lock(&io_tree->lock); 5165 while (!RB_EMPTY_ROOT(&io_tree->state)) { 5166 struct extent_state *state; 5167 struct extent_state *cached_state = NULL; 5168 u64 start; 5169 u64 end; 5170 5171 node = rb_first(&io_tree->state); 5172 state = rb_entry(node, struct extent_state, rb_node); 5173 start = state->start; 5174 end = state->end; 5175 spin_unlock(&io_tree->lock); 5176 5177 lock_extent_bits(io_tree, start, end, &cached_state); 5178 5179 /* 5180 * If still has DELALLOC flag, the extent didn't reach disk, 5181 * and its reserved space won't be freed by delayed_ref. 5182 * So we need to free its reserved space here. 5183 * (Refer to comment in btrfs_invalidatepage, case 2) 5184 * 5185 * Note, end is the bytenr of last byte, so we need + 1 here. 5186 */ 5187 if (state->state & EXTENT_DELALLOC) 5188 btrfs_qgroup_free_data(inode, start, end - start + 1); 5189 5190 clear_extent_bit(io_tree, start, end, 5191 EXTENT_LOCKED | EXTENT_DIRTY | 5192 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 5193 EXTENT_DEFRAG, 1, 1, 5194 &cached_state, GFP_NOFS); 5195 5196 cond_resched(); 5197 spin_lock(&io_tree->lock); 5198 } 5199 spin_unlock(&io_tree->lock); 5200 } 5201 5202 void btrfs_evict_inode(struct inode *inode) 5203 { 5204 struct btrfs_trans_handle *trans; 5205 struct btrfs_root *root = BTRFS_I(inode)->root; 5206 struct btrfs_block_rsv *rsv, *global_rsv; 5207 int steal_from_global = 0; 5208 u64 min_size; 5209 int ret; 5210 5211 trace_btrfs_inode_evict(inode); 5212 5213 if (!root) { 5214 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 5215 return; 5216 } 5217 5218 min_size = btrfs_calc_trunc_metadata_size(root, 1); 5219 5220 evict_inode_truncate_pages(inode); 5221 5222 if (inode->i_nlink && 5223 ((btrfs_root_refs(&root->root_item) != 0 && 5224 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || 5225 btrfs_is_free_space_inode(inode))) 5226 goto no_delete; 5227 5228 if (is_bad_inode(inode)) { 5229 btrfs_orphan_del(NULL, inode); 5230 goto no_delete; 5231 } 5232 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 5233 if (!special_file(inode->i_mode)) 5234 btrfs_wait_ordered_range(inode, 0, (u64)-1); 5235 5236 btrfs_free_io_failure_record(inode, 0, (u64)-1); 5237 5238 if (test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { 5239 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 5240 &BTRFS_I(inode)->runtime_flags)); 5241 goto no_delete; 5242 } 5243 5244 if (inode->i_nlink > 0) { 5245 BUG_ON(btrfs_root_refs(&root->root_item) != 0 && 5246 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); 5247 goto no_delete; 5248 } 5249 5250 ret = btrfs_commit_inode_delayed_inode(inode); 5251 if (ret) { 5252 btrfs_orphan_del(NULL, inode); 5253 goto no_delete; 5254 } 5255 5256 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 5257 if (!rsv) { 5258 btrfs_orphan_del(NULL, inode); 5259 goto no_delete; 5260 } 5261 rsv->size = min_size; 5262 rsv->failfast = 1; 5263 global_rsv = &root->fs_info->global_block_rsv; 5264 5265 btrfs_i_size_write(inode, 0); 5266 5267 /* 5268 * This is a bit simpler than btrfs_truncate since we've already 5269 * reserved our space for our orphan item in the unlink, so we just 5270 * need to reserve some slack space in case we add bytes and update 5271 * inode item when doing the truncate. 5272 */ 5273 while (1) { 5274 ret = btrfs_block_rsv_refill(root, rsv, min_size, 5275 BTRFS_RESERVE_FLUSH_LIMIT); 5276 5277 /* 5278 * Try and steal from the global reserve since we will 5279 * likely not use this space anyway, we want to try as 5280 * hard as possible to get this to work. 5281 */ 5282 if (ret) 5283 steal_from_global++; 5284 else 5285 steal_from_global = 0; 5286 ret = 0; 5287 5288 /* 5289 * steal_from_global == 0: we reserved stuff, hooray! 5290 * steal_from_global == 1: we didn't reserve stuff, boo! 5291 * steal_from_global == 2: we've committed, still not a lot of 5292 * room but maybe we'll have room in the global reserve this 5293 * time. 5294 * steal_from_global == 3: abandon all hope! 5295 */ 5296 if (steal_from_global > 2) { 5297 btrfs_warn(root->fs_info, 5298 "Could not get space for a delete, will truncate on mount %d", 5299 ret); 5300 btrfs_orphan_del(NULL, inode); 5301 btrfs_free_block_rsv(root, rsv); 5302 goto no_delete; 5303 } 5304 5305 trans = btrfs_join_transaction(root); 5306 if (IS_ERR(trans)) { 5307 btrfs_orphan_del(NULL, inode); 5308 btrfs_free_block_rsv(root, rsv); 5309 goto no_delete; 5310 } 5311 5312 /* 5313 * We can't just steal from the global reserve, we need to make 5314 * sure there is room to do it, if not we need to commit and try 5315 * again. 5316 */ 5317 if (steal_from_global) { 5318 if (!btrfs_check_space_for_delayed_refs(trans, root)) 5319 ret = btrfs_block_rsv_migrate(global_rsv, rsv, 5320 min_size, 0); 5321 else 5322 ret = -ENOSPC; 5323 } 5324 5325 /* 5326 * Couldn't steal from the global reserve, we have too much 5327 * pending stuff built up, commit the transaction and try it 5328 * again. 5329 */ 5330 if (ret) { 5331 ret = btrfs_commit_transaction(trans, root); 5332 if (ret) { 5333 btrfs_orphan_del(NULL, inode); 5334 btrfs_free_block_rsv(root, rsv); 5335 goto no_delete; 5336 } 5337 continue; 5338 } else { 5339 steal_from_global = 0; 5340 } 5341 5342 trans->block_rsv = rsv; 5343 5344 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 5345 if (ret != -ENOSPC && ret != -EAGAIN) 5346 break; 5347 5348 trans->block_rsv = &root->fs_info->trans_block_rsv; 5349 btrfs_end_transaction(trans, root); 5350 trans = NULL; 5351 btrfs_btree_balance_dirty(root); 5352 } 5353 5354 btrfs_free_block_rsv(root, rsv); 5355 5356 /* 5357 * Errors here aren't a big deal, it just means we leave orphan items 5358 * in the tree. They will be cleaned up on the next mount. 5359 */ 5360 if (ret == 0) { 5361 trans->block_rsv = root->orphan_block_rsv; 5362 btrfs_orphan_del(trans, inode); 5363 } else { 5364 btrfs_orphan_del(NULL, inode); 5365 } 5366 5367 trans->block_rsv = &root->fs_info->trans_block_rsv; 5368 if (!(root == root->fs_info->tree_root || 5369 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 5370 btrfs_return_ino(root, btrfs_ino(inode)); 5371 5372 btrfs_end_transaction(trans, root); 5373 btrfs_btree_balance_dirty(root); 5374 no_delete: 5375 btrfs_remove_delayed_node(inode); 5376 clear_inode(inode); 5377 } 5378 5379 /* 5380 * this returns the key found in the dir entry in the location pointer. 5381 * If no dir entries were found, location->objectid is 0. 5382 */ 5383 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 5384 struct btrfs_key *location) 5385 { 5386 const char *name = dentry->d_name.name; 5387 int namelen = dentry->d_name.len; 5388 struct btrfs_dir_item *di; 5389 struct btrfs_path *path; 5390 struct btrfs_root *root = BTRFS_I(dir)->root; 5391 int ret = 0; 5392 5393 path = btrfs_alloc_path(); 5394 if (!path) 5395 return -ENOMEM; 5396 5397 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 5398 namelen, 0); 5399 if (IS_ERR(di)) 5400 ret = PTR_ERR(di); 5401 5402 if (IS_ERR_OR_NULL(di)) 5403 goto out_err; 5404 5405 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 5406 out: 5407 btrfs_free_path(path); 5408 return ret; 5409 out_err: 5410 location->objectid = 0; 5411 goto out; 5412 } 5413 5414 /* 5415 * when we hit a tree root in a directory, the btrfs part of the inode 5416 * needs to be changed to reflect the root directory of the tree root. This 5417 * is kind of like crossing a mount point. 5418 */ 5419 static int fixup_tree_root_location(struct btrfs_root *root, 5420 struct inode *dir, 5421 struct dentry *dentry, 5422 struct btrfs_key *location, 5423 struct btrfs_root **sub_root) 5424 { 5425 struct btrfs_path *path; 5426 struct btrfs_root *new_root; 5427 struct btrfs_root_ref *ref; 5428 struct extent_buffer *leaf; 5429 struct btrfs_key key; 5430 int ret; 5431 int err = 0; 5432 5433 path = btrfs_alloc_path(); 5434 if (!path) { 5435 err = -ENOMEM; 5436 goto out; 5437 } 5438 5439 err = -ENOENT; 5440 key.objectid = BTRFS_I(dir)->root->root_key.objectid; 5441 key.type = BTRFS_ROOT_REF_KEY; 5442 key.offset = location->objectid; 5443 5444 ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path, 5445 0, 0); 5446 if (ret) { 5447 if (ret < 0) 5448 err = ret; 5449 goto out; 5450 } 5451 5452 leaf = path->nodes[0]; 5453 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 5454 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || 5455 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 5456 goto out; 5457 5458 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 5459 (unsigned long)(ref + 1), 5460 dentry->d_name.len); 5461 if (ret) 5462 goto out; 5463 5464 btrfs_release_path(path); 5465 5466 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 5467 if (IS_ERR(new_root)) { 5468 err = PTR_ERR(new_root); 5469 goto out; 5470 } 5471 5472 *sub_root = new_root; 5473 location->objectid = btrfs_root_dirid(&new_root->root_item); 5474 location->type = BTRFS_INODE_ITEM_KEY; 5475 location->offset = 0; 5476 err = 0; 5477 out: 5478 btrfs_free_path(path); 5479 return err; 5480 } 5481 5482 static void inode_tree_add(struct inode *inode) 5483 { 5484 struct btrfs_root *root = BTRFS_I(inode)->root; 5485 struct btrfs_inode *entry; 5486 struct rb_node **p; 5487 struct rb_node *parent; 5488 struct rb_node *new = &BTRFS_I(inode)->rb_node; 5489 u64 ino = btrfs_ino(inode); 5490 5491 if (inode_unhashed(inode)) 5492 return; 5493 parent = NULL; 5494 spin_lock(&root->inode_lock); 5495 p = &root->inode_tree.rb_node; 5496 while (*p) { 5497 parent = *p; 5498 entry = rb_entry(parent, struct btrfs_inode, rb_node); 5499 5500 if (ino < btrfs_ino(&entry->vfs_inode)) 5501 p = &parent->rb_left; 5502 else if (ino > btrfs_ino(&entry->vfs_inode)) 5503 p = &parent->rb_right; 5504 else { 5505 WARN_ON(!(entry->vfs_inode.i_state & 5506 (I_WILL_FREE | I_FREEING))); 5507 rb_replace_node(parent, new, &root->inode_tree); 5508 RB_CLEAR_NODE(parent); 5509 spin_unlock(&root->inode_lock); 5510 return; 5511 } 5512 } 5513 rb_link_node(new, parent, p); 5514 rb_insert_color(new, &root->inode_tree); 5515 spin_unlock(&root->inode_lock); 5516 } 5517 5518 static void inode_tree_del(struct inode *inode) 5519 { 5520 struct btrfs_root *root = BTRFS_I(inode)->root; 5521 int empty = 0; 5522 5523 spin_lock(&root->inode_lock); 5524 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 5525 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 5526 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 5527 empty = RB_EMPTY_ROOT(&root->inode_tree); 5528 } 5529 spin_unlock(&root->inode_lock); 5530 5531 if (empty && btrfs_root_refs(&root->root_item) == 0) { 5532 synchronize_srcu(&root->fs_info->subvol_srcu); 5533 spin_lock(&root->inode_lock); 5534 empty = RB_EMPTY_ROOT(&root->inode_tree); 5535 spin_unlock(&root->inode_lock); 5536 if (empty) 5537 btrfs_add_dead_root(root); 5538 } 5539 } 5540 5541 void btrfs_invalidate_inodes(struct btrfs_root *root) 5542 { 5543 struct rb_node *node; 5544 struct rb_node *prev; 5545 struct btrfs_inode *entry; 5546 struct inode *inode; 5547 u64 objectid = 0; 5548 5549 if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 5550 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 5551 5552 spin_lock(&root->inode_lock); 5553 again: 5554 node = root->inode_tree.rb_node; 5555 prev = NULL; 5556 while (node) { 5557 prev = node; 5558 entry = rb_entry(node, struct btrfs_inode, rb_node); 5559 5560 if (objectid < btrfs_ino(&entry->vfs_inode)) 5561 node = node->rb_left; 5562 else if (objectid > btrfs_ino(&entry->vfs_inode)) 5563 node = node->rb_right; 5564 else 5565 break; 5566 } 5567 if (!node) { 5568 while (prev) { 5569 entry = rb_entry(prev, struct btrfs_inode, rb_node); 5570 if (objectid <= btrfs_ino(&entry->vfs_inode)) { 5571 node = prev; 5572 break; 5573 } 5574 prev = rb_next(prev); 5575 } 5576 } 5577 while (node) { 5578 entry = rb_entry(node, struct btrfs_inode, rb_node); 5579 objectid = btrfs_ino(&entry->vfs_inode) + 1; 5580 inode = igrab(&entry->vfs_inode); 5581 if (inode) { 5582 spin_unlock(&root->inode_lock); 5583 if (atomic_read(&inode->i_count) > 1) 5584 d_prune_aliases(inode); 5585 /* 5586 * btrfs_drop_inode will have it removed from 5587 * the inode cache when its usage count 5588 * hits zero. 5589 */ 5590 iput(inode); 5591 cond_resched(); 5592 spin_lock(&root->inode_lock); 5593 goto again; 5594 } 5595 5596 if (cond_resched_lock(&root->inode_lock)) 5597 goto again; 5598 5599 node = rb_next(node); 5600 } 5601 spin_unlock(&root->inode_lock); 5602 } 5603 5604 static int btrfs_init_locked_inode(struct inode *inode, void *p) 5605 { 5606 struct btrfs_iget_args *args = p; 5607 inode->i_ino = args->location->objectid; 5608 memcpy(&BTRFS_I(inode)->location, args->location, 5609 sizeof(*args->location)); 5610 BTRFS_I(inode)->root = args->root; 5611 return 0; 5612 } 5613 5614 static int btrfs_find_actor(struct inode *inode, void *opaque) 5615 { 5616 struct btrfs_iget_args *args = opaque; 5617 return args->location->objectid == BTRFS_I(inode)->location.objectid && 5618 args->root == BTRFS_I(inode)->root; 5619 } 5620 5621 static struct inode *btrfs_iget_locked(struct super_block *s, 5622 struct btrfs_key *location, 5623 struct btrfs_root *root) 5624 { 5625 struct inode *inode; 5626 struct btrfs_iget_args args; 5627 unsigned long hashval = btrfs_inode_hash(location->objectid, root); 5628 5629 args.location = location; 5630 args.root = root; 5631 5632 inode = iget5_locked(s, hashval, btrfs_find_actor, 5633 btrfs_init_locked_inode, 5634 (void *)&args); 5635 return inode; 5636 } 5637 5638 /* Get an inode object given its location and corresponding root. 5639 * Returns in *is_new if the inode was read from disk 5640 */ 5641 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 5642 struct btrfs_root *root, int *new) 5643 { 5644 struct inode *inode; 5645 5646 inode = btrfs_iget_locked(s, location, root); 5647 if (!inode) 5648 return ERR_PTR(-ENOMEM); 5649 5650 if (inode->i_state & I_NEW) { 5651 int ret; 5652 5653 ret = btrfs_read_locked_inode(inode); 5654 if (!is_bad_inode(inode)) { 5655 inode_tree_add(inode); 5656 unlock_new_inode(inode); 5657 if (new) 5658 *new = 1; 5659 } else { 5660 unlock_new_inode(inode); 5661 iput(inode); 5662 ASSERT(ret < 0); 5663 inode = ERR_PTR(ret < 0 ? ret : -ESTALE); 5664 } 5665 } 5666 5667 return inode; 5668 } 5669 5670 static struct inode *new_simple_dir(struct super_block *s, 5671 struct btrfs_key *key, 5672 struct btrfs_root *root) 5673 { 5674 struct inode *inode = new_inode(s); 5675 5676 if (!inode) 5677 return ERR_PTR(-ENOMEM); 5678 5679 BTRFS_I(inode)->root = root; 5680 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 5681 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 5682 5683 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 5684 inode->i_op = &btrfs_dir_ro_inode_operations; 5685 inode->i_fop = &simple_dir_operations; 5686 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5687 inode->i_mtime = current_time(inode); 5688 inode->i_atime = inode->i_mtime; 5689 inode->i_ctime = inode->i_mtime; 5690 BTRFS_I(inode)->i_otime = inode->i_mtime; 5691 5692 return inode; 5693 } 5694 5695 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 5696 { 5697 struct inode *inode; 5698 struct btrfs_root *root = BTRFS_I(dir)->root; 5699 struct btrfs_root *sub_root = root; 5700 struct btrfs_key location; 5701 int index; 5702 int ret = 0; 5703 5704 if (dentry->d_name.len > BTRFS_NAME_LEN) 5705 return ERR_PTR(-ENAMETOOLONG); 5706 5707 ret = btrfs_inode_by_name(dir, dentry, &location); 5708 if (ret < 0) 5709 return ERR_PTR(ret); 5710 5711 if (location.objectid == 0) 5712 return ERR_PTR(-ENOENT); 5713 5714 if (location.type == BTRFS_INODE_ITEM_KEY) { 5715 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 5716 return inode; 5717 } 5718 5719 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 5720 5721 index = srcu_read_lock(&root->fs_info->subvol_srcu); 5722 ret = fixup_tree_root_location(root, dir, dentry, 5723 &location, &sub_root); 5724 if (ret < 0) { 5725 if (ret != -ENOENT) 5726 inode = ERR_PTR(ret); 5727 else 5728 inode = new_simple_dir(dir->i_sb, &location, sub_root); 5729 } else { 5730 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); 5731 } 5732 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 5733 5734 if (!IS_ERR(inode) && root != sub_root) { 5735 down_read(&root->fs_info->cleanup_work_sem); 5736 if (!(inode->i_sb->s_flags & MS_RDONLY)) 5737 ret = btrfs_orphan_cleanup(sub_root); 5738 up_read(&root->fs_info->cleanup_work_sem); 5739 if (ret) { 5740 iput(inode); 5741 inode = ERR_PTR(ret); 5742 } 5743 } 5744 5745 return inode; 5746 } 5747 5748 static int btrfs_dentry_delete(const struct dentry *dentry) 5749 { 5750 struct btrfs_root *root; 5751 struct inode *inode = d_inode(dentry); 5752 5753 if (!inode && !IS_ROOT(dentry)) 5754 inode = d_inode(dentry->d_parent); 5755 5756 if (inode) { 5757 root = BTRFS_I(inode)->root; 5758 if (btrfs_root_refs(&root->root_item) == 0) 5759 return 1; 5760 5761 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 5762 return 1; 5763 } 5764 return 0; 5765 } 5766 5767 static void btrfs_dentry_release(struct dentry *dentry) 5768 { 5769 kfree(dentry->d_fsdata); 5770 } 5771 5772 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 5773 unsigned int flags) 5774 { 5775 struct inode *inode; 5776 5777 inode = btrfs_lookup_dentry(dir, dentry); 5778 if (IS_ERR(inode)) { 5779 if (PTR_ERR(inode) == -ENOENT) 5780 inode = NULL; 5781 else 5782 return ERR_CAST(inode); 5783 } 5784 5785 return d_splice_alias(inode, dentry); 5786 } 5787 5788 unsigned char btrfs_filetype_table[] = { 5789 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 5790 }; 5791 5792 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) 5793 { 5794 struct inode *inode = file_inode(file); 5795 struct btrfs_root *root = BTRFS_I(inode)->root; 5796 struct btrfs_item *item; 5797 struct btrfs_dir_item *di; 5798 struct btrfs_key key; 5799 struct btrfs_key found_key; 5800 struct btrfs_path *path; 5801 struct list_head ins_list; 5802 struct list_head del_list; 5803 int ret; 5804 struct extent_buffer *leaf; 5805 int slot; 5806 unsigned char d_type; 5807 int over = 0; 5808 u32 di_cur; 5809 u32 di_total; 5810 u32 di_len; 5811 int key_type = BTRFS_DIR_INDEX_KEY; 5812 char tmp_name[32]; 5813 char *name_ptr; 5814 int name_len; 5815 int is_curr = 0; /* ctx->pos points to the current index? */ 5816 bool emitted; 5817 bool put = false; 5818 5819 /* FIXME, use a real flag for deciding about the key type */ 5820 if (root->fs_info->tree_root == root) 5821 key_type = BTRFS_DIR_ITEM_KEY; 5822 5823 if (!dir_emit_dots(file, ctx)) 5824 return 0; 5825 5826 path = btrfs_alloc_path(); 5827 if (!path) 5828 return -ENOMEM; 5829 5830 path->reada = READA_FORWARD; 5831 5832 if (key_type == BTRFS_DIR_INDEX_KEY) { 5833 INIT_LIST_HEAD(&ins_list); 5834 INIT_LIST_HEAD(&del_list); 5835 put = btrfs_readdir_get_delayed_items(inode, &ins_list, 5836 &del_list); 5837 } 5838 5839 key.type = key_type; 5840 key.offset = ctx->pos; 5841 key.objectid = btrfs_ino(inode); 5842 5843 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5844 if (ret < 0) 5845 goto err; 5846 5847 emitted = false; 5848 while (1) { 5849 leaf = path->nodes[0]; 5850 slot = path->slots[0]; 5851 if (slot >= btrfs_header_nritems(leaf)) { 5852 ret = btrfs_next_leaf(root, path); 5853 if (ret < 0) 5854 goto err; 5855 else if (ret > 0) 5856 break; 5857 continue; 5858 } 5859 5860 item = btrfs_item_nr(slot); 5861 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5862 5863 if (found_key.objectid != key.objectid) 5864 break; 5865 if (found_key.type != key_type) 5866 break; 5867 if (found_key.offset < ctx->pos) 5868 goto next; 5869 if (key_type == BTRFS_DIR_INDEX_KEY && 5870 btrfs_should_delete_dir_index(&del_list, 5871 found_key.offset)) 5872 goto next; 5873 5874 ctx->pos = found_key.offset; 5875 is_curr = 1; 5876 5877 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 5878 di_cur = 0; 5879 di_total = btrfs_item_size(leaf, item); 5880 5881 while (di_cur < di_total) { 5882 struct btrfs_key location; 5883 5884 if (verify_dir_item(root, leaf, di)) 5885 break; 5886 5887 name_len = btrfs_dir_name_len(leaf, di); 5888 if (name_len <= sizeof(tmp_name)) { 5889 name_ptr = tmp_name; 5890 } else { 5891 name_ptr = kmalloc(name_len, GFP_KERNEL); 5892 if (!name_ptr) { 5893 ret = -ENOMEM; 5894 goto err; 5895 } 5896 } 5897 read_extent_buffer(leaf, name_ptr, 5898 (unsigned long)(di + 1), name_len); 5899 5900 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 5901 btrfs_dir_item_key_to_cpu(leaf, di, &location); 5902 5903 5904 /* is this a reference to our own snapshot? If so 5905 * skip it. 5906 * 5907 * In contrast to old kernels, we insert the snapshot's 5908 * dir item and dir index after it has been created, so 5909 * we won't find a reference to our own snapshot. We 5910 * still keep the following code for backward 5911 * compatibility. 5912 */ 5913 if (location.type == BTRFS_ROOT_ITEM_KEY && 5914 location.objectid == root->root_key.objectid) { 5915 over = 0; 5916 goto skip; 5917 } 5918 over = !dir_emit(ctx, name_ptr, name_len, 5919 location.objectid, d_type); 5920 5921 skip: 5922 if (name_ptr != tmp_name) 5923 kfree(name_ptr); 5924 5925 if (over) 5926 goto nopos; 5927 emitted = true; 5928 di_len = btrfs_dir_name_len(leaf, di) + 5929 btrfs_dir_data_len(leaf, di) + sizeof(*di); 5930 di_cur += di_len; 5931 di = (struct btrfs_dir_item *)((char *)di + di_len); 5932 } 5933 next: 5934 path->slots[0]++; 5935 } 5936 5937 if (key_type == BTRFS_DIR_INDEX_KEY) { 5938 if (is_curr) 5939 ctx->pos++; 5940 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted); 5941 if (ret) 5942 goto nopos; 5943 } 5944 5945 /* 5946 * If we haven't emitted any dir entry, we must not touch ctx->pos as 5947 * it was was set to the termination value in previous call. We assume 5948 * that "." and ".." were emitted if we reach this point and set the 5949 * termination value as well for an empty directory. 5950 */ 5951 if (ctx->pos > 2 && !emitted) 5952 goto nopos; 5953 5954 /* Reached end of directory/root. Bump pos past the last item. */ 5955 ctx->pos++; 5956 5957 /* 5958 * Stop new entries from being returned after we return the last 5959 * entry. 5960 * 5961 * New directory entries are assigned a strictly increasing 5962 * offset. This means that new entries created during readdir 5963 * are *guaranteed* to be seen in the future by that readdir. 5964 * This has broken buggy programs which operate on names as 5965 * they're returned by readdir. Until we re-use freed offsets 5966 * we have this hack to stop new entries from being returned 5967 * under the assumption that they'll never reach this huge 5968 * offset. 5969 * 5970 * This is being careful not to overflow 32bit loff_t unless the 5971 * last entry requires it because doing so has broken 32bit apps 5972 * in the past. 5973 */ 5974 if (key_type == BTRFS_DIR_INDEX_KEY) { 5975 if (ctx->pos >= INT_MAX) 5976 ctx->pos = LLONG_MAX; 5977 else 5978 ctx->pos = INT_MAX; 5979 } 5980 nopos: 5981 ret = 0; 5982 err: 5983 if (put) 5984 btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); 5985 btrfs_free_path(path); 5986 return ret; 5987 } 5988 5989 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) 5990 { 5991 struct btrfs_root *root = BTRFS_I(inode)->root; 5992 struct btrfs_trans_handle *trans; 5993 int ret = 0; 5994 bool nolock = false; 5995 5996 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 5997 return 0; 5998 5999 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) 6000 nolock = true; 6001 6002 if (wbc->sync_mode == WB_SYNC_ALL) { 6003 if (nolock) 6004 trans = btrfs_join_transaction_nolock(root); 6005 else 6006 trans = btrfs_join_transaction(root); 6007 if (IS_ERR(trans)) 6008 return PTR_ERR(trans); 6009 ret = btrfs_commit_transaction(trans, root); 6010 } 6011 return ret; 6012 } 6013 6014 /* 6015 * This is somewhat expensive, updating the tree every time the 6016 * inode changes. But, it is most likely to find the inode in cache. 6017 * FIXME, needs more benchmarking...there are no reasons other than performance 6018 * to keep or drop this code. 6019 */ 6020 static int btrfs_dirty_inode(struct inode *inode) 6021 { 6022 struct btrfs_root *root = BTRFS_I(inode)->root; 6023 struct btrfs_trans_handle *trans; 6024 int ret; 6025 6026 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 6027 return 0; 6028 6029 trans = btrfs_join_transaction(root); 6030 if (IS_ERR(trans)) 6031 return PTR_ERR(trans); 6032 6033 ret = btrfs_update_inode(trans, root, inode); 6034 if (ret && ret == -ENOSPC) { 6035 /* whoops, lets try again with the full transaction */ 6036 btrfs_end_transaction(trans, root); 6037 trans = btrfs_start_transaction(root, 1); 6038 if (IS_ERR(trans)) 6039 return PTR_ERR(trans); 6040 6041 ret = btrfs_update_inode(trans, root, inode); 6042 } 6043 btrfs_end_transaction(trans, root); 6044 if (BTRFS_I(inode)->delayed_node) 6045 btrfs_balance_delayed_items(root); 6046 6047 return ret; 6048 } 6049 6050 /* 6051 * This is a copy of file_update_time. We need this so we can return error on 6052 * ENOSPC for updating the inode in the case of file write and mmap writes. 6053 */ 6054 static int btrfs_update_time(struct inode *inode, struct timespec *now, 6055 int flags) 6056 { 6057 struct btrfs_root *root = BTRFS_I(inode)->root; 6058 6059 if (btrfs_root_readonly(root)) 6060 return -EROFS; 6061 6062 if (flags & S_VERSION) 6063 inode_inc_iversion(inode); 6064 if (flags & S_CTIME) 6065 inode->i_ctime = *now; 6066 if (flags & S_MTIME) 6067 inode->i_mtime = *now; 6068 if (flags & S_ATIME) 6069 inode->i_atime = *now; 6070 return btrfs_dirty_inode(inode); 6071 } 6072 6073 /* 6074 * find the highest existing sequence number in a directory 6075 * and then set the in-memory index_cnt variable to reflect 6076 * free sequence numbers 6077 */ 6078 static int btrfs_set_inode_index_count(struct inode *inode) 6079 { 6080 struct btrfs_root *root = BTRFS_I(inode)->root; 6081 struct btrfs_key key, found_key; 6082 struct btrfs_path *path; 6083 struct extent_buffer *leaf; 6084 int ret; 6085 6086 key.objectid = btrfs_ino(inode); 6087 key.type = BTRFS_DIR_INDEX_KEY; 6088 key.offset = (u64)-1; 6089 6090 path = btrfs_alloc_path(); 6091 if (!path) 6092 return -ENOMEM; 6093 6094 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6095 if (ret < 0) 6096 goto out; 6097 /* FIXME: we should be able to handle this */ 6098 if (ret == 0) 6099 goto out; 6100 ret = 0; 6101 6102 /* 6103 * MAGIC NUMBER EXPLANATION: 6104 * since we search a directory based on f_pos we have to start at 2 6105 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 6106 * else has to start at 2 6107 */ 6108 if (path->slots[0] == 0) { 6109 BTRFS_I(inode)->index_cnt = 2; 6110 goto out; 6111 } 6112 6113 path->slots[0]--; 6114 6115 leaf = path->nodes[0]; 6116 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6117 6118 if (found_key.objectid != btrfs_ino(inode) || 6119 found_key.type != BTRFS_DIR_INDEX_KEY) { 6120 BTRFS_I(inode)->index_cnt = 2; 6121 goto out; 6122 } 6123 6124 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 6125 out: 6126 btrfs_free_path(path); 6127 return ret; 6128 } 6129 6130 /* 6131 * helper to find a free sequence number in a given directory. This current 6132 * code is very simple, later versions will do smarter things in the btree 6133 */ 6134 int btrfs_set_inode_index(struct inode *dir, u64 *index) 6135 { 6136 int ret = 0; 6137 6138 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 6139 ret = btrfs_inode_delayed_dir_index_count(dir); 6140 if (ret) { 6141 ret = btrfs_set_inode_index_count(dir); 6142 if (ret) 6143 return ret; 6144 } 6145 } 6146 6147 *index = BTRFS_I(dir)->index_cnt; 6148 BTRFS_I(dir)->index_cnt++; 6149 6150 return ret; 6151 } 6152 6153 static int btrfs_insert_inode_locked(struct inode *inode) 6154 { 6155 struct btrfs_iget_args args; 6156 args.location = &BTRFS_I(inode)->location; 6157 args.root = BTRFS_I(inode)->root; 6158 6159 return insert_inode_locked4(inode, 6160 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), 6161 btrfs_find_actor, &args); 6162 } 6163 6164 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 6165 struct btrfs_root *root, 6166 struct inode *dir, 6167 const char *name, int name_len, 6168 u64 ref_objectid, u64 objectid, 6169 umode_t mode, u64 *index) 6170 { 6171 struct inode *inode; 6172 struct btrfs_inode_item *inode_item; 6173 struct btrfs_key *location; 6174 struct btrfs_path *path; 6175 struct btrfs_inode_ref *ref; 6176 struct btrfs_key key[2]; 6177 u32 sizes[2]; 6178 int nitems = name ? 2 : 1; 6179 unsigned long ptr; 6180 int ret; 6181 6182 path = btrfs_alloc_path(); 6183 if (!path) 6184 return ERR_PTR(-ENOMEM); 6185 6186 inode = new_inode(root->fs_info->sb); 6187 if (!inode) { 6188 btrfs_free_path(path); 6189 return ERR_PTR(-ENOMEM); 6190 } 6191 6192 /* 6193 * O_TMPFILE, set link count to 0, so that after this point, 6194 * we fill in an inode item with the correct link count. 6195 */ 6196 if (!name) 6197 set_nlink(inode, 0); 6198 6199 /* 6200 * we have to initialize this early, so we can reclaim the inode 6201 * number if we fail afterwards in this function. 6202 */ 6203 inode->i_ino = objectid; 6204 6205 if (dir && name) { 6206 trace_btrfs_inode_request(dir); 6207 6208 ret = btrfs_set_inode_index(dir, index); 6209 if (ret) { 6210 btrfs_free_path(path); 6211 iput(inode); 6212 return ERR_PTR(ret); 6213 } 6214 } else if (dir) { 6215 *index = 0; 6216 } 6217 /* 6218 * index_cnt is ignored for everything but a dir, 6219 * btrfs_get_inode_index_count has an explanation for the magic 6220 * number 6221 */ 6222 BTRFS_I(inode)->index_cnt = 2; 6223 BTRFS_I(inode)->dir_index = *index; 6224 BTRFS_I(inode)->root = root; 6225 BTRFS_I(inode)->generation = trans->transid; 6226 inode->i_generation = BTRFS_I(inode)->generation; 6227 6228 /* 6229 * We could have gotten an inode number from somebody who was fsynced 6230 * and then removed in this same transaction, so let's just set full 6231 * sync since it will be a full sync anyway and this will blow away the 6232 * old info in the log. 6233 */ 6234 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 6235 6236 key[0].objectid = objectid; 6237 key[0].type = BTRFS_INODE_ITEM_KEY; 6238 key[0].offset = 0; 6239 6240 sizes[0] = sizeof(struct btrfs_inode_item); 6241 6242 if (name) { 6243 /* 6244 * Start new inodes with an inode_ref. This is slightly more 6245 * efficient for small numbers of hard links since they will 6246 * be packed into one item. Extended refs will kick in if we 6247 * add more hard links than can fit in the ref item. 6248 */ 6249 key[1].objectid = objectid; 6250 key[1].type = BTRFS_INODE_REF_KEY; 6251 key[1].offset = ref_objectid; 6252 6253 sizes[1] = name_len + sizeof(*ref); 6254 } 6255 6256 location = &BTRFS_I(inode)->location; 6257 location->objectid = objectid; 6258 location->offset = 0; 6259 location->type = BTRFS_INODE_ITEM_KEY; 6260 6261 ret = btrfs_insert_inode_locked(inode); 6262 if (ret < 0) 6263 goto fail; 6264 6265 path->leave_spinning = 1; 6266 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); 6267 if (ret != 0) 6268 goto fail_unlock; 6269 6270 inode_init_owner(inode, dir, mode); 6271 inode_set_bytes(inode, 0); 6272 6273 inode->i_mtime = current_time(inode); 6274 inode->i_atime = inode->i_mtime; 6275 inode->i_ctime = inode->i_mtime; 6276 BTRFS_I(inode)->i_otime = inode->i_mtime; 6277 6278 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 6279 struct btrfs_inode_item); 6280 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, 6281 sizeof(*inode_item)); 6282 fill_inode_item(trans, path->nodes[0], inode_item, inode); 6283 6284 if (name) { 6285 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 6286 struct btrfs_inode_ref); 6287 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 6288 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 6289 ptr = (unsigned long)(ref + 1); 6290 write_extent_buffer(path->nodes[0], name, ptr, name_len); 6291 } 6292 6293 btrfs_mark_buffer_dirty(path->nodes[0]); 6294 btrfs_free_path(path); 6295 6296 btrfs_inherit_iflags(inode, dir); 6297 6298 if (S_ISREG(mode)) { 6299 if (btrfs_test_opt(root->fs_info, NODATASUM)) 6300 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6301 if (btrfs_test_opt(root->fs_info, NODATACOW)) 6302 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | 6303 BTRFS_INODE_NODATASUM; 6304 } 6305 6306 inode_tree_add(inode); 6307 6308 trace_btrfs_inode_new(inode); 6309 btrfs_set_inode_last_trans(trans, inode); 6310 6311 btrfs_update_root_times(trans, root); 6312 6313 ret = btrfs_inode_inherit_props(trans, inode, dir); 6314 if (ret) 6315 btrfs_err(root->fs_info, 6316 "error inheriting props for ino %llu (root %llu): %d", 6317 btrfs_ino(inode), root->root_key.objectid, ret); 6318 6319 return inode; 6320 6321 fail_unlock: 6322 unlock_new_inode(inode); 6323 fail: 6324 if (dir && name) 6325 BTRFS_I(dir)->index_cnt--; 6326 btrfs_free_path(path); 6327 iput(inode); 6328 return ERR_PTR(ret); 6329 } 6330 6331 static inline u8 btrfs_inode_type(struct inode *inode) 6332 { 6333 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 6334 } 6335 6336 /* 6337 * utility function to add 'inode' into 'parent_inode' with 6338 * a give name and a given sequence number. 6339 * if 'add_backref' is true, also insert a backref from the 6340 * inode to the parent directory. 6341 */ 6342 int btrfs_add_link(struct btrfs_trans_handle *trans, 6343 struct inode *parent_inode, struct inode *inode, 6344 const char *name, int name_len, int add_backref, u64 index) 6345 { 6346 int ret = 0; 6347 struct btrfs_key key; 6348 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 6349 u64 ino = btrfs_ino(inode); 6350 u64 parent_ino = btrfs_ino(parent_inode); 6351 6352 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6353 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 6354 } else { 6355 key.objectid = ino; 6356 key.type = BTRFS_INODE_ITEM_KEY; 6357 key.offset = 0; 6358 } 6359 6360 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6361 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 6362 key.objectid, root->root_key.objectid, 6363 parent_ino, index, name, name_len); 6364 } else if (add_backref) { 6365 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 6366 parent_ino, index); 6367 } 6368 6369 /* Nothing to clean up yet */ 6370 if (ret) 6371 return ret; 6372 6373 ret = btrfs_insert_dir_item(trans, root, name, name_len, 6374 parent_inode, &key, 6375 btrfs_inode_type(inode), index); 6376 if (ret == -EEXIST || ret == -EOVERFLOW) 6377 goto fail_dir_item; 6378 else if (ret) { 6379 btrfs_abort_transaction(trans, ret); 6380 return ret; 6381 } 6382 6383 btrfs_i_size_write(parent_inode, parent_inode->i_size + 6384 name_len * 2); 6385 inode_inc_iversion(parent_inode); 6386 parent_inode->i_mtime = parent_inode->i_ctime = 6387 current_time(parent_inode); 6388 ret = btrfs_update_inode(trans, root, parent_inode); 6389 if (ret) 6390 btrfs_abort_transaction(trans, ret); 6391 return ret; 6392 6393 fail_dir_item: 6394 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 6395 u64 local_index; 6396 int err; 6397 err = btrfs_del_root_ref(trans, root->fs_info->tree_root, 6398 key.objectid, root->root_key.objectid, 6399 parent_ino, &local_index, name, name_len); 6400 6401 } else if (add_backref) { 6402 u64 local_index; 6403 int err; 6404 6405 err = btrfs_del_inode_ref(trans, root, name, name_len, 6406 ino, parent_ino, &local_index); 6407 } 6408 return ret; 6409 } 6410 6411 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 6412 struct inode *dir, struct dentry *dentry, 6413 struct inode *inode, int backref, u64 index) 6414 { 6415 int err = btrfs_add_link(trans, dir, inode, 6416 dentry->d_name.name, dentry->d_name.len, 6417 backref, index); 6418 if (err > 0) 6419 err = -EEXIST; 6420 return err; 6421 } 6422 6423 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 6424 umode_t mode, dev_t rdev) 6425 { 6426 struct btrfs_trans_handle *trans; 6427 struct btrfs_root *root = BTRFS_I(dir)->root; 6428 struct inode *inode = NULL; 6429 int err; 6430 int drop_inode = 0; 6431 u64 objectid; 6432 u64 index = 0; 6433 6434 /* 6435 * 2 for inode item and ref 6436 * 2 for dir items 6437 * 1 for xattr if selinux is on 6438 */ 6439 trans = btrfs_start_transaction(root, 5); 6440 if (IS_ERR(trans)) 6441 return PTR_ERR(trans); 6442 6443 err = btrfs_find_free_ino(root, &objectid); 6444 if (err) 6445 goto out_unlock; 6446 6447 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6448 dentry->d_name.len, btrfs_ino(dir), objectid, 6449 mode, &index); 6450 if (IS_ERR(inode)) { 6451 err = PTR_ERR(inode); 6452 goto out_unlock; 6453 } 6454 6455 /* 6456 * If the active LSM wants to access the inode during 6457 * d_instantiate it needs these. Smack checks to see 6458 * if the filesystem supports xattrs by looking at the 6459 * ops vector. 6460 */ 6461 inode->i_op = &btrfs_special_inode_operations; 6462 init_special_inode(inode, inode->i_mode, rdev); 6463 6464 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6465 if (err) 6466 goto out_unlock_inode; 6467 6468 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 6469 if (err) { 6470 goto out_unlock_inode; 6471 } else { 6472 btrfs_update_inode(trans, root, inode); 6473 unlock_new_inode(inode); 6474 d_instantiate(dentry, inode); 6475 } 6476 6477 out_unlock: 6478 btrfs_end_transaction(trans, root); 6479 btrfs_balance_delayed_items(root); 6480 btrfs_btree_balance_dirty(root); 6481 if (drop_inode) { 6482 inode_dec_link_count(inode); 6483 iput(inode); 6484 } 6485 return err; 6486 6487 out_unlock_inode: 6488 drop_inode = 1; 6489 unlock_new_inode(inode); 6490 goto out_unlock; 6491 6492 } 6493 6494 static int btrfs_create(struct inode *dir, struct dentry *dentry, 6495 umode_t mode, bool excl) 6496 { 6497 struct btrfs_trans_handle *trans; 6498 struct btrfs_root *root = BTRFS_I(dir)->root; 6499 struct inode *inode = NULL; 6500 int drop_inode_on_err = 0; 6501 int err; 6502 u64 objectid; 6503 u64 index = 0; 6504 6505 /* 6506 * 2 for inode item and ref 6507 * 2 for dir items 6508 * 1 for xattr if selinux is on 6509 */ 6510 trans = btrfs_start_transaction(root, 5); 6511 if (IS_ERR(trans)) 6512 return PTR_ERR(trans); 6513 6514 err = btrfs_find_free_ino(root, &objectid); 6515 if (err) 6516 goto out_unlock; 6517 6518 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6519 dentry->d_name.len, btrfs_ino(dir), objectid, 6520 mode, &index); 6521 if (IS_ERR(inode)) { 6522 err = PTR_ERR(inode); 6523 goto out_unlock; 6524 } 6525 drop_inode_on_err = 1; 6526 /* 6527 * If the active LSM wants to access the inode during 6528 * d_instantiate it needs these. Smack checks to see 6529 * if the filesystem supports xattrs by looking at the 6530 * ops vector. 6531 */ 6532 inode->i_fop = &btrfs_file_operations; 6533 inode->i_op = &btrfs_file_inode_operations; 6534 inode->i_mapping->a_ops = &btrfs_aops; 6535 6536 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6537 if (err) 6538 goto out_unlock_inode; 6539 6540 err = btrfs_update_inode(trans, root, inode); 6541 if (err) 6542 goto out_unlock_inode; 6543 6544 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 6545 if (err) 6546 goto out_unlock_inode; 6547 6548 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 6549 unlock_new_inode(inode); 6550 d_instantiate(dentry, inode); 6551 6552 out_unlock: 6553 btrfs_end_transaction(trans, root); 6554 if (err && drop_inode_on_err) { 6555 inode_dec_link_count(inode); 6556 iput(inode); 6557 } 6558 btrfs_balance_delayed_items(root); 6559 btrfs_btree_balance_dirty(root); 6560 return err; 6561 6562 out_unlock_inode: 6563 unlock_new_inode(inode); 6564 goto out_unlock; 6565 6566 } 6567 6568 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 6569 struct dentry *dentry) 6570 { 6571 struct btrfs_trans_handle *trans = NULL; 6572 struct btrfs_root *root = BTRFS_I(dir)->root; 6573 struct inode *inode = d_inode(old_dentry); 6574 u64 index; 6575 int err; 6576 int drop_inode = 0; 6577 6578 /* do not allow sys_link's with other subvols of the same device */ 6579 if (root->objectid != BTRFS_I(inode)->root->objectid) 6580 return -EXDEV; 6581 6582 if (inode->i_nlink >= BTRFS_LINK_MAX) 6583 return -EMLINK; 6584 6585 err = btrfs_set_inode_index(dir, &index); 6586 if (err) 6587 goto fail; 6588 6589 /* 6590 * 2 items for inode and inode ref 6591 * 2 items for dir items 6592 * 1 item for parent inode 6593 */ 6594 trans = btrfs_start_transaction(root, 5); 6595 if (IS_ERR(trans)) { 6596 err = PTR_ERR(trans); 6597 trans = NULL; 6598 goto fail; 6599 } 6600 6601 /* There are several dir indexes for this inode, clear the cache. */ 6602 BTRFS_I(inode)->dir_index = 0ULL; 6603 inc_nlink(inode); 6604 inode_inc_iversion(inode); 6605 inode->i_ctime = current_time(inode); 6606 ihold(inode); 6607 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6608 6609 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 6610 6611 if (err) { 6612 drop_inode = 1; 6613 } else { 6614 struct dentry *parent = dentry->d_parent; 6615 err = btrfs_update_inode(trans, root, inode); 6616 if (err) 6617 goto fail; 6618 if (inode->i_nlink == 1) { 6619 /* 6620 * If new hard link count is 1, it's a file created 6621 * with open(2) O_TMPFILE flag. 6622 */ 6623 err = btrfs_orphan_del(trans, inode); 6624 if (err) 6625 goto fail; 6626 } 6627 d_instantiate(dentry, inode); 6628 btrfs_log_new_name(trans, inode, NULL, parent); 6629 } 6630 6631 btrfs_balance_delayed_items(root); 6632 fail: 6633 if (trans) 6634 btrfs_end_transaction(trans, root); 6635 if (drop_inode) { 6636 inode_dec_link_count(inode); 6637 iput(inode); 6638 } 6639 btrfs_btree_balance_dirty(root); 6640 return err; 6641 } 6642 6643 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 6644 { 6645 struct inode *inode = NULL; 6646 struct btrfs_trans_handle *trans; 6647 struct btrfs_root *root = BTRFS_I(dir)->root; 6648 int err = 0; 6649 int drop_on_err = 0; 6650 u64 objectid = 0; 6651 u64 index = 0; 6652 6653 /* 6654 * 2 items for inode and ref 6655 * 2 items for dir items 6656 * 1 for xattr if selinux is on 6657 */ 6658 trans = btrfs_start_transaction(root, 5); 6659 if (IS_ERR(trans)) 6660 return PTR_ERR(trans); 6661 6662 err = btrfs_find_free_ino(root, &objectid); 6663 if (err) 6664 goto out_fail; 6665 6666 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6667 dentry->d_name.len, btrfs_ino(dir), objectid, 6668 S_IFDIR | mode, &index); 6669 if (IS_ERR(inode)) { 6670 err = PTR_ERR(inode); 6671 goto out_fail; 6672 } 6673 6674 drop_on_err = 1; 6675 /* these must be set before we unlock the inode */ 6676 inode->i_op = &btrfs_dir_inode_operations; 6677 inode->i_fop = &btrfs_dir_file_operations; 6678 6679 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6680 if (err) 6681 goto out_fail_inode; 6682 6683 btrfs_i_size_write(inode, 0); 6684 err = btrfs_update_inode(trans, root, inode); 6685 if (err) 6686 goto out_fail_inode; 6687 6688 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 6689 dentry->d_name.len, 0, index); 6690 if (err) 6691 goto out_fail_inode; 6692 6693 d_instantiate(dentry, inode); 6694 /* 6695 * mkdir is special. We're unlocking after we call d_instantiate 6696 * to avoid a race with nfsd calling d_instantiate. 6697 */ 6698 unlock_new_inode(inode); 6699 drop_on_err = 0; 6700 6701 out_fail: 6702 btrfs_end_transaction(trans, root); 6703 if (drop_on_err) { 6704 inode_dec_link_count(inode); 6705 iput(inode); 6706 } 6707 btrfs_balance_delayed_items(root); 6708 btrfs_btree_balance_dirty(root); 6709 return err; 6710 6711 out_fail_inode: 6712 unlock_new_inode(inode); 6713 goto out_fail; 6714 } 6715 6716 /* Find next extent map of a given extent map, caller needs to ensure locks */ 6717 static struct extent_map *next_extent_map(struct extent_map *em) 6718 { 6719 struct rb_node *next; 6720 6721 next = rb_next(&em->rb_node); 6722 if (!next) 6723 return NULL; 6724 return container_of(next, struct extent_map, rb_node); 6725 } 6726 6727 static struct extent_map *prev_extent_map(struct extent_map *em) 6728 { 6729 struct rb_node *prev; 6730 6731 prev = rb_prev(&em->rb_node); 6732 if (!prev) 6733 return NULL; 6734 return container_of(prev, struct extent_map, rb_node); 6735 } 6736 6737 /* helper for btfs_get_extent. Given an existing extent in the tree, 6738 * the existing extent is the nearest extent to map_start, 6739 * and an extent that you want to insert, deal with overlap and insert 6740 * the best fitted new extent into the tree. 6741 */ 6742 static int merge_extent_mapping(struct extent_map_tree *em_tree, 6743 struct extent_map *existing, 6744 struct extent_map *em, 6745 u64 map_start) 6746 { 6747 struct extent_map *prev; 6748 struct extent_map *next; 6749 u64 start; 6750 u64 end; 6751 u64 start_diff; 6752 6753 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 6754 6755 if (existing->start > map_start) { 6756 next = existing; 6757 prev = prev_extent_map(next); 6758 } else { 6759 prev = existing; 6760 next = next_extent_map(prev); 6761 } 6762 6763 start = prev ? extent_map_end(prev) : em->start; 6764 start = max_t(u64, start, em->start); 6765 end = next ? next->start : extent_map_end(em); 6766 end = min_t(u64, end, extent_map_end(em)); 6767 start_diff = start - em->start; 6768 em->start = start; 6769 em->len = end - start; 6770 if (em->block_start < EXTENT_MAP_LAST_BYTE && 6771 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 6772 em->block_start += start_diff; 6773 em->block_len -= start_diff; 6774 } 6775 return add_extent_mapping(em_tree, em, 0); 6776 } 6777 6778 static noinline int uncompress_inline(struct btrfs_path *path, 6779 struct page *page, 6780 size_t pg_offset, u64 extent_offset, 6781 struct btrfs_file_extent_item *item) 6782 { 6783 int ret; 6784 struct extent_buffer *leaf = path->nodes[0]; 6785 char *tmp; 6786 size_t max_size; 6787 unsigned long inline_size; 6788 unsigned long ptr; 6789 int compress_type; 6790 6791 WARN_ON(pg_offset != 0); 6792 compress_type = btrfs_file_extent_compression(leaf, item); 6793 max_size = btrfs_file_extent_ram_bytes(leaf, item); 6794 inline_size = btrfs_file_extent_inline_item_len(leaf, 6795 btrfs_item_nr(path->slots[0])); 6796 tmp = kmalloc(inline_size, GFP_NOFS); 6797 if (!tmp) 6798 return -ENOMEM; 6799 ptr = btrfs_file_extent_inline_start(item); 6800 6801 read_extent_buffer(leaf, tmp, ptr, inline_size); 6802 6803 max_size = min_t(unsigned long, PAGE_SIZE, max_size); 6804 ret = btrfs_decompress(compress_type, tmp, page, 6805 extent_offset, inline_size, max_size); 6806 kfree(tmp); 6807 return ret; 6808 } 6809 6810 /* 6811 * a bit scary, this does extent mapping from logical file offset to the disk. 6812 * the ugly parts come from merging extents from the disk with the in-ram 6813 * representation. This gets more complex because of the data=ordered code, 6814 * where the in-ram extents might be locked pending data=ordered completion. 6815 * 6816 * This also copies inline extents directly into the page. 6817 */ 6818 6819 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 6820 size_t pg_offset, u64 start, u64 len, 6821 int create) 6822 { 6823 int ret; 6824 int err = 0; 6825 u64 extent_start = 0; 6826 u64 extent_end = 0; 6827 u64 objectid = btrfs_ino(inode); 6828 u32 found_type; 6829 struct btrfs_path *path = NULL; 6830 struct btrfs_root *root = BTRFS_I(inode)->root; 6831 struct btrfs_file_extent_item *item; 6832 struct extent_buffer *leaf; 6833 struct btrfs_key found_key; 6834 struct extent_map *em = NULL; 6835 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 6836 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6837 struct btrfs_trans_handle *trans = NULL; 6838 const bool new_inline = !page || create; 6839 6840 again: 6841 read_lock(&em_tree->lock); 6842 em = lookup_extent_mapping(em_tree, start, len); 6843 if (em) 6844 em->bdev = root->fs_info->fs_devices->latest_bdev; 6845 read_unlock(&em_tree->lock); 6846 6847 if (em) { 6848 if (em->start > start || em->start + em->len <= start) 6849 free_extent_map(em); 6850 else if (em->block_start == EXTENT_MAP_INLINE && page) 6851 free_extent_map(em); 6852 else 6853 goto out; 6854 } 6855 em = alloc_extent_map(); 6856 if (!em) { 6857 err = -ENOMEM; 6858 goto out; 6859 } 6860 em->bdev = root->fs_info->fs_devices->latest_bdev; 6861 em->start = EXTENT_MAP_HOLE; 6862 em->orig_start = EXTENT_MAP_HOLE; 6863 em->len = (u64)-1; 6864 em->block_len = (u64)-1; 6865 6866 if (!path) { 6867 path = btrfs_alloc_path(); 6868 if (!path) { 6869 err = -ENOMEM; 6870 goto out; 6871 } 6872 /* 6873 * Chances are we'll be called again, so go ahead and do 6874 * readahead 6875 */ 6876 path->reada = READA_FORWARD; 6877 } 6878 6879 ret = btrfs_lookup_file_extent(trans, root, path, 6880 objectid, start, trans != NULL); 6881 if (ret < 0) { 6882 err = ret; 6883 goto out; 6884 } 6885 6886 if (ret != 0) { 6887 if (path->slots[0] == 0) 6888 goto not_found; 6889 path->slots[0]--; 6890 } 6891 6892 leaf = path->nodes[0]; 6893 item = btrfs_item_ptr(leaf, path->slots[0], 6894 struct btrfs_file_extent_item); 6895 /* are we inside the extent that was found? */ 6896 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6897 found_type = found_key.type; 6898 if (found_key.objectid != objectid || 6899 found_type != BTRFS_EXTENT_DATA_KEY) { 6900 /* 6901 * If we backup past the first extent we want to move forward 6902 * and see if there is an extent in front of us, otherwise we'll 6903 * say there is a hole for our whole search range which can 6904 * cause problems. 6905 */ 6906 extent_end = start; 6907 goto next; 6908 } 6909 6910 found_type = btrfs_file_extent_type(leaf, item); 6911 extent_start = found_key.offset; 6912 if (found_type == BTRFS_FILE_EXTENT_REG || 6913 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6914 extent_end = extent_start + 6915 btrfs_file_extent_num_bytes(leaf, item); 6916 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6917 size_t size; 6918 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); 6919 extent_end = ALIGN(extent_start + size, root->sectorsize); 6920 } 6921 next: 6922 if (start >= extent_end) { 6923 path->slots[0]++; 6924 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 6925 ret = btrfs_next_leaf(root, path); 6926 if (ret < 0) { 6927 err = ret; 6928 goto out; 6929 } 6930 if (ret > 0) 6931 goto not_found; 6932 leaf = path->nodes[0]; 6933 } 6934 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6935 if (found_key.objectid != objectid || 6936 found_key.type != BTRFS_EXTENT_DATA_KEY) 6937 goto not_found; 6938 if (start + len <= found_key.offset) 6939 goto not_found; 6940 if (start > found_key.offset) 6941 goto next; 6942 em->start = start; 6943 em->orig_start = start; 6944 em->len = found_key.offset - start; 6945 goto not_found_em; 6946 } 6947 6948 btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); 6949 6950 if (found_type == BTRFS_FILE_EXTENT_REG || 6951 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6952 goto insert; 6953 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6954 unsigned long ptr; 6955 char *map; 6956 size_t size; 6957 size_t extent_offset; 6958 size_t copy_size; 6959 6960 if (new_inline) 6961 goto out; 6962 6963 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); 6964 extent_offset = page_offset(page) + pg_offset - extent_start; 6965 copy_size = min_t(u64, PAGE_SIZE - pg_offset, 6966 size - extent_offset); 6967 em->start = extent_start + extent_offset; 6968 em->len = ALIGN(copy_size, root->sectorsize); 6969 em->orig_block_len = em->len; 6970 em->orig_start = em->start; 6971 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 6972 if (create == 0 && !PageUptodate(page)) { 6973 if (btrfs_file_extent_compression(leaf, item) != 6974 BTRFS_COMPRESS_NONE) { 6975 ret = uncompress_inline(path, page, pg_offset, 6976 extent_offset, item); 6977 if (ret) { 6978 err = ret; 6979 goto out; 6980 } 6981 } else { 6982 map = kmap(page); 6983 read_extent_buffer(leaf, map + pg_offset, ptr, 6984 copy_size); 6985 if (pg_offset + copy_size < PAGE_SIZE) { 6986 memset(map + pg_offset + copy_size, 0, 6987 PAGE_SIZE - pg_offset - 6988 copy_size); 6989 } 6990 kunmap(page); 6991 } 6992 flush_dcache_page(page); 6993 } else if (create && PageUptodate(page)) { 6994 BUG(); 6995 if (!trans) { 6996 kunmap(page); 6997 free_extent_map(em); 6998 em = NULL; 6999 7000 btrfs_release_path(path); 7001 trans = btrfs_join_transaction(root); 7002 7003 if (IS_ERR(trans)) 7004 return ERR_CAST(trans); 7005 goto again; 7006 } 7007 map = kmap(page); 7008 write_extent_buffer(leaf, map + pg_offset, ptr, 7009 copy_size); 7010 kunmap(page); 7011 btrfs_mark_buffer_dirty(leaf); 7012 } 7013 set_extent_uptodate(io_tree, em->start, 7014 extent_map_end(em) - 1, NULL, GFP_NOFS); 7015 goto insert; 7016 } 7017 not_found: 7018 em->start = start; 7019 em->orig_start = start; 7020 em->len = len; 7021 not_found_em: 7022 em->block_start = EXTENT_MAP_HOLE; 7023 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 7024 insert: 7025 btrfs_release_path(path); 7026 if (em->start > start || extent_map_end(em) <= start) { 7027 btrfs_err(root->fs_info, 7028 "bad extent! em: [%llu %llu] passed [%llu %llu]", 7029 em->start, em->len, start, len); 7030 err = -EIO; 7031 goto out; 7032 } 7033 7034 err = 0; 7035 write_lock(&em_tree->lock); 7036 ret = add_extent_mapping(em_tree, em, 0); 7037 /* it is possible that someone inserted the extent into the tree 7038 * while we had the lock dropped. It is also possible that 7039 * an overlapping map exists in the tree 7040 */ 7041 if (ret == -EEXIST) { 7042 struct extent_map *existing; 7043 7044 ret = 0; 7045 7046 existing = search_extent_mapping(em_tree, start, len); 7047 /* 7048 * existing will always be non-NULL, since there must be 7049 * extent causing the -EEXIST. 7050 */ 7051 if (existing->start == em->start && 7052 extent_map_end(existing) == extent_map_end(em) && 7053 em->block_start == existing->block_start) { 7054 /* 7055 * these two extents are the same, it happens 7056 * with inlines especially 7057 */ 7058 free_extent_map(em); 7059 em = existing; 7060 err = 0; 7061 7062 } else if (start >= extent_map_end(existing) || 7063 start <= existing->start) { 7064 /* 7065 * The existing extent map is the one nearest to 7066 * the [start, start + len) range which overlaps 7067 */ 7068 err = merge_extent_mapping(em_tree, existing, 7069 em, start); 7070 free_extent_map(existing); 7071 if (err) { 7072 free_extent_map(em); 7073 em = NULL; 7074 } 7075 } else { 7076 free_extent_map(em); 7077 em = existing; 7078 err = 0; 7079 } 7080 } 7081 write_unlock(&em_tree->lock); 7082 out: 7083 7084 trace_btrfs_get_extent(root, em); 7085 7086 btrfs_free_path(path); 7087 if (trans) { 7088 ret = btrfs_end_transaction(trans, root); 7089 if (!err) 7090 err = ret; 7091 } 7092 if (err) { 7093 free_extent_map(em); 7094 return ERR_PTR(err); 7095 } 7096 BUG_ON(!em); /* Error is always set */ 7097 return em; 7098 } 7099 7100 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 7101 size_t pg_offset, u64 start, u64 len, 7102 int create) 7103 { 7104 struct extent_map *em; 7105 struct extent_map *hole_em = NULL; 7106 u64 range_start = start; 7107 u64 end; 7108 u64 found; 7109 u64 found_end; 7110 int err = 0; 7111 7112 em = btrfs_get_extent(inode, page, pg_offset, start, len, create); 7113 if (IS_ERR(em)) 7114 return em; 7115 if (em) { 7116 /* 7117 * if our em maps to 7118 * - a hole or 7119 * - a pre-alloc extent, 7120 * there might actually be delalloc bytes behind it. 7121 */ 7122 if (em->block_start != EXTENT_MAP_HOLE && 7123 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7124 return em; 7125 else 7126 hole_em = em; 7127 } 7128 7129 /* check to see if we've wrapped (len == -1 or similar) */ 7130 end = start + len; 7131 if (end < start) 7132 end = (u64)-1; 7133 else 7134 end -= 1; 7135 7136 em = NULL; 7137 7138 /* ok, we didn't find anything, lets look for delalloc */ 7139 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, 7140 end, len, EXTENT_DELALLOC, 1); 7141 found_end = range_start + found; 7142 if (found_end < range_start) 7143 found_end = (u64)-1; 7144 7145 /* 7146 * we didn't find anything useful, return 7147 * the original results from get_extent() 7148 */ 7149 if (range_start > end || found_end <= start) { 7150 em = hole_em; 7151 hole_em = NULL; 7152 goto out; 7153 } 7154 7155 /* adjust the range_start to make sure it doesn't 7156 * go backwards from the start they passed in 7157 */ 7158 range_start = max(start, range_start); 7159 found = found_end - range_start; 7160 7161 if (found > 0) { 7162 u64 hole_start = start; 7163 u64 hole_len = len; 7164 7165 em = alloc_extent_map(); 7166 if (!em) { 7167 err = -ENOMEM; 7168 goto out; 7169 } 7170 /* 7171 * when btrfs_get_extent can't find anything it 7172 * returns one huge hole 7173 * 7174 * make sure what it found really fits our range, and 7175 * adjust to make sure it is based on the start from 7176 * the caller 7177 */ 7178 if (hole_em) { 7179 u64 calc_end = extent_map_end(hole_em); 7180 7181 if (calc_end <= start || (hole_em->start > end)) { 7182 free_extent_map(hole_em); 7183 hole_em = NULL; 7184 } else { 7185 hole_start = max(hole_em->start, start); 7186 hole_len = calc_end - hole_start; 7187 } 7188 } 7189 em->bdev = NULL; 7190 if (hole_em && range_start > hole_start) { 7191 /* our hole starts before our delalloc, so we 7192 * have to return just the parts of the hole 7193 * that go until the delalloc starts 7194 */ 7195 em->len = min(hole_len, 7196 range_start - hole_start); 7197 em->start = hole_start; 7198 em->orig_start = hole_start; 7199 /* 7200 * don't adjust block start at all, 7201 * it is fixed at EXTENT_MAP_HOLE 7202 */ 7203 em->block_start = hole_em->block_start; 7204 em->block_len = hole_len; 7205 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) 7206 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7207 } else { 7208 em->start = range_start; 7209 em->len = found; 7210 em->orig_start = range_start; 7211 em->block_start = EXTENT_MAP_DELALLOC; 7212 em->block_len = found; 7213 } 7214 } else if (hole_em) { 7215 return hole_em; 7216 } 7217 out: 7218 7219 free_extent_map(hole_em); 7220 if (err) { 7221 free_extent_map(em); 7222 return ERR_PTR(err); 7223 } 7224 return em; 7225 } 7226 7227 static struct extent_map *btrfs_create_dio_extent(struct inode *inode, 7228 const u64 start, 7229 const u64 len, 7230 const u64 orig_start, 7231 const u64 block_start, 7232 const u64 block_len, 7233 const u64 orig_block_len, 7234 const u64 ram_bytes, 7235 const int type) 7236 { 7237 struct extent_map *em = NULL; 7238 int ret; 7239 7240 down_read(&BTRFS_I(inode)->dio_sem); 7241 if (type != BTRFS_ORDERED_NOCOW) { 7242 em = create_pinned_em(inode, start, len, orig_start, 7243 block_start, block_len, orig_block_len, 7244 ram_bytes, type); 7245 if (IS_ERR(em)) 7246 goto out; 7247 } 7248 ret = btrfs_add_ordered_extent_dio(inode, start, block_start, 7249 len, block_len, type); 7250 if (ret) { 7251 if (em) { 7252 free_extent_map(em); 7253 btrfs_drop_extent_cache(inode, start, 7254 start + len - 1, 0); 7255 } 7256 em = ERR_PTR(ret); 7257 } 7258 out: 7259 up_read(&BTRFS_I(inode)->dio_sem); 7260 7261 return em; 7262 } 7263 7264 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 7265 u64 start, u64 len) 7266 { 7267 struct btrfs_root *root = BTRFS_I(inode)->root; 7268 struct extent_map *em; 7269 struct btrfs_key ins; 7270 u64 alloc_hint; 7271 int ret; 7272 7273 alloc_hint = get_extent_allocation_hint(inode, start, len); 7274 ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0, 7275 alloc_hint, &ins, 1, 1); 7276 if (ret) 7277 return ERR_PTR(ret); 7278 7279 em = btrfs_create_dio_extent(inode, start, ins.offset, start, 7280 ins.objectid, ins.offset, ins.offset, 7281 ins.offset, 0); 7282 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); 7283 if (IS_ERR(em)) 7284 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 7285 7286 return em; 7287 } 7288 7289 /* 7290 * returns 1 when the nocow is safe, < 1 on error, 0 if the 7291 * block must be cow'd 7292 */ 7293 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, 7294 u64 *orig_start, u64 *orig_block_len, 7295 u64 *ram_bytes) 7296 { 7297 struct btrfs_trans_handle *trans; 7298 struct btrfs_path *path; 7299 int ret; 7300 struct extent_buffer *leaf; 7301 struct btrfs_root *root = BTRFS_I(inode)->root; 7302 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7303 struct btrfs_file_extent_item *fi; 7304 struct btrfs_key key; 7305 u64 disk_bytenr; 7306 u64 backref_offset; 7307 u64 extent_end; 7308 u64 num_bytes; 7309 int slot; 7310 int found_type; 7311 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); 7312 7313 path = btrfs_alloc_path(); 7314 if (!path) 7315 return -ENOMEM; 7316 7317 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), 7318 offset, 0); 7319 if (ret < 0) 7320 goto out; 7321 7322 slot = path->slots[0]; 7323 if (ret == 1) { 7324 if (slot == 0) { 7325 /* can't find the item, must cow */ 7326 ret = 0; 7327 goto out; 7328 } 7329 slot--; 7330 } 7331 ret = 0; 7332 leaf = path->nodes[0]; 7333 btrfs_item_key_to_cpu(leaf, &key, slot); 7334 if (key.objectid != btrfs_ino(inode) || 7335 key.type != BTRFS_EXTENT_DATA_KEY) { 7336 /* not our file or wrong item type, must cow */ 7337 goto out; 7338 } 7339 7340 if (key.offset > offset) { 7341 /* Wrong offset, must cow */ 7342 goto out; 7343 } 7344 7345 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 7346 found_type = btrfs_file_extent_type(leaf, fi); 7347 if (found_type != BTRFS_FILE_EXTENT_REG && 7348 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 7349 /* not a regular extent, must cow */ 7350 goto out; 7351 } 7352 7353 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) 7354 goto out; 7355 7356 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 7357 if (extent_end <= offset) 7358 goto out; 7359 7360 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 7361 if (disk_bytenr == 0) 7362 goto out; 7363 7364 if (btrfs_file_extent_compression(leaf, fi) || 7365 btrfs_file_extent_encryption(leaf, fi) || 7366 btrfs_file_extent_other_encoding(leaf, fi)) 7367 goto out; 7368 7369 backref_offset = btrfs_file_extent_offset(leaf, fi); 7370 7371 if (orig_start) { 7372 *orig_start = key.offset - backref_offset; 7373 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 7374 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 7375 } 7376 7377 if (btrfs_extent_readonly(root, disk_bytenr)) 7378 goto out; 7379 7380 num_bytes = min(offset + *len, extent_end) - offset; 7381 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { 7382 u64 range_end; 7383 7384 range_end = round_up(offset + num_bytes, root->sectorsize) - 1; 7385 ret = test_range_bit(io_tree, offset, range_end, 7386 EXTENT_DELALLOC, 0, NULL); 7387 if (ret) { 7388 ret = -EAGAIN; 7389 goto out; 7390 } 7391 } 7392 7393 btrfs_release_path(path); 7394 7395 /* 7396 * look for other files referencing this extent, if we 7397 * find any we must cow 7398 */ 7399 trans = btrfs_join_transaction(root); 7400 if (IS_ERR(trans)) { 7401 ret = 0; 7402 goto out; 7403 } 7404 7405 ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), 7406 key.offset - backref_offset, disk_bytenr); 7407 btrfs_end_transaction(trans, root); 7408 if (ret) { 7409 ret = 0; 7410 goto out; 7411 } 7412 7413 /* 7414 * adjust disk_bytenr and num_bytes to cover just the bytes 7415 * in this extent we are about to write. If there 7416 * are any csums in that range we have to cow in order 7417 * to keep the csums correct 7418 */ 7419 disk_bytenr += backref_offset; 7420 disk_bytenr += offset - key.offset; 7421 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 7422 goto out; 7423 /* 7424 * all of the above have passed, it is safe to overwrite this extent 7425 * without cow 7426 */ 7427 *len = num_bytes; 7428 ret = 1; 7429 out: 7430 btrfs_free_path(path); 7431 return ret; 7432 } 7433 7434 bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) 7435 { 7436 struct radix_tree_root *root = &inode->i_mapping->page_tree; 7437 int found = false; 7438 void **pagep = NULL; 7439 struct page *page = NULL; 7440 int start_idx; 7441 int end_idx; 7442 7443 start_idx = start >> PAGE_SHIFT; 7444 7445 /* 7446 * end is the last byte in the last page. end == start is legal 7447 */ 7448 end_idx = end >> PAGE_SHIFT; 7449 7450 rcu_read_lock(); 7451 7452 /* Most of the code in this while loop is lifted from 7453 * find_get_page. It's been modified to begin searching from a 7454 * page and return just the first page found in that range. If the 7455 * found idx is less than or equal to the end idx then we know that 7456 * a page exists. If no pages are found or if those pages are 7457 * outside of the range then we're fine (yay!) */ 7458 while (page == NULL && 7459 radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { 7460 page = radix_tree_deref_slot(pagep); 7461 if (unlikely(!page)) 7462 break; 7463 7464 if (radix_tree_exception(page)) { 7465 if (radix_tree_deref_retry(page)) { 7466 page = NULL; 7467 continue; 7468 } 7469 /* 7470 * Otherwise, shmem/tmpfs must be storing a swap entry 7471 * here as an exceptional entry: so return it without 7472 * attempting to raise page count. 7473 */ 7474 page = NULL; 7475 break; /* TODO: Is this relevant for this use case? */ 7476 } 7477 7478 if (!page_cache_get_speculative(page)) { 7479 page = NULL; 7480 continue; 7481 } 7482 7483 /* 7484 * Has the page moved? 7485 * This is part of the lockless pagecache protocol. See 7486 * include/linux/pagemap.h for details. 7487 */ 7488 if (unlikely(page != *pagep)) { 7489 put_page(page); 7490 page = NULL; 7491 } 7492 } 7493 7494 if (page) { 7495 if (page->index <= end_idx) 7496 found = true; 7497 put_page(page); 7498 } 7499 7500 rcu_read_unlock(); 7501 return found; 7502 } 7503 7504 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 7505 struct extent_state **cached_state, int writing) 7506 { 7507 struct btrfs_ordered_extent *ordered; 7508 int ret = 0; 7509 7510 while (1) { 7511 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7512 cached_state); 7513 /* 7514 * We're concerned with the entire range that we're going to be 7515 * doing DIO to, so we need to make sure there's no ordered 7516 * extents in this range. 7517 */ 7518 ordered = btrfs_lookup_ordered_range(inode, lockstart, 7519 lockend - lockstart + 1); 7520 7521 /* 7522 * We need to make sure there are no buffered pages in this 7523 * range either, we could have raced between the invalidate in 7524 * generic_file_direct_write and locking the extent. The 7525 * invalidate needs to happen so that reads after a write do not 7526 * get stale data. 7527 */ 7528 if (!ordered && 7529 (!writing || 7530 !btrfs_page_exists_in_range(inode, lockstart, lockend))) 7531 break; 7532 7533 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7534 cached_state, GFP_NOFS); 7535 7536 if (ordered) { 7537 /* 7538 * If we are doing a DIO read and the ordered extent we 7539 * found is for a buffered write, we can not wait for it 7540 * to complete and retry, because if we do so we can 7541 * deadlock with concurrent buffered writes on page 7542 * locks. This happens only if our DIO read covers more 7543 * than one extent map, if at this point has already 7544 * created an ordered extent for a previous extent map 7545 * and locked its range in the inode's io tree, and a 7546 * concurrent write against that previous extent map's 7547 * range and this range started (we unlock the ranges 7548 * in the io tree only when the bios complete and 7549 * buffered writes always lock pages before attempting 7550 * to lock range in the io tree). 7551 */ 7552 if (writing || 7553 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) 7554 btrfs_start_ordered_extent(inode, ordered, 1); 7555 else 7556 ret = -ENOTBLK; 7557 btrfs_put_ordered_extent(ordered); 7558 } else { 7559 /* 7560 * We could trigger writeback for this range (and wait 7561 * for it to complete) and then invalidate the pages for 7562 * this range (through invalidate_inode_pages2_range()), 7563 * but that can lead us to a deadlock with a concurrent 7564 * call to readpages() (a buffered read or a defrag call 7565 * triggered a readahead) on a page lock due to an 7566 * ordered dio extent we created before but did not have 7567 * yet a corresponding bio submitted (whence it can not 7568 * complete), which makes readpages() wait for that 7569 * ordered extent to complete while holding a lock on 7570 * that page. 7571 */ 7572 ret = -ENOTBLK; 7573 } 7574 7575 if (ret) 7576 break; 7577 7578 cond_resched(); 7579 } 7580 7581 return ret; 7582 } 7583 7584 static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 7585 u64 len, u64 orig_start, 7586 u64 block_start, u64 block_len, 7587 u64 orig_block_len, u64 ram_bytes, 7588 int type) 7589 { 7590 struct extent_map_tree *em_tree; 7591 struct extent_map *em; 7592 struct btrfs_root *root = BTRFS_I(inode)->root; 7593 int ret; 7594 7595 em_tree = &BTRFS_I(inode)->extent_tree; 7596 em = alloc_extent_map(); 7597 if (!em) 7598 return ERR_PTR(-ENOMEM); 7599 7600 em->start = start; 7601 em->orig_start = orig_start; 7602 em->mod_start = start; 7603 em->mod_len = len; 7604 em->len = len; 7605 em->block_len = block_len; 7606 em->block_start = block_start; 7607 em->bdev = root->fs_info->fs_devices->latest_bdev; 7608 em->orig_block_len = orig_block_len; 7609 em->ram_bytes = ram_bytes; 7610 em->generation = -1; 7611 set_bit(EXTENT_FLAG_PINNED, &em->flags); 7612 if (type == BTRFS_ORDERED_PREALLOC) 7613 set_bit(EXTENT_FLAG_FILLING, &em->flags); 7614 7615 do { 7616 btrfs_drop_extent_cache(inode, em->start, 7617 em->start + em->len - 1, 0); 7618 write_lock(&em_tree->lock); 7619 ret = add_extent_mapping(em_tree, em, 1); 7620 write_unlock(&em_tree->lock); 7621 } while (ret == -EEXIST); 7622 7623 if (ret) { 7624 free_extent_map(em); 7625 return ERR_PTR(ret); 7626 } 7627 7628 return em; 7629 } 7630 7631 static void adjust_dio_outstanding_extents(struct inode *inode, 7632 struct btrfs_dio_data *dio_data, 7633 const u64 len) 7634 { 7635 unsigned num_extents; 7636 7637 num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1, 7638 BTRFS_MAX_EXTENT_SIZE); 7639 /* 7640 * If we have an outstanding_extents count still set then we're 7641 * within our reservation, otherwise we need to adjust our inode 7642 * counter appropriately. 7643 */ 7644 if (dio_data->outstanding_extents) { 7645 dio_data->outstanding_extents -= num_extents; 7646 } else { 7647 spin_lock(&BTRFS_I(inode)->lock); 7648 BTRFS_I(inode)->outstanding_extents += num_extents; 7649 spin_unlock(&BTRFS_I(inode)->lock); 7650 } 7651 } 7652 7653 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 7654 struct buffer_head *bh_result, int create) 7655 { 7656 struct extent_map *em; 7657 struct btrfs_root *root = BTRFS_I(inode)->root; 7658 struct extent_state *cached_state = NULL; 7659 struct btrfs_dio_data *dio_data = NULL; 7660 u64 start = iblock << inode->i_blkbits; 7661 u64 lockstart, lockend; 7662 u64 len = bh_result->b_size; 7663 int unlock_bits = EXTENT_LOCKED; 7664 int ret = 0; 7665 7666 if (create) 7667 unlock_bits |= EXTENT_DIRTY; 7668 else 7669 len = min_t(u64, len, root->sectorsize); 7670 7671 lockstart = start; 7672 lockend = start + len - 1; 7673 7674 if (current->journal_info) { 7675 /* 7676 * Need to pull our outstanding extents and set journal_info to NULL so 7677 * that anything that needs to check if there's a transaction doesn't get 7678 * confused. 7679 */ 7680 dio_data = current->journal_info; 7681 current->journal_info = NULL; 7682 } 7683 7684 /* 7685 * If this errors out it's because we couldn't invalidate pagecache for 7686 * this range and we need to fallback to buffered. 7687 */ 7688 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, 7689 create)) { 7690 ret = -ENOTBLK; 7691 goto err; 7692 } 7693 7694 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 7695 if (IS_ERR(em)) { 7696 ret = PTR_ERR(em); 7697 goto unlock_err; 7698 } 7699 7700 /* 7701 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 7702 * io. INLINE is special, and we could probably kludge it in here, but 7703 * it's still buffered so for safety lets just fall back to the generic 7704 * buffered path. 7705 * 7706 * For COMPRESSED we _have_ to read the entire extent in so we can 7707 * decompress it, so there will be buffering required no matter what we 7708 * do, so go ahead and fallback to buffered. 7709 * 7710 * We return -ENOTBLK because that's what makes DIO go ahead and go back 7711 * to buffered IO. Don't blame me, this is the price we pay for using 7712 * the generic code. 7713 */ 7714 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 7715 em->block_start == EXTENT_MAP_INLINE) { 7716 free_extent_map(em); 7717 ret = -ENOTBLK; 7718 goto unlock_err; 7719 } 7720 7721 /* Just a good old fashioned hole, return */ 7722 if (!create && (em->block_start == EXTENT_MAP_HOLE || 7723 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 7724 free_extent_map(em); 7725 goto unlock_err; 7726 } 7727 7728 /* 7729 * We don't allocate a new extent in the following cases 7730 * 7731 * 1) The inode is marked as NODATACOW. In this case we'll just use the 7732 * existing extent. 7733 * 2) The extent is marked as PREALLOC. We're good to go here and can 7734 * just use the extent. 7735 * 7736 */ 7737 if (!create) { 7738 len = min(len, em->len - (start - em->start)); 7739 lockstart = start + len; 7740 goto unlock; 7741 } 7742 7743 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 7744 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 7745 em->block_start != EXTENT_MAP_HOLE)) { 7746 int type; 7747 u64 block_start, orig_start, orig_block_len, ram_bytes; 7748 7749 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7750 type = BTRFS_ORDERED_PREALLOC; 7751 else 7752 type = BTRFS_ORDERED_NOCOW; 7753 len = min(len, em->len - (start - em->start)); 7754 block_start = em->block_start + (start - em->start); 7755 7756 if (can_nocow_extent(inode, start, &len, &orig_start, 7757 &orig_block_len, &ram_bytes) == 1 && 7758 btrfs_inc_nocow_writers(root->fs_info, block_start)) { 7759 struct extent_map *em2; 7760 7761 em2 = btrfs_create_dio_extent(inode, start, len, 7762 orig_start, block_start, 7763 len, orig_block_len, 7764 ram_bytes, type); 7765 btrfs_dec_nocow_writers(root->fs_info, block_start); 7766 if (type == BTRFS_ORDERED_PREALLOC) { 7767 free_extent_map(em); 7768 em = em2; 7769 } 7770 if (em2 && IS_ERR(em2)) { 7771 ret = PTR_ERR(em2); 7772 goto unlock_err; 7773 } 7774 /* 7775 * For inode marked NODATACOW or extent marked PREALLOC, 7776 * use the existing or preallocated extent, so does not 7777 * need to adjust btrfs_space_info's bytes_may_use. 7778 */ 7779 btrfs_free_reserved_data_space_noquota(inode, 7780 start, len); 7781 goto unlock; 7782 } 7783 } 7784 7785 /* 7786 * this will cow the extent, reset the len in case we changed 7787 * it above 7788 */ 7789 len = bh_result->b_size; 7790 free_extent_map(em); 7791 em = btrfs_new_extent_direct(inode, start, len); 7792 if (IS_ERR(em)) { 7793 ret = PTR_ERR(em); 7794 goto unlock_err; 7795 } 7796 len = min(len, em->len - (start - em->start)); 7797 unlock: 7798 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 7799 inode->i_blkbits; 7800 bh_result->b_size = len; 7801 bh_result->b_bdev = em->bdev; 7802 set_buffer_mapped(bh_result); 7803 if (create) { 7804 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7805 set_buffer_new(bh_result); 7806 7807 /* 7808 * Need to update the i_size under the extent lock so buffered 7809 * readers will get the updated i_size when we unlock. 7810 */ 7811 if (start + len > i_size_read(inode)) 7812 i_size_write(inode, start + len); 7813 7814 adjust_dio_outstanding_extents(inode, dio_data, len); 7815 WARN_ON(dio_data->reserve < len); 7816 dio_data->reserve -= len; 7817 dio_data->unsubmitted_oe_range_end = start + len; 7818 current->journal_info = dio_data; 7819 } 7820 7821 /* 7822 * In the case of write we need to clear and unlock the entire range, 7823 * in the case of read we need to unlock only the end area that we 7824 * aren't using if there is any left over space. 7825 */ 7826 if (lockstart < lockend) { 7827 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 7828 lockend, unlock_bits, 1, 0, 7829 &cached_state, GFP_NOFS); 7830 } else { 7831 free_extent_state(cached_state); 7832 } 7833 7834 free_extent_map(em); 7835 7836 return 0; 7837 7838 unlock_err: 7839 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7840 unlock_bits, 1, 0, &cached_state, GFP_NOFS); 7841 err: 7842 if (dio_data) 7843 current->journal_info = dio_data; 7844 /* 7845 * Compensate the delalloc release we do in btrfs_direct_IO() when we 7846 * write less data then expected, so that we don't underflow our inode's 7847 * outstanding extents counter. 7848 */ 7849 if (create && dio_data) 7850 adjust_dio_outstanding_extents(inode, dio_data, len); 7851 7852 return ret; 7853 } 7854 7855 static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, 7856 int mirror_num) 7857 { 7858 struct btrfs_root *root = BTRFS_I(inode)->root; 7859 int ret; 7860 7861 BUG_ON(bio_op(bio) == REQ_OP_WRITE); 7862 7863 bio_get(bio); 7864 7865 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 7866 BTRFS_WQ_ENDIO_DIO_REPAIR); 7867 if (ret) 7868 goto err; 7869 7870 ret = btrfs_map_bio(root, bio, mirror_num, 0); 7871 err: 7872 bio_put(bio); 7873 return ret; 7874 } 7875 7876 static int btrfs_check_dio_repairable(struct inode *inode, 7877 struct bio *failed_bio, 7878 struct io_failure_record *failrec, 7879 int failed_mirror) 7880 { 7881 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7882 int num_copies; 7883 7884 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 7885 if (num_copies == 1) { 7886 /* 7887 * we only have a single copy of the data, so don't bother with 7888 * all the retry and error correction code that follows. no 7889 * matter what the error is, it is very likely to persist. 7890 */ 7891 btrfs_debug(fs_info, 7892 "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 7893 num_copies, failrec->this_mirror, failed_mirror); 7894 return 0; 7895 } 7896 7897 failrec->failed_mirror = failed_mirror; 7898 failrec->this_mirror++; 7899 if (failrec->this_mirror == failed_mirror) 7900 failrec->this_mirror++; 7901 7902 if (failrec->this_mirror > num_copies) { 7903 btrfs_debug(fs_info, 7904 "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 7905 num_copies, failrec->this_mirror, failed_mirror); 7906 return 0; 7907 } 7908 7909 return 1; 7910 } 7911 7912 static int dio_read_error(struct inode *inode, struct bio *failed_bio, 7913 struct page *page, unsigned int pgoff, 7914 u64 start, u64 end, int failed_mirror, 7915 bio_end_io_t *repair_endio, void *repair_arg) 7916 { 7917 struct io_failure_record *failrec; 7918 struct bio *bio; 7919 int isector; 7920 int read_mode; 7921 int ret; 7922 7923 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 7924 7925 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 7926 if (ret) 7927 return ret; 7928 7929 ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, 7930 failed_mirror); 7931 if (!ret) { 7932 free_io_failure(inode, failrec); 7933 return -EIO; 7934 } 7935 7936 if ((failed_bio->bi_vcnt > 1) 7937 || (failed_bio->bi_io_vec->bv_len 7938 > BTRFS_I(inode)->root->sectorsize)) 7939 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 7940 else 7941 read_mode = READ_SYNC; 7942 7943 isector = start - btrfs_io_bio(failed_bio)->logical; 7944 isector >>= inode->i_sb->s_blocksize_bits; 7945 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 7946 pgoff, isector, repair_endio, repair_arg); 7947 if (!bio) { 7948 free_io_failure(inode, failrec); 7949 return -EIO; 7950 } 7951 bio_set_op_attrs(bio, REQ_OP_READ, read_mode); 7952 7953 btrfs_debug(BTRFS_I(inode)->root->fs_info, 7954 "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", 7955 read_mode, failrec->this_mirror, failrec->in_validation); 7956 7957 ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); 7958 if (ret) { 7959 free_io_failure(inode, failrec); 7960 bio_put(bio); 7961 } 7962 7963 return ret; 7964 } 7965 7966 struct btrfs_retry_complete { 7967 struct completion done; 7968 struct inode *inode; 7969 u64 start; 7970 int uptodate; 7971 }; 7972 7973 static void btrfs_retry_endio_nocsum(struct bio *bio) 7974 { 7975 struct btrfs_retry_complete *done = bio->bi_private; 7976 struct inode *inode; 7977 struct bio_vec *bvec; 7978 int i; 7979 7980 if (bio->bi_error) 7981 goto end; 7982 7983 ASSERT(bio->bi_vcnt == 1); 7984 inode = bio->bi_io_vec->bv_page->mapping->host; 7985 ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize); 7986 7987 done->uptodate = 1; 7988 bio_for_each_segment_all(bvec, bio, i) 7989 clean_io_failure(done->inode, done->start, bvec->bv_page, 0); 7990 end: 7991 complete(&done->done); 7992 bio_put(bio); 7993 } 7994 7995 static int __btrfs_correct_data_nocsum(struct inode *inode, 7996 struct btrfs_io_bio *io_bio) 7997 { 7998 struct btrfs_fs_info *fs_info; 7999 struct bio_vec *bvec; 8000 struct btrfs_retry_complete done; 8001 u64 start; 8002 unsigned int pgoff; 8003 u32 sectorsize; 8004 int nr_sectors; 8005 int i; 8006 int ret; 8007 8008 fs_info = BTRFS_I(inode)->root->fs_info; 8009 sectorsize = BTRFS_I(inode)->root->sectorsize; 8010 8011 start = io_bio->logical; 8012 done.inode = inode; 8013 8014 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 8015 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); 8016 pgoff = bvec->bv_offset; 8017 8018 next_block_or_try_again: 8019 done.uptodate = 0; 8020 done.start = start; 8021 init_completion(&done.done); 8022 8023 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, 8024 pgoff, start, start + sectorsize - 1, 8025 io_bio->mirror_num, 8026 btrfs_retry_endio_nocsum, &done); 8027 if (ret) 8028 return ret; 8029 8030 wait_for_completion(&done.done); 8031 8032 if (!done.uptodate) { 8033 /* We might have another mirror, so try again */ 8034 goto next_block_or_try_again; 8035 } 8036 8037 start += sectorsize; 8038 8039 if (nr_sectors--) { 8040 pgoff += sectorsize; 8041 goto next_block_or_try_again; 8042 } 8043 } 8044 8045 return 0; 8046 } 8047 8048 static void btrfs_retry_endio(struct bio *bio) 8049 { 8050 struct btrfs_retry_complete *done = bio->bi_private; 8051 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 8052 struct inode *inode; 8053 struct bio_vec *bvec; 8054 u64 start; 8055 int uptodate; 8056 int ret; 8057 int i; 8058 8059 if (bio->bi_error) 8060 goto end; 8061 8062 uptodate = 1; 8063 8064 start = done->start; 8065 8066 ASSERT(bio->bi_vcnt == 1); 8067 inode = bio->bi_io_vec->bv_page->mapping->host; 8068 ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize); 8069 8070 bio_for_each_segment_all(bvec, bio, i) { 8071 ret = __readpage_endio_check(done->inode, io_bio, i, 8072 bvec->bv_page, bvec->bv_offset, 8073 done->start, bvec->bv_len); 8074 if (!ret) 8075 clean_io_failure(done->inode, done->start, 8076 bvec->bv_page, bvec->bv_offset); 8077 else 8078 uptodate = 0; 8079 } 8080 8081 done->uptodate = uptodate; 8082 end: 8083 complete(&done->done); 8084 bio_put(bio); 8085 } 8086 8087 static int __btrfs_subio_endio_read(struct inode *inode, 8088 struct btrfs_io_bio *io_bio, int err) 8089 { 8090 struct btrfs_fs_info *fs_info; 8091 struct bio_vec *bvec; 8092 struct btrfs_retry_complete done; 8093 u64 start; 8094 u64 offset = 0; 8095 u32 sectorsize; 8096 int nr_sectors; 8097 unsigned int pgoff; 8098 int csum_pos; 8099 int i; 8100 int ret; 8101 8102 fs_info = BTRFS_I(inode)->root->fs_info; 8103 sectorsize = BTRFS_I(inode)->root->sectorsize; 8104 8105 err = 0; 8106 start = io_bio->logical; 8107 done.inode = inode; 8108 8109 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 8110 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); 8111 8112 pgoff = bvec->bv_offset; 8113 next_block: 8114 csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); 8115 ret = __readpage_endio_check(inode, io_bio, csum_pos, 8116 bvec->bv_page, pgoff, start, 8117 sectorsize); 8118 if (likely(!ret)) 8119 goto next; 8120 try_again: 8121 done.uptodate = 0; 8122 done.start = start; 8123 init_completion(&done.done); 8124 8125 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, 8126 pgoff, start, start + sectorsize - 1, 8127 io_bio->mirror_num, 8128 btrfs_retry_endio, &done); 8129 if (ret) { 8130 err = ret; 8131 goto next; 8132 } 8133 8134 wait_for_completion(&done.done); 8135 8136 if (!done.uptodate) { 8137 /* We might have another mirror, so try again */ 8138 goto try_again; 8139 } 8140 next: 8141 offset += sectorsize; 8142 start += sectorsize; 8143 8144 ASSERT(nr_sectors); 8145 8146 if (--nr_sectors) { 8147 pgoff += sectorsize; 8148 goto next_block; 8149 } 8150 } 8151 8152 return err; 8153 } 8154 8155 static int btrfs_subio_endio_read(struct inode *inode, 8156 struct btrfs_io_bio *io_bio, int err) 8157 { 8158 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 8159 8160 if (skip_csum) { 8161 if (unlikely(err)) 8162 return __btrfs_correct_data_nocsum(inode, io_bio); 8163 else 8164 return 0; 8165 } else { 8166 return __btrfs_subio_endio_read(inode, io_bio, err); 8167 } 8168 } 8169 8170 static void btrfs_endio_direct_read(struct bio *bio) 8171 { 8172 struct btrfs_dio_private *dip = bio->bi_private; 8173 struct inode *inode = dip->inode; 8174 struct bio *dio_bio; 8175 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 8176 int err = bio->bi_error; 8177 8178 if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) 8179 err = btrfs_subio_endio_read(inode, io_bio, err); 8180 8181 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 8182 dip->logical_offset + dip->bytes - 1); 8183 dio_bio = dip->dio_bio; 8184 8185 kfree(dip); 8186 8187 dio_bio->bi_error = bio->bi_error; 8188 dio_end_io(dio_bio, bio->bi_error); 8189 8190 if (io_bio->end_io) 8191 io_bio->end_io(io_bio, err); 8192 bio_put(bio); 8193 } 8194 8195 static void btrfs_endio_direct_write_update_ordered(struct inode *inode, 8196 const u64 offset, 8197 const u64 bytes, 8198 const int uptodate) 8199 { 8200 struct btrfs_root *root = BTRFS_I(inode)->root; 8201 struct btrfs_ordered_extent *ordered = NULL; 8202 u64 ordered_offset = offset; 8203 u64 ordered_bytes = bytes; 8204 int ret; 8205 8206 again: 8207 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 8208 &ordered_offset, 8209 ordered_bytes, 8210 uptodate); 8211 if (!ret) 8212 goto out_test; 8213 8214 btrfs_init_work(&ordered->work, btrfs_endio_write_helper, 8215 finish_ordered_fn, NULL, NULL); 8216 btrfs_queue_work(root->fs_info->endio_write_workers, 8217 &ordered->work); 8218 out_test: 8219 /* 8220 * our bio might span multiple ordered extents. If we haven't 8221 * completed the accounting for the whole dio, go back and try again 8222 */ 8223 if (ordered_offset < offset + bytes) { 8224 ordered_bytes = offset + bytes - ordered_offset; 8225 ordered = NULL; 8226 goto again; 8227 } 8228 } 8229 8230 static void btrfs_endio_direct_write(struct bio *bio) 8231 { 8232 struct btrfs_dio_private *dip = bio->bi_private; 8233 struct bio *dio_bio = dip->dio_bio; 8234 8235 btrfs_endio_direct_write_update_ordered(dip->inode, 8236 dip->logical_offset, 8237 dip->bytes, 8238 !bio->bi_error); 8239 8240 kfree(dip); 8241 8242 dio_bio->bi_error = bio->bi_error; 8243 dio_end_io(dio_bio, bio->bi_error); 8244 bio_put(bio); 8245 } 8246 8247 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, 8248 struct bio *bio, int mirror_num, 8249 unsigned long bio_flags, u64 offset) 8250 { 8251 int ret; 8252 struct btrfs_root *root = BTRFS_I(inode)->root; 8253 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); 8254 BUG_ON(ret); /* -ENOMEM */ 8255 return 0; 8256 } 8257 8258 static void btrfs_end_dio_bio(struct bio *bio) 8259 { 8260 struct btrfs_dio_private *dip = bio->bi_private; 8261 int err = bio->bi_error; 8262 8263 if (err) 8264 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, 8265 "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", 8266 btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf, 8267 (unsigned long long)bio->bi_iter.bi_sector, 8268 bio->bi_iter.bi_size, err); 8269 8270 if (dip->subio_endio) 8271 err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); 8272 8273 if (err) { 8274 dip->errors = 1; 8275 8276 /* 8277 * before atomic variable goto zero, we must make sure 8278 * dip->errors is perceived to be set. 8279 */ 8280 smp_mb__before_atomic(); 8281 } 8282 8283 /* if there are more bios still pending for this dio, just exit */ 8284 if (!atomic_dec_and_test(&dip->pending_bios)) 8285 goto out; 8286 8287 if (dip->errors) { 8288 bio_io_error(dip->orig_bio); 8289 } else { 8290 dip->dio_bio->bi_error = 0; 8291 bio_endio(dip->orig_bio); 8292 } 8293 out: 8294 bio_put(bio); 8295 } 8296 8297 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, 8298 u64 first_sector, gfp_t gfp_flags) 8299 { 8300 struct bio *bio; 8301 bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); 8302 if (bio) 8303 bio_associate_current(bio); 8304 return bio; 8305 } 8306 8307 static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, 8308 struct inode *inode, 8309 struct btrfs_dio_private *dip, 8310 struct bio *bio, 8311 u64 file_offset) 8312 { 8313 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 8314 struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); 8315 int ret; 8316 8317 /* 8318 * We load all the csum data we need when we submit 8319 * the first bio to reduce the csum tree search and 8320 * contention. 8321 */ 8322 if (dip->logical_offset == file_offset) { 8323 ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio, 8324 file_offset); 8325 if (ret) 8326 return ret; 8327 } 8328 8329 if (bio == dip->orig_bio) 8330 return 0; 8331 8332 file_offset -= dip->logical_offset; 8333 file_offset >>= inode->i_sb->s_blocksize_bits; 8334 io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset); 8335 8336 return 0; 8337 } 8338 8339 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 8340 u64 file_offset, int skip_sum, 8341 int async_submit) 8342 { 8343 struct btrfs_dio_private *dip = bio->bi_private; 8344 bool write = bio_op(bio) == REQ_OP_WRITE; 8345 struct btrfs_root *root = BTRFS_I(inode)->root; 8346 int ret; 8347 8348 if (async_submit) 8349 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); 8350 8351 bio_get(bio); 8352 8353 if (!write) { 8354 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 8355 BTRFS_WQ_ENDIO_DATA); 8356 if (ret) 8357 goto err; 8358 } 8359 8360 if (skip_sum) 8361 goto map; 8362 8363 if (write && async_submit) { 8364 ret = btrfs_wq_submit_bio(root->fs_info, 8365 inode, bio, 0, 0, file_offset, 8366 __btrfs_submit_bio_start_direct_io, 8367 __btrfs_submit_bio_done); 8368 goto err; 8369 } else if (write) { 8370 /* 8371 * If we aren't doing async submit, calculate the csum of the 8372 * bio now. 8373 */ 8374 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 8375 if (ret) 8376 goto err; 8377 } else { 8378 ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio, 8379 file_offset); 8380 if (ret) 8381 goto err; 8382 } 8383 map: 8384 ret = btrfs_map_bio(root, bio, 0, async_submit); 8385 err: 8386 bio_put(bio); 8387 return ret; 8388 } 8389 8390 static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, 8391 int skip_sum) 8392 { 8393 struct inode *inode = dip->inode; 8394 struct btrfs_root *root = BTRFS_I(inode)->root; 8395 struct bio *bio; 8396 struct bio *orig_bio = dip->orig_bio; 8397 struct bio_vec *bvec = orig_bio->bi_io_vec; 8398 u64 start_sector = orig_bio->bi_iter.bi_sector; 8399 u64 file_offset = dip->logical_offset; 8400 u64 submit_len = 0; 8401 u64 map_length; 8402 u32 blocksize = root->sectorsize; 8403 int async_submit = 0; 8404 int nr_sectors; 8405 int ret; 8406 int i; 8407 8408 map_length = orig_bio->bi_iter.bi_size; 8409 ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), 8410 start_sector << 9, &map_length, NULL, 0); 8411 if (ret) 8412 return -EIO; 8413 8414 if (map_length >= orig_bio->bi_iter.bi_size) { 8415 bio = orig_bio; 8416 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; 8417 goto submit; 8418 } 8419 8420 /* async crcs make it difficult to collect full stripe writes. */ 8421 if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) 8422 async_submit = 0; 8423 else 8424 async_submit = 1; 8425 8426 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 8427 if (!bio) 8428 return -ENOMEM; 8429 8430 bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio)); 8431 bio->bi_private = dip; 8432 bio->bi_end_io = btrfs_end_dio_bio; 8433 btrfs_io_bio(bio)->logical = file_offset; 8434 atomic_inc(&dip->pending_bios); 8435 8436 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 8437 nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len); 8438 i = 0; 8439 next_block: 8440 if (unlikely(map_length < submit_len + blocksize || 8441 bio_add_page(bio, bvec->bv_page, blocksize, 8442 bvec->bv_offset + (i * blocksize)) < blocksize)) { 8443 /* 8444 * inc the count before we submit the bio so 8445 * we know the end IO handler won't happen before 8446 * we inc the count. Otherwise, the dip might get freed 8447 * before we're done setting it up 8448 */ 8449 atomic_inc(&dip->pending_bios); 8450 ret = __btrfs_submit_dio_bio(bio, inode, 8451 file_offset, skip_sum, 8452 async_submit); 8453 if (ret) { 8454 bio_put(bio); 8455 atomic_dec(&dip->pending_bios); 8456 goto out_err; 8457 } 8458 8459 start_sector += submit_len >> 9; 8460 file_offset += submit_len; 8461 8462 submit_len = 0; 8463 8464 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 8465 start_sector, GFP_NOFS); 8466 if (!bio) 8467 goto out_err; 8468 bio_set_op_attrs(bio, bio_op(orig_bio), 8469 bio_flags(orig_bio)); 8470 bio->bi_private = dip; 8471 bio->bi_end_io = btrfs_end_dio_bio; 8472 btrfs_io_bio(bio)->logical = file_offset; 8473 8474 map_length = orig_bio->bi_iter.bi_size; 8475 ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), 8476 start_sector << 9, 8477 &map_length, NULL, 0); 8478 if (ret) { 8479 bio_put(bio); 8480 goto out_err; 8481 } 8482 8483 goto next_block; 8484 } else { 8485 submit_len += blocksize; 8486 if (--nr_sectors) { 8487 i++; 8488 goto next_block; 8489 } 8490 bvec++; 8491 } 8492 } 8493 8494 submit: 8495 ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, 8496 async_submit); 8497 if (!ret) 8498 return 0; 8499 8500 bio_put(bio); 8501 out_err: 8502 dip->errors = 1; 8503 /* 8504 * before atomic variable goto zero, we must 8505 * make sure dip->errors is perceived to be set. 8506 */ 8507 smp_mb__before_atomic(); 8508 if (atomic_dec_and_test(&dip->pending_bios)) 8509 bio_io_error(dip->orig_bio); 8510 8511 /* bio_end_io() will handle error, so we needn't return it */ 8512 return 0; 8513 } 8514 8515 static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, 8516 loff_t file_offset) 8517 { 8518 struct btrfs_dio_private *dip = NULL; 8519 struct bio *io_bio = NULL; 8520 struct btrfs_io_bio *btrfs_bio; 8521 int skip_sum; 8522 bool write = (bio_op(dio_bio) == REQ_OP_WRITE); 8523 int ret = 0; 8524 8525 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 8526 8527 io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); 8528 if (!io_bio) { 8529 ret = -ENOMEM; 8530 goto free_ordered; 8531 } 8532 8533 dip = kzalloc(sizeof(*dip), GFP_NOFS); 8534 if (!dip) { 8535 ret = -ENOMEM; 8536 goto free_ordered; 8537 } 8538 8539 dip->private = dio_bio->bi_private; 8540 dip->inode = inode; 8541 dip->logical_offset = file_offset; 8542 dip->bytes = dio_bio->bi_iter.bi_size; 8543 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; 8544 io_bio->bi_private = dip; 8545 dip->orig_bio = io_bio; 8546 dip->dio_bio = dio_bio; 8547 atomic_set(&dip->pending_bios, 0); 8548 btrfs_bio = btrfs_io_bio(io_bio); 8549 btrfs_bio->logical = file_offset; 8550 8551 if (write) { 8552 io_bio->bi_end_io = btrfs_endio_direct_write; 8553 } else { 8554 io_bio->bi_end_io = btrfs_endio_direct_read; 8555 dip->subio_endio = btrfs_subio_endio_read; 8556 } 8557 8558 /* 8559 * Reset the range for unsubmitted ordered extents (to a 0 length range) 8560 * even if we fail to submit a bio, because in such case we do the 8561 * corresponding error handling below and it must not be done a second 8562 * time by btrfs_direct_IO(). 8563 */ 8564 if (write) { 8565 struct btrfs_dio_data *dio_data = current->journal_info; 8566 8567 dio_data->unsubmitted_oe_range_end = dip->logical_offset + 8568 dip->bytes; 8569 dio_data->unsubmitted_oe_range_start = 8570 dio_data->unsubmitted_oe_range_end; 8571 } 8572 8573 ret = btrfs_submit_direct_hook(dip, skip_sum); 8574 if (!ret) 8575 return; 8576 8577 if (btrfs_bio->end_io) 8578 btrfs_bio->end_io(btrfs_bio, ret); 8579 8580 free_ordered: 8581 /* 8582 * If we arrived here it means either we failed to submit the dip 8583 * or we either failed to clone the dio_bio or failed to allocate the 8584 * dip. If we cloned the dio_bio and allocated the dip, we can just 8585 * call bio_endio against our io_bio so that we get proper resource 8586 * cleanup if we fail to submit the dip, otherwise, we must do the 8587 * same as btrfs_endio_direct_[write|read] because we can't call these 8588 * callbacks - they require an allocated dip and a clone of dio_bio. 8589 */ 8590 if (io_bio && dip) { 8591 io_bio->bi_error = -EIO; 8592 bio_endio(io_bio); 8593 /* 8594 * The end io callbacks free our dip, do the final put on io_bio 8595 * and all the cleanup and final put for dio_bio (through 8596 * dio_end_io()). 8597 */ 8598 dip = NULL; 8599 io_bio = NULL; 8600 } else { 8601 if (write) 8602 btrfs_endio_direct_write_update_ordered(inode, 8603 file_offset, 8604 dio_bio->bi_iter.bi_size, 8605 0); 8606 else 8607 unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, 8608 file_offset + dio_bio->bi_iter.bi_size - 1); 8609 8610 dio_bio->bi_error = -EIO; 8611 /* 8612 * Releases and cleans up our dio_bio, no need to bio_put() 8613 * nor bio_endio()/bio_io_error() against dio_bio. 8614 */ 8615 dio_end_io(dio_bio, ret); 8616 } 8617 if (io_bio) 8618 bio_put(io_bio); 8619 kfree(dip); 8620 } 8621 8622 static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, 8623 const struct iov_iter *iter, loff_t offset) 8624 { 8625 int seg; 8626 int i; 8627 unsigned blocksize_mask = root->sectorsize - 1; 8628 ssize_t retval = -EINVAL; 8629 8630 if (offset & blocksize_mask) 8631 goto out; 8632 8633 if (iov_iter_alignment(iter) & blocksize_mask) 8634 goto out; 8635 8636 /* If this is a write we don't need to check anymore */ 8637 if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter)) 8638 return 0; 8639 /* 8640 * Check to make sure we don't have duplicate iov_base's in this 8641 * iovec, if so return EINVAL, otherwise we'll get csum errors 8642 * when reading back. 8643 */ 8644 for (seg = 0; seg < iter->nr_segs; seg++) { 8645 for (i = seg + 1; i < iter->nr_segs; i++) { 8646 if (iter->iov[seg].iov_base == iter->iov[i].iov_base) 8647 goto out; 8648 } 8649 } 8650 retval = 0; 8651 out: 8652 return retval; 8653 } 8654 8655 static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 8656 { 8657 struct file *file = iocb->ki_filp; 8658 struct inode *inode = file->f_mapping->host; 8659 struct btrfs_root *root = BTRFS_I(inode)->root; 8660 struct btrfs_dio_data dio_data = { 0 }; 8661 loff_t offset = iocb->ki_pos; 8662 size_t count = 0; 8663 int flags = 0; 8664 bool wakeup = true; 8665 bool relock = false; 8666 ssize_t ret; 8667 8668 if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset)) 8669 return 0; 8670 8671 inode_dio_begin(inode); 8672 smp_mb__after_atomic(); 8673 8674 /* 8675 * The generic stuff only does filemap_write_and_wait_range, which 8676 * isn't enough if we've written compressed pages to this area, so 8677 * we need to flush the dirty pages again to make absolutely sure 8678 * that any outstanding dirty pages are on disk. 8679 */ 8680 count = iov_iter_count(iter); 8681 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 8682 &BTRFS_I(inode)->runtime_flags)) 8683 filemap_fdatawrite_range(inode->i_mapping, offset, 8684 offset + count - 1); 8685 8686 if (iov_iter_rw(iter) == WRITE) { 8687 /* 8688 * If the write DIO is beyond the EOF, we need update 8689 * the isize, but it is protected by i_mutex. So we can 8690 * not unlock the i_mutex at this case. 8691 */ 8692 if (offset + count <= inode->i_size) { 8693 inode_unlock(inode); 8694 relock = true; 8695 } 8696 ret = btrfs_delalloc_reserve_space(inode, offset, count); 8697 if (ret) 8698 goto out; 8699 dio_data.outstanding_extents = div64_u64(count + 8700 BTRFS_MAX_EXTENT_SIZE - 1, 8701 BTRFS_MAX_EXTENT_SIZE); 8702 8703 /* 8704 * We need to know how many extents we reserved so that we can 8705 * do the accounting properly if we go over the number we 8706 * originally calculated. Abuse current->journal_info for this. 8707 */ 8708 dio_data.reserve = round_up(count, root->sectorsize); 8709 dio_data.unsubmitted_oe_range_start = (u64)offset; 8710 dio_data.unsubmitted_oe_range_end = (u64)offset; 8711 current->journal_info = &dio_data; 8712 } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, 8713 &BTRFS_I(inode)->runtime_flags)) { 8714 inode_dio_end(inode); 8715 flags = DIO_LOCKING | DIO_SKIP_HOLES; 8716 wakeup = false; 8717 } 8718 8719 ret = __blockdev_direct_IO(iocb, inode, 8720 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 8721 iter, btrfs_get_blocks_direct, NULL, 8722 btrfs_submit_direct, flags); 8723 if (iov_iter_rw(iter) == WRITE) { 8724 current->journal_info = NULL; 8725 if (ret < 0 && ret != -EIOCBQUEUED) { 8726 if (dio_data.reserve) 8727 btrfs_delalloc_release_space(inode, offset, 8728 dio_data.reserve); 8729 /* 8730 * On error we might have left some ordered extents 8731 * without submitting corresponding bios for them, so 8732 * cleanup them up to avoid other tasks getting them 8733 * and waiting for them to complete forever. 8734 */ 8735 if (dio_data.unsubmitted_oe_range_start < 8736 dio_data.unsubmitted_oe_range_end) 8737 btrfs_endio_direct_write_update_ordered(inode, 8738 dio_data.unsubmitted_oe_range_start, 8739 dio_data.unsubmitted_oe_range_end - 8740 dio_data.unsubmitted_oe_range_start, 8741 0); 8742 } else if (ret >= 0 && (size_t)ret < count) 8743 btrfs_delalloc_release_space(inode, offset, 8744 count - (size_t)ret); 8745 } 8746 out: 8747 if (wakeup) 8748 inode_dio_end(inode); 8749 if (relock) 8750 inode_lock(inode); 8751 8752 return ret; 8753 } 8754 8755 #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 8756 8757 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 8758 __u64 start, __u64 len) 8759 { 8760 int ret; 8761 8762 ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); 8763 if (ret) 8764 return ret; 8765 8766 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 8767 } 8768 8769 int btrfs_readpage(struct file *file, struct page *page) 8770 { 8771 struct extent_io_tree *tree; 8772 tree = &BTRFS_I(page->mapping->host)->io_tree; 8773 return extent_read_full_page(tree, page, btrfs_get_extent, 0); 8774 } 8775 8776 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 8777 { 8778 struct extent_io_tree *tree; 8779 struct inode *inode = page->mapping->host; 8780 int ret; 8781 8782 if (current->flags & PF_MEMALLOC) { 8783 redirty_page_for_writepage(wbc, page); 8784 unlock_page(page); 8785 return 0; 8786 } 8787 8788 /* 8789 * If we are under memory pressure we will call this directly from the 8790 * VM, we need to make sure we have the inode referenced for the ordered 8791 * extent. If not just return like we didn't do anything. 8792 */ 8793 if (!igrab(inode)) { 8794 redirty_page_for_writepage(wbc, page); 8795 return AOP_WRITEPAGE_ACTIVATE; 8796 } 8797 tree = &BTRFS_I(page->mapping->host)->io_tree; 8798 ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc); 8799 btrfs_add_delayed_iput(inode); 8800 return ret; 8801 } 8802 8803 static int btrfs_writepages(struct address_space *mapping, 8804 struct writeback_control *wbc) 8805 { 8806 struct extent_io_tree *tree; 8807 8808 tree = &BTRFS_I(mapping->host)->io_tree; 8809 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 8810 } 8811 8812 static int 8813 btrfs_readpages(struct file *file, struct address_space *mapping, 8814 struct list_head *pages, unsigned nr_pages) 8815 { 8816 struct extent_io_tree *tree; 8817 tree = &BTRFS_I(mapping->host)->io_tree; 8818 return extent_readpages(tree, mapping, pages, nr_pages, 8819 btrfs_get_extent); 8820 } 8821 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8822 { 8823 struct extent_io_tree *tree; 8824 struct extent_map_tree *map; 8825 int ret; 8826 8827 tree = &BTRFS_I(page->mapping->host)->io_tree; 8828 map = &BTRFS_I(page->mapping->host)->extent_tree; 8829 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 8830 if (ret == 1) { 8831 ClearPagePrivate(page); 8832 set_page_private(page, 0); 8833 put_page(page); 8834 } 8835 return ret; 8836 } 8837 8838 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8839 { 8840 if (PageWriteback(page) || PageDirty(page)) 8841 return 0; 8842 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 8843 } 8844 8845 static void btrfs_invalidatepage(struct page *page, unsigned int offset, 8846 unsigned int length) 8847 { 8848 struct inode *inode = page->mapping->host; 8849 struct extent_io_tree *tree; 8850 struct btrfs_ordered_extent *ordered; 8851 struct extent_state *cached_state = NULL; 8852 u64 page_start = page_offset(page); 8853 u64 page_end = page_start + PAGE_SIZE - 1; 8854 u64 start; 8855 u64 end; 8856 int inode_evicting = inode->i_state & I_FREEING; 8857 8858 /* 8859 * we have the page locked, so new writeback can't start, 8860 * and the dirty bit won't be cleared while we are here. 8861 * 8862 * Wait for IO on this page so that we can safely clear 8863 * the PagePrivate2 bit and do ordered accounting 8864 */ 8865 wait_on_page_writeback(page); 8866 8867 tree = &BTRFS_I(inode)->io_tree; 8868 if (offset) { 8869 btrfs_releasepage(page, GFP_NOFS); 8870 return; 8871 } 8872 8873 if (!inode_evicting) 8874 lock_extent_bits(tree, page_start, page_end, &cached_state); 8875 again: 8876 start = page_start; 8877 ordered = btrfs_lookup_ordered_range(inode, start, 8878 page_end - start + 1); 8879 if (ordered) { 8880 end = min(page_end, ordered->file_offset + ordered->len - 1); 8881 /* 8882 * IO on this page will never be started, so we need 8883 * to account for any ordered extents now 8884 */ 8885 if (!inode_evicting) 8886 clear_extent_bit(tree, start, end, 8887 EXTENT_DIRTY | EXTENT_DELALLOC | 8888 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 8889 EXTENT_DEFRAG, 1, 0, &cached_state, 8890 GFP_NOFS); 8891 /* 8892 * whoever cleared the private bit is responsible 8893 * for the finish_ordered_io 8894 */ 8895 if (TestClearPagePrivate2(page)) { 8896 struct btrfs_ordered_inode_tree *tree; 8897 u64 new_len; 8898 8899 tree = &BTRFS_I(inode)->ordered_tree; 8900 8901 spin_lock_irq(&tree->lock); 8902 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 8903 new_len = start - ordered->file_offset; 8904 if (new_len < ordered->truncated_len) 8905 ordered->truncated_len = new_len; 8906 spin_unlock_irq(&tree->lock); 8907 8908 if (btrfs_dec_test_ordered_pending(inode, &ordered, 8909 start, 8910 end - start + 1, 1)) 8911 btrfs_finish_ordered_io(ordered); 8912 } 8913 btrfs_put_ordered_extent(ordered); 8914 if (!inode_evicting) { 8915 cached_state = NULL; 8916 lock_extent_bits(tree, start, end, 8917 &cached_state); 8918 } 8919 8920 start = end + 1; 8921 if (start < page_end) 8922 goto again; 8923 } 8924 8925 /* 8926 * Qgroup reserved space handler 8927 * Page here will be either 8928 * 1) Already written to disk 8929 * In this case, its reserved space is released from data rsv map 8930 * and will be freed by delayed_ref handler finally. 8931 * So even we call qgroup_free_data(), it won't decrease reserved 8932 * space. 8933 * 2) Not written to disk 8934 * This means the reserved space should be freed here. 8935 */ 8936 btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); 8937 if (!inode_evicting) { 8938 clear_extent_bit(tree, page_start, page_end, 8939 EXTENT_LOCKED | EXTENT_DIRTY | 8940 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 8941 EXTENT_DEFRAG, 1, 1, 8942 &cached_state, GFP_NOFS); 8943 8944 __btrfs_releasepage(page, GFP_NOFS); 8945 } 8946 8947 ClearPageChecked(page); 8948 if (PagePrivate(page)) { 8949 ClearPagePrivate(page); 8950 set_page_private(page, 0); 8951 put_page(page); 8952 } 8953 } 8954 8955 /* 8956 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 8957 * called from a page fault handler when a page is first dirtied. Hence we must 8958 * be careful to check for EOF conditions here. We set the page up correctly 8959 * for a written page which means we get ENOSPC checking when writing into 8960 * holes and correct delalloc and unwritten extent mapping on filesystems that 8961 * support these features. 8962 * 8963 * We are not allowed to take the i_mutex here so we have to play games to 8964 * protect against truncate races as the page could now be beyond EOF. Because 8965 * vmtruncate() writes the inode size before removing pages, once we have the 8966 * page lock we can determine safely if the page is beyond EOF. If it is not 8967 * beyond EOF, then the page is guaranteed safe against truncation until we 8968 * unlock the page. 8969 */ 8970 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 8971 { 8972 struct page *page = vmf->page; 8973 struct inode *inode = file_inode(vma->vm_file); 8974 struct btrfs_root *root = BTRFS_I(inode)->root; 8975 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 8976 struct btrfs_ordered_extent *ordered; 8977 struct extent_state *cached_state = NULL; 8978 char *kaddr; 8979 unsigned long zero_start; 8980 loff_t size; 8981 int ret; 8982 int reserved = 0; 8983 u64 reserved_space; 8984 u64 page_start; 8985 u64 page_end; 8986 u64 end; 8987 8988 reserved_space = PAGE_SIZE; 8989 8990 sb_start_pagefault(inode->i_sb); 8991 page_start = page_offset(page); 8992 page_end = page_start + PAGE_SIZE - 1; 8993 end = page_end; 8994 8995 /* 8996 * Reserving delalloc space after obtaining the page lock can lead to 8997 * deadlock. For example, if a dirty page is locked by this function 8998 * and the call to btrfs_delalloc_reserve_space() ends up triggering 8999 * dirty page write out, then the btrfs_writepage() function could 9000 * end up waiting indefinitely to get a lock on the page currently 9001 * being processed by btrfs_page_mkwrite() function. 9002 */ 9003 ret = btrfs_delalloc_reserve_space(inode, page_start, 9004 reserved_space); 9005 if (!ret) { 9006 ret = file_update_time(vma->vm_file); 9007 reserved = 1; 9008 } 9009 if (ret) { 9010 if (ret == -ENOMEM) 9011 ret = VM_FAULT_OOM; 9012 else /* -ENOSPC, -EIO, etc */ 9013 ret = VM_FAULT_SIGBUS; 9014 if (reserved) 9015 goto out; 9016 goto out_noreserve; 9017 } 9018 9019 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 9020 again: 9021 lock_page(page); 9022 size = i_size_read(inode); 9023 9024 if ((page->mapping != inode->i_mapping) || 9025 (page_start >= size)) { 9026 /* page got truncated out from underneath us */ 9027 goto out_unlock; 9028 } 9029 wait_on_page_writeback(page); 9030 9031 lock_extent_bits(io_tree, page_start, page_end, &cached_state); 9032 set_page_extent_mapped(page); 9033 9034 /* 9035 * we can't set the delalloc bits if there are pending ordered 9036 * extents. Drop our locks and wait for them to finish 9037 */ 9038 ordered = btrfs_lookup_ordered_range(inode, page_start, page_end); 9039 if (ordered) { 9040 unlock_extent_cached(io_tree, page_start, page_end, 9041 &cached_state, GFP_NOFS); 9042 unlock_page(page); 9043 btrfs_start_ordered_extent(inode, ordered, 1); 9044 btrfs_put_ordered_extent(ordered); 9045 goto again; 9046 } 9047 9048 if (page->index == ((size - 1) >> PAGE_SHIFT)) { 9049 reserved_space = round_up(size - page_start, root->sectorsize); 9050 if (reserved_space < PAGE_SIZE) { 9051 end = page_start + reserved_space - 1; 9052 spin_lock(&BTRFS_I(inode)->lock); 9053 BTRFS_I(inode)->outstanding_extents++; 9054 spin_unlock(&BTRFS_I(inode)->lock); 9055 btrfs_delalloc_release_space(inode, page_start, 9056 PAGE_SIZE - reserved_space); 9057 } 9058 } 9059 9060 /* 9061 * XXX - page_mkwrite gets called every time the page is dirtied, even 9062 * if it was already dirty, so for space accounting reasons we need to 9063 * clear any delalloc bits for the range we are fixing to save. There 9064 * is probably a better way to do this, but for now keep consistent with 9065 * prepare_pages in the normal write path. 9066 */ 9067 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, 9068 EXTENT_DIRTY | EXTENT_DELALLOC | 9069 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 9070 0, 0, &cached_state, GFP_NOFS); 9071 9072 ret = btrfs_set_extent_delalloc(inode, page_start, end, 9073 &cached_state, 0); 9074 if (ret) { 9075 unlock_extent_cached(io_tree, page_start, page_end, 9076 &cached_state, GFP_NOFS); 9077 ret = VM_FAULT_SIGBUS; 9078 goto out_unlock; 9079 } 9080 ret = 0; 9081 9082 /* page is wholly or partially inside EOF */ 9083 if (page_start + PAGE_SIZE > size) 9084 zero_start = size & ~PAGE_MASK; 9085 else 9086 zero_start = PAGE_SIZE; 9087 9088 if (zero_start != PAGE_SIZE) { 9089 kaddr = kmap(page); 9090 memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); 9091 flush_dcache_page(page); 9092 kunmap(page); 9093 } 9094 ClearPageChecked(page); 9095 set_page_dirty(page); 9096 SetPageUptodate(page); 9097 9098 BTRFS_I(inode)->last_trans = root->fs_info->generation; 9099 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 9100 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; 9101 9102 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 9103 9104 out_unlock: 9105 if (!ret) { 9106 sb_end_pagefault(inode->i_sb); 9107 return VM_FAULT_LOCKED; 9108 } 9109 unlock_page(page); 9110 out: 9111 btrfs_delalloc_release_space(inode, page_start, reserved_space); 9112 out_noreserve: 9113 sb_end_pagefault(inode->i_sb); 9114 return ret; 9115 } 9116 9117 static int btrfs_truncate(struct inode *inode) 9118 { 9119 struct btrfs_root *root = BTRFS_I(inode)->root; 9120 struct btrfs_block_rsv *rsv; 9121 int ret = 0; 9122 int err = 0; 9123 struct btrfs_trans_handle *trans; 9124 u64 mask = root->sectorsize - 1; 9125 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 9126 9127 ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), 9128 (u64)-1); 9129 if (ret) 9130 return ret; 9131 9132 /* 9133 * Yes ladies and gentlemen, this is indeed ugly. The fact is we have 9134 * 3 things going on here 9135 * 9136 * 1) We need to reserve space for our orphan item and the space to 9137 * delete our orphan item. Lord knows we don't want to have a dangling 9138 * orphan item because we didn't reserve space to remove it. 9139 * 9140 * 2) We need to reserve space to update our inode. 9141 * 9142 * 3) We need to have something to cache all the space that is going to 9143 * be free'd up by the truncate operation, but also have some slack 9144 * space reserved in case it uses space during the truncate (thank you 9145 * very much snapshotting). 9146 * 9147 * And we need these to all be separate. The fact is we can use a lot of 9148 * space doing the truncate, and we have no earthly idea how much space 9149 * we will use, so we need the truncate reservation to be separate so it 9150 * doesn't end up using space reserved for updating the inode or 9151 * removing the orphan item. We also need to be able to stop the 9152 * transaction and start a new one, which means we need to be able to 9153 * update the inode several times, and we have no idea of knowing how 9154 * many times that will be, so we can't just reserve 1 item for the 9155 * entirety of the operation, so that has to be done separately as well. 9156 * Then there is the orphan item, which does indeed need to be held on 9157 * to for the whole operation, and we need nobody to touch this reserved 9158 * space except the orphan code. 9159 * 9160 * So that leaves us with 9161 * 9162 * 1) root->orphan_block_rsv - for the orphan deletion. 9163 * 2) rsv - for the truncate reservation, which we will steal from the 9164 * transaction reservation. 9165 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 9166 * updating the inode. 9167 */ 9168 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 9169 if (!rsv) 9170 return -ENOMEM; 9171 rsv->size = min_size; 9172 rsv->failfast = 1; 9173 9174 /* 9175 * 1 for the truncate slack space 9176 * 1 for updating the inode. 9177 */ 9178 trans = btrfs_start_transaction(root, 2); 9179 if (IS_ERR(trans)) { 9180 err = PTR_ERR(trans); 9181 goto out; 9182 } 9183 9184 /* Migrate the slack space for the truncate to our reserve */ 9185 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, 9186 min_size, 0); 9187 BUG_ON(ret); 9188 9189 /* 9190 * So if we truncate and then write and fsync we normally would just 9191 * write the extents that changed, which is a problem if we need to 9192 * first truncate that entire inode. So set this flag so we write out 9193 * all of the extents in the inode to the sync log so we're completely 9194 * safe. 9195 */ 9196 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 9197 trans->block_rsv = rsv; 9198 9199 while (1) { 9200 ret = btrfs_truncate_inode_items(trans, root, inode, 9201 inode->i_size, 9202 BTRFS_EXTENT_DATA_KEY); 9203 if (ret != -ENOSPC && ret != -EAGAIN) { 9204 err = ret; 9205 break; 9206 } 9207 9208 trans->block_rsv = &root->fs_info->trans_block_rsv; 9209 ret = btrfs_update_inode(trans, root, inode); 9210 if (ret) { 9211 err = ret; 9212 break; 9213 } 9214 9215 btrfs_end_transaction(trans, root); 9216 btrfs_btree_balance_dirty(root); 9217 9218 trans = btrfs_start_transaction(root, 2); 9219 if (IS_ERR(trans)) { 9220 ret = err = PTR_ERR(trans); 9221 trans = NULL; 9222 break; 9223 } 9224 9225 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, 9226 rsv, min_size, 0); 9227 BUG_ON(ret); /* shouldn't happen */ 9228 trans->block_rsv = rsv; 9229 } 9230 9231 if (ret == 0 && inode->i_nlink > 0) { 9232 trans->block_rsv = root->orphan_block_rsv; 9233 ret = btrfs_orphan_del(trans, inode); 9234 if (ret) 9235 err = ret; 9236 } 9237 9238 if (trans) { 9239 trans->block_rsv = &root->fs_info->trans_block_rsv; 9240 ret = btrfs_update_inode(trans, root, inode); 9241 if (ret && !err) 9242 err = ret; 9243 9244 ret = btrfs_end_transaction(trans, root); 9245 btrfs_btree_balance_dirty(root); 9246 } 9247 out: 9248 btrfs_free_block_rsv(root, rsv); 9249 9250 if (ret && !err) 9251 err = ret; 9252 9253 return err; 9254 } 9255 9256 /* 9257 * create a new subvolume directory/inode (helper for the ioctl). 9258 */ 9259 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 9260 struct btrfs_root *new_root, 9261 struct btrfs_root *parent_root, 9262 u64 new_dirid) 9263 { 9264 struct inode *inode; 9265 int err; 9266 u64 index = 0; 9267 9268 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, 9269 new_dirid, new_dirid, 9270 S_IFDIR | (~current_umask() & S_IRWXUGO), 9271 &index); 9272 if (IS_ERR(inode)) 9273 return PTR_ERR(inode); 9274 inode->i_op = &btrfs_dir_inode_operations; 9275 inode->i_fop = &btrfs_dir_file_operations; 9276 9277 set_nlink(inode, 1); 9278 btrfs_i_size_write(inode, 0); 9279 unlock_new_inode(inode); 9280 9281 err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 9282 if (err) 9283 btrfs_err(new_root->fs_info, 9284 "error inheriting subvolume %llu properties: %d", 9285 new_root->root_key.objectid, err); 9286 9287 err = btrfs_update_inode(trans, new_root, inode); 9288 9289 iput(inode); 9290 return err; 9291 } 9292 9293 struct inode *btrfs_alloc_inode(struct super_block *sb) 9294 { 9295 struct btrfs_inode *ei; 9296 struct inode *inode; 9297 9298 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 9299 if (!ei) 9300 return NULL; 9301 9302 ei->root = NULL; 9303 ei->generation = 0; 9304 ei->last_trans = 0; 9305 ei->last_sub_trans = 0; 9306 ei->logged_trans = 0; 9307 ei->delalloc_bytes = 0; 9308 ei->defrag_bytes = 0; 9309 ei->disk_i_size = 0; 9310 ei->flags = 0; 9311 ei->csum_bytes = 0; 9312 ei->index_cnt = (u64)-1; 9313 ei->dir_index = 0; 9314 ei->last_unlink_trans = 0; 9315 ei->last_log_commit = 0; 9316 ei->delayed_iput_count = 0; 9317 9318 spin_lock_init(&ei->lock); 9319 ei->outstanding_extents = 0; 9320 ei->reserved_extents = 0; 9321 9322 ei->runtime_flags = 0; 9323 ei->force_compress = BTRFS_COMPRESS_NONE; 9324 9325 ei->delayed_node = NULL; 9326 9327 ei->i_otime.tv_sec = 0; 9328 ei->i_otime.tv_nsec = 0; 9329 9330 inode = &ei->vfs_inode; 9331 extent_map_tree_init(&ei->extent_tree); 9332 extent_io_tree_init(&ei->io_tree, &inode->i_data); 9333 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 9334 ei->io_tree.track_uptodate = 1; 9335 ei->io_failure_tree.track_uptodate = 1; 9336 atomic_set(&ei->sync_writers, 0); 9337 mutex_init(&ei->log_mutex); 9338 mutex_init(&ei->delalloc_mutex); 9339 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 9340 INIT_LIST_HEAD(&ei->delalloc_inodes); 9341 INIT_LIST_HEAD(&ei->delayed_iput); 9342 RB_CLEAR_NODE(&ei->rb_node); 9343 init_rwsem(&ei->dio_sem); 9344 9345 return inode; 9346 } 9347 9348 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 9349 void btrfs_test_destroy_inode(struct inode *inode) 9350 { 9351 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 9352 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 9353 } 9354 #endif 9355 9356 static void btrfs_i_callback(struct rcu_head *head) 9357 { 9358 struct inode *inode = container_of(head, struct inode, i_rcu); 9359 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 9360 } 9361 9362 void btrfs_destroy_inode(struct inode *inode) 9363 { 9364 struct btrfs_ordered_extent *ordered; 9365 struct btrfs_root *root = BTRFS_I(inode)->root; 9366 9367 WARN_ON(!hlist_empty(&inode->i_dentry)); 9368 WARN_ON(inode->i_data.nrpages); 9369 WARN_ON(BTRFS_I(inode)->outstanding_extents); 9370 WARN_ON(BTRFS_I(inode)->reserved_extents); 9371 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 9372 WARN_ON(BTRFS_I(inode)->csum_bytes); 9373 WARN_ON(BTRFS_I(inode)->defrag_bytes); 9374 9375 /* 9376 * This can happen where we create an inode, but somebody else also 9377 * created the same inode and we need to destroy the one we already 9378 * created. 9379 */ 9380 if (!root) 9381 goto free; 9382 9383 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 9384 &BTRFS_I(inode)->runtime_flags)) { 9385 btrfs_info(root->fs_info, "inode %llu still on the orphan list", 9386 btrfs_ino(inode)); 9387 atomic_dec(&root->orphan_inodes); 9388 } 9389 9390 while (1) { 9391 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 9392 if (!ordered) 9393 break; 9394 else { 9395 btrfs_err(root->fs_info, 9396 "found ordered extent %llu %llu on inode cleanup", 9397 ordered->file_offset, ordered->len); 9398 btrfs_remove_ordered_extent(inode, ordered); 9399 btrfs_put_ordered_extent(ordered); 9400 btrfs_put_ordered_extent(ordered); 9401 } 9402 } 9403 btrfs_qgroup_check_reserved_leak(inode); 9404 inode_tree_del(inode); 9405 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 9406 free: 9407 call_rcu(&inode->i_rcu, btrfs_i_callback); 9408 } 9409 9410 int btrfs_drop_inode(struct inode *inode) 9411 { 9412 struct btrfs_root *root = BTRFS_I(inode)->root; 9413 9414 if (root == NULL) 9415 return 1; 9416 9417 /* the snap/subvol tree is on deleting */ 9418 if (btrfs_root_refs(&root->root_item) == 0) 9419 return 1; 9420 else 9421 return generic_drop_inode(inode); 9422 } 9423 9424 static void init_once(void *foo) 9425 { 9426 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 9427 9428 inode_init_once(&ei->vfs_inode); 9429 } 9430 9431 void btrfs_destroy_cachep(void) 9432 { 9433 /* 9434 * Make sure all delayed rcu free inodes are flushed before we 9435 * destroy cache. 9436 */ 9437 rcu_barrier(); 9438 kmem_cache_destroy(btrfs_inode_cachep); 9439 kmem_cache_destroy(btrfs_trans_handle_cachep); 9440 kmem_cache_destroy(btrfs_transaction_cachep); 9441 kmem_cache_destroy(btrfs_path_cachep); 9442 kmem_cache_destroy(btrfs_free_space_cachep); 9443 } 9444 9445 int btrfs_init_cachep(void) 9446 { 9447 btrfs_inode_cachep = kmem_cache_create("btrfs_inode", 9448 sizeof(struct btrfs_inode), 0, 9449 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, 9450 init_once); 9451 if (!btrfs_inode_cachep) 9452 goto fail; 9453 9454 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", 9455 sizeof(struct btrfs_trans_handle), 0, 9456 SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); 9457 if (!btrfs_trans_handle_cachep) 9458 goto fail; 9459 9460 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", 9461 sizeof(struct btrfs_transaction), 0, 9462 SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); 9463 if (!btrfs_transaction_cachep) 9464 goto fail; 9465 9466 btrfs_path_cachep = kmem_cache_create("btrfs_path", 9467 sizeof(struct btrfs_path), 0, 9468 SLAB_MEM_SPREAD, NULL); 9469 if (!btrfs_path_cachep) 9470 goto fail; 9471 9472 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 9473 sizeof(struct btrfs_free_space), 0, 9474 SLAB_MEM_SPREAD, NULL); 9475 if (!btrfs_free_space_cachep) 9476 goto fail; 9477 9478 return 0; 9479 fail: 9480 btrfs_destroy_cachep(); 9481 return -ENOMEM; 9482 } 9483 9484 static int btrfs_getattr(struct vfsmount *mnt, 9485 struct dentry *dentry, struct kstat *stat) 9486 { 9487 u64 delalloc_bytes; 9488 struct inode *inode = d_inode(dentry); 9489 u32 blocksize = inode->i_sb->s_blocksize; 9490 9491 generic_fillattr(inode, stat); 9492 stat->dev = BTRFS_I(inode)->root->anon_dev; 9493 9494 spin_lock(&BTRFS_I(inode)->lock); 9495 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; 9496 spin_unlock(&BTRFS_I(inode)->lock); 9497 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + 9498 ALIGN(delalloc_bytes, blocksize)) >> 9; 9499 return 0; 9500 } 9501 9502 static int btrfs_rename_exchange(struct inode *old_dir, 9503 struct dentry *old_dentry, 9504 struct inode *new_dir, 9505 struct dentry *new_dentry) 9506 { 9507 struct btrfs_trans_handle *trans; 9508 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9509 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9510 struct inode *new_inode = new_dentry->d_inode; 9511 struct inode *old_inode = old_dentry->d_inode; 9512 struct timespec ctime = current_time(old_inode); 9513 struct dentry *parent; 9514 u64 old_ino = btrfs_ino(old_inode); 9515 u64 new_ino = btrfs_ino(new_inode); 9516 u64 old_idx = 0; 9517 u64 new_idx = 0; 9518 u64 root_objectid; 9519 int ret; 9520 bool root_log_pinned = false; 9521 bool dest_log_pinned = false; 9522 9523 /* we only allow rename subvolume link between subvolumes */ 9524 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 9525 return -EXDEV; 9526 9527 /* close the race window with snapshot create/destroy ioctl */ 9528 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9529 down_read(&root->fs_info->subvol_sem); 9530 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) 9531 down_read(&dest->fs_info->subvol_sem); 9532 9533 /* 9534 * We want to reserve the absolute worst case amount of items. So if 9535 * both inodes are subvols and we need to unlink them then that would 9536 * require 4 item modifications, but if they are both normal inodes it 9537 * would require 5 item modifications, so we'll assume their normal 9538 * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items 9539 * should cover the worst case number of items we'll modify. 9540 */ 9541 trans = btrfs_start_transaction(root, 12); 9542 if (IS_ERR(trans)) { 9543 ret = PTR_ERR(trans); 9544 goto out_notrans; 9545 } 9546 9547 /* 9548 * We need to find a free sequence number both in the source and 9549 * in the destination directory for the exchange. 9550 */ 9551 ret = btrfs_set_inode_index(new_dir, &old_idx); 9552 if (ret) 9553 goto out_fail; 9554 ret = btrfs_set_inode_index(old_dir, &new_idx); 9555 if (ret) 9556 goto out_fail; 9557 9558 BTRFS_I(old_inode)->dir_index = 0ULL; 9559 BTRFS_I(new_inode)->dir_index = 0ULL; 9560 9561 /* Reference for the source. */ 9562 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 9563 /* force full log commit if subvolume involved. */ 9564 btrfs_set_log_full_commit(root->fs_info, trans); 9565 } else { 9566 btrfs_pin_log_trans(root); 9567 root_log_pinned = true; 9568 ret = btrfs_insert_inode_ref(trans, dest, 9569 new_dentry->d_name.name, 9570 new_dentry->d_name.len, 9571 old_ino, 9572 btrfs_ino(new_dir), old_idx); 9573 if (ret) 9574 goto out_fail; 9575 } 9576 9577 /* And now for the dest. */ 9578 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 9579 /* force full log commit if subvolume involved. */ 9580 btrfs_set_log_full_commit(dest->fs_info, trans); 9581 } else { 9582 btrfs_pin_log_trans(dest); 9583 dest_log_pinned = true; 9584 ret = btrfs_insert_inode_ref(trans, root, 9585 old_dentry->d_name.name, 9586 old_dentry->d_name.len, 9587 new_ino, 9588 btrfs_ino(old_dir), new_idx); 9589 if (ret) 9590 goto out_fail; 9591 } 9592 9593 /* Update inode version and ctime/mtime. */ 9594 inode_inc_iversion(old_dir); 9595 inode_inc_iversion(new_dir); 9596 inode_inc_iversion(old_inode); 9597 inode_inc_iversion(new_inode); 9598 old_dir->i_ctime = old_dir->i_mtime = ctime; 9599 new_dir->i_ctime = new_dir->i_mtime = ctime; 9600 old_inode->i_ctime = ctime; 9601 new_inode->i_ctime = ctime; 9602 9603 if (old_dentry->d_parent != new_dentry->d_parent) { 9604 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 9605 btrfs_record_unlink_dir(trans, new_dir, new_inode, 1); 9606 } 9607 9608 /* src is a subvolume */ 9609 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 9610 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 9611 ret = btrfs_unlink_subvol(trans, root, old_dir, 9612 root_objectid, 9613 old_dentry->d_name.name, 9614 old_dentry->d_name.len); 9615 } else { /* src is an inode */ 9616 ret = __btrfs_unlink_inode(trans, root, old_dir, 9617 old_dentry->d_inode, 9618 old_dentry->d_name.name, 9619 old_dentry->d_name.len); 9620 if (!ret) 9621 ret = btrfs_update_inode(trans, root, old_inode); 9622 } 9623 if (ret) { 9624 btrfs_abort_transaction(trans, ret); 9625 goto out_fail; 9626 } 9627 9628 /* dest is a subvolume */ 9629 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { 9630 root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; 9631 ret = btrfs_unlink_subvol(trans, dest, new_dir, 9632 root_objectid, 9633 new_dentry->d_name.name, 9634 new_dentry->d_name.len); 9635 } else { /* dest is an inode */ 9636 ret = __btrfs_unlink_inode(trans, dest, new_dir, 9637 new_dentry->d_inode, 9638 new_dentry->d_name.name, 9639 new_dentry->d_name.len); 9640 if (!ret) 9641 ret = btrfs_update_inode(trans, dest, new_inode); 9642 } 9643 if (ret) { 9644 btrfs_abort_transaction(trans, ret); 9645 goto out_fail; 9646 } 9647 9648 ret = btrfs_add_link(trans, new_dir, old_inode, 9649 new_dentry->d_name.name, 9650 new_dentry->d_name.len, 0, old_idx); 9651 if (ret) { 9652 btrfs_abort_transaction(trans, ret); 9653 goto out_fail; 9654 } 9655 9656 ret = btrfs_add_link(trans, old_dir, new_inode, 9657 old_dentry->d_name.name, 9658 old_dentry->d_name.len, 0, new_idx); 9659 if (ret) { 9660 btrfs_abort_transaction(trans, ret); 9661 goto out_fail; 9662 } 9663 9664 if (old_inode->i_nlink == 1) 9665 BTRFS_I(old_inode)->dir_index = old_idx; 9666 if (new_inode->i_nlink == 1) 9667 BTRFS_I(new_inode)->dir_index = new_idx; 9668 9669 if (root_log_pinned) { 9670 parent = new_dentry->d_parent; 9671 btrfs_log_new_name(trans, old_inode, old_dir, parent); 9672 btrfs_end_log_trans(root); 9673 root_log_pinned = false; 9674 } 9675 if (dest_log_pinned) { 9676 parent = old_dentry->d_parent; 9677 btrfs_log_new_name(trans, new_inode, new_dir, parent); 9678 btrfs_end_log_trans(dest); 9679 dest_log_pinned = false; 9680 } 9681 out_fail: 9682 /* 9683 * If we have pinned a log and an error happened, we unpin tasks 9684 * trying to sync the log and force them to fallback to a transaction 9685 * commit if the log currently contains any of the inodes involved in 9686 * this rename operation (to ensure we do not persist a log with an 9687 * inconsistent state for any of these inodes or leading to any 9688 * inconsistencies when replayed). If the transaction was aborted, the 9689 * abortion reason is propagated to userspace when attempting to commit 9690 * the transaction. If the log does not contain any of these inodes, we 9691 * allow the tasks to sync it. 9692 */ 9693 if (ret && (root_log_pinned || dest_log_pinned)) { 9694 if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || 9695 btrfs_inode_in_log(new_dir, root->fs_info->generation) || 9696 btrfs_inode_in_log(old_inode, root->fs_info->generation) || 9697 (new_inode && 9698 btrfs_inode_in_log(new_inode, root->fs_info->generation))) 9699 btrfs_set_log_full_commit(root->fs_info, trans); 9700 9701 if (root_log_pinned) { 9702 btrfs_end_log_trans(root); 9703 root_log_pinned = false; 9704 } 9705 if (dest_log_pinned) { 9706 btrfs_end_log_trans(dest); 9707 dest_log_pinned = false; 9708 } 9709 } 9710 ret = btrfs_end_transaction(trans, root); 9711 out_notrans: 9712 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) 9713 up_read(&dest->fs_info->subvol_sem); 9714 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9715 up_read(&root->fs_info->subvol_sem); 9716 9717 return ret; 9718 } 9719 9720 static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, 9721 struct btrfs_root *root, 9722 struct inode *dir, 9723 struct dentry *dentry) 9724 { 9725 int ret; 9726 struct inode *inode; 9727 u64 objectid; 9728 u64 index; 9729 9730 ret = btrfs_find_free_ino(root, &objectid); 9731 if (ret) 9732 return ret; 9733 9734 inode = btrfs_new_inode(trans, root, dir, 9735 dentry->d_name.name, 9736 dentry->d_name.len, 9737 btrfs_ino(dir), 9738 objectid, 9739 S_IFCHR | WHITEOUT_MODE, 9740 &index); 9741 9742 if (IS_ERR(inode)) { 9743 ret = PTR_ERR(inode); 9744 return ret; 9745 } 9746 9747 inode->i_op = &btrfs_special_inode_operations; 9748 init_special_inode(inode, inode->i_mode, 9749 WHITEOUT_DEV); 9750 9751 ret = btrfs_init_inode_security(trans, inode, dir, 9752 &dentry->d_name); 9753 if (ret) 9754 goto out; 9755 9756 ret = btrfs_add_nondir(trans, dir, dentry, 9757 inode, 0, index); 9758 if (ret) 9759 goto out; 9760 9761 ret = btrfs_update_inode(trans, root, inode); 9762 out: 9763 unlock_new_inode(inode); 9764 if (ret) 9765 inode_dec_link_count(inode); 9766 iput(inode); 9767 9768 return ret; 9769 } 9770 9771 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 9772 struct inode *new_dir, struct dentry *new_dentry, 9773 unsigned int flags) 9774 { 9775 struct btrfs_trans_handle *trans; 9776 unsigned int trans_num_items; 9777 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9778 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9779 struct inode *new_inode = d_inode(new_dentry); 9780 struct inode *old_inode = d_inode(old_dentry); 9781 u64 index = 0; 9782 u64 root_objectid; 9783 int ret; 9784 u64 old_ino = btrfs_ino(old_inode); 9785 bool log_pinned = false; 9786 9787 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 9788 return -EPERM; 9789 9790 /* we only allow rename subvolume link between subvolumes */ 9791 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 9792 return -EXDEV; 9793 9794 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 9795 (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) 9796 return -ENOTEMPTY; 9797 9798 if (S_ISDIR(old_inode->i_mode) && new_inode && 9799 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 9800 return -ENOTEMPTY; 9801 9802 9803 /* check for collisions, even if the name isn't there */ 9804 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, 9805 new_dentry->d_name.name, 9806 new_dentry->d_name.len); 9807 9808 if (ret) { 9809 if (ret == -EEXIST) { 9810 /* we shouldn't get 9811 * eexist without a new_inode */ 9812 if (WARN_ON(!new_inode)) { 9813 return ret; 9814 } 9815 } else { 9816 /* maybe -EOVERFLOW */ 9817 return ret; 9818 } 9819 } 9820 ret = 0; 9821 9822 /* 9823 * we're using rename to replace one file with another. Start IO on it 9824 * now so we don't add too much work to the end of the transaction 9825 */ 9826 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) 9827 filemap_flush(old_inode->i_mapping); 9828 9829 /* close the racy window with snapshot create/destroy ioctl */ 9830 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9831 down_read(&root->fs_info->subvol_sem); 9832 /* 9833 * We want to reserve the absolute worst case amount of items. So if 9834 * both inodes are subvols and we need to unlink them then that would 9835 * require 4 item modifications, but if they are both normal inodes it 9836 * would require 5 item modifications, so we'll assume they are normal 9837 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 9838 * should cover the worst case number of items we'll modify. 9839 * If our rename has the whiteout flag, we need more 5 units for the 9840 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item 9841 * when selinux is enabled). 9842 */ 9843 trans_num_items = 11; 9844 if (flags & RENAME_WHITEOUT) 9845 trans_num_items += 5; 9846 trans = btrfs_start_transaction(root, trans_num_items); 9847 if (IS_ERR(trans)) { 9848 ret = PTR_ERR(trans); 9849 goto out_notrans; 9850 } 9851 9852 if (dest != root) 9853 btrfs_record_root_in_trans(trans, dest); 9854 9855 ret = btrfs_set_inode_index(new_dir, &index); 9856 if (ret) 9857 goto out_fail; 9858 9859 BTRFS_I(old_inode)->dir_index = 0ULL; 9860 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 9861 /* force full log commit if subvolume involved. */ 9862 btrfs_set_log_full_commit(root->fs_info, trans); 9863 } else { 9864 btrfs_pin_log_trans(root); 9865 log_pinned = true; 9866 ret = btrfs_insert_inode_ref(trans, dest, 9867 new_dentry->d_name.name, 9868 new_dentry->d_name.len, 9869 old_ino, 9870 btrfs_ino(new_dir), index); 9871 if (ret) 9872 goto out_fail; 9873 } 9874 9875 inode_inc_iversion(old_dir); 9876 inode_inc_iversion(new_dir); 9877 inode_inc_iversion(old_inode); 9878 old_dir->i_ctime = old_dir->i_mtime = 9879 new_dir->i_ctime = new_dir->i_mtime = 9880 old_inode->i_ctime = current_time(old_dir); 9881 9882 if (old_dentry->d_parent != new_dentry->d_parent) 9883 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 9884 9885 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 9886 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 9887 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 9888 old_dentry->d_name.name, 9889 old_dentry->d_name.len); 9890 } else { 9891 ret = __btrfs_unlink_inode(trans, root, old_dir, 9892 d_inode(old_dentry), 9893 old_dentry->d_name.name, 9894 old_dentry->d_name.len); 9895 if (!ret) 9896 ret = btrfs_update_inode(trans, root, old_inode); 9897 } 9898 if (ret) { 9899 btrfs_abort_transaction(trans, ret); 9900 goto out_fail; 9901 } 9902 9903 if (new_inode) { 9904 inode_inc_iversion(new_inode); 9905 new_inode->i_ctime = current_time(new_inode); 9906 if (unlikely(btrfs_ino(new_inode) == 9907 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 9908 root_objectid = BTRFS_I(new_inode)->location.objectid; 9909 ret = btrfs_unlink_subvol(trans, dest, new_dir, 9910 root_objectid, 9911 new_dentry->d_name.name, 9912 new_dentry->d_name.len); 9913 BUG_ON(new_inode->i_nlink == 0); 9914 } else { 9915 ret = btrfs_unlink_inode(trans, dest, new_dir, 9916 d_inode(new_dentry), 9917 new_dentry->d_name.name, 9918 new_dentry->d_name.len); 9919 } 9920 if (!ret && new_inode->i_nlink == 0) 9921 ret = btrfs_orphan_add(trans, d_inode(new_dentry)); 9922 if (ret) { 9923 btrfs_abort_transaction(trans, ret); 9924 goto out_fail; 9925 } 9926 } 9927 9928 ret = btrfs_add_link(trans, new_dir, old_inode, 9929 new_dentry->d_name.name, 9930 new_dentry->d_name.len, 0, index); 9931 if (ret) { 9932 btrfs_abort_transaction(trans, ret); 9933 goto out_fail; 9934 } 9935 9936 if (old_inode->i_nlink == 1) 9937 BTRFS_I(old_inode)->dir_index = index; 9938 9939 if (log_pinned) { 9940 struct dentry *parent = new_dentry->d_parent; 9941 9942 btrfs_log_new_name(trans, old_inode, old_dir, parent); 9943 btrfs_end_log_trans(root); 9944 log_pinned = false; 9945 } 9946 9947 if (flags & RENAME_WHITEOUT) { 9948 ret = btrfs_whiteout_for_rename(trans, root, old_dir, 9949 old_dentry); 9950 9951 if (ret) { 9952 btrfs_abort_transaction(trans, ret); 9953 goto out_fail; 9954 } 9955 } 9956 out_fail: 9957 /* 9958 * If we have pinned the log and an error happened, we unpin tasks 9959 * trying to sync the log and force them to fallback to a transaction 9960 * commit if the log currently contains any of the inodes involved in 9961 * this rename operation (to ensure we do not persist a log with an 9962 * inconsistent state for any of these inodes or leading to any 9963 * inconsistencies when replayed). If the transaction was aborted, the 9964 * abortion reason is propagated to userspace when attempting to commit 9965 * the transaction. If the log does not contain any of these inodes, we 9966 * allow the tasks to sync it. 9967 */ 9968 if (ret && log_pinned) { 9969 if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || 9970 btrfs_inode_in_log(new_dir, root->fs_info->generation) || 9971 btrfs_inode_in_log(old_inode, root->fs_info->generation) || 9972 (new_inode && 9973 btrfs_inode_in_log(new_inode, root->fs_info->generation))) 9974 btrfs_set_log_full_commit(root->fs_info, trans); 9975 9976 btrfs_end_log_trans(root); 9977 log_pinned = false; 9978 } 9979 btrfs_end_transaction(trans, root); 9980 out_notrans: 9981 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9982 up_read(&root->fs_info->subvol_sem); 9983 9984 return ret; 9985 } 9986 9987 static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, 9988 struct inode *new_dir, struct dentry *new_dentry, 9989 unsigned int flags) 9990 { 9991 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 9992 return -EINVAL; 9993 9994 if (flags & RENAME_EXCHANGE) 9995 return btrfs_rename_exchange(old_dir, old_dentry, new_dir, 9996 new_dentry); 9997 9998 return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); 9999 } 10000 10001 static void btrfs_run_delalloc_work(struct btrfs_work *work) 10002 { 10003 struct btrfs_delalloc_work *delalloc_work; 10004 struct inode *inode; 10005 10006 delalloc_work = container_of(work, struct btrfs_delalloc_work, 10007 work); 10008 inode = delalloc_work->inode; 10009 filemap_flush(inode->i_mapping); 10010 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 10011 &BTRFS_I(inode)->runtime_flags)) 10012 filemap_flush(inode->i_mapping); 10013 10014 if (delalloc_work->delay_iput) 10015 btrfs_add_delayed_iput(inode); 10016 else 10017 iput(inode); 10018 complete(&delalloc_work->completion); 10019 } 10020 10021 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, 10022 int delay_iput) 10023 { 10024 struct btrfs_delalloc_work *work; 10025 10026 work = kmalloc(sizeof(*work), GFP_NOFS); 10027 if (!work) 10028 return NULL; 10029 10030 init_completion(&work->completion); 10031 INIT_LIST_HEAD(&work->list); 10032 work->inode = inode; 10033 work->delay_iput = delay_iput; 10034 WARN_ON_ONCE(!inode); 10035 btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, 10036 btrfs_run_delalloc_work, NULL, NULL); 10037 10038 return work; 10039 } 10040 10041 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) 10042 { 10043 wait_for_completion(&work->completion); 10044 kfree(work); 10045 } 10046 10047 /* 10048 * some fairly slow code that needs optimization. This walks the list 10049 * of all the inodes with pending delalloc and forces them to disk. 10050 */ 10051 static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, 10052 int nr) 10053 { 10054 struct btrfs_inode *binode; 10055 struct inode *inode; 10056 struct btrfs_delalloc_work *work, *next; 10057 struct list_head works; 10058 struct list_head splice; 10059 int ret = 0; 10060 10061 INIT_LIST_HEAD(&works); 10062 INIT_LIST_HEAD(&splice); 10063 10064 mutex_lock(&root->delalloc_mutex); 10065 spin_lock(&root->delalloc_lock); 10066 list_splice_init(&root->delalloc_inodes, &splice); 10067 while (!list_empty(&splice)) { 10068 binode = list_entry(splice.next, struct btrfs_inode, 10069 delalloc_inodes); 10070 10071 list_move_tail(&binode->delalloc_inodes, 10072 &root->delalloc_inodes); 10073 inode = igrab(&binode->vfs_inode); 10074 if (!inode) { 10075 cond_resched_lock(&root->delalloc_lock); 10076 continue; 10077 } 10078 spin_unlock(&root->delalloc_lock); 10079 10080 work = btrfs_alloc_delalloc_work(inode, delay_iput); 10081 if (!work) { 10082 if (delay_iput) 10083 btrfs_add_delayed_iput(inode); 10084 else 10085 iput(inode); 10086 ret = -ENOMEM; 10087 goto out; 10088 } 10089 list_add_tail(&work->list, &works); 10090 btrfs_queue_work(root->fs_info->flush_workers, 10091 &work->work); 10092 ret++; 10093 if (nr != -1 && ret >= nr) 10094 goto out; 10095 cond_resched(); 10096 spin_lock(&root->delalloc_lock); 10097 } 10098 spin_unlock(&root->delalloc_lock); 10099 10100 out: 10101 list_for_each_entry_safe(work, next, &works, list) { 10102 list_del_init(&work->list); 10103 btrfs_wait_and_free_delalloc_work(work); 10104 } 10105 10106 if (!list_empty_careful(&splice)) { 10107 spin_lock(&root->delalloc_lock); 10108 list_splice_tail(&splice, &root->delalloc_inodes); 10109 spin_unlock(&root->delalloc_lock); 10110 } 10111 mutex_unlock(&root->delalloc_mutex); 10112 return ret; 10113 } 10114 10115 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 10116 { 10117 int ret; 10118 10119 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 10120 return -EROFS; 10121 10122 ret = __start_delalloc_inodes(root, delay_iput, -1); 10123 if (ret > 0) 10124 ret = 0; 10125 /* 10126 * the filemap_flush will queue IO into the worker threads, but 10127 * we have to make sure the IO is actually started and that 10128 * ordered extents get created before we return 10129 */ 10130 atomic_inc(&root->fs_info->async_submit_draining); 10131 while (atomic_read(&root->fs_info->nr_async_submits) || 10132 atomic_read(&root->fs_info->async_delalloc_pages)) { 10133 wait_event(root->fs_info->async_submit_wait, 10134 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 10135 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 10136 } 10137 atomic_dec(&root->fs_info->async_submit_draining); 10138 return ret; 10139 } 10140 10141 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, 10142 int nr) 10143 { 10144 struct btrfs_root *root; 10145 struct list_head splice; 10146 int ret; 10147 10148 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 10149 return -EROFS; 10150 10151 INIT_LIST_HEAD(&splice); 10152 10153 mutex_lock(&fs_info->delalloc_root_mutex); 10154 spin_lock(&fs_info->delalloc_root_lock); 10155 list_splice_init(&fs_info->delalloc_roots, &splice); 10156 while (!list_empty(&splice) && nr) { 10157 root = list_first_entry(&splice, struct btrfs_root, 10158 delalloc_root); 10159 root = btrfs_grab_fs_root(root); 10160 BUG_ON(!root); 10161 list_move_tail(&root->delalloc_root, 10162 &fs_info->delalloc_roots); 10163 spin_unlock(&fs_info->delalloc_root_lock); 10164 10165 ret = __start_delalloc_inodes(root, delay_iput, nr); 10166 btrfs_put_fs_root(root); 10167 if (ret < 0) 10168 goto out; 10169 10170 if (nr != -1) { 10171 nr -= ret; 10172 WARN_ON(nr < 0); 10173 } 10174 spin_lock(&fs_info->delalloc_root_lock); 10175 } 10176 spin_unlock(&fs_info->delalloc_root_lock); 10177 10178 ret = 0; 10179 atomic_inc(&fs_info->async_submit_draining); 10180 while (atomic_read(&fs_info->nr_async_submits) || 10181 atomic_read(&fs_info->async_delalloc_pages)) { 10182 wait_event(fs_info->async_submit_wait, 10183 (atomic_read(&fs_info->nr_async_submits) == 0 && 10184 atomic_read(&fs_info->async_delalloc_pages) == 0)); 10185 } 10186 atomic_dec(&fs_info->async_submit_draining); 10187 out: 10188 if (!list_empty_careful(&splice)) { 10189 spin_lock(&fs_info->delalloc_root_lock); 10190 list_splice_tail(&splice, &fs_info->delalloc_roots); 10191 spin_unlock(&fs_info->delalloc_root_lock); 10192 } 10193 mutex_unlock(&fs_info->delalloc_root_mutex); 10194 return ret; 10195 } 10196 10197 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 10198 const char *symname) 10199 { 10200 struct btrfs_trans_handle *trans; 10201 struct btrfs_root *root = BTRFS_I(dir)->root; 10202 struct btrfs_path *path; 10203 struct btrfs_key key; 10204 struct inode *inode = NULL; 10205 int err; 10206 int drop_inode = 0; 10207 u64 objectid; 10208 u64 index = 0; 10209 int name_len; 10210 int datasize; 10211 unsigned long ptr; 10212 struct btrfs_file_extent_item *ei; 10213 struct extent_buffer *leaf; 10214 10215 name_len = strlen(symname); 10216 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 10217 return -ENAMETOOLONG; 10218 10219 /* 10220 * 2 items for inode item and ref 10221 * 2 items for dir items 10222 * 1 item for updating parent inode item 10223 * 1 item for the inline extent item 10224 * 1 item for xattr if selinux is on 10225 */ 10226 trans = btrfs_start_transaction(root, 7); 10227 if (IS_ERR(trans)) 10228 return PTR_ERR(trans); 10229 10230 err = btrfs_find_free_ino(root, &objectid); 10231 if (err) 10232 goto out_unlock; 10233 10234 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 10235 dentry->d_name.len, btrfs_ino(dir), objectid, 10236 S_IFLNK|S_IRWXUGO, &index); 10237 if (IS_ERR(inode)) { 10238 err = PTR_ERR(inode); 10239 goto out_unlock; 10240 } 10241 10242 /* 10243 * If the active LSM wants to access the inode during 10244 * d_instantiate it needs these. Smack checks to see 10245 * if the filesystem supports xattrs by looking at the 10246 * ops vector. 10247 */ 10248 inode->i_fop = &btrfs_file_operations; 10249 inode->i_op = &btrfs_file_inode_operations; 10250 inode->i_mapping->a_ops = &btrfs_aops; 10251 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 10252 10253 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 10254 if (err) 10255 goto out_unlock_inode; 10256 10257 path = btrfs_alloc_path(); 10258 if (!path) { 10259 err = -ENOMEM; 10260 goto out_unlock_inode; 10261 } 10262 key.objectid = btrfs_ino(inode); 10263 key.offset = 0; 10264 key.type = BTRFS_EXTENT_DATA_KEY; 10265 datasize = btrfs_file_extent_calc_inline_size(name_len); 10266 err = btrfs_insert_empty_item(trans, root, path, &key, 10267 datasize); 10268 if (err) { 10269 btrfs_free_path(path); 10270 goto out_unlock_inode; 10271 } 10272 leaf = path->nodes[0]; 10273 ei = btrfs_item_ptr(leaf, path->slots[0], 10274 struct btrfs_file_extent_item); 10275 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 10276 btrfs_set_file_extent_type(leaf, ei, 10277 BTRFS_FILE_EXTENT_INLINE); 10278 btrfs_set_file_extent_encryption(leaf, ei, 0); 10279 btrfs_set_file_extent_compression(leaf, ei, 0); 10280 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 10281 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 10282 10283 ptr = btrfs_file_extent_inline_start(ei); 10284 write_extent_buffer(leaf, symname, ptr, name_len); 10285 btrfs_mark_buffer_dirty(leaf); 10286 btrfs_free_path(path); 10287 10288 inode->i_op = &btrfs_symlink_inode_operations; 10289 inode_nohighmem(inode); 10290 inode->i_mapping->a_ops = &btrfs_symlink_aops; 10291 inode_set_bytes(inode, name_len); 10292 btrfs_i_size_write(inode, name_len); 10293 err = btrfs_update_inode(trans, root, inode); 10294 /* 10295 * Last step, add directory indexes for our symlink inode. This is the 10296 * last step to avoid extra cleanup of these indexes if an error happens 10297 * elsewhere above. 10298 */ 10299 if (!err) 10300 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 10301 if (err) { 10302 drop_inode = 1; 10303 goto out_unlock_inode; 10304 } 10305 10306 unlock_new_inode(inode); 10307 d_instantiate(dentry, inode); 10308 10309 out_unlock: 10310 btrfs_end_transaction(trans, root); 10311 if (drop_inode) { 10312 inode_dec_link_count(inode); 10313 iput(inode); 10314 } 10315 btrfs_btree_balance_dirty(root); 10316 return err; 10317 10318 out_unlock_inode: 10319 drop_inode = 1; 10320 unlock_new_inode(inode); 10321 goto out_unlock; 10322 } 10323 10324 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 10325 u64 start, u64 num_bytes, u64 min_size, 10326 loff_t actual_len, u64 *alloc_hint, 10327 struct btrfs_trans_handle *trans) 10328 { 10329 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 10330 struct extent_map *em; 10331 struct btrfs_root *root = BTRFS_I(inode)->root; 10332 struct btrfs_key ins; 10333 u64 cur_offset = start; 10334 u64 i_size; 10335 u64 cur_bytes; 10336 u64 last_alloc = (u64)-1; 10337 int ret = 0; 10338 bool own_trans = true; 10339 u64 end = start + num_bytes - 1; 10340 10341 if (trans) 10342 own_trans = false; 10343 while (num_bytes > 0) { 10344 if (own_trans) { 10345 trans = btrfs_start_transaction(root, 3); 10346 if (IS_ERR(trans)) { 10347 ret = PTR_ERR(trans); 10348 break; 10349 } 10350 } 10351 10352 cur_bytes = min_t(u64, num_bytes, SZ_256M); 10353 cur_bytes = max(cur_bytes, min_size); 10354 /* 10355 * If we are severely fragmented we could end up with really 10356 * small allocations, so if the allocator is returning small 10357 * chunks lets make its job easier by only searching for those 10358 * sized chunks. 10359 */ 10360 cur_bytes = min(cur_bytes, last_alloc); 10361 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, 10362 min_size, 0, *alloc_hint, &ins, 1, 0); 10363 if (ret) { 10364 if (own_trans) 10365 btrfs_end_transaction(trans, root); 10366 break; 10367 } 10368 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); 10369 10370 last_alloc = ins.offset; 10371 ret = insert_reserved_file_extent(trans, inode, 10372 cur_offset, ins.objectid, 10373 ins.offset, ins.offset, 10374 ins.offset, 0, 0, 0, 10375 BTRFS_FILE_EXTENT_PREALLOC); 10376 if (ret) { 10377 btrfs_free_reserved_extent(root, ins.objectid, 10378 ins.offset, 0); 10379 btrfs_abort_transaction(trans, ret); 10380 if (own_trans) 10381 btrfs_end_transaction(trans, root); 10382 break; 10383 } 10384 10385 btrfs_drop_extent_cache(inode, cur_offset, 10386 cur_offset + ins.offset -1, 0); 10387 10388 em = alloc_extent_map(); 10389 if (!em) { 10390 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 10391 &BTRFS_I(inode)->runtime_flags); 10392 goto next; 10393 } 10394 10395 em->start = cur_offset; 10396 em->orig_start = cur_offset; 10397 em->len = ins.offset; 10398 em->block_start = ins.objectid; 10399 em->block_len = ins.offset; 10400 em->orig_block_len = ins.offset; 10401 em->ram_bytes = ins.offset; 10402 em->bdev = root->fs_info->fs_devices->latest_bdev; 10403 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 10404 em->generation = trans->transid; 10405 10406 while (1) { 10407 write_lock(&em_tree->lock); 10408 ret = add_extent_mapping(em_tree, em, 1); 10409 write_unlock(&em_tree->lock); 10410 if (ret != -EEXIST) 10411 break; 10412 btrfs_drop_extent_cache(inode, cur_offset, 10413 cur_offset + ins.offset - 1, 10414 0); 10415 } 10416 free_extent_map(em); 10417 next: 10418 num_bytes -= ins.offset; 10419 cur_offset += ins.offset; 10420 *alloc_hint = ins.objectid + ins.offset; 10421 10422 inode_inc_iversion(inode); 10423 inode->i_ctime = current_time(inode); 10424 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 10425 if (!(mode & FALLOC_FL_KEEP_SIZE) && 10426 (actual_len > inode->i_size) && 10427 (cur_offset > inode->i_size)) { 10428 if (cur_offset > actual_len) 10429 i_size = actual_len; 10430 else 10431 i_size = cur_offset; 10432 i_size_write(inode, i_size); 10433 btrfs_ordered_update_i_size(inode, i_size, NULL); 10434 } 10435 10436 ret = btrfs_update_inode(trans, root, inode); 10437 10438 if (ret) { 10439 btrfs_abort_transaction(trans, ret); 10440 if (own_trans) 10441 btrfs_end_transaction(trans, root); 10442 break; 10443 } 10444 10445 if (own_trans) 10446 btrfs_end_transaction(trans, root); 10447 } 10448 if (cur_offset < end) 10449 btrfs_free_reserved_data_space(inode, cur_offset, 10450 end - cur_offset + 1); 10451 return ret; 10452 } 10453 10454 int btrfs_prealloc_file_range(struct inode *inode, int mode, 10455 u64 start, u64 num_bytes, u64 min_size, 10456 loff_t actual_len, u64 *alloc_hint) 10457 { 10458 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 10459 min_size, actual_len, alloc_hint, 10460 NULL); 10461 } 10462 10463 int btrfs_prealloc_file_range_trans(struct inode *inode, 10464 struct btrfs_trans_handle *trans, int mode, 10465 u64 start, u64 num_bytes, u64 min_size, 10466 loff_t actual_len, u64 *alloc_hint) 10467 { 10468 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 10469 min_size, actual_len, alloc_hint, trans); 10470 } 10471 10472 static int btrfs_set_page_dirty(struct page *page) 10473 { 10474 return __set_page_dirty_nobuffers(page); 10475 } 10476 10477 static int btrfs_permission(struct inode *inode, int mask) 10478 { 10479 struct btrfs_root *root = BTRFS_I(inode)->root; 10480 umode_t mode = inode->i_mode; 10481 10482 if (mask & MAY_WRITE && 10483 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 10484 if (btrfs_root_readonly(root)) 10485 return -EROFS; 10486 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 10487 return -EACCES; 10488 } 10489 return generic_permission(inode, mask); 10490 } 10491 10492 static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 10493 { 10494 struct btrfs_trans_handle *trans; 10495 struct btrfs_root *root = BTRFS_I(dir)->root; 10496 struct inode *inode = NULL; 10497 u64 objectid; 10498 u64 index; 10499 int ret = 0; 10500 10501 /* 10502 * 5 units required for adding orphan entry 10503 */ 10504 trans = btrfs_start_transaction(root, 5); 10505 if (IS_ERR(trans)) 10506 return PTR_ERR(trans); 10507 10508 ret = btrfs_find_free_ino(root, &objectid); 10509 if (ret) 10510 goto out; 10511 10512 inode = btrfs_new_inode(trans, root, dir, NULL, 0, 10513 btrfs_ino(dir), objectid, mode, &index); 10514 if (IS_ERR(inode)) { 10515 ret = PTR_ERR(inode); 10516 inode = NULL; 10517 goto out; 10518 } 10519 10520 inode->i_fop = &btrfs_file_operations; 10521 inode->i_op = &btrfs_file_inode_operations; 10522 10523 inode->i_mapping->a_ops = &btrfs_aops; 10524 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 10525 10526 ret = btrfs_init_inode_security(trans, inode, dir, NULL); 10527 if (ret) 10528 goto out_inode; 10529 10530 ret = btrfs_update_inode(trans, root, inode); 10531 if (ret) 10532 goto out_inode; 10533 ret = btrfs_orphan_add(trans, inode); 10534 if (ret) 10535 goto out_inode; 10536 10537 /* 10538 * We set number of links to 0 in btrfs_new_inode(), and here we set 10539 * it to 1 because d_tmpfile() will issue a warning if the count is 0, 10540 * through: 10541 * 10542 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() 10543 */ 10544 set_nlink(inode, 1); 10545 unlock_new_inode(inode); 10546 d_tmpfile(dentry, inode); 10547 mark_inode_dirty(inode); 10548 10549 out: 10550 btrfs_end_transaction(trans, root); 10551 if (ret) 10552 iput(inode); 10553 btrfs_balance_delayed_items(root); 10554 btrfs_btree_balance_dirty(root); 10555 return ret; 10556 10557 out_inode: 10558 unlock_new_inode(inode); 10559 goto out; 10560 10561 } 10562 10563 static const struct inode_operations btrfs_dir_inode_operations = { 10564 .getattr = btrfs_getattr, 10565 .lookup = btrfs_lookup, 10566 .create = btrfs_create, 10567 .unlink = btrfs_unlink, 10568 .link = btrfs_link, 10569 .mkdir = btrfs_mkdir, 10570 .rmdir = btrfs_rmdir, 10571 .rename = btrfs_rename2, 10572 .symlink = btrfs_symlink, 10573 .setattr = btrfs_setattr, 10574 .mknod = btrfs_mknod, 10575 .listxattr = btrfs_listxattr, 10576 .permission = btrfs_permission, 10577 .get_acl = btrfs_get_acl, 10578 .set_acl = btrfs_set_acl, 10579 .update_time = btrfs_update_time, 10580 .tmpfile = btrfs_tmpfile, 10581 }; 10582 static const struct inode_operations btrfs_dir_ro_inode_operations = { 10583 .lookup = btrfs_lookup, 10584 .permission = btrfs_permission, 10585 .get_acl = btrfs_get_acl, 10586 .set_acl = btrfs_set_acl, 10587 .update_time = btrfs_update_time, 10588 }; 10589 10590 static const struct file_operations btrfs_dir_file_operations = { 10591 .llseek = generic_file_llseek, 10592 .read = generic_read_dir, 10593 .iterate_shared = btrfs_real_readdir, 10594 .unlocked_ioctl = btrfs_ioctl, 10595 #ifdef CONFIG_COMPAT 10596 .compat_ioctl = btrfs_compat_ioctl, 10597 #endif 10598 .release = btrfs_release_file, 10599 .fsync = btrfs_sync_file, 10600 }; 10601 10602 static const struct extent_io_ops btrfs_extent_io_ops = { 10603 .fill_delalloc = run_delalloc_range, 10604 .submit_bio_hook = btrfs_submit_bio_hook, 10605 .merge_bio_hook = btrfs_merge_bio_hook, 10606 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 10607 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 10608 .writepage_start_hook = btrfs_writepage_start_hook, 10609 .set_bit_hook = btrfs_set_bit_hook, 10610 .clear_bit_hook = btrfs_clear_bit_hook, 10611 .merge_extent_hook = btrfs_merge_extent_hook, 10612 .split_extent_hook = btrfs_split_extent_hook, 10613 }; 10614 10615 /* 10616 * btrfs doesn't support the bmap operation because swapfiles 10617 * use bmap to make a mapping of extents in the file. They assume 10618 * these extents won't change over the life of the file and they 10619 * use the bmap result to do IO directly to the drive. 10620 * 10621 * the btrfs bmap call would return logical addresses that aren't 10622 * suitable for IO and they also will change frequently as COW 10623 * operations happen. So, swapfile + btrfs == corruption. 10624 * 10625 * For now we're avoiding this by dropping bmap. 10626 */ 10627 static const struct address_space_operations btrfs_aops = { 10628 .readpage = btrfs_readpage, 10629 .writepage = btrfs_writepage, 10630 .writepages = btrfs_writepages, 10631 .readpages = btrfs_readpages, 10632 .direct_IO = btrfs_direct_IO, 10633 .invalidatepage = btrfs_invalidatepage, 10634 .releasepage = btrfs_releasepage, 10635 .set_page_dirty = btrfs_set_page_dirty, 10636 .error_remove_page = generic_error_remove_page, 10637 }; 10638 10639 static const struct address_space_operations btrfs_symlink_aops = { 10640 .readpage = btrfs_readpage, 10641 .writepage = btrfs_writepage, 10642 .invalidatepage = btrfs_invalidatepage, 10643 .releasepage = btrfs_releasepage, 10644 }; 10645 10646 static const struct inode_operations btrfs_file_inode_operations = { 10647 .getattr = btrfs_getattr, 10648 .setattr = btrfs_setattr, 10649 .listxattr = btrfs_listxattr, 10650 .permission = btrfs_permission, 10651 .fiemap = btrfs_fiemap, 10652 .get_acl = btrfs_get_acl, 10653 .set_acl = btrfs_set_acl, 10654 .update_time = btrfs_update_time, 10655 }; 10656 static const struct inode_operations btrfs_special_inode_operations = { 10657 .getattr = btrfs_getattr, 10658 .setattr = btrfs_setattr, 10659 .permission = btrfs_permission, 10660 .listxattr = btrfs_listxattr, 10661 .get_acl = btrfs_get_acl, 10662 .set_acl = btrfs_set_acl, 10663 .update_time = btrfs_update_time, 10664 }; 10665 static const struct inode_operations btrfs_symlink_inode_operations = { 10666 .readlink = generic_readlink, 10667 .get_link = page_get_link, 10668 .getattr = btrfs_getattr, 10669 .setattr = btrfs_setattr, 10670 .permission = btrfs_permission, 10671 .listxattr = btrfs_listxattr, 10672 .update_time = btrfs_update_time, 10673 }; 10674 10675 const struct dentry_operations btrfs_dentry_operations = { 10676 .d_delete = btrfs_dentry_delete, 10677 .d_release = btrfs_dentry_release, 10678 }; 10679