1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/aio.h> 36 #include <linux/bit_spinlock.h> 37 #include <linux/xattr.h> 38 #include <linux/posix_acl.h> 39 #include <linux/falloc.h> 40 #include <linux/slab.h> 41 #include <linux/ratelimit.h> 42 #include <linux/mount.h> 43 #include <linux/btrfs.h> 44 #include <linux/blkdev.h> 45 #include <linux/posix_acl_xattr.h> 46 #include "ctree.h" 47 #include "disk-io.h" 48 #include "transaction.h" 49 #include "btrfs_inode.h" 50 #include "print-tree.h" 51 #include "ordered-data.h" 52 #include "xattr.h" 53 #include "tree-log.h" 54 #include "volumes.h" 55 #include "compression.h" 56 #include "locking.h" 57 #include "free-space-cache.h" 58 #include "inode-map.h" 59 #include "backref.h" 60 #include "hash.h" 61 #include "props.h" 62 63 struct btrfs_iget_args { 64 struct btrfs_key *location; 65 struct btrfs_root *root; 66 }; 67 68 static const struct inode_operations btrfs_dir_inode_operations; 69 static const struct inode_operations btrfs_symlink_inode_operations; 70 static const struct inode_operations btrfs_dir_ro_inode_operations; 71 static const struct inode_operations btrfs_special_inode_operations; 72 static const struct inode_operations btrfs_file_inode_operations; 73 static const struct address_space_operations btrfs_aops; 74 static const struct address_space_operations btrfs_symlink_aops; 75 static const struct file_operations btrfs_dir_file_operations; 76 static struct extent_io_ops btrfs_extent_io_ops; 77 78 static struct kmem_cache *btrfs_inode_cachep; 79 static struct kmem_cache *btrfs_delalloc_work_cachep; 80 struct kmem_cache *btrfs_trans_handle_cachep; 81 struct kmem_cache *btrfs_transaction_cachep; 82 struct kmem_cache *btrfs_path_cachep; 83 struct kmem_cache *btrfs_free_space_cachep; 84 85 #define S_SHIFT 12 86 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 87 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 88 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 89 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 90 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 91 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 92 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 93 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 94 }; 95 96 static int btrfs_setsize(struct inode *inode, struct iattr *attr); 97 static int btrfs_truncate(struct inode *inode); 98 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 99 static noinline int cow_file_range(struct inode *inode, 100 struct page *locked_page, 101 u64 start, u64 end, int *page_started, 102 unsigned long *nr_written, int unlock); 103 static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 104 u64 len, u64 orig_start, 105 u64 block_start, u64 block_len, 106 u64 orig_block_len, u64 ram_bytes, 107 int type); 108 109 static int btrfs_dirty_inode(struct inode *inode); 110 111 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 112 struct inode *inode, struct inode *dir, 113 const struct qstr *qstr) 114 { 115 int err; 116 117 err = btrfs_init_acl(trans, inode, dir); 118 if (!err) 119 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 120 return err; 121 } 122 123 /* 124 * this does all the hard work for inserting an inline extent into 125 * the btree. The caller should have done a btrfs_drop_extents so that 126 * no overlapping inline items exist in the btree 127 */ 128 static int insert_inline_extent(struct btrfs_trans_handle *trans, 129 struct btrfs_path *path, int extent_inserted, 130 struct btrfs_root *root, struct inode *inode, 131 u64 start, size_t size, size_t compressed_size, 132 int compress_type, 133 struct page **compressed_pages) 134 { 135 struct extent_buffer *leaf; 136 struct page *page = NULL; 137 char *kaddr; 138 unsigned long ptr; 139 struct btrfs_file_extent_item *ei; 140 int err = 0; 141 int ret; 142 size_t cur_size = size; 143 unsigned long offset; 144 145 if (compressed_size && compressed_pages) 146 cur_size = compressed_size; 147 148 inode_add_bytes(inode, size); 149 150 if (!extent_inserted) { 151 struct btrfs_key key; 152 size_t datasize; 153 154 key.objectid = btrfs_ino(inode); 155 key.offset = start; 156 key.type = BTRFS_EXTENT_DATA_KEY; 157 158 datasize = btrfs_file_extent_calc_inline_size(cur_size); 159 path->leave_spinning = 1; 160 ret = btrfs_insert_empty_item(trans, root, path, &key, 161 datasize); 162 if (ret) { 163 err = ret; 164 goto fail; 165 } 166 } 167 leaf = path->nodes[0]; 168 ei = btrfs_item_ptr(leaf, path->slots[0], 169 struct btrfs_file_extent_item); 170 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 171 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 172 btrfs_set_file_extent_encryption(leaf, ei, 0); 173 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 174 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 175 ptr = btrfs_file_extent_inline_start(ei); 176 177 if (compress_type != BTRFS_COMPRESS_NONE) { 178 struct page *cpage; 179 int i = 0; 180 while (compressed_size > 0) { 181 cpage = compressed_pages[i]; 182 cur_size = min_t(unsigned long, compressed_size, 183 PAGE_CACHE_SIZE); 184 185 kaddr = kmap_atomic(cpage); 186 write_extent_buffer(leaf, kaddr, ptr, cur_size); 187 kunmap_atomic(kaddr); 188 189 i++; 190 ptr += cur_size; 191 compressed_size -= cur_size; 192 } 193 btrfs_set_file_extent_compression(leaf, ei, 194 compress_type); 195 } else { 196 page = find_get_page(inode->i_mapping, 197 start >> PAGE_CACHE_SHIFT); 198 btrfs_set_file_extent_compression(leaf, ei, 0); 199 kaddr = kmap_atomic(page); 200 offset = start & (PAGE_CACHE_SIZE - 1); 201 write_extent_buffer(leaf, kaddr + offset, ptr, size); 202 kunmap_atomic(kaddr); 203 page_cache_release(page); 204 } 205 btrfs_mark_buffer_dirty(leaf); 206 btrfs_release_path(path); 207 208 /* 209 * we're an inline extent, so nobody can 210 * extend the file past i_size without locking 211 * a page we already have locked. 212 * 213 * We must do any isize and inode updates 214 * before we unlock the pages. Otherwise we 215 * could end up racing with unlink. 216 */ 217 BTRFS_I(inode)->disk_i_size = inode->i_size; 218 ret = btrfs_update_inode(trans, root, inode); 219 220 return ret; 221 fail: 222 return err; 223 } 224 225 226 /* 227 * conditionally insert an inline extent into the file. This 228 * does the checks required to make sure the data is small enough 229 * to fit as an inline extent. 230 */ 231 static noinline int cow_file_range_inline(struct btrfs_root *root, 232 struct inode *inode, u64 start, 233 u64 end, size_t compressed_size, 234 int compress_type, 235 struct page **compressed_pages) 236 { 237 struct btrfs_trans_handle *trans; 238 u64 isize = i_size_read(inode); 239 u64 actual_end = min(end + 1, isize); 240 u64 inline_len = actual_end - start; 241 u64 aligned_end = ALIGN(end, root->sectorsize); 242 u64 data_len = inline_len; 243 int ret; 244 struct btrfs_path *path; 245 int extent_inserted = 0; 246 u32 extent_item_size; 247 248 if (compressed_size) 249 data_len = compressed_size; 250 251 if (start > 0 || 252 actual_end > PAGE_CACHE_SIZE || 253 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || 254 (!compressed_size && 255 (actual_end & (root->sectorsize - 1)) == 0) || 256 end + 1 < isize || 257 data_len > root->fs_info->max_inline) { 258 return 1; 259 } 260 261 path = btrfs_alloc_path(); 262 if (!path) 263 return -ENOMEM; 264 265 trans = btrfs_join_transaction(root); 266 if (IS_ERR(trans)) { 267 btrfs_free_path(path); 268 return PTR_ERR(trans); 269 } 270 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 271 272 if (compressed_size && compressed_pages) 273 extent_item_size = btrfs_file_extent_calc_inline_size( 274 compressed_size); 275 else 276 extent_item_size = btrfs_file_extent_calc_inline_size( 277 inline_len); 278 279 ret = __btrfs_drop_extents(trans, root, inode, path, 280 start, aligned_end, NULL, 281 1, 1, extent_item_size, &extent_inserted); 282 if (ret) { 283 btrfs_abort_transaction(trans, root, ret); 284 goto out; 285 } 286 287 if (isize > actual_end) 288 inline_len = min_t(u64, isize, actual_end); 289 ret = insert_inline_extent(trans, path, extent_inserted, 290 root, inode, start, 291 inline_len, compressed_size, 292 compress_type, compressed_pages); 293 if (ret && ret != -ENOSPC) { 294 btrfs_abort_transaction(trans, root, ret); 295 goto out; 296 } else if (ret == -ENOSPC) { 297 ret = 1; 298 goto out; 299 } 300 301 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 302 btrfs_delalloc_release_metadata(inode, end + 1 - start); 303 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 304 out: 305 btrfs_free_path(path); 306 btrfs_end_transaction(trans, root); 307 return ret; 308 } 309 310 struct async_extent { 311 u64 start; 312 u64 ram_size; 313 u64 compressed_size; 314 struct page **pages; 315 unsigned long nr_pages; 316 int compress_type; 317 struct list_head list; 318 }; 319 320 struct async_cow { 321 struct inode *inode; 322 struct btrfs_root *root; 323 struct page *locked_page; 324 u64 start; 325 u64 end; 326 struct list_head extents; 327 struct btrfs_work work; 328 }; 329 330 static noinline int add_async_extent(struct async_cow *cow, 331 u64 start, u64 ram_size, 332 u64 compressed_size, 333 struct page **pages, 334 unsigned long nr_pages, 335 int compress_type) 336 { 337 struct async_extent *async_extent; 338 339 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 340 BUG_ON(!async_extent); /* -ENOMEM */ 341 async_extent->start = start; 342 async_extent->ram_size = ram_size; 343 async_extent->compressed_size = compressed_size; 344 async_extent->pages = pages; 345 async_extent->nr_pages = nr_pages; 346 async_extent->compress_type = compress_type; 347 list_add_tail(&async_extent->list, &cow->extents); 348 return 0; 349 } 350 351 static inline int inode_need_compress(struct inode *inode) 352 { 353 struct btrfs_root *root = BTRFS_I(inode)->root; 354 355 /* force compress */ 356 if (btrfs_test_opt(root, FORCE_COMPRESS)) 357 return 1; 358 /* bad compression ratios */ 359 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 360 return 0; 361 if (btrfs_test_opt(root, COMPRESS) || 362 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || 363 BTRFS_I(inode)->force_compress) 364 return 1; 365 return 0; 366 } 367 368 /* 369 * we create compressed extents in two phases. The first 370 * phase compresses a range of pages that have already been 371 * locked (both pages and state bits are locked). 372 * 373 * This is done inside an ordered work queue, and the compression 374 * is spread across many cpus. The actual IO submission is step 375 * two, and the ordered work queue takes care of making sure that 376 * happens in the same order things were put onto the queue by 377 * writepages and friends. 378 * 379 * If this code finds it can't get good compression, it puts an 380 * entry onto the work queue to write the uncompressed bytes. This 381 * makes sure that both compressed inodes and uncompressed inodes 382 * are written in the same order that the flusher thread sent them 383 * down. 384 */ 385 static noinline int compress_file_range(struct inode *inode, 386 struct page *locked_page, 387 u64 start, u64 end, 388 struct async_cow *async_cow, 389 int *num_added) 390 { 391 struct btrfs_root *root = BTRFS_I(inode)->root; 392 u64 num_bytes; 393 u64 blocksize = root->sectorsize; 394 u64 actual_end; 395 u64 isize = i_size_read(inode); 396 int ret = 0; 397 struct page **pages = NULL; 398 unsigned long nr_pages; 399 unsigned long nr_pages_ret = 0; 400 unsigned long total_compressed = 0; 401 unsigned long total_in = 0; 402 unsigned long max_compressed = 128 * 1024; 403 unsigned long max_uncompressed = 128 * 1024; 404 int i; 405 int will_compress; 406 int compress_type = root->fs_info->compress_type; 407 int redirty = 0; 408 409 /* if this is a small write inside eof, kick off a defrag */ 410 if ((end - start + 1) < 16 * 1024 && 411 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 412 btrfs_add_inode_defrag(NULL, inode); 413 414 /* 415 * skip compression for a small file range(<=blocksize) that 416 * isn't an inline extent, since it dosen't save disk space at all. 417 */ 418 if ((end - start + 1) <= blocksize && 419 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 420 goto cleanup_and_bail_uncompressed; 421 422 actual_end = min_t(u64, isize, end + 1); 423 again: 424 will_compress = 0; 425 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 426 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 427 428 /* 429 * we don't want to send crud past the end of i_size through 430 * compression, that's just a waste of CPU time. So, if the 431 * end of the file is before the start of our current 432 * requested range of bytes, we bail out to the uncompressed 433 * cleanup code that can deal with all of this. 434 * 435 * It isn't really the fastest way to fix things, but this is a 436 * very uncommon corner. 437 */ 438 if (actual_end <= start) 439 goto cleanup_and_bail_uncompressed; 440 441 total_compressed = actual_end - start; 442 443 /* we want to make sure that amount of ram required to uncompress 444 * an extent is reasonable, so we limit the total size in ram 445 * of a compressed extent to 128k. This is a crucial number 446 * because it also controls how easily we can spread reads across 447 * cpus for decompression. 448 * 449 * We also want to make sure the amount of IO required to do 450 * a random read is reasonably small, so we limit the size of 451 * a compressed extent to 128k. 452 */ 453 total_compressed = min(total_compressed, max_uncompressed); 454 num_bytes = ALIGN(end - start + 1, blocksize); 455 num_bytes = max(blocksize, num_bytes); 456 total_in = 0; 457 ret = 0; 458 459 /* 460 * we do compression for mount -o compress and when the 461 * inode has not been flagged as nocompress. This flag can 462 * change at any time if we discover bad compression ratios. 463 */ 464 if (inode_need_compress(inode)) { 465 WARN_ON(pages); 466 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 467 if (!pages) { 468 /* just bail out to the uncompressed code */ 469 goto cont; 470 } 471 472 if (BTRFS_I(inode)->force_compress) 473 compress_type = BTRFS_I(inode)->force_compress; 474 475 /* 476 * we need to call clear_page_dirty_for_io on each 477 * page in the range. Otherwise applications with the file 478 * mmap'd can wander in and change the page contents while 479 * we are compressing them. 480 * 481 * If the compression fails for any reason, we set the pages 482 * dirty again later on. 483 */ 484 extent_range_clear_dirty_for_io(inode, start, end); 485 redirty = 1; 486 ret = btrfs_compress_pages(compress_type, 487 inode->i_mapping, start, 488 total_compressed, pages, 489 nr_pages, &nr_pages_ret, 490 &total_in, 491 &total_compressed, 492 max_compressed); 493 494 if (!ret) { 495 unsigned long offset = total_compressed & 496 (PAGE_CACHE_SIZE - 1); 497 struct page *page = pages[nr_pages_ret - 1]; 498 char *kaddr; 499 500 /* zero the tail end of the last page, we might be 501 * sending it down to disk 502 */ 503 if (offset) { 504 kaddr = kmap_atomic(page); 505 memset(kaddr + offset, 0, 506 PAGE_CACHE_SIZE - offset); 507 kunmap_atomic(kaddr); 508 } 509 will_compress = 1; 510 } 511 } 512 cont: 513 if (start == 0) { 514 /* lets try to make an inline extent */ 515 if (ret || total_in < (actual_end - start)) { 516 /* we didn't compress the entire range, try 517 * to make an uncompressed inline extent. 518 */ 519 ret = cow_file_range_inline(root, inode, start, end, 520 0, 0, NULL); 521 } else { 522 /* try making a compressed inline extent */ 523 ret = cow_file_range_inline(root, inode, start, end, 524 total_compressed, 525 compress_type, pages); 526 } 527 if (ret <= 0) { 528 unsigned long clear_flags = EXTENT_DELALLOC | 529 EXTENT_DEFRAG; 530 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; 531 532 /* 533 * inline extent creation worked or returned error, 534 * we don't need to create any more async work items. 535 * Unlock and free up our temp pages. 536 */ 537 extent_clear_unlock_delalloc(inode, start, end, NULL, 538 clear_flags, PAGE_UNLOCK | 539 PAGE_CLEAR_DIRTY | 540 PAGE_SET_WRITEBACK | 541 PAGE_END_WRITEBACK); 542 goto free_pages_out; 543 } 544 } 545 546 if (will_compress) { 547 /* 548 * we aren't doing an inline extent round the compressed size 549 * up to a block size boundary so the allocator does sane 550 * things 551 */ 552 total_compressed = ALIGN(total_compressed, blocksize); 553 554 /* 555 * one last check to make sure the compression is really a 556 * win, compare the page count read with the blocks on disk 557 */ 558 total_in = ALIGN(total_in, PAGE_CACHE_SIZE); 559 if (total_compressed >= total_in) { 560 will_compress = 0; 561 } else { 562 num_bytes = total_in; 563 } 564 } 565 if (!will_compress && pages) { 566 /* 567 * the compression code ran but failed to make things smaller, 568 * free any pages it allocated and our page pointer array 569 */ 570 for (i = 0; i < nr_pages_ret; i++) { 571 WARN_ON(pages[i]->mapping); 572 page_cache_release(pages[i]); 573 } 574 kfree(pages); 575 pages = NULL; 576 total_compressed = 0; 577 nr_pages_ret = 0; 578 579 /* flag the file so we don't compress in the future */ 580 if (!btrfs_test_opt(root, FORCE_COMPRESS) && 581 !(BTRFS_I(inode)->force_compress)) { 582 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 583 } 584 } 585 if (will_compress) { 586 *num_added += 1; 587 588 /* the async work queues will take care of doing actual 589 * allocation on disk for these compressed pages, 590 * and will submit them to the elevator. 591 */ 592 add_async_extent(async_cow, start, num_bytes, 593 total_compressed, pages, nr_pages_ret, 594 compress_type); 595 596 if (start + num_bytes < end) { 597 start += num_bytes; 598 pages = NULL; 599 cond_resched(); 600 goto again; 601 } 602 } else { 603 cleanup_and_bail_uncompressed: 604 /* 605 * No compression, but we still need to write the pages in 606 * the file we've been given so far. redirty the locked 607 * page if it corresponds to our extent and set things up 608 * for the async work queue to run cow_file_range to do 609 * the normal delalloc dance 610 */ 611 if (page_offset(locked_page) >= start && 612 page_offset(locked_page) <= end) { 613 __set_page_dirty_nobuffers(locked_page); 614 /* unlocked later on in the async handlers */ 615 } 616 if (redirty) 617 extent_range_redirty_for_io(inode, start, end); 618 add_async_extent(async_cow, start, end - start + 1, 619 0, NULL, 0, BTRFS_COMPRESS_NONE); 620 *num_added += 1; 621 } 622 623 out: 624 return ret; 625 626 free_pages_out: 627 for (i = 0; i < nr_pages_ret; i++) { 628 WARN_ON(pages[i]->mapping); 629 page_cache_release(pages[i]); 630 } 631 kfree(pages); 632 633 goto out; 634 } 635 636 /* 637 * phase two of compressed writeback. This is the ordered portion 638 * of the code, which only gets called in the order the work was 639 * queued. We walk all the async extents created by compress_file_range 640 * and send them down to the disk. 641 */ 642 static noinline int submit_compressed_extents(struct inode *inode, 643 struct async_cow *async_cow) 644 { 645 struct async_extent *async_extent; 646 u64 alloc_hint = 0; 647 struct btrfs_key ins; 648 struct extent_map *em; 649 struct btrfs_root *root = BTRFS_I(inode)->root; 650 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 651 struct extent_io_tree *io_tree; 652 int ret = 0; 653 654 if (list_empty(&async_cow->extents)) 655 return 0; 656 657 again: 658 while (!list_empty(&async_cow->extents)) { 659 async_extent = list_entry(async_cow->extents.next, 660 struct async_extent, list); 661 list_del(&async_extent->list); 662 663 io_tree = &BTRFS_I(inode)->io_tree; 664 665 retry: 666 /* did the compression code fall back to uncompressed IO? */ 667 if (!async_extent->pages) { 668 int page_started = 0; 669 unsigned long nr_written = 0; 670 671 lock_extent(io_tree, async_extent->start, 672 async_extent->start + 673 async_extent->ram_size - 1); 674 675 /* allocate blocks */ 676 ret = cow_file_range(inode, async_cow->locked_page, 677 async_extent->start, 678 async_extent->start + 679 async_extent->ram_size - 1, 680 &page_started, &nr_written, 0); 681 682 /* JDM XXX */ 683 684 /* 685 * if page_started, cow_file_range inserted an 686 * inline extent and took care of all the unlocking 687 * and IO for us. Otherwise, we need to submit 688 * all those pages down to the drive. 689 */ 690 if (!page_started && !ret) 691 extent_write_locked_range(io_tree, 692 inode, async_extent->start, 693 async_extent->start + 694 async_extent->ram_size - 1, 695 btrfs_get_extent, 696 WB_SYNC_ALL); 697 else if (ret) 698 unlock_page(async_cow->locked_page); 699 kfree(async_extent); 700 cond_resched(); 701 continue; 702 } 703 704 lock_extent(io_tree, async_extent->start, 705 async_extent->start + async_extent->ram_size - 1); 706 707 ret = btrfs_reserve_extent(root, 708 async_extent->compressed_size, 709 async_extent->compressed_size, 710 0, alloc_hint, &ins, 1, 1); 711 if (ret) { 712 int i; 713 714 for (i = 0; i < async_extent->nr_pages; i++) { 715 WARN_ON(async_extent->pages[i]->mapping); 716 page_cache_release(async_extent->pages[i]); 717 } 718 kfree(async_extent->pages); 719 async_extent->nr_pages = 0; 720 async_extent->pages = NULL; 721 722 if (ret == -ENOSPC) { 723 unlock_extent(io_tree, async_extent->start, 724 async_extent->start + 725 async_extent->ram_size - 1); 726 727 /* 728 * we need to redirty the pages if we decide to 729 * fallback to uncompressed IO, otherwise we 730 * will not submit these pages down to lower 731 * layers. 732 */ 733 extent_range_redirty_for_io(inode, 734 async_extent->start, 735 async_extent->start + 736 async_extent->ram_size - 1); 737 738 goto retry; 739 } 740 goto out_free; 741 } 742 743 /* 744 * here we're doing allocation and writeback of the 745 * compressed pages 746 */ 747 btrfs_drop_extent_cache(inode, async_extent->start, 748 async_extent->start + 749 async_extent->ram_size - 1, 0); 750 751 em = alloc_extent_map(); 752 if (!em) { 753 ret = -ENOMEM; 754 goto out_free_reserve; 755 } 756 em->start = async_extent->start; 757 em->len = async_extent->ram_size; 758 em->orig_start = em->start; 759 em->mod_start = em->start; 760 em->mod_len = em->len; 761 762 em->block_start = ins.objectid; 763 em->block_len = ins.offset; 764 em->orig_block_len = ins.offset; 765 em->ram_bytes = async_extent->ram_size; 766 em->bdev = root->fs_info->fs_devices->latest_bdev; 767 em->compress_type = async_extent->compress_type; 768 set_bit(EXTENT_FLAG_PINNED, &em->flags); 769 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 770 em->generation = -1; 771 772 while (1) { 773 write_lock(&em_tree->lock); 774 ret = add_extent_mapping(em_tree, em, 1); 775 write_unlock(&em_tree->lock); 776 if (ret != -EEXIST) { 777 free_extent_map(em); 778 break; 779 } 780 btrfs_drop_extent_cache(inode, async_extent->start, 781 async_extent->start + 782 async_extent->ram_size - 1, 0); 783 } 784 785 if (ret) 786 goto out_free_reserve; 787 788 ret = btrfs_add_ordered_extent_compress(inode, 789 async_extent->start, 790 ins.objectid, 791 async_extent->ram_size, 792 ins.offset, 793 BTRFS_ORDERED_COMPRESSED, 794 async_extent->compress_type); 795 if (ret) { 796 btrfs_drop_extent_cache(inode, async_extent->start, 797 async_extent->start + 798 async_extent->ram_size - 1, 0); 799 goto out_free_reserve; 800 } 801 802 /* 803 * clear dirty, set writeback and unlock the pages. 804 */ 805 extent_clear_unlock_delalloc(inode, async_extent->start, 806 async_extent->start + 807 async_extent->ram_size - 1, 808 NULL, EXTENT_LOCKED | EXTENT_DELALLOC, 809 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 810 PAGE_SET_WRITEBACK); 811 ret = btrfs_submit_compressed_write(inode, 812 async_extent->start, 813 async_extent->ram_size, 814 ins.objectid, 815 ins.offset, async_extent->pages, 816 async_extent->nr_pages); 817 alloc_hint = ins.objectid + ins.offset; 818 kfree(async_extent); 819 if (ret) 820 goto out; 821 cond_resched(); 822 } 823 ret = 0; 824 out: 825 return ret; 826 out_free_reserve: 827 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 828 out_free: 829 extent_clear_unlock_delalloc(inode, async_extent->start, 830 async_extent->start + 831 async_extent->ram_size - 1, 832 NULL, EXTENT_LOCKED | EXTENT_DELALLOC | 833 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 834 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 835 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); 836 kfree(async_extent); 837 goto again; 838 } 839 840 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 841 u64 num_bytes) 842 { 843 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 844 struct extent_map *em; 845 u64 alloc_hint = 0; 846 847 read_lock(&em_tree->lock); 848 em = search_extent_mapping(em_tree, start, num_bytes); 849 if (em) { 850 /* 851 * if block start isn't an actual block number then find the 852 * first block in this inode and use that as a hint. If that 853 * block is also bogus then just don't worry about it. 854 */ 855 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 856 free_extent_map(em); 857 em = search_extent_mapping(em_tree, 0, 0); 858 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 859 alloc_hint = em->block_start; 860 if (em) 861 free_extent_map(em); 862 } else { 863 alloc_hint = em->block_start; 864 free_extent_map(em); 865 } 866 } 867 read_unlock(&em_tree->lock); 868 869 return alloc_hint; 870 } 871 872 /* 873 * when extent_io.c finds a delayed allocation range in the file, 874 * the call backs end up in this code. The basic idea is to 875 * allocate extents on disk for the range, and create ordered data structs 876 * in ram to track those extents. 877 * 878 * locked_page is the page that writepage had locked already. We use 879 * it to make sure we don't do extra locks or unlocks. 880 * 881 * *page_started is set to one if we unlock locked_page and do everything 882 * required to start IO on it. It may be clean and already done with 883 * IO when we return. 884 */ 885 static noinline int cow_file_range(struct inode *inode, 886 struct page *locked_page, 887 u64 start, u64 end, int *page_started, 888 unsigned long *nr_written, 889 int unlock) 890 { 891 struct btrfs_root *root = BTRFS_I(inode)->root; 892 u64 alloc_hint = 0; 893 u64 num_bytes; 894 unsigned long ram_size; 895 u64 disk_num_bytes; 896 u64 cur_alloc_size; 897 u64 blocksize = root->sectorsize; 898 struct btrfs_key ins; 899 struct extent_map *em; 900 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 901 int ret = 0; 902 903 if (btrfs_is_free_space_inode(inode)) { 904 WARN_ON_ONCE(1); 905 ret = -EINVAL; 906 goto out_unlock; 907 } 908 909 num_bytes = ALIGN(end - start + 1, blocksize); 910 num_bytes = max(blocksize, num_bytes); 911 disk_num_bytes = num_bytes; 912 913 /* if this is a small write inside eof, kick off defrag */ 914 if (num_bytes < 64 * 1024 && 915 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 916 btrfs_add_inode_defrag(NULL, inode); 917 918 if (start == 0) { 919 /* lets try to make an inline extent */ 920 ret = cow_file_range_inline(root, inode, start, end, 0, 0, 921 NULL); 922 if (ret == 0) { 923 extent_clear_unlock_delalloc(inode, start, end, NULL, 924 EXTENT_LOCKED | EXTENT_DELALLOC | 925 EXTENT_DEFRAG, PAGE_UNLOCK | 926 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 927 PAGE_END_WRITEBACK); 928 929 *nr_written = *nr_written + 930 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 931 *page_started = 1; 932 goto out; 933 } else if (ret < 0) { 934 goto out_unlock; 935 } 936 } 937 938 BUG_ON(disk_num_bytes > 939 btrfs_super_total_bytes(root->fs_info->super_copy)); 940 941 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 942 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 943 944 while (disk_num_bytes > 0) { 945 unsigned long op; 946 947 cur_alloc_size = disk_num_bytes; 948 ret = btrfs_reserve_extent(root, cur_alloc_size, 949 root->sectorsize, 0, alloc_hint, 950 &ins, 1, 1); 951 if (ret < 0) 952 goto out_unlock; 953 954 em = alloc_extent_map(); 955 if (!em) { 956 ret = -ENOMEM; 957 goto out_reserve; 958 } 959 em->start = start; 960 em->orig_start = em->start; 961 ram_size = ins.offset; 962 em->len = ins.offset; 963 em->mod_start = em->start; 964 em->mod_len = em->len; 965 966 em->block_start = ins.objectid; 967 em->block_len = ins.offset; 968 em->orig_block_len = ins.offset; 969 em->ram_bytes = ram_size; 970 em->bdev = root->fs_info->fs_devices->latest_bdev; 971 set_bit(EXTENT_FLAG_PINNED, &em->flags); 972 em->generation = -1; 973 974 while (1) { 975 write_lock(&em_tree->lock); 976 ret = add_extent_mapping(em_tree, em, 1); 977 write_unlock(&em_tree->lock); 978 if (ret != -EEXIST) { 979 free_extent_map(em); 980 break; 981 } 982 btrfs_drop_extent_cache(inode, start, 983 start + ram_size - 1, 0); 984 } 985 if (ret) 986 goto out_reserve; 987 988 cur_alloc_size = ins.offset; 989 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 990 ram_size, cur_alloc_size, 0); 991 if (ret) 992 goto out_drop_extent_cache; 993 994 if (root->root_key.objectid == 995 BTRFS_DATA_RELOC_TREE_OBJECTID) { 996 ret = btrfs_reloc_clone_csums(inode, start, 997 cur_alloc_size); 998 if (ret) 999 goto out_drop_extent_cache; 1000 } 1001 1002 if (disk_num_bytes < cur_alloc_size) 1003 break; 1004 1005 /* we're not doing compressed IO, don't unlock the first 1006 * page (which the caller expects to stay locked), don't 1007 * clear any dirty bits and don't set any writeback bits 1008 * 1009 * Do set the Private2 bit so we know this page was properly 1010 * setup for writepage 1011 */ 1012 op = unlock ? PAGE_UNLOCK : 0; 1013 op |= PAGE_SET_PRIVATE2; 1014 1015 extent_clear_unlock_delalloc(inode, start, 1016 start + ram_size - 1, locked_page, 1017 EXTENT_LOCKED | EXTENT_DELALLOC, 1018 op); 1019 disk_num_bytes -= cur_alloc_size; 1020 num_bytes -= cur_alloc_size; 1021 alloc_hint = ins.objectid + ins.offset; 1022 start += cur_alloc_size; 1023 } 1024 out: 1025 return ret; 1026 1027 out_drop_extent_cache: 1028 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); 1029 out_reserve: 1030 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 1031 out_unlock: 1032 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1033 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 1034 EXTENT_DELALLOC | EXTENT_DEFRAG, 1035 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 1036 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); 1037 goto out; 1038 } 1039 1040 /* 1041 * work queue call back to started compression on a file and pages 1042 */ 1043 static noinline void async_cow_start(struct btrfs_work *work) 1044 { 1045 struct async_cow *async_cow; 1046 int num_added = 0; 1047 async_cow = container_of(work, struct async_cow, work); 1048 1049 compress_file_range(async_cow->inode, async_cow->locked_page, 1050 async_cow->start, async_cow->end, async_cow, 1051 &num_added); 1052 if (num_added == 0) { 1053 btrfs_add_delayed_iput(async_cow->inode); 1054 async_cow->inode = NULL; 1055 } 1056 } 1057 1058 /* 1059 * work queue call back to submit previously compressed pages 1060 */ 1061 static noinline void async_cow_submit(struct btrfs_work *work) 1062 { 1063 struct async_cow *async_cow; 1064 struct btrfs_root *root; 1065 unsigned long nr_pages; 1066 1067 async_cow = container_of(work, struct async_cow, work); 1068 1069 root = async_cow->root; 1070 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 1071 PAGE_CACHE_SHIFT; 1072 1073 if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < 1074 5 * 1024 * 1024 && 1075 waitqueue_active(&root->fs_info->async_submit_wait)) 1076 wake_up(&root->fs_info->async_submit_wait); 1077 1078 if (async_cow->inode) 1079 submit_compressed_extents(async_cow->inode, async_cow); 1080 } 1081 1082 static noinline void async_cow_free(struct btrfs_work *work) 1083 { 1084 struct async_cow *async_cow; 1085 async_cow = container_of(work, struct async_cow, work); 1086 if (async_cow->inode) 1087 btrfs_add_delayed_iput(async_cow->inode); 1088 kfree(async_cow); 1089 } 1090 1091 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1092 u64 start, u64 end, int *page_started, 1093 unsigned long *nr_written) 1094 { 1095 struct async_cow *async_cow; 1096 struct btrfs_root *root = BTRFS_I(inode)->root; 1097 unsigned long nr_pages; 1098 u64 cur_end; 1099 int limit = 10 * 1024 * 1024; 1100 1101 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1102 1, 0, NULL, GFP_NOFS); 1103 while (start < end) { 1104 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 1105 BUG_ON(!async_cow); /* -ENOMEM */ 1106 async_cow->inode = igrab(inode); 1107 async_cow->root = root; 1108 async_cow->locked_page = locked_page; 1109 async_cow->start = start; 1110 1111 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && 1112 !btrfs_test_opt(root, FORCE_COMPRESS)) 1113 cur_end = end; 1114 else 1115 cur_end = min(end, start + 512 * 1024 - 1); 1116 1117 async_cow->end = cur_end; 1118 INIT_LIST_HEAD(&async_cow->extents); 1119 1120 btrfs_init_work(&async_cow->work, 1121 btrfs_delalloc_helper, 1122 async_cow_start, async_cow_submit, 1123 async_cow_free); 1124 1125 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1126 PAGE_CACHE_SHIFT; 1127 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1128 1129 btrfs_queue_work(root->fs_info->delalloc_workers, 1130 &async_cow->work); 1131 1132 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1133 wait_event(root->fs_info->async_submit_wait, 1134 (atomic_read(&root->fs_info->async_delalloc_pages) < 1135 limit)); 1136 } 1137 1138 while (atomic_read(&root->fs_info->async_submit_draining) && 1139 atomic_read(&root->fs_info->async_delalloc_pages)) { 1140 wait_event(root->fs_info->async_submit_wait, 1141 (atomic_read(&root->fs_info->async_delalloc_pages) == 1142 0)); 1143 } 1144 1145 *nr_written += nr_pages; 1146 start = cur_end + 1; 1147 } 1148 *page_started = 1; 1149 return 0; 1150 } 1151 1152 static noinline int csum_exist_in_range(struct btrfs_root *root, 1153 u64 bytenr, u64 num_bytes) 1154 { 1155 int ret; 1156 struct btrfs_ordered_sum *sums; 1157 LIST_HEAD(list); 1158 1159 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1160 bytenr + num_bytes - 1, &list, 0); 1161 if (ret == 0 && list_empty(&list)) 1162 return 0; 1163 1164 while (!list_empty(&list)) { 1165 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1166 list_del(&sums->list); 1167 kfree(sums); 1168 } 1169 return 1; 1170 } 1171 1172 /* 1173 * when nowcow writeback call back. This checks for snapshots or COW copies 1174 * of the extents that exist in the file, and COWs the file as required. 1175 * 1176 * If no cow copies or snapshots exist, we write directly to the existing 1177 * blocks on disk 1178 */ 1179 static noinline int run_delalloc_nocow(struct inode *inode, 1180 struct page *locked_page, 1181 u64 start, u64 end, int *page_started, int force, 1182 unsigned long *nr_written) 1183 { 1184 struct btrfs_root *root = BTRFS_I(inode)->root; 1185 struct btrfs_trans_handle *trans; 1186 struct extent_buffer *leaf; 1187 struct btrfs_path *path; 1188 struct btrfs_file_extent_item *fi; 1189 struct btrfs_key found_key; 1190 u64 cow_start; 1191 u64 cur_offset; 1192 u64 extent_end; 1193 u64 extent_offset; 1194 u64 disk_bytenr; 1195 u64 num_bytes; 1196 u64 disk_num_bytes; 1197 u64 ram_bytes; 1198 int extent_type; 1199 int ret, err; 1200 int type; 1201 int nocow; 1202 int check_prev = 1; 1203 bool nolock; 1204 u64 ino = btrfs_ino(inode); 1205 1206 path = btrfs_alloc_path(); 1207 if (!path) { 1208 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1209 EXTENT_LOCKED | EXTENT_DELALLOC | 1210 EXTENT_DO_ACCOUNTING | 1211 EXTENT_DEFRAG, PAGE_UNLOCK | 1212 PAGE_CLEAR_DIRTY | 1213 PAGE_SET_WRITEBACK | 1214 PAGE_END_WRITEBACK); 1215 return -ENOMEM; 1216 } 1217 1218 nolock = btrfs_is_free_space_inode(inode); 1219 1220 if (nolock) 1221 trans = btrfs_join_transaction_nolock(root); 1222 else 1223 trans = btrfs_join_transaction(root); 1224 1225 if (IS_ERR(trans)) { 1226 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1227 EXTENT_LOCKED | EXTENT_DELALLOC | 1228 EXTENT_DO_ACCOUNTING | 1229 EXTENT_DEFRAG, PAGE_UNLOCK | 1230 PAGE_CLEAR_DIRTY | 1231 PAGE_SET_WRITEBACK | 1232 PAGE_END_WRITEBACK); 1233 btrfs_free_path(path); 1234 return PTR_ERR(trans); 1235 } 1236 1237 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1238 1239 cow_start = (u64)-1; 1240 cur_offset = start; 1241 while (1) { 1242 ret = btrfs_lookup_file_extent(trans, root, path, ino, 1243 cur_offset, 0); 1244 if (ret < 0) 1245 goto error; 1246 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1247 leaf = path->nodes[0]; 1248 btrfs_item_key_to_cpu(leaf, &found_key, 1249 path->slots[0] - 1); 1250 if (found_key.objectid == ino && 1251 found_key.type == BTRFS_EXTENT_DATA_KEY) 1252 path->slots[0]--; 1253 } 1254 check_prev = 0; 1255 next_slot: 1256 leaf = path->nodes[0]; 1257 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1258 ret = btrfs_next_leaf(root, path); 1259 if (ret < 0) 1260 goto error; 1261 if (ret > 0) 1262 break; 1263 leaf = path->nodes[0]; 1264 } 1265 1266 nocow = 0; 1267 disk_bytenr = 0; 1268 num_bytes = 0; 1269 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1270 1271 if (found_key.objectid > ino || 1272 found_key.type > BTRFS_EXTENT_DATA_KEY || 1273 found_key.offset > end) 1274 break; 1275 1276 if (found_key.offset > cur_offset) { 1277 extent_end = found_key.offset; 1278 extent_type = 0; 1279 goto out_check; 1280 } 1281 1282 fi = btrfs_item_ptr(leaf, path->slots[0], 1283 struct btrfs_file_extent_item); 1284 extent_type = btrfs_file_extent_type(leaf, fi); 1285 1286 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1287 if (extent_type == BTRFS_FILE_EXTENT_REG || 1288 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1289 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1290 extent_offset = btrfs_file_extent_offset(leaf, fi); 1291 extent_end = found_key.offset + 1292 btrfs_file_extent_num_bytes(leaf, fi); 1293 disk_num_bytes = 1294 btrfs_file_extent_disk_num_bytes(leaf, fi); 1295 if (extent_end <= start) { 1296 path->slots[0]++; 1297 goto next_slot; 1298 } 1299 if (disk_bytenr == 0) 1300 goto out_check; 1301 if (btrfs_file_extent_compression(leaf, fi) || 1302 btrfs_file_extent_encryption(leaf, fi) || 1303 btrfs_file_extent_other_encoding(leaf, fi)) 1304 goto out_check; 1305 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1306 goto out_check; 1307 if (btrfs_extent_readonly(root, disk_bytenr)) 1308 goto out_check; 1309 if (btrfs_cross_ref_exist(trans, root, ino, 1310 found_key.offset - 1311 extent_offset, disk_bytenr)) 1312 goto out_check; 1313 disk_bytenr += extent_offset; 1314 disk_bytenr += cur_offset - found_key.offset; 1315 num_bytes = min(end + 1, extent_end) - cur_offset; 1316 /* 1317 * if there are pending snapshots for this root, 1318 * we fall into common COW way. 1319 */ 1320 if (!nolock) { 1321 err = btrfs_start_nocow_write(root); 1322 if (!err) 1323 goto out_check; 1324 } 1325 /* 1326 * force cow if csum exists in the range. 1327 * this ensure that csum for a given extent are 1328 * either valid or do not exist. 1329 */ 1330 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1331 goto out_check; 1332 nocow = 1; 1333 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1334 extent_end = found_key.offset + 1335 btrfs_file_extent_inline_len(leaf, 1336 path->slots[0], fi); 1337 extent_end = ALIGN(extent_end, root->sectorsize); 1338 } else { 1339 BUG_ON(1); 1340 } 1341 out_check: 1342 if (extent_end <= start) { 1343 path->slots[0]++; 1344 if (!nolock && nocow) 1345 btrfs_end_nocow_write(root); 1346 goto next_slot; 1347 } 1348 if (!nocow) { 1349 if (cow_start == (u64)-1) 1350 cow_start = cur_offset; 1351 cur_offset = extent_end; 1352 if (cur_offset > end) 1353 break; 1354 path->slots[0]++; 1355 goto next_slot; 1356 } 1357 1358 btrfs_release_path(path); 1359 if (cow_start != (u64)-1) { 1360 ret = cow_file_range(inode, locked_page, 1361 cow_start, found_key.offset - 1, 1362 page_started, nr_written, 1); 1363 if (ret) { 1364 if (!nolock && nocow) 1365 btrfs_end_nocow_write(root); 1366 goto error; 1367 } 1368 cow_start = (u64)-1; 1369 } 1370 1371 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1372 struct extent_map *em; 1373 struct extent_map_tree *em_tree; 1374 em_tree = &BTRFS_I(inode)->extent_tree; 1375 em = alloc_extent_map(); 1376 BUG_ON(!em); /* -ENOMEM */ 1377 em->start = cur_offset; 1378 em->orig_start = found_key.offset - extent_offset; 1379 em->len = num_bytes; 1380 em->block_len = num_bytes; 1381 em->block_start = disk_bytenr; 1382 em->orig_block_len = disk_num_bytes; 1383 em->ram_bytes = ram_bytes; 1384 em->bdev = root->fs_info->fs_devices->latest_bdev; 1385 em->mod_start = em->start; 1386 em->mod_len = em->len; 1387 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1388 set_bit(EXTENT_FLAG_FILLING, &em->flags); 1389 em->generation = -1; 1390 while (1) { 1391 write_lock(&em_tree->lock); 1392 ret = add_extent_mapping(em_tree, em, 1); 1393 write_unlock(&em_tree->lock); 1394 if (ret != -EEXIST) { 1395 free_extent_map(em); 1396 break; 1397 } 1398 btrfs_drop_extent_cache(inode, em->start, 1399 em->start + em->len - 1, 0); 1400 } 1401 type = BTRFS_ORDERED_PREALLOC; 1402 } else { 1403 type = BTRFS_ORDERED_NOCOW; 1404 } 1405 1406 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1407 num_bytes, num_bytes, type); 1408 BUG_ON(ret); /* -ENOMEM */ 1409 1410 if (root->root_key.objectid == 1411 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1412 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1413 num_bytes); 1414 if (ret) { 1415 if (!nolock && nocow) 1416 btrfs_end_nocow_write(root); 1417 goto error; 1418 } 1419 } 1420 1421 extent_clear_unlock_delalloc(inode, cur_offset, 1422 cur_offset + num_bytes - 1, 1423 locked_page, EXTENT_LOCKED | 1424 EXTENT_DELALLOC, PAGE_UNLOCK | 1425 PAGE_SET_PRIVATE2); 1426 if (!nolock && nocow) 1427 btrfs_end_nocow_write(root); 1428 cur_offset = extent_end; 1429 if (cur_offset > end) 1430 break; 1431 } 1432 btrfs_release_path(path); 1433 1434 if (cur_offset <= end && cow_start == (u64)-1) { 1435 cow_start = cur_offset; 1436 cur_offset = end; 1437 } 1438 1439 if (cow_start != (u64)-1) { 1440 ret = cow_file_range(inode, locked_page, cow_start, end, 1441 page_started, nr_written, 1); 1442 if (ret) 1443 goto error; 1444 } 1445 1446 error: 1447 err = btrfs_end_transaction(trans, root); 1448 if (!ret) 1449 ret = err; 1450 1451 if (ret && cur_offset < end) 1452 extent_clear_unlock_delalloc(inode, cur_offset, end, 1453 locked_page, EXTENT_LOCKED | 1454 EXTENT_DELALLOC | EXTENT_DEFRAG | 1455 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 1456 PAGE_CLEAR_DIRTY | 1457 PAGE_SET_WRITEBACK | 1458 PAGE_END_WRITEBACK); 1459 btrfs_free_path(path); 1460 return ret; 1461 } 1462 1463 static inline int need_force_cow(struct inode *inode, u64 start, u64 end) 1464 { 1465 1466 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 1467 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) 1468 return 0; 1469 1470 /* 1471 * @defrag_bytes is a hint value, no spinlock held here, 1472 * if is not zero, it means the file is defragging. 1473 * Force cow if given extent needs to be defragged. 1474 */ 1475 if (BTRFS_I(inode)->defrag_bytes && 1476 test_range_bit(&BTRFS_I(inode)->io_tree, start, end, 1477 EXTENT_DEFRAG, 0, NULL)) 1478 return 1; 1479 1480 return 0; 1481 } 1482 1483 /* 1484 * extent_io.c call back to do delayed allocation processing 1485 */ 1486 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1487 u64 start, u64 end, int *page_started, 1488 unsigned long *nr_written) 1489 { 1490 int ret; 1491 int force_cow = need_force_cow(inode, start, end); 1492 1493 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { 1494 ret = run_delalloc_nocow(inode, locked_page, start, end, 1495 page_started, 1, nr_written); 1496 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { 1497 ret = run_delalloc_nocow(inode, locked_page, start, end, 1498 page_started, 0, nr_written); 1499 } else if (!inode_need_compress(inode)) { 1500 ret = cow_file_range(inode, locked_page, start, end, 1501 page_started, nr_written, 1); 1502 } else { 1503 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1504 &BTRFS_I(inode)->runtime_flags); 1505 ret = cow_file_range_async(inode, locked_page, start, end, 1506 page_started, nr_written); 1507 } 1508 return ret; 1509 } 1510 1511 static void btrfs_split_extent_hook(struct inode *inode, 1512 struct extent_state *orig, u64 split) 1513 { 1514 /* not delalloc, ignore it */ 1515 if (!(orig->state & EXTENT_DELALLOC)) 1516 return; 1517 1518 spin_lock(&BTRFS_I(inode)->lock); 1519 BTRFS_I(inode)->outstanding_extents++; 1520 spin_unlock(&BTRFS_I(inode)->lock); 1521 } 1522 1523 /* 1524 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1525 * extents so we can keep track of new extents that are just merged onto old 1526 * extents, such as when we are doing sequential writes, so we can properly 1527 * account for the metadata space we'll need. 1528 */ 1529 static void btrfs_merge_extent_hook(struct inode *inode, 1530 struct extent_state *new, 1531 struct extent_state *other) 1532 { 1533 /* not delalloc, ignore it */ 1534 if (!(other->state & EXTENT_DELALLOC)) 1535 return; 1536 1537 spin_lock(&BTRFS_I(inode)->lock); 1538 BTRFS_I(inode)->outstanding_extents--; 1539 spin_unlock(&BTRFS_I(inode)->lock); 1540 } 1541 1542 static void btrfs_add_delalloc_inodes(struct btrfs_root *root, 1543 struct inode *inode) 1544 { 1545 spin_lock(&root->delalloc_lock); 1546 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1547 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1548 &root->delalloc_inodes); 1549 set_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1550 &BTRFS_I(inode)->runtime_flags); 1551 root->nr_delalloc_inodes++; 1552 if (root->nr_delalloc_inodes == 1) { 1553 spin_lock(&root->fs_info->delalloc_root_lock); 1554 BUG_ON(!list_empty(&root->delalloc_root)); 1555 list_add_tail(&root->delalloc_root, 1556 &root->fs_info->delalloc_roots); 1557 spin_unlock(&root->fs_info->delalloc_root_lock); 1558 } 1559 } 1560 spin_unlock(&root->delalloc_lock); 1561 } 1562 1563 static void btrfs_del_delalloc_inode(struct btrfs_root *root, 1564 struct inode *inode) 1565 { 1566 spin_lock(&root->delalloc_lock); 1567 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1568 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1569 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1570 &BTRFS_I(inode)->runtime_flags); 1571 root->nr_delalloc_inodes--; 1572 if (!root->nr_delalloc_inodes) { 1573 spin_lock(&root->fs_info->delalloc_root_lock); 1574 BUG_ON(list_empty(&root->delalloc_root)); 1575 list_del_init(&root->delalloc_root); 1576 spin_unlock(&root->fs_info->delalloc_root_lock); 1577 } 1578 } 1579 spin_unlock(&root->delalloc_lock); 1580 } 1581 1582 /* 1583 * extent_io.c set_bit_hook, used to track delayed allocation 1584 * bytes in this file, and to maintain the list of inodes that 1585 * have pending delalloc work to be done. 1586 */ 1587 static void btrfs_set_bit_hook(struct inode *inode, 1588 struct extent_state *state, unsigned long *bits) 1589 { 1590 1591 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) 1592 WARN_ON(1); 1593 /* 1594 * set_bit and clear bit hooks normally require _irqsave/restore 1595 * but in this case, we are only testing for the DELALLOC 1596 * bit, which is only set or cleared with irqs on 1597 */ 1598 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1599 struct btrfs_root *root = BTRFS_I(inode)->root; 1600 u64 len = state->end + 1 - state->start; 1601 bool do_list = !btrfs_is_free_space_inode(inode); 1602 1603 if (*bits & EXTENT_FIRST_DELALLOC) { 1604 *bits &= ~EXTENT_FIRST_DELALLOC; 1605 } else { 1606 spin_lock(&BTRFS_I(inode)->lock); 1607 BTRFS_I(inode)->outstanding_extents++; 1608 spin_unlock(&BTRFS_I(inode)->lock); 1609 } 1610 1611 __percpu_counter_add(&root->fs_info->delalloc_bytes, len, 1612 root->fs_info->delalloc_batch); 1613 spin_lock(&BTRFS_I(inode)->lock); 1614 BTRFS_I(inode)->delalloc_bytes += len; 1615 if (*bits & EXTENT_DEFRAG) 1616 BTRFS_I(inode)->defrag_bytes += len; 1617 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1618 &BTRFS_I(inode)->runtime_flags)) 1619 btrfs_add_delalloc_inodes(root, inode); 1620 spin_unlock(&BTRFS_I(inode)->lock); 1621 } 1622 } 1623 1624 /* 1625 * extent_io.c clear_bit_hook, see set_bit_hook for why 1626 */ 1627 static void btrfs_clear_bit_hook(struct inode *inode, 1628 struct extent_state *state, 1629 unsigned long *bits) 1630 { 1631 u64 len = state->end + 1 - state->start; 1632 1633 spin_lock(&BTRFS_I(inode)->lock); 1634 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) 1635 BTRFS_I(inode)->defrag_bytes -= len; 1636 spin_unlock(&BTRFS_I(inode)->lock); 1637 1638 /* 1639 * set_bit and clear bit hooks normally require _irqsave/restore 1640 * but in this case, we are only testing for the DELALLOC 1641 * bit, which is only set or cleared with irqs on 1642 */ 1643 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1644 struct btrfs_root *root = BTRFS_I(inode)->root; 1645 bool do_list = !btrfs_is_free_space_inode(inode); 1646 1647 if (*bits & EXTENT_FIRST_DELALLOC) { 1648 *bits &= ~EXTENT_FIRST_DELALLOC; 1649 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { 1650 spin_lock(&BTRFS_I(inode)->lock); 1651 BTRFS_I(inode)->outstanding_extents--; 1652 spin_unlock(&BTRFS_I(inode)->lock); 1653 } 1654 1655 /* 1656 * We don't reserve metadata space for space cache inodes so we 1657 * don't need to call dellalloc_release_metadata if there is an 1658 * error. 1659 */ 1660 if (*bits & EXTENT_DO_ACCOUNTING && 1661 root != root->fs_info->tree_root) 1662 btrfs_delalloc_release_metadata(inode, len); 1663 1664 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1665 && do_list && !(state->state & EXTENT_NORESERVE)) 1666 btrfs_free_reserved_data_space(inode, len); 1667 1668 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, 1669 root->fs_info->delalloc_batch); 1670 spin_lock(&BTRFS_I(inode)->lock); 1671 BTRFS_I(inode)->delalloc_bytes -= len; 1672 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1673 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1674 &BTRFS_I(inode)->runtime_flags)) 1675 btrfs_del_delalloc_inode(root, inode); 1676 spin_unlock(&BTRFS_I(inode)->lock); 1677 } 1678 } 1679 1680 /* 1681 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1682 * we don't create bios that span stripes or chunks 1683 */ 1684 int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, 1685 size_t size, struct bio *bio, 1686 unsigned long bio_flags) 1687 { 1688 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1689 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 1690 u64 length = 0; 1691 u64 map_length; 1692 int ret; 1693 1694 if (bio_flags & EXTENT_BIO_COMPRESSED) 1695 return 0; 1696 1697 length = bio->bi_iter.bi_size; 1698 map_length = length; 1699 ret = btrfs_map_block(root->fs_info, rw, logical, 1700 &map_length, NULL, 0); 1701 /* Will always return 0 with map_multi == NULL */ 1702 BUG_ON(ret < 0); 1703 if (map_length < length + size) 1704 return 1; 1705 return 0; 1706 } 1707 1708 /* 1709 * in order to insert checksums into the metadata in large chunks, 1710 * we wait until bio submission time. All the pages in the bio are 1711 * checksummed and sums are attached onto the ordered extent record. 1712 * 1713 * At IO completion time the cums attached on the ordered extent record 1714 * are inserted into the btree 1715 */ 1716 static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1717 struct bio *bio, int mirror_num, 1718 unsigned long bio_flags, 1719 u64 bio_offset) 1720 { 1721 struct btrfs_root *root = BTRFS_I(inode)->root; 1722 int ret = 0; 1723 1724 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1725 BUG_ON(ret); /* -ENOMEM */ 1726 return 0; 1727 } 1728 1729 /* 1730 * in order to insert checksums into the metadata in large chunks, 1731 * we wait until bio submission time. All the pages in the bio are 1732 * checksummed and sums are attached onto the ordered extent record. 1733 * 1734 * At IO completion time the cums attached on the ordered extent record 1735 * are inserted into the btree 1736 */ 1737 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1738 int mirror_num, unsigned long bio_flags, 1739 u64 bio_offset) 1740 { 1741 struct btrfs_root *root = BTRFS_I(inode)->root; 1742 int ret; 1743 1744 ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); 1745 if (ret) 1746 bio_endio(bio, ret); 1747 return ret; 1748 } 1749 1750 /* 1751 * extent_io.c submission hook. This does the right thing for csum calculation 1752 * on write, or reading the csums from the tree before a read 1753 */ 1754 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1755 int mirror_num, unsigned long bio_flags, 1756 u64 bio_offset) 1757 { 1758 struct btrfs_root *root = BTRFS_I(inode)->root; 1759 int ret = 0; 1760 int skip_sum; 1761 int metadata = 0; 1762 int async = !atomic_read(&BTRFS_I(inode)->sync_writers); 1763 1764 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1765 1766 if (btrfs_is_free_space_inode(inode)) 1767 metadata = 2; 1768 1769 if (!(rw & REQ_WRITE)) { 1770 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1771 if (ret) 1772 goto out; 1773 1774 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1775 ret = btrfs_submit_compressed_read(inode, bio, 1776 mirror_num, 1777 bio_flags); 1778 goto out; 1779 } else if (!skip_sum) { 1780 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1781 if (ret) 1782 goto out; 1783 } 1784 goto mapit; 1785 } else if (async && !skip_sum) { 1786 /* csum items have already been cloned */ 1787 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1788 goto mapit; 1789 /* we're doing a write, do the async checksumming */ 1790 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1791 inode, rw, bio, mirror_num, 1792 bio_flags, bio_offset, 1793 __btrfs_submit_bio_start, 1794 __btrfs_submit_bio_done); 1795 goto out; 1796 } else if (!skip_sum) { 1797 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1798 if (ret) 1799 goto out; 1800 } 1801 1802 mapit: 1803 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); 1804 1805 out: 1806 if (ret < 0) 1807 bio_endio(bio, ret); 1808 return ret; 1809 } 1810 1811 /* 1812 * given a list of ordered sums record them in the inode. This happens 1813 * at IO completion time based on sums calculated at bio submission time. 1814 */ 1815 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1816 struct inode *inode, u64 file_offset, 1817 struct list_head *list) 1818 { 1819 struct btrfs_ordered_sum *sum; 1820 1821 list_for_each_entry(sum, list, list) { 1822 trans->adding_csums = 1; 1823 btrfs_csum_file_blocks(trans, 1824 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1825 trans->adding_csums = 0; 1826 } 1827 return 0; 1828 } 1829 1830 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1831 struct extent_state **cached_state) 1832 { 1833 WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); 1834 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1835 cached_state, GFP_NOFS); 1836 } 1837 1838 /* see btrfs_writepage_start_hook for details on why this is required */ 1839 struct btrfs_writepage_fixup { 1840 struct page *page; 1841 struct btrfs_work work; 1842 }; 1843 1844 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 1845 { 1846 struct btrfs_writepage_fixup *fixup; 1847 struct btrfs_ordered_extent *ordered; 1848 struct extent_state *cached_state = NULL; 1849 struct page *page; 1850 struct inode *inode; 1851 u64 page_start; 1852 u64 page_end; 1853 int ret; 1854 1855 fixup = container_of(work, struct btrfs_writepage_fixup, work); 1856 page = fixup->page; 1857 again: 1858 lock_page(page); 1859 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 1860 ClearPageChecked(page); 1861 goto out_page; 1862 } 1863 1864 inode = page->mapping->host; 1865 page_start = page_offset(page); 1866 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1867 1868 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, 1869 &cached_state); 1870 1871 /* already ordered? We're done */ 1872 if (PagePrivate2(page)) 1873 goto out; 1874 1875 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1876 if (ordered) { 1877 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 1878 page_end, &cached_state, GFP_NOFS); 1879 unlock_page(page); 1880 btrfs_start_ordered_extent(inode, ordered, 1); 1881 btrfs_put_ordered_extent(ordered); 1882 goto again; 1883 } 1884 1885 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 1886 if (ret) { 1887 mapping_set_error(page->mapping, ret); 1888 end_extent_writepage(page, ret, page_start, page_end); 1889 ClearPageChecked(page); 1890 goto out; 1891 } 1892 1893 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1894 ClearPageChecked(page); 1895 set_page_dirty(page); 1896 out: 1897 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 1898 &cached_state, GFP_NOFS); 1899 out_page: 1900 unlock_page(page); 1901 page_cache_release(page); 1902 kfree(fixup); 1903 } 1904 1905 /* 1906 * There are a few paths in the higher layers of the kernel that directly 1907 * set the page dirty bit without asking the filesystem if it is a 1908 * good idea. This causes problems because we want to make sure COW 1909 * properly happens and the data=ordered rules are followed. 1910 * 1911 * In our case any range that doesn't have the ORDERED bit set 1912 * hasn't been properly setup for IO. We kick off an async process 1913 * to fix it up. The async helper will wait for ordered extents, set 1914 * the delalloc bit and make it safe to write the page. 1915 */ 1916 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 1917 { 1918 struct inode *inode = page->mapping->host; 1919 struct btrfs_writepage_fixup *fixup; 1920 struct btrfs_root *root = BTRFS_I(inode)->root; 1921 1922 /* this page is properly in the ordered list */ 1923 if (TestClearPagePrivate2(page)) 1924 return 0; 1925 1926 if (PageChecked(page)) 1927 return -EAGAIN; 1928 1929 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 1930 if (!fixup) 1931 return -EAGAIN; 1932 1933 SetPageChecked(page); 1934 page_cache_get(page); 1935 btrfs_init_work(&fixup->work, btrfs_fixup_helper, 1936 btrfs_writepage_fixup_worker, NULL, NULL); 1937 fixup->page = page; 1938 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); 1939 return -EBUSY; 1940 } 1941 1942 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 1943 struct inode *inode, u64 file_pos, 1944 u64 disk_bytenr, u64 disk_num_bytes, 1945 u64 num_bytes, u64 ram_bytes, 1946 u8 compression, u8 encryption, 1947 u16 other_encoding, int extent_type) 1948 { 1949 struct btrfs_root *root = BTRFS_I(inode)->root; 1950 struct btrfs_file_extent_item *fi; 1951 struct btrfs_path *path; 1952 struct extent_buffer *leaf; 1953 struct btrfs_key ins; 1954 int extent_inserted = 0; 1955 int ret; 1956 1957 path = btrfs_alloc_path(); 1958 if (!path) 1959 return -ENOMEM; 1960 1961 /* 1962 * we may be replacing one extent in the tree with another. 1963 * The new extent is pinned in the extent map, and we don't want 1964 * to drop it from the cache until it is completely in the btree. 1965 * 1966 * So, tell btrfs_drop_extents to leave this extent in the cache. 1967 * the caller is expected to unpin it and allow it to be merged 1968 * with the others. 1969 */ 1970 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, 1971 file_pos + num_bytes, NULL, 0, 1972 1, sizeof(*fi), &extent_inserted); 1973 if (ret) 1974 goto out; 1975 1976 if (!extent_inserted) { 1977 ins.objectid = btrfs_ino(inode); 1978 ins.offset = file_pos; 1979 ins.type = BTRFS_EXTENT_DATA_KEY; 1980 1981 path->leave_spinning = 1; 1982 ret = btrfs_insert_empty_item(trans, root, path, &ins, 1983 sizeof(*fi)); 1984 if (ret) 1985 goto out; 1986 } 1987 leaf = path->nodes[0]; 1988 fi = btrfs_item_ptr(leaf, path->slots[0], 1989 struct btrfs_file_extent_item); 1990 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1991 btrfs_set_file_extent_type(leaf, fi, extent_type); 1992 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 1993 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 1994 btrfs_set_file_extent_offset(leaf, fi, 0); 1995 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1996 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 1997 btrfs_set_file_extent_compression(leaf, fi, compression); 1998 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1999 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 2000 2001 btrfs_mark_buffer_dirty(leaf); 2002 btrfs_release_path(path); 2003 2004 inode_add_bytes(inode, num_bytes); 2005 2006 ins.objectid = disk_bytenr; 2007 ins.offset = disk_num_bytes; 2008 ins.type = BTRFS_EXTENT_ITEM_KEY; 2009 ret = btrfs_alloc_reserved_file_extent(trans, root, 2010 root->root_key.objectid, 2011 btrfs_ino(inode), file_pos, &ins); 2012 out: 2013 btrfs_free_path(path); 2014 2015 return ret; 2016 } 2017 2018 /* snapshot-aware defrag */ 2019 struct sa_defrag_extent_backref { 2020 struct rb_node node; 2021 struct old_sa_defrag_extent *old; 2022 u64 root_id; 2023 u64 inum; 2024 u64 file_pos; 2025 u64 extent_offset; 2026 u64 num_bytes; 2027 u64 generation; 2028 }; 2029 2030 struct old_sa_defrag_extent { 2031 struct list_head list; 2032 struct new_sa_defrag_extent *new; 2033 2034 u64 extent_offset; 2035 u64 bytenr; 2036 u64 offset; 2037 u64 len; 2038 int count; 2039 }; 2040 2041 struct new_sa_defrag_extent { 2042 struct rb_root root; 2043 struct list_head head; 2044 struct btrfs_path *path; 2045 struct inode *inode; 2046 u64 file_pos; 2047 u64 len; 2048 u64 bytenr; 2049 u64 disk_len; 2050 u8 compress_type; 2051 }; 2052 2053 static int backref_comp(struct sa_defrag_extent_backref *b1, 2054 struct sa_defrag_extent_backref *b2) 2055 { 2056 if (b1->root_id < b2->root_id) 2057 return -1; 2058 else if (b1->root_id > b2->root_id) 2059 return 1; 2060 2061 if (b1->inum < b2->inum) 2062 return -1; 2063 else if (b1->inum > b2->inum) 2064 return 1; 2065 2066 if (b1->file_pos < b2->file_pos) 2067 return -1; 2068 else if (b1->file_pos > b2->file_pos) 2069 return 1; 2070 2071 /* 2072 * [------------------------------] ===> (a range of space) 2073 * |<--->| |<---->| =============> (fs/file tree A) 2074 * |<---------------------------->| ===> (fs/file tree B) 2075 * 2076 * A range of space can refer to two file extents in one tree while 2077 * refer to only one file extent in another tree. 2078 * 2079 * So we may process a disk offset more than one time(two extents in A) 2080 * and locate at the same extent(one extent in B), then insert two same 2081 * backrefs(both refer to the extent in B). 2082 */ 2083 return 0; 2084 } 2085 2086 static void backref_insert(struct rb_root *root, 2087 struct sa_defrag_extent_backref *backref) 2088 { 2089 struct rb_node **p = &root->rb_node; 2090 struct rb_node *parent = NULL; 2091 struct sa_defrag_extent_backref *entry; 2092 int ret; 2093 2094 while (*p) { 2095 parent = *p; 2096 entry = rb_entry(parent, struct sa_defrag_extent_backref, node); 2097 2098 ret = backref_comp(backref, entry); 2099 if (ret < 0) 2100 p = &(*p)->rb_left; 2101 else 2102 p = &(*p)->rb_right; 2103 } 2104 2105 rb_link_node(&backref->node, parent, p); 2106 rb_insert_color(&backref->node, root); 2107 } 2108 2109 /* 2110 * Note the backref might has changed, and in this case we just return 0. 2111 */ 2112 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, 2113 void *ctx) 2114 { 2115 struct btrfs_file_extent_item *extent; 2116 struct btrfs_fs_info *fs_info; 2117 struct old_sa_defrag_extent *old = ctx; 2118 struct new_sa_defrag_extent *new = old->new; 2119 struct btrfs_path *path = new->path; 2120 struct btrfs_key key; 2121 struct btrfs_root *root; 2122 struct sa_defrag_extent_backref *backref; 2123 struct extent_buffer *leaf; 2124 struct inode *inode = new->inode; 2125 int slot; 2126 int ret; 2127 u64 extent_offset; 2128 u64 num_bytes; 2129 2130 if (BTRFS_I(inode)->root->root_key.objectid == root_id && 2131 inum == btrfs_ino(inode)) 2132 return 0; 2133 2134 key.objectid = root_id; 2135 key.type = BTRFS_ROOT_ITEM_KEY; 2136 key.offset = (u64)-1; 2137 2138 fs_info = BTRFS_I(inode)->root->fs_info; 2139 root = btrfs_read_fs_root_no_name(fs_info, &key); 2140 if (IS_ERR(root)) { 2141 if (PTR_ERR(root) == -ENOENT) 2142 return 0; 2143 WARN_ON(1); 2144 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", 2145 inum, offset, root_id); 2146 return PTR_ERR(root); 2147 } 2148 2149 key.objectid = inum; 2150 key.type = BTRFS_EXTENT_DATA_KEY; 2151 if (offset > (u64)-1 << 32) 2152 key.offset = 0; 2153 else 2154 key.offset = offset; 2155 2156 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2157 if (WARN_ON(ret < 0)) 2158 return ret; 2159 ret = 0; 2160 2161 while (1) { 2162 cond_resched(); 2163 2164 leaf = path->nodes[0]; 2165 slot = path->slots[0]; 2166 2167 if (slot >= btrfs_header_nritems(leaf)) { 2168 ret = btrfs_next_leaf(root, path); 2169 if (ret < 0) { 2170 goto out; 2171 } else if (ret > 0) { 2172 ret = 0; 2173 goto out; 2174 } 2175 continue; 2176 } 2177 2178 path->slots[0]++; 2179 2180 btrfs_item_key_to_cpu(leaf, &key, slot); 2181 2182 if (key.objectid > inum) 2183 goto out; 2184 2185 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) 2186 continue; 2187 2188 extent = btrfs_item_ptr(leaf, slot, 2189 struct btrfs_file_extent_item); 2190 2191 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) 2192 continue; 2193 2194 /* 2195 * 'offset' refers to the exact key.offset, 2196 * NOT the 'offset' field in btrfs_extent_data_ref, ie. 2197 * (key.offset - extent_offset). 2198 */ 2199 if (key.offset != offset) 2200 continue; 2201 2202 extent_offset = btrfs_file_extent_offset(leaf, extent); 2203 num_bytes = btrfs_file_extent_num_bytes(leaf, extent); 2204 2205 if (extent_offset >= old->extent_offset + old->offset + 2206 old->len || extent_offset + num_bytes <= 2207 old->extent_offset + old->offset) 2208 continue; 2209 break; 2210 } 2211 2212 backref = kmalloc(sizeof(*backref), GFP_NOFS); 2213 if (!backref) { 2214 ret = -ENOENT; 2215 goto out; 2216 } 2217 2218 backref->root_id = root_id; 2219 backref->inum = inum; 2220 backref->file_pos = offset; 2221 backref->num_bytes = num_bytes; 2222 backref->extent_offset = extent_offset; 2223 backref->generation = btrfs_file_extent_generation(leaf, extent); 2224 backref->old = old; 2225 backref_insert(&new->root, backref); 2226 old->count++; 2227 out: 2228 btrfs_release_path(path); 2229 WARN_ON(ret); 2230 return ret; 2231 } 2232 2233 static noinline bool record_extent_backrefs(struct btrfs_path *path, 2234 struct new_sa_defrag_extent *new) 2235 { 2236 struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; 2237 struct old_sa_defrag_extent *old, *tmp; 2238 int ret; 2239 2240 new->path = path; 2241 2242 list_for_each_entry_safe(old, tmp, &new->head, list) { 2243 ret = iterate_inodes_from_logical(old->bytenr + 2244 old->extent_offset, fs_info, 2245 path, record_one_backref, 2246 old); 2247 if (ret < 0 && ret != -ENOENT) 2248 return false; 2249 2250 /* no backref to be processed for this extent */ 2251 if (!old->count) { 2252 list_del(&old->list); 2253 kfree(old); 2254 } 2255 } 2256 2257 if (list_empty(&new->head)) 2258 return false; 2259 2260 return true; 2261 } 2262 2263 static int relink_is_mergable(struct extent_buffer *leaf, 2264 struct btrfs_file_extent_item *fi, 2265 struct new_sa_defrag_extent *new) 2266 { 2267 if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) 2268 return 0; 2269 2270 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 2271 return 0; 2272 2273 if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) 2274 return 0; 2275 2276 if (btrfs_file_extent_encryption(leaf, fi) || 2277 btrfs_file_extent_other_encoding(leaf, fi)) 2278 return 0; 2279 2280 return 1; 2281 } 2282 2283 /* 2284 * Note the backref might has changed, and in this case we just return 0. 2285 */ 2286 static noinline int relink_extent_backref(struct btrfs_path *path, 2287 struct sa_defrag_extent_backref *prev, 2288 struct sa_defrag_extent_backref *backref) 2289 { 2290 struct btrfs_file_extent_item *extent; 2291 struct btrfs_file_extent_item *item; 2292 struct btrfs_ordered_extent *ordered; 2293 struct btrfs_trans_handle *trans; 2294 struct btrfs_fs_info *fs_info; 2295 struct btrfs_root *root; 2296 struct btrfs_key key; 2297 struct extent_buffer *leaf; 2298 struct old_sa_defrag_extent *old = backref->old; 2299 struct new_sa_defrag_extent *new = old->new; 2300 struct inode *src_inode = new->inode; 2301 struct inode *inode; 2302 struct extent_state *cached = NULL; 2303 int ret = 0; 2304 u64 start; 2305 u64 len; 2306 u64 lock_start; 2307 u64 lock_end; 2308 bool merge = false; 2309 int index; 2310 2311 if (prev && prev->root_id == backref->root_id && 2312 prev->inum == backref->inum && 2313 prev->file_pos + prev->num_bytes == backref->file_pos) 2314 merge = true; 2315 2316 /* step 1: get root */ 2317 key.objectid = backref->root_id; 2318 key.type = BTRFS_ROOT_ITEM_KEY; 2319 key.offset = (u64)-1; 2320 2321 fs_info = BTRFS_I(src_inode)->root->fs_info; 2322 index = srcu_read_lock(&fs_info->subvol_srcu); 2323 2324 root = btrfs_read_fs_root_no_name(fs_info, &key); 2325 if (IS_ERR(root)) { 2326 srcu_read_unlock(&fs_info->subvol_srcu, index); 2327 if (PTR_ERR(root) == -ENOENT) 2328 return 0; 2329 return PTR_ERR(root); 2330 } 2331 2332 if (btrfs_root_readonly(root)) { 2333 srcu_read_unlock(&fs_info->subvol_srcu, index); 2334 return 0; 2335 } 2336 2337 /* step 2: get inode */ 2338 key.objectid = backref->inum; 2339 key.type = BTRFS_INODE_ITEM_KEY; 2340 key.offset = 0; 2341 2342 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 2343 if (IS_ERR(inode)) { 2344 srcu_read_unlock(&fs_info->subvol_srcu, index); 2345 return 0; 2346 } 2347 2348 srcu_read_unlock(&fs_info->subvol_srcu, index); 2349 2350 /* step 3: relink backref */ 2351 lock_start = backref->file_pos; 2352 lock_end = backref->file_pos + backref->num_bytes - 1; 2353 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, 2354 0, &cached); 2355 2356 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); 2357 if (ordered) { 2358 btrfs_put_ordered_extent(ordered); 2359 goto out_unlock; 2360 } 2361 2362 trans = btrfs_join_transaction(root); 2363 if (IS_ERR(trans)) { 2364 ret = PTR_ERR(trans); 2365 goto out_unlock; 2366 } 2367 2368 key.objectid = backref->inum; 2369 key.type = BTRFS_EXTENT_DATA_KEY; 2370 key.offset = backref->file_pos; 2371 2372 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2373 if (ret < 0) { 2374 goto out_free_path; 2375 } else if (ret > 0) { 2376 ret = 0; 2377 goto out_free_path; 2378 } 2379 2380 extent = btrfs_item_ptr(path->nodes[0], path->slots[0], 2381 struct btrfs_file_extent_item); 2382 2383 if (btrfs_file_extent_generation(path->nodes[0], extent) != 2384 backref->generation) 2385 goto out_free_path; 2386 2387 btrfs_release_path(path); 2388 2389 start = backref->file_pos; 2390 if (backref->extent_offset < old->extent_offset + old->offset) 2391 start += old->extent_offset + old->offset - 2392 backref->extent_offset; 2393 2394 len = min(backref->extent_offset + backref->num_bytes, 2395 old->extent_offset + old->offset + old->len); 2396 len -= max(backref->extent_offset, old->extent_offset + old->offset); 2397 2398 ret = btrfs_drop_extents(trans, root, inode, start, 2399 start + len, 1); 2400 if (ret) 2401 goto out_free_path; 2402 again: 2403 key.objectid = btrfs_ino(inode); 2404 key.type = BTRFS_EXTENT_DATA_KEY; 2405 key.offset = start; 2406 2407 path->leave_spinning = 1; 2408 if (merge) { 2409 struct btrfs_file_extent_item *fi; 2410 u64 extent_len; 2411 struct btrfs_key found_key; 2412 2413 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2414 if (ret < 0) 2415 goto out_free_path; 2416 2417 path->slots[0]--; 2418 leaf = path->nodes[0]; 2419 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2420 2421 fi = btrfs_item_ptr(leaf, path->slots[0], 2422 struct btrfs_file_extent_item); 2423 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 2424 2425 if (extent_len + found_key.offset == start && 2426 relink_is_mergable(leaf, fi, new)) { 2427 btrfs_set_file_extent_num_bytes(leaf, fi, 2428 extent_len + len); 2429 btrfs_mark_buffer_dirty(leaf); 2430 inode_add_bytes(inode, len); 2431 2432 ret = 1; 2433 goto out_free_path; 2434 } else { 2435 merge = false; 2436 btrfs_release_path(path); 2437 goto again; 2438 } 2439 } 2440 2441 ret = btrfs_insert_empty_item(trans, root, path, &key, 2442 sizeof(*extent)); 2443 if (ret) { 2444 btrfs_abort_transaction(trans, root, ret); 2445 goto out_free_path; 2446 } 2447 2448 leaf = path->nodes[0]; 2449 item = btrfs_item_ptr(leaf, path->slots[0], 2450 struct btrfs_file_extent_item); 2451 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); 2452 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); 2453 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); 2454 btrfs_set_file_extent_num_bytes(leaf, item, len); 2455 btrfs_set_file_extent_ram_bytes(leaf, item, new->len); 2456 btrfs_set_file_extent_generation(leaf, item, trans->transid); 2457 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); 2458 btrfs_set_file_extent_compression(leaf, item, new->compress_type); 2459 btrfs_set_file_extent_encryption(leaf, item, 0); 2460 btrfs_set_file_extent_other_encoding(leaf, item, 0); 2461 2462 btrfs_mark_buffer_dirty(leaf); 2463 inode_add_bytes(inode, len); 2464 btrfs_release_path(path); 2465 2466 ret = btrfs_inc_extent_ref(trans, root, new->bytenr, 2467 new->disk_len, 0, 2468 backref->root_id, backref->inum, 2469 new->file_pos, 0); /* start - extent_offset */ 2470 if (ret) { 2471 btrfs_abort_transaction(trans, root, ret); 2472 goto out_free_path; 2473 } 2474 2475 ret = 1; 2476 out_free_path: 2477 btrfs_release_path(path); 2478 path->leave_spinning = 0; 2479 btrfs_end_transaction(trans, root); 2480 out_unlock: 2481 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, 2482 &cached, GFP_NOFS); 2483 iput(inode); 2484 return ret; 2485 } 2486 2487 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) 2488 { 2489 struct old_sa_defrag_extent *old, *tmp; 2490 2491 if (!new) 2492 return; 2493 2494 list_for_each_entry_safe(old, tmp, &new->head, list) { 2495 list_del(&old->list); 2496 kfree(old); 2497 } 2498 kfree(new); 2499 } 2500 2501 static void relink_file_extents(struct new_sa_defrag_extent *new) 2502 { 2503 struct btrfs_path *path; 2504 struct sa_defrag_extent_backref *backref; 2505 struct sa_defrag_extent_backref *prev = NULL; 2506 struct inode *inode; 2507 struct btrfs_root *root; 2508 struct rb_node *node; 2509 int ret; 2510 2511 inode = new->inode; 2512 root = BTRFS_I(inode)->root; 2513 2514 path = btrfs_alloc_path(); 2515 if (!path) 2516 return; 2517 2518 if (!record_extent_backrefs(path, new)) { 2519 btrfs_free_path(path); 2520 goto out; 2521 } 2522 btrfs_release_path(path); 2523 2524 while (1) { 2525 node = rb_first(&new->root); 2526 if (!node) 2527 break; 2528 rb_erase(node, &new->root); 2529 2530 backref = rb_entry(node, struct sa_defrag_extent_backref, node); 2531 2532 ret = relink_extent_backref(path, prev, backref); 2533 WARN_ON(ret < 0); 2534 2535 kfree(prev); 2536 2537 if (ret == 1) 2538 prev = backref; 2539 else 2540 prev = NULL; 2541 cond_resched(); 2542 } 2543 kfree(prev); 2544 2545 btrfs_free_path(path); 2546 out: 2547 free_sa_defrag_extent(new); 2548 2549 atomic_dec(&root->fs_info->defrag_running); 2550 wake_up(&root->fs_info->transaction_wait); 2551 } 2552 2553 static struct new_sa_defrag_extent * 2554 record_old_file_extents(struct inode *inode, 2555 struct btrfs_ordered_extent *ordered) 2556 { 2557 struct btrfs_root *root = BTRFS_I(inode)->root; 2558 struct btrfs_path *path; 2559 struct btrfs_key key; 2560 struct old_sa_defrag_extent *old; 2561 struct new_sa_defrag_extent *new; 2562 int ret; 2563 2564 new = kmalloc(sizeof(*new), GFP_NOFS); 2565 if (!new) 2566 return NULL; 2567 2568 new->inode = inode; 2569 new->file_pos = ordered->file_offset; 2570 new->len = ordered->len; 2571 new->bytenr = ordered->start; 2572 new->disk_len = ordered->disk_len; 2573 new->compress_type = ordered->compress_type; 2574 new->root = RB_ROOT; 2575 INIT_LIST_HEAD(&new->head); 2576 2577 path = btrfs_alloc_path(); 2578 if (!path) 2579 goto out_kfree; 2580 2581 key.objectid = btrfs_ino(inode); 2582 key.type = BTRFS_EXTENT_DATA_KEY; 2583 key.offset = new->file_pos; 2584 2585 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2586 if (ret < 0) 2587 goto out_free_path; 2588 if (ret > 0 && path->slots[0] > 0) 2589 path->slots[0]--; 2590 2591 /* find out all the old extents for the file range */ 2592 while (1) { 2593 struct btrfs_file_extent_item *extent; 2594 struct extent_buffer *l; 2595 int slot; 2596 u64 num_bytes; 2597 u64 offset; 2598 u64 end; 2599 u64 disk_bytenr; 2600 u64 extent_offset; 2601 2602 l = path->nodes[0]; 2603 slot = path->slots[0]; 2604 2605 if (slot >= btrfs_header_nritems(l)) { 2606 ret = btrfs_next_leaf(root, path); 2607 if (ret < 0) 2608 goto out_free_path; 2609 else if (ret > 0) 2610 break; 2611 continue; 2612 } 2613 2614 btrfs_item_key_to_cpu(l, &key, slot); 2615 2616 if (key.objectid != btrfs_ino(inode)) 2617 break; 2618 if (key.type != BTRFS_EXTENT_DATA_KEY) 2619 break; 2620 if (key.offset >= new->file_pos + new->len) 2621 break; 2622 2623 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); 2624 2625 num_bytes = btrfs_file_extent_num_bytes(l, extent); 2626 if (key.offset + num_bytes < new->file_pos) 2627 goto next; 2628 2629 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); 2630 if (!disk_bytenr) 2631 goto next; 2632 2633 extent_offset = btrfs_file_extent_offset(l, extent); 2634 2635 old = kmalloc(sizeof(*old), GFP_NOFS); 2636 if (!old) 2637 goto out_free_path; 2638 2639 offset = max(new->file_pos, key.offset); 2640 end = min(new->file_pos + new->len, key.offset + num_bytes); 2641 2642 old->bytenr = disk_bytenr; 2643 old->extent_offset = extent_offset; 2644 old->offset = offset - key.offset; 2645 old->len = end - offset; 2646 old->new = new; 2647 old->count = 0; 2648 list_add_tail(&old->list, &new->head); 2649 next: 2650 path->slots[0]++; 2651 cond_resched(); 2652 } 2653 2654 btrfs_free_path(path); 2655 atomic_inc(&root->fs_info->defrag_running); 2656 2657 return new; 2658 2659 out_free_path: 2660 btrfs_free_path(path); 2661 out_kfree: 2662 free_sa_defrag_extent(new); 2663 return NULL; 2664 } 2665 2666 static void btrfs_release_delalloc_bytes(struct btrfs_root *root, 2667 u64 start, u64 len) 2668 { 2669 struct btrfs_block_group_cache *cache; 2670 2671 cache = btrfs_lookup_block_group(root->fs_info, start); 2672 ASSERT(cache); 2673 2674 spin_lock(&cache->lock); 2675 cache->delalloc_bytes -= len; 2676 spin_unlock(&cache->lock); 2677 2678 btrfs_put_block_group(cache); 2679 } 2680 2681 /* as ordered data IO finishes, this gets called so we can finish 2682 * an ordered extent if the range of bytes in the file it covers are 2683 * fully written. 2684 */ 2685 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 2686 { 2687 struct inode *inode = ordered_extent->inode; 2688 struct btrfs_root *root = BTRFS_I(inode)->root; 2689 struct btrfs_trans_handle *trans = NULL; 2690 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2691 struct extent_state *cached_state = NULL; 2692 struct new_sa_defrag_extent *new = NULL; 2693 int compress_type = 0; 2694 int ret = 0; 2695 u64 logical_len = ordered_extent->len; 2696 bool nolock; 2697 bool truncated = false; 2698 2699 nolock = btrfs_is_free_space_inode(inode); 2700 2701 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 2702 ret = -EIO; 2703 goto out; 2704 } 2705 2706 btrfs_free_io_failure_record(inode, ordered_extent->file_offset, 2707 ordered_extent->file_offset + 2708 ordered_extent->len - 1); 2709 2710 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 2711 truncated = true; 2712 logical_len = ordered_extent->truncated_len; 2713 /* Truncated the entire extent, don't bother adding */ 2714 if (!logical_len) 2715 goto out; 2716 } 2717 2718 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 2719 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 2720 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 2721 if (nolock) 2722 trans = btrfs_join_transaction_nolock(root); 2723 else 2724 trans = btrfs_join_transaction(root); 2725 if (IS_ERR(trans)) { 2726 ret = PTR_ERR(trans); 2727 trans = NULL; 2728 goto out; 2729 } 2730 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 2731 ret = btrfs_update_inode_fallback(trans, root, inode); 2732 if (ret) /* -ENOMEM or corruption */ 2733 btrfs_abort_transaction(trans, root, ret); 2734 goto out; 2735 } 2736 2737 lock_extent_bits(io_tree, ordered_extent->file_offset, 2738 ordered_extent->file_offset + ordered_extent->len - 1, 2739 0, &cached_state); 2740 2741 ret = test_range_bit(io_tree, ordered_extent->file_offset, 2742 ordered_extent->file_offset + ordered_extent->len - 1, 2743 EXTENT_DEFRAG, 1, cached_state); 2744 if (ret) { 2745 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2746 if (0 && last_snapshot >= BTRFS_I(inode)->generation) 2747 /* the inode is shared */ 2748 new = record_old_file_extents(inode, ordered_extent); 2749 2750 clear_extent_bit(io_tree, ordered_extent->file_offset, 2751 ordered_extent->file_offset + ordered_extent->len - 1, 2752 EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); 2753 } 2754 2755 if (nolock) 2756 trans = btrfs_join_transaction_nolock(root); 2757 else 2758 trans = btrfs_join_transaction(root); 2759 if (IS_ERR(trans)) { 2760 ret = PTR_ERR(trans); 2761 trans = NULL; 2762 goto out_unlock; 2763 } 2764 2765 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 2766 2767 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 2768 compress_type = ordered_extent->compress_type; 2769 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 2770 BUG_ON(compress_type); 2771 ret = btrfs_mark_extent_written(trans, inode, 2772 ordered_extent->file_offset, 2773 ordered_extent->file_offset + 2774 logical_len); 2775 } else { 2776 BUG_ON(root == root->fs_info->tree_root); 2777 ret = insert_reserved_file_extent(trans, inode, 2778 ordered_extent->file_offset, 2779 ordered_extent->start, 2780 ordered_extent->disk_len, 2781 logical_len, logical_len, 2782 compress_type, 0, 0, 2783 BTRFS_FILE_EXTENT_REG); 2784 if (!ret) 2785 btrfs_release_delalloc_bytes(root, 2786 ordered_extent->start, 2787 ordered_extent->disk_len); 2788 } 2789 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 2790 ordered_extent->file_offset, ordered_extent->len, 2791 trans->transid); 2792 if (ret < 0) { 2793 btrfs_abort_transaction(trans, root, ret); 2794 goto out_unlock; 2795 } 2796 2797 add_pending_csums(trans, inode, ordered_extent->file_offset, 2798 &ordered_extent->list); 2799 2800 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 2801 ret = btrfs_update_inode_fallback(trans, root, inode); 2802 if (ret) { /* -ENOMEM or corruption */ 2803 btrfs_abort_transaction(trans, root, ret); 2804 goto out_unlock; 2805 } 2806 ret = 0; 2807 out_unlock: 2808 unlock_extent_cached(io_tree, ordered_extent->file_offset, 2809 ordered_extent->file_offset + 2810 ordered_extent->len - 1, &cached_state, GFP_NOFS); 2811 out: 2812 if (root != root->fs_info->tree_root) 2813 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 2814 if (trans) 2815 btrfs_end_transaction(trans, root); 2816 2817 if (ret || truncated) { 2818 u64 start, end; 2819 2820 if (truncated) 2821 start = ordered_extent->file_offset + logical_len; 2822 else 2823 start = ordered_extent->file_offset; 2824 end = ordered_extent->file_offset + ordered_extent->len - 1; 2825 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); 2826 2827 /* Drop the cache for the part of the extent we didn't write. */ 2828 btrfs_drop_extent_cache(inode, start, end, 0); 2829 2830 /* 2831 * If the ordered extent had an IOERR or something else went 2832 * wrong we need to return the space for this ordered extent 2833 * back to the allocator. We only free the extent in the 2834 * truncated case if we didn't write out the extent at all. 2835 */ 2836 if ((ret || !logical_len) && 2837 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 2838 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) 2839 btrfs_free_reserved_extent(root, ordered_extent->start, 2840 ordered_extent->disk_len, 1); 2841 } 2842 2843 2844 /* 2845 * This needs to be done to make sure anybody waiting knows we are done 2846 * updating everything for this ordered extent. 2847 */ 2848 btrfs_remove_ordered_extent(inode, ordered_extent); 2849 2850 /* for snapshot-aware defrag */ 2851 if (new) { 2852 if (ret) { 2853 free_sa_defrag_extent(new); 2854 atomic_dec(&root->fs_info->defrag_running); 2855 } else { 2856 relink_file_extents(new); 2857 } 2858 } 2859 2860 /* once for us */ 2861 btrfs_put_ordered_extent(ordered_extent); 2862 /* once for the tree */ 2863 btrfs_put_ordered_extent(ordered_extent); 2864 2865 return ret; 2866 } 2867 2868 static void finish_ordered_fn(struct btrfs_work *work) 2869 { 2870 struct btrfs_ordered_extent *ordered_extent; 2871 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 2872 btrfs_finish_ordered_io(ordered_extent); 2873 } 2874 2875 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 2876 struct extent_state *state, int uptodate) 2877 { 2878 struct inode *inode = page->mapping->host; 2879 struct btrfs_root *root = BTRFS_I(inode)->root; 2880 struct btrfs_ordered_extent *ordered_extent = NULL; 2881 struct btrfs_workqueue *wq; 2882 btrfs_work_func_t func; 2883 2884 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2885 2886 ClearPagePrivate2(page); 2887 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 2888 end - start + 1, uptodate)) 2889 return 0; 2890 2891 if (btrfs_is_free_space_inode(inode)) { 2892 wq = root->fs_info->endio_freespace_worker; 2893 func = btrfs_freespace_write_helper; 2894 } else { 2895 wq = root->fs_info->endio_write_workers; 2896 func = btrfs_endio_write_helper; 2897 } 2898 2899 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, 2900 NULL); 2901 btrfs_queue_work(wq, &ordered_extent->work); 2902 2903 return 0; 2904 } 2905 2906 static int __readpage_endio_check(struct inode *inode, 2907 struct btrfs_io_bio *io_bio, 2908 int icsum, struct page *page, 2909 int pgoff, u64 start, size_t len) 2910 { 2911 char *kaddr; 2912 u32 csum_expected; 2913 u32 csum = ~(u32)0; 2914 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 2915 DEFAULT_RATELIMIT_BURST); 2916 2917 csum_expected = *(((u32 *)io_bio->csum) + icsum); 2918 2919 kaddr = kmap_atomic(page); 2920 csum = btrfs_csum_data(kaddr + pgoff, csum, len); 2921 btrfs_csum_final(csum, (char *)&csum); 2922 if (csum != csum_expected) 2923 goto zeroit; 2924 2925 kunmap_atomic(kaddr); 2926 return 0; 2927 zeroit: 2928 if (__ratelimit(&_rs)) 2929 btrfs_info(BTRFS_I(inode)->root->fs_info, 2930 "csum failed ino %llu off %llu csum %u expected csum %u", 2931 btrfs_ino(inode), start, csum, csum_expected); 2932 memset(kaddr + pgoff, 1, len); 2933 flush_dcache_page(page); 2934 kunmap_atomic(kaddr); 2935 if (csum_expected == 0) 2936 return 0; 2937 return -EIO; 2938 } 2939 2940 /* 2941 * when reads are done, we need to check csums to verify the data is correct 2942 * if there's a match, we allow the bio to finish. If not, the code in 2943 * extent_io.c will try to find good copies for us. 2944 */ 2945 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, 2946 u64 phy_offset, struct page *page, 2947 u64 start, u64 end, int mirror) 2948 { 2949 size_t offset = start - page_offset(page); 2950 struct inode *inode = page->mapping->host; 2951 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2952 struct btrfs_root *root = BTRFS_I(inode)->root; 2953 2954 if (PageChecked(page)) { 2955 ClearPageChecked(page); 2956 return 0; 2957 } 2958 2959 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 2960 return 0; 2961 2962 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 2963 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 2964 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 2965 GFP_NOFS); 2966 return 0; 2967 } 2968 2969 phy_offset >>= inode->i_sb->s_blocksize_bits; 2970 return __readpage_endio_check(inode, io_bio, phy_offset, page, offset, 2971 start, (size_t)(end - start + 1)); 2972 } 2973 2974 struct delayed_iput { 2975 struct list_head list; 2976 struct inode *inode; 2977 }; 2978 2979 /* JDM: If this is fs-wide, why can't we add a pointer to 2980 * btrfs_inode instead and avoid the allocation? */ 2981 void btrfs_add_delayed_iput(struct inode *inode) 2982 { 2983 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2984 struct delayed_iput *delayed; 2985 2986 if (atomic_add_unless(&inode->i_count, -1, 1)) 2987 return; 2988 2989 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); 2990 delayed->inode = inode; 2991 2992 spin_lock(&fs_info->delayed_iput_lock); 2993 list_add_tail(&delayed->list, &fs_info->delayed_iputs); 2994 spin_unlock(&fs_info->delayed_iput_lock); 2995 } 2996 2997 void btrfs_run_delayed_iputs(struct btrfs_root *root) 2998 { 2999 LIST_HEAD(list); 3000 struct btrfs_fs_info *fs_info = root->fs_info; 3001 struct delayed_iput *delayed; 3002 int empty; 3003 3004 spin_lock(&fs_info->delayed_iput_lock); 3005 empty = list_empty(&fs_info->delayed_iputs); 3006 spin_unlock(&fs_info->delayed_iput_lock); 3007 if (empty) 3008 return; 3009 3010 spin_lock(&fs_info->delayed_iput_lock); 3011 list_splice_init(&fs_info->delayed_iputs, &list); 3012 spin_unlock(&fs_info->delayed_iput_lock); 3013 3014 while (!list_empty(&list)) { 3015 delayed = list_entry(list.next, struct delayed_iput, list); 3016 list_del(&delayed->list); 3017 iput(delayed->inode); 3018 kfree(delayed); 3019 } 3020 } 3021 3022 /* 3023 * This is called in transaction commit time. If there are no orphan 3024 * files in the subvolume, it removes orphan item and frees block_rsv 3025 * structure. 3026 */ 3027 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 3028 struct btrfs_root *root) 3029 { 3030 struct btrfs_block_rsv *block_rsv; 3031 int ret; 3032 3033 if (atomic_read(&root->orphan_inodes) || 3034 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 3035 return; 3036 3037 spin_lock(&root->orphan_lock); 3038 if (atomic_read(&root->orphan_inodes)) { 3039 spin_unlock(&root->orphan_lock); 3040 return; 3041 } 3042 3043 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { 3044 spin_unlock(&root->orphan_lock); 3045 return; 3046 } 3047 3048 block_rsv = root->orphan_block_rsv; 3049 root->orphan_block_rsv = NULL; 3050 spin_unlock(&root->orphan_lock); 3051 3052 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && 3053 btrfs_root_refs(&root->root_item) > 0) { 3054 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 3055 root->root_key.objectid); 3056 if (ret) 3057 btrfs_abort_transaction(trans, root, ret); 3058 else 3059 clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, 3060 &root->state); 3061 } 3062 3063 if (block_rsv) { 3064 WARN_ON(block_rsv->size > 0); 3065 btrfs_free_block_rsv(root, block_rsv); 3066 } 3067 } 3068 3069 /* 3070 * This creates an orphan entry for the given inode in case something goes 3071 * wrong in the middle of an unlink/truncate. 3072 * 3073 * NOTE: caller of this function should reserve 5 units of metadata for 3074 * this function. 3075 */ 3076 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 3077 { 3078 struct btrfs_root *root = BTRFS_I(inode)->root; 3079 struct btrfs_block_rsv *block_rsv = NULL; 3080 int reserve = 0; 3081 int insert = 0; 3082 int ret; 3083 3084 if (!root->orphan_block_rsv) { 3085 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 3086 if (!block_rsv) 3087 return -ENOMEM; 3088 } 3089 3090 spin_lock(&root->orphan_lock); 3091 if (!root->orphan_block_rsv) { 3092 root->orphan_block_rsv = block_rsv; 3093 } else if (block_rsv) { 3094 btrfs_free_block_rsv(root, block_rsv); 3095 block_rsv = NULL; 3096 } 3097 3098 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3099 &BTRFS_I(inode)->runtime_flags)) { 3100 #if 0 3101 /* 3102 * For proper ENOSPC handling, we should do orphan 3103 * cleanup when mounting. But this introduces backward 3104 * compatibility issue. 3105 */ 3106 if (!xchg(&root->orphan_item_inserted, 1)) 3107 insert = 2; 3108 else 3109 insert = 1; 3110 #endif 3111 insert = 1; 3112 atomic_inc(&root->orphan_inodes); 3113 } 3114 3115 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3116 &BTRFS_I(inode)->runtime_flags)) 3117 reserve = 1; 3118 spin_unlock(&root->orphan_lock); 3119 3120 /* grab metadata reservation from transaction handle */ 3121 if (reserve) { 3122 ret = btrfs_orphan_reserve_metadata(trans, inode); 3123 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ 3124 } 3125 3126 /* insert an orphan item to track this unlinked/truncated file */ 3127 if (insert >= 1) { 3128 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 3129 if (ret) { 3130 atomic_dec(&root->orphan_inodes); 3131 if (reserve) { 3132 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3133 &BTRFS_I(inode)->runtime_flags); 3134 btrfs_orphan_release_metadata(inode); 3135 } 3136 if (ret != -EEXIST) { 3137 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3138 &BTRFS_I(inode)->runtime_flags); 3139 btrfs_abort_transaction(trans, root, ret); 3140 return ret; 3141 } 3142 } 3143 ret = 0; 3144 } 3145 3146 /* insert an orphan item to track subvolume contains orphan files */ 3147 if (insert >= 2) { 3148 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 3149 root->root_key.objectid); 3150 if (ret && ret != -EEXIST) { 3151 btrfs_abort_transaction(trans, root, ret); 3152 return ret; 3153 } 3154 } 3155 return 0; 3156 } 3157 3158 /* 3159 * We have done the truncate/delete so we can go ahead and remove the orphan 3160 * item for this particular inode. 3161 */ 3162 static int btrfs_orphan_del(struct btrfs_trans_handle *trans, 3163 struct inode *inode) 3164 { 3165 struct btrfs_root *root = BTRFS_I(inode)->root; 3166 int delete_item = 0; 3167 int release_rsv = 0; 3168 int ret = 0; 3169 3170 spin_lock(&root->orphan_lock); 3171 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3172 &BTRFS_I(inode)->runtime_flags)) 3173 delete_item = 1; 3174 3175 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3176 &BTRFS_I(inode)->runtime_flags)) 3177 release_rsv = 1; 3178 spin_unlock(&root->orphan_lock); 3179 3180 if (delete_item) { 3181 atomic_dec(&root->orphan_inodes); 3182 if (trans) 3183 ret = btrfs_del_orphan_item(trans, root, 3184 btrfs_ino(inode)); 3185 } 3186 3187 if (release_rsv) 3188 btrfs_orphan_release_metadata(inode); 3189 3190 return ret; 3191 } 3192 3193 /* 3194 * this cleans up any orphans that may be left on the list from the last use 3195 * of this root. 3196 */ 3197 int btrfs_orphan_cleanup(struct btrfs_root *root) 3198 { 3199 struct btrfs_path *path; 3200 struct extent_buffer *leaf; 3201 struct btrfs_key key, found_key; 3202 struct btrfs_trans_handle *trans; 3203 struct inode *inode; 3204 u64 last_objectid = 0; 3205 int ret = 0, nr_unlink = 0, nr_truncate = 0; 3206 3207 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 3208 return 0; 3209 3210 path = btrfs_alloc_path(); 3211 if (!path) { 3212 ret = -ENOMEM; 3213 goto out; 3214 } 3215 path->reada = -1; 3216 3217 key.objectid = BTRFS_ORPHAN_OBJECTID; 3218 key.type = BTRFS_ORPHAN_ITEM_KEY; 3219 key.offset = (u64)-1; 3220 3221 while (1) { 3222 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3223 if (ret < 0) 3224 goto out; 3225 3226 /* 3227 * if ret == 0 means we found what we were searching for, which 3228 * is weird, but possible, so only screw with path if we didn't 3229 * find the key and see if we have stuff that matches 3230 */ 3231 if (ret > 0) { 3232 ret = 0; 3233 if (path->slots[0] == 0) 3234 break; 3235 path->slots[0]--; 3236 } 3237 3238 /* pull out the item */ 3239 leaf = path->nodes[0]; 3240 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3241 3242 /* make sure the item matches what we want */ 3243 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3244 break; 3245 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) 3246 break; 3247 3248 /* release the path since we're done with it */ 3249 btrfs_release_path(path); 3250 3251 /* 3252 * this is where we are basically btrfs_lookup, without the 3253 * crossing root thing. we store the inode number in the 3254 * offset of the orphan item. 3255 */ 3256 3257 if (found_key.offset == last_objectid) { 3258 btrfs_err(root->fs_info, 3259 "Error removing orphan entry, stopping orphan cleanup"); 3260 ret = -EINVAL; 3261 goto out; 3262 } 3263 3264 last_objectid = found_key.offset; 3265 3266 found_key.objectid = found_key.offset; 3267 found_key.type = BTRFS_INODE_ITEM_KEY; 3268 found_key.offset = 0; 3269 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 3270 ret = PTR_ERR_OR_ZERO(inode); 3271 if (ret && ret != -ESTALE) 3272 goto out; 3273 3274 if (ret == -ESTALE && root == root->fs_info->tree_root) { 3275 struct btrfs_root *dead_root; 3276 struct btrfs_fs_info *fs_info = root->fs_info; 3277 int is_dead_root = 0; 3278 3279 /* 3280 * this is an orphan in the tree root. Currently these 3281 * could come from 2 sources: 3282 * a) a snapshot deletion in progress 3283 * b) a free space cache inode 3284 * We need to distinguish those two, as the snapshot 3285 * orphan must not get deleted. 3286 * find_dead_roots already ran before us, so if this 3287 * is a snapshot deletion, we should find the root 3288 * in the dead_roots list 3289 */ 3290 spin_lock(&fs_info->trans_lock); 3291 list_for_each_entry(dead_root, &fs_info->dead_roots, 3292 root_list) { 3293 if (dead_root->root_key.objectid == 3294 found_key.objectid) { 3295 is_dead_root = 1; 3296 break; 3297 } 3298 } 3299 spin_unlock(&fs_info->trans_lock); 3300 if (is_dead_root) { 3301 /* prevent this orphan from being found again */ 3302 key.offset = found_key.objectid - 1; 3303 continue; 3304 } 3305 } 3306 /* 3307 * Inode is already gone but the orphan item is still there, 3308 * kill the orphan item. 3309 */ 3310 if (ret == -ESTALE) { 3311 trans = btrfs_start_transaction(root, 1); 3312 if (IS_ERR(trans)) { 3313 ret = PTR_ERR(trans); 3314 goto out; 3315 } 3316 btrfs_debug(root->fs_info, "auto deleting %Lu", 3317 found_key.objectid); 3318 ret = btrfs_del_orphan_item(trans, root, 3319 found_key.objectid); 3320 btrfs_end_transaction(trans, root); 3321 if (ret) 3322 goto out; 3323 continue; 3324 } 3325 3326 /* 3327 * add this inode to the orphan list so btrfs_orphan_del does 3328 * the proper thing when we hit it 3329 */ 3330 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3331 &BTRFS_I(inode)->runtime_flags); 3332 atomic_inc(&root->orphan_inodes); 3333 3334 /* if we have links, this was a truncate, lets do that */ 3335 if (inode->i_nlink) { 3336 if (WARN_ON(!S_ISREG(inode->i_mode))) { 3337 iput(inode); 3338 continue; 3339 } 3340 nr_truncate++; 3341 3342 /* 1 for the orphan item deletion. */ 3343 trans = btrfs_start_transaction(root, 1); 3344 if (IS_ERR(trans)) { 3345 iput(inode); 3346 ret = PTR_ERR(trans); 3347 goto out; 3348 } 3349 ret = btrfs_orphan_add(trans, inode); 3350 btrfs_end_transaction(trans, root); 3351 if (ret) { 3352 iput(inode); 3353 goto out; 3354 } 3355 3356 ret = btrfs_truncate(inode); 3357 if (ret) 3358 btrfs_orphan_del(NULL, inode); 3359 } else { 3360 nr_unlink++; 3361 } 3362 3363 /* this will do delete_inode and everything for us */ 3364 iput(inode); 3365 if (ret) 3366 goto out; 3367 } 3368 /* release the path since we're done with it */ 3369 btrfs_release_path(path); 3370 3371 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 3372 3373 if (root->orphan_block_rsv) 3374 btrfs_block_rsv_release(root, root->orphan_block_rsv, 3375 (u64)-1); 3376 3377 if (root->orphan_block_rsv || 3378 test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { 3379 trans = btrfs_join_transaction(root); 3380 if (!IS_ERR(trans)) 3381 btrfs_end_transaction(trans, root); 3382 } 3383 3384 if (nr_unlink) 3385 btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink); 3386 if (nr_truncate) 3387 btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate); 3388 3389 out: 3390 if (ret) 3391 btrfs_crit(root->fs_info, 3392 "could not do orphan cleanup %d", ret); 3393 btrfs_free_path(path); 3394 return ret; 3395 } 3396 3397 /* 3398 * very simple check to peek ahead in the leaf looking for xattrs. If we 3399 * don't find any xattrs, we know there can't be any acls. 3400 * 3401 * slot is the slot the inode is in, objectid is the objectid of the inode 3402 */ 3403 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 3404 int slot, u64 objectid, 3405 int *first_xattr_slot) 3406 { 3407 u32 nritems = btrfs_header_nritems(leaf); 3408 struct btrfs_key found_key; 3409 static u64 xattr_access = 0; 3410 static u64 xattr_default = 0; 3411 int scanned = 0; 3412 3413 if (!xattr_access) { 3414 xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, 3415 strlen(POSIX_ACL_XATTR_ACCESS)); 3416 xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, 3417 strlen(POSIX_ACL_XATTR_DEFAULT)); 3418 } 3419 3420 slot++; 3421 *first_xattr_slot = -1; 3422 while (slot < nritems) { 3423 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3424 3425 /* we found a different objectid, there must not be acls */ 3426 if (found_key.objectid != objectid) 3427 return 0; 3428 3429 /* we found an xattr, assume we've got an acl */ 3430 if (found_key.type == BTRFS_XATTR_ITEM_KEY) { 3431 if (*first_xattr_slot == -1) 3432 *first_xattr_slot = slot; 3433 if (found_key.offset == xattr_access || 3434 found_key.offset == xattr_default) 3435 return 1; 3436 } 3437 3438 /* 3439 * we found a key greater than an xattr key, there can't 3440 * be any acls later on 3441 */ 3442 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 3443 return 0; 3444 3445 slot++; 3446 scanned++; 3447 3448 /* 3449 * it goes inode, inode backrefs, xattrs, extents, 3450 * so if there are a ton of hard links to an inode there can 3451 * be a lot of backrefs. Don't waste time searching too hard, 3452 * this is just an optimization 3453 */ 3454 if (scanned >= 8) 3455 break; 3456 } 3457 /* we hit the end of the leaf before we found an xattr or 3458 * something larger than an xattr. We have to assume the inode 3459 * has acls 3460 */ 3461 if (*first_xattr_slot == -1) 3462 *first_xattr_slot = slot; 3463 return 1; 3464 } 3465 3466 /* 3467 * read an inode from the btree into the in-memory inode 3468 */ 3469 static void btrfs_read_locked_inode(struct inode *inode) 3470 { 3471 struct btrfs_path *path; 3472 struct extent_buffer *leaf; 3473 struct btrfs_inode_item *inode_item; 3474 struct btrfs_timespec *tspec; 3475 struct btrfs_root *root = BTRFS_I(inode)->root; 3476 struct btrfs_key location; 3477 unsigned long ptr; 3478 int maybe_acls; 3479 u32 rdev; 3480 int ret; 3481 bool filled = false; 3482 int first_xattr_slot; 3483 3484 ret = btrfs_fill_inode(inode, &rdev); 3485 if (!ret) 3486 filled = true; 3487 3488 path = btrfs_alloc_path(); 3489 if (!path) 3490 goto make_bad; 3491 3492 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3493 3494 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3495 if (ret) 3496 goto make_bad; 3497 3498 leaf = path->nodes[0]; 3499 3500 if (filled) 3501 goto cache_index; 3502 3503 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3504 struct btrfs_inode_item); 3505 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 3506 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 3507 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); 3508 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); 3509 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 3510 3511 tspec = btrfs_inode_atime(inode_item); 3512 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 3513 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 3514 3515 tspec = btrfs_inode_mtime(inode_item); 3516 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 3517 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 3518 3519 tspec = btrfs_inode_ctime(inode_item); 3520 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 3521 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 3522 3523 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 3524 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 3525 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); 3526 3527 /* 3528 * If we were modified in the current generation and evicted from memory 3529 * and then re-read we need to do a full sync since we don't have any 3530 * idea about which extents were modified before we were evicted from 3531 * cache. 3532 */ 3533 if (BTRFS_I(inode)->last_trans == root->fs_info->generation) 3534 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3535 &BTRFS_I(inode)->runtime_flags); 3536 3537 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 3538 inode->i_generation = BTRFS_I(inode)->generation; 3539 inode->i_rdev = 0; 3540 rdev = btrfs_inode_rdev(leaf, inode_item); 3541 3542 BTRFS_I(inode)->index_cnt = (u64)-1; 3543 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 3544 3545 cache_index: 3546 path->slots[0]++; 3547 if (inode->i_nlink != 1 || 3548 path->slots[0] >= btrfs_header_nritems(leaf)) 3549 goto cache_acl; 3550 3551 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); 3552 if (location.objectid != btrfs_ino(inode)) 3553 goto cache_acl; 3554 3555 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 3556 if (location.type == BTRFS_INODE_REF_KEY) { 3557 struct btrfs_inode_ref *ref; 3558 3559 ref = (struct btrfs_inode_ref *)ptr; 3560 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); 3561 } else if (location.type == BTRFS_INODE_EXTREF_KEY) { 3562 struct btrfs_inode_extref *extref; 3563 3564 extref = (struct btrfs_inode_extref *)ptr; 3565 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, 3566 extref); 3567 } 3568 cache_acl: 3569 /* 3570 * try to precache a NULL acl entry for files that don't have 3571 * any xattrs or acls 3572 */ 3573 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 3574 btrfs_ino(inode), &first_xattr_slot); 3575 if (first_xattr_slot != -1) { 3576 path->slots[0] = first_xattr_slot; 3577 ret = btrfs_load_inode_props(inode, path); 3578 if (ret) 3579 btrfs_err(root->fs_info, 3580 "error loading props for ino %llu (root %llu): %d", 3581 btrfs_ino(inode), 3582 root->root_key.objectid, ret); 3583 } 3584 btrfs_free_path(path); 3585 3586 if (!maybe_acls) 3587 cache_no_acl(inode); 3588 3589 switch (inode->i_mode & S_IFMT) { 3590 case S_IFREG: 3591 inode->i_mapping->a_ops = &btrfs_aops; 3592 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 3593 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 3594 inode->i_fop = &btrfs_file_operations; 3595 inode->i_op = &btrfs_file_inode_operations; 3596 break; 3597 case S_IFDIR: 3598 inode->i_fop = &btrfs_dir_file_operations; 3599 if (root == root->fs_info->tree_root) 3600 inode->i_op = &btrfs_dir_ro_inode_operations; 3601 else 3602 inode->i_op = &btrfs_dir_inode_operations; 3603 break; 3604 case S_IFLNK: 3605 inode->i_op = &btrfs_symlink_inode_operations; 3606 inode->i_mapping->a_ops = &btrfs_symlink_aops; 3607 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 3608 break; 3609 default: 3610 inode->i_op = &btrfs_special_inode_operations; 3611 init_special_inode(inode, inode->i_mode, rdev); 3612 break; 3613 } 3614 3615 btrfs_update_iflags(inode); 3616 return; 3617 3618 make_bad: 3619 btrfs_free_path(path); 3620 make_bad_inode(inode); 3621 } 3622 3623 /* 3624 * given a leaf and an inode, copy the inode fields into the leaf 3625 */ 3626 static void fill_inode_item(struct btrfs_trans_handle *trans, 3627 struct extent_buffer *leaf, 3628 struct btrfs_inode_item *item, 3629 struct inode *inode) 3630 { 3631 struct btrfs_map_token token; 3632 3633 btrfs_init_map_token(&token); 3634 3635 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3636 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3637 btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, 3638 &token); 3639 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3640 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3641 3642 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), 3643 inode->i_atime.tv_sec, &token); 3644 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), 3645 inode->i_atime.tv_nsec, &token); 3646 3647 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), 3648 inode->i_mtime.tv_sec, &token); 3649 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), 3650 inode->i_mtime.tv_nsec, &token); 3651 3652 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), 3653 inode->i_ctime.tv_sec, &token); 3654 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), 3655 inode->i_ctime.tv_nsec, &token); 3656 3657 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3658 &token); 3659 btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, 3660 &token); 3661 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3662 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3663 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3664 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3665 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3666 } 3667 3668 /* 3669 * copy everything in the in-memory inode into the btree. 3670 */ 3671 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 3672 struct btrfs_root *root, struct inode *inode) 3673 { 3674 struct btrfs_inode_item *inode_item; 3675 struct btrfs_path *path; 3676 struct extent_buffer *leaf; 3677 int ret; 3678 3679 path = btrfs_alloc_path(); 3680 if (!path) 3681 return -ENOMEM; 3682 3683 path->leave_spinning = 1; 3684 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 3685 1); 3686 if (ret) { 3687 if (ret > 0) 3688 ret = -ENOENT; 3689 goto failed; 3690 } 3691 3692 leaf = path->nodes[0]; 3693 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3694 struct btrfs_inode_item); 3695 3696 fill_inode_item(trans, leaf, inode_item, inode); 3697 btrfs_mark_buffer_dirty(leaf); 3698 btrfs_set_inode_last_trans(trans, inode); 3699 ret = 0; 3700 failed: 3701 btrfs_free_path(path); 3702 return ret; 3703 } 3704 3705 /* 3706 * copy everything in the in-memory inode into the btree. 3707 */ 3708 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 3709 struct btrfs_root *root, struct inode *inode) 3710 { 3711 int ret; 3712 3713 /* 3714 * If the inode is a free space inode, we can deadlock during commit 3715 * if we put it into the delayed code. 3716 * 3717 * The data relocation inode should also be directly updated 3718 * without delay 3719 */ 3720 if (!btrfs_is_free_space_inode(inode) 3721 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 3722 && !root->fs_info->log_root_recovering) { 3723 btrfs_update_root_times(trans, root); 3724 3725 ret = btrfs_delayed_update_inode(trans, root, inode); 3726 if (!ret) 3727 btrfs_set_inode_last_trans(trans, inode); 3728 return ret; 3729 } 3730 3731 return btrfs_update_inode_item(trans, root, inode); 3732 } 3733 3734 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 3735 struct btrfs_root *root, 3736 struct inode *inode) 3737 { 3738 int ret; 3739 3740 ret = btrfs_update_inode(trans, root, inode); 3741 if (ret == -ENOSPC) 3742 return btrfs_update_inode_item(trans, root, inode); 3743 return ret; 3744 } 3745 3746 /* 3747 * unlink helper that gets used here in inode.c and in the tree logging 3748 * recovery code. It remove a link in a directory with a given name, and 3749 * also drops the back refs in the inode to the directory 3750 */ 3751 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 3752 struct btrfs_root *root, 3753 struct inode *dir, struct inode *inode, 3754 const char *name, int name_len) 3755 { 3756 struct btrfs_path *path; 3757 int ret = 0; 3758 struct extent_buffer *leaf; 3759 struct btrfs_dir_item *di; 3760 struct btrfs_key key; 3761 u64 index; 3762 u64 ino = btrfs_ino(inode); 3763 u64 dir_ino = btrfs_ino(dir); 3764 3765 path = btrfs_alloc_path(); 3766 if (!path) { 3767 ret = -ENOMEM; 3768 goto out; 3769 } 3770 3771 path->leave_spinning = 1; 3772 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 3773 name, name_len, -1); 3774 if (IS_ERR(di)) { 3775 ret = PTR_ERR(di); 3776 goto err; 3777 } 3778 if (!di) { 3779 ret = -ENOENT; 3780 goto err; 3781 } 3782 leaf = path->nodes[0]; 3783 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3784 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3785 if (ret) 3786 goto err; 3787 btrfs_release_path(path); 3788 3789 /* 3790 * If we don't have dir index, we have to get it by looking up 3791 * the inode ref, since we get the inode ref, remove it directly, 3792 * it is unnecessary to do delayed deletion. 3793 * 3794 * But if we have dir index, needn't search inode ref to get it. 3795 * Since the inode ref is close to the inode item, it is better 3796 * that we delay to delete it, and just do this deletion when 3797 * we update the inode item. 3798 */ 3799 if (BTRFS_I(inode)->dir_index) { 3800 ret = btrfs_delayed_delete_inode_ref(inode); 3801 if (!ret) { 3802 index = BTRFS_I(inode)->dir_index; 3803 goto skip_backref; 3804 } 3805 } 3806 3807 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 3808 dir_ino, &index); 3809 if (ret) { 3810 btrfs_info(root->fs_info, 3811 "failed to delete reference to %.*s, inode %llu parent %llu", 3812 name_len, name, ino, dir_ino); 3813 btrfs_abort_transaction(trans, root, ret); 3814 goto err; 3815 } 3816 skip_backref: 3817 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 3818 if (ret) { 3819 btrfs_abort_transaction(trans, root, ret); 3820 goto err; 3821 } 3822 3823 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 3824 inode, dir_ino); 3825 if (ret != 0 && ret != -ENOENT) { 3826 btrfs_abort_transaction(trans, root, ret); 3827 goto err; 3828 } 3829 3830 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 3831 dir, index); 3832 if (ret == -ENOENT) 3833 ret = 0; 3834 else if (ret) 3835 btrfs_abort_transaction(trans, root, ret); 3836 err: 3837 btrfs_free_path(path); 3838 if (ret) 3839 goto out; 3840 3841 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3842 inode_inc_iversion(inode); 3843 inode_inc_iversion(dir); 3844 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3845 ret = btrfs_update_inode(trans, root, dir); 3846 out: 3847 return ret; 3848 } 3849 3850 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 3851 struct btrfs_root *root, 3852 struct inode *dir, struct inode *inode, 3853 const char *name, int name_len) 3854 { 3855 int ret; 3856 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 3857 if (!ret) { 3858 drop_nlink(inode); 3859 ret = btrfs_update_inode(trans, root, inode); 3860 } 3861 return ret; 3862 } 3863 3864 /* 3865 * helper to start transaction for unlink and rmdir. 3866 * 3867 * unlink and rmdir are special in btrfs, they do not always free space, so 3868 * if we cannot make our reservations the normal way try and see if there is 3869 * plenty of slack room in the global reserve to migrate, otherwise we cannot 3870 * allow the unlink to occur. 3871 */ 3872 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) 3873 { 3874 struct btrfs_trans_handle *trans; 3875 struct btrfs_root *root = BTRFS_I(dir)->root; 3876 int ret; 3877 3878 /* 3879 * 1 for the possible orphan item 3880 * 1 for the dir item 3881 * 1 for the dir index 3882 * 1 for the inode ref 3883 * 1 for the inode 3884 */ 3885 trans = btrfs_start_transaction(root, 5); 3886 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 3887 return trans; 3888 3889 if (PTR_ERR(trans) == -ENOSPC) { 3890 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); 3891 3892 trans = btrfs_start_transaction(root, 0); 3893 if (IS_ERR(trans)) 3894 return trans; 3895 ret = btrfs_cond_migrate_bytes(root->fs_info, 3896 &root->fs_info->trans_block_rsv, 3897 num_bytes, 5); 3898 if (ret) { 3899 btrfs_end_transaction(trans, root); 3900 return ERR_PTR(ret); 3901 } 3902 trans->block_rsv = &root->fs_info->trans_block_rsv; 3903 trans->bytes_reserved = num_bytes; 3904 } 3905 return trans; 3906 } 3907 3908 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3909 { 3910 struct btrfs_root *root = BTRFS_I(dir)->root; 3911 struct btrfs_trans_handle *trans; 3912 struct inode *inode = dentry->d_inode; 3913 int ret; 3914 3915 trans = __unlink_start_trans(dir); 3916 if (IS_ERR(trans)) 3917 return PTR_ERR(trans); 3918 3919 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 3920 3921 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3922 dentry->d_name.name, dentry->d_name.len); 3923 if (ret) 3924 goto out; 3925 3926 if (inode->i_nlink == 0) { 3927 ret = btrfs_orphan_add(trans, inode); 3928 if (ret) 3929 goto out; 3930 } 3931 3932 out: 3933 btrfs_end_transaction(trans, root); 3934 btrfs_btree_balance_dirty(root); 3935 return ret; 3936 } 3937 3938 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 3939 struct btrfs_root *root, 3940 struct inode *dir, u64 objectid, 3941 const char *name, int name_len) 3942 { 3943 struct btrfs_path *path; 3944 struct extent_buffer *leaf; 3945 struct btrfs_dir_item *di; 3946 struct btrfs_key key; 3947 u64 index; 3948 int ret; 3949 u64 dir_ino = btrfs_ino(dir); 3950 3951 path = btrfs_alloc_path(); 3952 if (!path) 3953 return -ENOMEM; 3954 3955 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 3956 name, name_len, -1); 3957 if (IS_ERR_OR_NULL(di)) { 3958 if (!di) 3959 ret = -ENOENT; 3960 else 3961 ret = PTR_ERR(di); 3962 goto out; 3963 } 3964 3965 leaf = path->nodes[0]; 3966 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3967 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 3968 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3969 if (ret) { 3970 btrfs_abort_transaction(trans, root, ret); 3971 goto out; 3972 } 3973 btrfs_release_path(path); 3974 3975 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 3976 objectid, root->root_key.objectid, 3977 dir_ino, &index, name, name_len); 3978 if (ret < 0) { 3979 if (ret != -ENOENT) { 3980 btrfs_abort_transaction(trans, root, ret); 3981 goto out; 3982 } 3983 di = btrfs_search_dir_index_item(root, path, dir_ino, 3984 name, name_len); 3985 if (IS_ERR_OR_NULL(di)) { 3986 if (!di) 3987 ret = -ENOENT; 3988 else 3989 ret = PTR_ERR(di); 3990 btrfs_abort_transaction(trans, root, ret); 3991 goto out; 3992 } 3993 3994 leaf = path->nodes[0]; 3995 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3996 btrfs_release_path(path); 3997 index = key.offset; 3998 } 3999 btrfs_release_path(path); 4000 4001 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 4002 if (ret) { 4003 btrfs_abort_transaction(trans, root, ret); 4004 goto out; 4005 } 4006 4007 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 4008 inode_inc_iversion(dir); 4009 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 4010 ret = btrfs_update_inode_fallback(trans, root, dir); 4011 if (ret) 4012 btrfs_abort_transaction(trans, root, ret); 4013 out: 4014 btrfs_free_path(path); 4015 return ret; 4016 } 4017 4018 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 4019 { 4020 struct inode *inode = dentry->d_inode; 4021 int err = 0; 4022 struct btrfs_root *root = BTRFS_I(dir)->root; 4023 struct btrfs_trans_handle *trans; 4024 4025 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 4026 return -ENOTEMPTY; 4027 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 4028 return -EPERM; 4029 4030 trans = __unlink_start_trans(dir); 4031 if (IS_ERR(trans)) 4032 return PTR_ERR(trans); 4033 4034 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 4035 err = btrfs_unlink_subvol(trans, root, dir, 4036 BTRFS_I(inode)->location.objectid, 4037 dentry->d_name.name, 4038 dentry->d_name.len); 4039 goto out; 4040 } 4041 4042 err = btrfs_orphan_add(trans, inode); 4043 if (err) 4044 goto out; 4045 4046 /* now the directory is empty */ 4047 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 4048 dentry->d_name.name, dentry->d_name.len); 4049 if (!err) 4050 btrfs_i_size_write(inode, 0); 4051 out: 4052 btrfs_end_transaction(trans, root); 4053 btrfs_btree_balance_dirty(root); 4054 4055 return err; 4056 } 4057 4058 /* 4059 * this can truncate away extent items, csum items and directory items. 4060 * It starts at a high offset and removes keys until it can't find 4061 * any higher than new_size 4062 * 4063 * csum items that cross the new i_size are truncated to the new size 4064 * as well. 4065 * 4066 * min_type is the minimum key type to truncate down to. If set to 0, this 4067 * will kill all the items on this inode, including the INODE_ITEM_KEY. 4068 */ 4069 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 4070 struct btrfs_root *root, 4071 struct inode *inode, 4072 u64 new_size, u32 min_type) 4073 { 4074 struct btrfs_path *path; 4075 struct extent_buffer *leaf; 4076 struct btrfs_file_extent_item *fi; 4077 struct btrfs_key key; 4078 struct btrfs_key found_key; 4079 u64 extent_start = 0; 4080 u64 extent_num_bytes = 0; 4081 u64 extent_offset = 0; 4082 u64 item_end = 0; 4083 u64 last_size = (u64)-1; 4084 u32 found_type = (u8)-1; 4085 int found_extent; 4086 int del_item; 4087 int pending_del_nr = 0; 4088 int pending_del_slot = 0; 4089 int extent_type = -1; 4090 int ret; 4091 int err = 0; 4092 u64 ino = btrfs_ino(inode); 4093 4094 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 4095 4096 path = btrfs_alloc_path(); 4097 if (!path) 4098 return -ENOMEM; 4099 path->reada = -1; 4100 4101 /* 4102 * We want to drop from the next block forward in case this new size is 4103 * not block aligned since we will be keeping the last block of the 4104 * extent just the way it is. 4105 */ 4106 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4107 root == root->fs_info->tree_root) 4108 btrfs_drop_extent_cache(inode, ALIGN(new_size, 4109 root->sectorsize), (u64)-1, 0); 4110 4111 /* 4112 * This function is also used to drop the items in the log tree before 4113 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 4114 * it is used to drop the loged items. So we shouldn't kill the delayed 4115 * items. 4116 */ 4117 if (min_type == 0 && root == BTRFS_I(inode)->root) 4118 btrfs_kill_delayed_inode_items(inode); 4119 4120 key.objectid = ino; 4121 key.offset = (u64)-1; 4122 key.type = (u8)-1; 4123 4124 search_again: 4125 path->leave_spinning = 1; 4126 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 4127 if (ret < 0) { 4128 err = ret; 4129 goto out; 4130 } 4131 4132 if (ret > 0) { 4133 /* there are no items in the tree for us to truncate, we're 4134 * done 4135 */ 4136 if (path->slots[0] == 0) 4137 goto out; 4138 path->slots[0]--; 4139 } 4140 4141 while (1) { 4142 fi = NULL; 4143 leaf = path->nodes[0]; 4144 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4145 found_type = found_key.type; 4146 4147 if (found_key.objectid != ino) 4148 break; 4149 4150 if (found_type < min_type) 4151 break; 4152 4153 item_end = found_key.offset; 4154 if (found_type == BTRFS_EXTENT_DATA_KEY) { 4155 fi = btrfs_item_ptr(leaf, path->slots[0], 4156 struct btrfs_file_extent_item); 4157 extent_type = btrfs_file_extent_type(leaf, fi); 4158 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4159 item_end += 4160 btrfs_file_extent_num_bytes(leaf, fi); 4161 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4162 item_end += btrfs_file_extent_inline_len(leaf, 4163 path->slots[0], fi); 4164 } 4165 item_end--; 4166 } 4167 if (found_type > min_type) { 4168 del_item = 1; 4169 } else { 4170 if (item_end < new_size) 4171 break; 4172 if (found_key.offset >= new_size) 4173 del_item = 1; 4174 else 4175 del_item = 0; 4176 } 4177 found_extent = 0; 4178 /* FIXME, shrink the extent if the ref count is only 1 */ 4179 if (found_type != BTRFS_EXTENT_DATA_KEY) 4180 goto delete; 4181 4182 if (del_item) 4183 last_size = found_key.offset; 4184 else 4185 last_size = new_size; 4186 4187 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4188 u64 num_dec; 4189 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 4190 if (!del_item) { 4191 u64 orig_num_bytes = 4192 btrfs_file_extent_num_bytes(leaf, fi); 4193 extent_num_bytes = ALIGN(new_size - 4194 found_key.offset, 4195 root->sectorsize); 4196 btrfs_set_file_extent_num_bytes(leaf, fi, 4197 extent_num_bytes); 4198 num_dec = (orig_num_bytes - 4199 extent_num_bytes); 4200 if (test_bit(BTRFS_ROOT_REF_COWS, 4201 &root->state) && 4202 extent_start != 0) 4203 inode_sub_bytes(inode, num_dec); 4204 btrfs_mark_buffer_dirty(leaf); 4205 } else { 4206 extent_num_bytes = 4207 btrfs_file_extent_disk_num_bytes(leaf, 4208 fi); 4209 extent_offset = found_key.offset - 4210 btrfs_file_extent_offset(leaf, fi); 4211 4212 /* FIXME blocksize != 4096 */ 4213 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 4214 if (extent_start != 0) { 4215 found_extent = 1; 4216 if (test_bit(BTRFS_ROOT_REF_COWS, 4217 &root->state)) 4218 inode_sub_bytes(inode, num_dec); 4219 } 4220 } 4221 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4222 /* 4223 * we can't truncate inline items that have had 4224 * special encodings 4225 */ 4226 if (!del_item && 4227 btrfs_file_extent_compression(leaf, fi) == 0 && 4228 btrfs_file_extent_encryption(leaf, fi) == 0 && 4229 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 4230 u32 size = new_size - found_key.offset; 4231 4232 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4233 inode_sub_bytes(inode, item_end + 1 - 4234 new_size); 4235 4236 /* 4237 * update the ram bytes to properly reflect 4238 * the new size of our item 4239 */ 4240 btrfs_set_file_extent_ram_bytes(leaf, fi, size); 4241 size = 4242 btrfs_file_extent_calc_inline_size(size); 4243 btrfs_truncate_item(root, path, size, 1); 4244 } else if (test_bit(BTRFS_ROOT_REF_COWS, 4245 &root->state)) { 4246 inode_sub_bytes(inode, item_end + 1 - 4247 found_key.offset); 4248 } 4249 } 4250 delete: 4251 if (del_item) { 4252 if (!pending_del_nr) { 4253 /* no pending yet, add ourselves */ 4254 pending_del_slot = path->slots[0]; 4255 pending_del_nr = 1; 4256 } else if (pending_del_nr && 4257 path->slots[0] + 1 == pending_del_slot) { 4258 /* hop on the pending chunk */ 4259 pending_del_nr++; 4260 pending_del_slot = path->slots[0]; 4261 } else { 4262 BUG(); 4263 } 4264 } else { 4265 break; 4266 } 4267 if (found_extent && 4268 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4269 root == root->fs_info->tree_root)) { 4270 btrfs_set_path_blocking(path); 4271 ret = btrfs_free_extent(trans, root, extent_start, 4272 extent_num_bytes, 0, 4273 btrfs_header_owner(leaf), 4274 ino, extent_offset, 0); 4275 BUG_ON(ret); 4276 } 4277 4278 if (found_type == BTRFS_INODE_ITEM_KEY) 4279 break; 4280 4281 if (path->slots[0] == 0 || 4282 path->slots[0] != pending_del_slot) { 4283 if (pending_del_nr) { 4284 ret = btrfs_del_items(trans, root, path, 4285 pending_del_slot, 4286 pending_del_nr); 4287 if (ret) { 4288 btrfs_abort_transaction(trans, 4289 root, ret); 4290 goto error; 4291 } 4292 pending_del_nr = 0; 4293 } 4294 btrfs_release_path(path); 4295 goto search_again; 4296 } else { 4297 path->slots[0]--; 4298 } 4299 } 4300 out: 4301 if (pending_del_nr) { 4302 ret = btrfs_del_items(trans, root, path, pending_del_slot, 4303 pending_del_nr); 4304 if (ret) 4305 btrfs_abort_transaction(trans, root, ret); 4306 } 4307 error: 4308 if (last_size != (u64)-1 && 4309 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4310 btrfs_ordered_update_i_size(inode, last_size, NULL); 4311 btrfs_free_path(path); 4312 return err; 4313 } 4314 4315 /* 4316 * btrfs_truncate_page - read, zero a chunk and write a page 4317 * @inode - inode that we're zeroing 4318 * @from - the offset to start zeroing 4319 * @len - the length to zero, 0 to zero the entire range respective to the 4320 * offset 4321 * @front - zero up to the offset instead of from the offset on 4322 * 4323 * This will find the page for the "from" offset and cow the page and zero the 4324 * part we want to zero. This is used with truncate and hole punching. 4325 */ 4326 int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, 4327 int front) 4328 { 4329 struct address_space *mapping = inode->i_mapping; 4330 struct btrfs_root *root = BTRFS_I(inode)->root; 4331 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4332 struct btrfs_ordered_extent *ordered; 4333 struct extent_state *cached_state = NULL; 4334 char *kaddr; 4335 u32 blocksize = root->sectorsize; 4336 pgoff_t index = from >> PAGE_CACHE_SHIFT; 4337 unsigned offset = from & (PAGE_CACHE_SIZE-1); 4338 struct page *page; 4339 gfp_t mask = btrfs_alloc_write_mask(mapping); 4340 int ret = 0; 4341 u64 page_start; 4342 u64 page_end; 4343 4344 if ((offset & (blocksize - 1)) == 0 && 4345 (!len || ((len & (blocksize - 1)) == 0))) 4346 goto out; 4347 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 4348 if (ret) 4349 goto out; 4350 4351 again: 4352 page = find_or_create_page(mapping, index, mask); 4353 if (!page) { 4354 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 4355 ret = -ENOMEM; 4356 goto out; 4357 } 4358 4359 page_start = page_offset(page); 4360 page_end = page_start + PAGE_CACHE_SIZE - 1; 4361 4362 if (!PageUptodate(page)) { 4363 ret = btrfs_readpage(NULL, page); 4364 lock_page(page); 4365 if (page->mapping != mapping) { 4366 unlock_page(page); 4367 page_cache_release(page); 4368 goto again; 4369 } 4370 if (!PageUptodate(page)) { 4371 ret = -EIO; 4372 goto out_unlock; 4373 } 4374 } 4375 wait_on_page_writeback(page); 4376 4377 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 4378 set_page_extent_mapped(page); 4379 4380 ordered = btrfs_lookup_ordered_extent(inode, page_start); 4381 if (ordered) { 4382 unlock_extent_cached(io_tree, page_start, page_end, 4383 &cached_state, GFP_NOFS); 4384 unlock_page(page); 4385 page_cache_release(page); 4386 btrfs_start_ordered_extent(inode, ordered, 1); 4387 btrfs_put_ordered_extent(ordered); 4388 goto again; 4389 } 4390 4391 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 4392 EXTENT_DIRTY | EXTENT_DELALLOC | 4393 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 4394 0, 0, &cached_state, GFP_NOFS); 4395 4396 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 4397 &cached_state); 4398 if (ret) { 4399 unlock_extent_cached(io_tree, page_start, page_end, 4400 &cached_state, GFP_NOFS); 4401 goto out_unlock; 4402 } 4403 4404 if (offset != PAGE_CACHE_SIZE) { 4405 if (!len) 4406 len = PAGE_CACHE_SIZE - offset; 4407 kaddr = kmap(page); 4408 if (front) 4409 memset(kaddr, 0, offset); 4410 else 4411 memset(kaddr + offset, 0, len); 4412 flush_dcache_page(page); 4413 kunmap(page); 4414 } 4415 ClearPageChecked(page); 4416 set_page_dirty(page); 4417 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 4418 GFP_NOFS); 4419 4420 out_unlock: 4421 if (ret) 4422 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 4423 unlock_page(page); 4424 page_cache_release(page); 4425 out: 4426 return ret; 4427 } 4428 4429 static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, 4430 u64 offset, u64 len) 4431 { 4432 struct btrfs_trans_handle *trans; 4433 int ret; 4434 4435 /* 4436 * Still need to make sure the inode looks like it's been updated so 4437 * that any holes get logged if we fsync. 4438 */ 4439 if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) { 4440 BTRFS_I(inode)->last_trans = root->fs_info->generation; 4441 BTRFS_I(inode)->last_sub_trans = root->log_transid; 4442 BTRFS_I(inode)->last_log_commit = root->last_log_commit; 4443 return 0; 4444 } 4445 4446 /* 4447 * 1 - for the one we're dropping 4448 * 1 - for the one we're adding 4449 * 1 - for updating the inode. 4450 */ 4451 trans = btrfs_start_transaction(root, 3); 4452 if (IS_ERR(trans)) 4453 return PTR_ERR(trans); 4454 4455 ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); 4456 if (ret) { 4457 btrfs_abort_transaction(trans, root, ret); 4458 btrfs_end_transaction(trans, root); 4459 return ret; 4460 } 4461 4462 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 4463 0, 0, len, 0, len, 0, 0, 0); 4464 if (ret) 4465 btrfs_abort_transaction(trans, root, ret); 4466 else 4467 btrfs_update_inode(trans, root, inode); 4468 btrfs_end_transaction(trans, root); 4469 return ret; 4470 } 4471 4472 /* 4473 * This function puts in dummy file extents for the area we're creating a hole 4474 * for. So if we are truncating this file to a larger size we need to insert 4475 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 4476 * the range between oldsize and size 4477 */ 4478 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 4479 { 4480 struct btrfs_root *root = BTRFS_I(inode)->root; 4481 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4482 struct extent_map *em = NULL; 4483 struct extent_state *cached_state = NULL; 4484 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4485 u64 hole_start = ALIGN(oldsize, root->sectorsize); 4486 u64 block_end = ALIGN(size, root->sectorsize); 4487 u64 last_byte; 4488 u64 cur_offset; 4489 u64 hole_size; 4490 int err = 0; 4491 4492 /* 4493 * If our size started in the middle of a page we need to zero out the 4494 * rest of the page before we expand the i_size, otherwise we could 4495 * expose stale data. 4496 */ 4497 err = btrfs_truncate_page(inode, oldsize, 0, 0); 4498 if (err) 4499 return err; 4500 4501 if (size <= hole_start) 4502 return 0; 4503 4504 while (1) { 4505 struct btrfs_ordered_extent *ordered; 4506 4507 lock_extent_bits(io_tree, hole_start, block_end - 1, 0, 4508 &cached_state); 4509 ordered = btrfs_lookup_ordered_range(inode, hole_start, 4510 block_end - hole_start); 4511 if (!ordered) 4512 break; 4513 unlock_extent_cached(io_tree, hole_start, block_end - 1, 4514 &cached_state, GFP_NOFS); 4515 btrfs_start_ordered_extent(inode, ordered, 1); 4516 btrfs_put_ordered_extent(ordered); 4517 } 4518 4519 cur_offset = hole_start; 4520 while (1) { 4521 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 4522 block_end - cur_offset, 0); 4523 if (IS_ERR(em)) { 4524 err = PTR_ERR(em); 4525 em = NULL; 4526 break; 4527 } 4528 last_byte = min(extent_map_end(em), block_end); 4529 last_byte = ALIGN(last_byte , root->sectorsize); 4530 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 4531 struct extent_map *hole_em; 4532 hole_size = last_byte - cur_offset; 4533 4534 err = maybe_insert_hole(root, inode, cur_offset, 4535 hole_size); 4536 if (err) 4537 break; 4538 btrfs_drop_extent_cache(inode, cur_offset, 4539 cur_offset + hole_size - 1, 0); 4540 hole_em = alloc_extent_map(); 4541 if (!hole_em) { 4542 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4543 &BTRFS_I(inode)->runtime_flags); 4544 goto next; 4545 } 4546 hole_em->start = cur_offset; 4547 hole_em->len = hole_size; 4548 hole_em->orig_start = cur_offset; 4549 4550 hole_em->block_start = EXTENT_MAP_HOLE; 4551 hole_em->block_len = 0; 4552 hole_em->orig_block_len = 0; 4553 hole_em->ram_bytes = hole_size; 4554 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 4555 hole_em->compress_type = BTRFS_COMPRESS_NONE; 4556 hole_em->generation = root->fs_info->generation; 4557 4558 while (1) { 4559 write_lock(&em_tree->lock); 4560 err = add_extent_mapping(em_tree, hole_em, 1); 4561 write_unlock(&em_tree->lock); 4562 if (err != -EEXIST) 4563 break; 4564 btrfs_drop_extent_cache(inode, cur_offset, 4565 cur_offset + 4566 hole_size - 1, 0); 4567 } 4568 free_extent_map(hole_em); 4569 } 4570 next: 4571 free_extent_map(em); 4572 em = NULL; 4573 cur_offset = last_byte; 4574 if (cur_offset >= block_end) 4575 break; 4576 } 4577 free_extent_map(em); 4578 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 4579 GFP_NOFS); 4580 return err; 4581 } 4582 4583 static int btrfs_setsize(struct inode *inode, struct iattr *attr) 4584 { 4585 struct btrfs_root *root = BTRFS_I(inode)->root; 4586 struct btrfs_trans_handle *trans; 4587 loff_t oldsize = i_size_read(inode); 4588 loff_t newsize = attr->ia_size; 4589 int mask = attr->ia_valid; 4590 int ret; 4591 4592 /* 4593 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a 4594 * special case where we need to update the times despite not having 4595 * these flags set. For all other operations the VFS set these flags 4596 * explicitly if it wants a timestamp update. 4597 */ 4598 if (newsize != oldsize) { 4599 inode_inc_iversion(inode); 4600 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) 4601 inode->i_ctime = inode->i_mtime = 4602 current_fs_time(inode->i_sb); 4603 } 4604 4605 if (newsize > oldsize) { 4606 truncate_pagecache(inode, newsize); 4607 ret = btrfs_cont_expand(inode, oldsize, newsize); 4608 if (ret) 4609 return ret; 4610 4611 trans = btrfs_start_transaction(root, 1); 4612 if (IS_ERR(trans)) 4613 return PTR_ERR(trans); 4614 4615 i_size_write(inode, newsize); 4616 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 4617 ret = btrfs_update_inode(trans, root, inode); 4618 btrfs_end_transaction(trans, root); 4619 } else { 4620 4621 /* 4622 * We're truncating a file that used to have good data down to 4623 * zero. Make sure it gets into the ordered flush list so that 4624 * any new writes get down to disk quickly. 4625 */ 4626 if (newsize == 0) 4627 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 4628 &BTRFS_I(inode)->runtime_flags); 4629 4630 /* 4631 * 1 for the orphan item we're going to add 4632 * 1 for the orphan item deletion. 4633 */ 4634 trans = btrfs_start_transaction(root, 2); 4635 if (IS_ERR(trans)) 4636 return PTR_ERR(trans); 4637 4638 /* 4639 * We need to do this in case we fail at _any_ point during the 4640 * actual truncate. Once we do the truncate_setsize we could 4641 * invalidate pages which forces any outstanding ordered io to 4642 * be instantly completed which will give us extents that need 4643 * to be truncated. If we fail to get an orphan inode down we 4644 * could have left over extents that were never meant to live, 4645 * so we need to garuntee from this point on that everything 4646 * will be consistent. 4647 */ 4648 ret = btrfs_orphan_add(trans, inode); 4649 btrfs_end_transaction(trans, root); 4650 if (ret) 4651 return ret; 4652 4653 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 4654 truncate_setsize(inode, newsize); 4655 4656 /* Disable nonlocked read DIO to avoid the end less truncate */ 4657 btrfs_inode_block_unlocked_dio(inode); 4658 inode_dio_wait(inode); 4659 btrfs_inode_resume_unlocked_dio(inode); 4660 4661 ret = btrfs_truncate(inode); 4662 if (ret && inode->i_nlink) { 4663 int err; 4664 4665 /* 4666 * failed to truncate, disk_i_size is only adjusted down 4667 * as we remove extents, so it should represent the true 4668 * size of the inode, so reset the in memory size and 4669 * delete our orphan entry. 4670 */ 4671 trans = btrfs_join_transaction(root); 4672 if (IS_ERR(trans)) { 4673 btrfs_orphan_del(NULL, inode); 4674 return ret; 4675 } 4676 i_size_write(inode, BTRFS_I(inode)->disk_i_size); 4677 err = btrfs_orphan_del(trans, inode); 4678 if (err) 4679 btrfs_abort_transaction(trans, root, err); 4680 btrfs_end_transaction(trans, root); 4681 } 4682 } 4683 4684 return ret; 4685 } 4686 4687 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 4688 { 4689 struct inode *inode = dentry->d_inode; 4690 struct btrfs_root *root = BTRFS_I(inode)->root; 4691 int err; 4692 4693 if (btrfs_root_readonly(root)) 4694 return -EROFS; 4695 4696 err = inode_change_ok(inode, attr); 4697 if (err) 4698 return err; 4699 4700 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 4701 err = btrfs_setsize(inode, attr); 4702 if (err) 4703 return err; 4704 } 4705 4706 if (attr->ia_valid) { 4707 setattr_copy(inode, attr); 4708 inode_inc_iversion(inode); 4709 err = btrfs_dirty_inode(inode); 4710 4711 if (!err && attr->ia_valid & ATTR_MODE) 4712 err = posix_acl_chmod(inode, inode->i_mode); 4713 } 4714 4715 return err; 4716 } 4717 4718 /* 4719 * While truncating the inode pages during eviction, we get the VFS calling 4720 * btrfs_invalidatepage() against each page of the inode. This is slow because 4721 * the calls to btrfs_invalidatepage() result in a huge amount of calls to 4722 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting 4723 * extent_state structures over and over, wasting lots of time. 4724 * 4725 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all 4726 * those expensive operations on a per page basis and do only the ordered io 4727 * finishing, while we release here the extent_map and extent_state structures, 4728 * without the excessive merging and splitting. 4729 */ 4730 static void evict_inode_truncate_pages(struct inode *inode) 4731 { 4732 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4733 struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; 4734 struct rb_node *node; 4735 4736 ASSERT(inode->i_state & I_FREEING); 4737 truncate_inode_pages_final(&inode->i_data); 4738 4739 write_lock(&map_tree->lock); 4740 while (!RB_EMPTY_ROOT(&map_tree->map)) { 4741 struct extent_map *em; 4742 4743 node = rb_first(&map_tree->map); 4744 em = rb_entry(node, struct extent_map, rb_node); 4745 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 4746 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 4747 remove_extent_mapping(map_tree, em); 4748 free_extent_map(em); 4749 if (need_resched()) { 4750 write_unlock(&map_tree->lock); 4751 cond_resched(); 4752 write_lock(&map_tree->lock); 4753 } 4754 } 4755 write_unlock(&map_tree->lock); 4756 4757 spin_lock(&io_tree->lock); 4758 while (!RB_EMPTY_ROOT(&io_tree->state)) { 4759 struct extent_state *state; 4760 struct extent_state *cached_state = NULL; 4761 4762 node = rb_first(&io_tree->state); 4763 state = rb_entry(node, struct extent_state, rb_node); 4764 atomic_inc(&state->refs); 4765 spin_unlock(&io_tree->lock); 4766 4767 lock_extent_bits(io_tree, state->start, state->end, 4768 0, &cached_state); 4769 clear_extent_bit(io_tree, state->start, state->end, 4770 EXTENT_LOCKED | EXTENT_DIRTY | 4771 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 4772 EXTENT_DEFRAG, 1, 1, 4773 &cached_state, GFP_NOFS); 4774 free_extent_state(state); 4775 4776 cond_resched(); 4777 spin_lock(&io_tree->lock); 4778 } 4779 spin_unlock(&io_tree->lock); 4780 } 4781 4782 void btrfs_evict_inode(struct inode *inode) 4783 { 4784 struct btrfs_trans_handle *trans; 4785 struct btrfs_root *root = BTRFS_I(inode)->root; 4786 struct btrfs_block_rsv *rsv, *global_rsv; 4787 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 4788 int ret; 4789 4790 trace_btrfs_inode_evict(inode); 4791 4792 evict_inode_truncate_pages(inode); 4793 4794 if (inode->i_nlink && 4795 ((btrfs_root_refs(&root->root_item) != 0 && 4796 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || 4797 btrfs_is_free_space_inode(inode))) 4798 goto no_delete; 4799 4800 if (is_bad_inode(inode)) { 4801 btrfs_orphan_del(NULL, inode); 4802 goto no_delete; 4803 } 4804 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 4805 btrfs_wait_ordered_range(inode, 0, (u64)-1); 4806 4807 btrfs_free_io_failure_record(inode, 0, (u64)-1); 4808 4809 if (root->fs_info->log_root_recovering) { 4810 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 4811 &BTRFS_I(inode)->runtime_flags)); 4812 goto no_delete; 4813 } 4814 4815 if (inode->i_nlink > 0) { 4816 BUG_ON(btrfs_root_refs(&root->root_item) != 0 && 4817 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); 4818 goto no_delete; 4819 } 4820 4821 ret = btrfs_commit_inode_delayed_inode(inode); 4822 if (ret) { 4823 btrfs_orphan_del(NULL, inode); 4824 goto no_delete; 4825 } 4826 4827 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 4828 if (!rsv) { 4829 btrfs_orphan_del(NULL, inode); 4830 goto no_delete; 4831 } 4832 rsv->size = min_size; 4833 rsv->failfast = 1; 4834 global_rsv = &root->fs_info->global_block_rsv; 4835 4836 btrfs_i_size_write(inode, 0); 4837 4838 /* 4839 * This is a bit simpler than btrfs_truncate since we've already 4840 * reserved our space for our orphan item in the unlink, so we just 4841 * need to reserve some slack space in case we add bytes and update 4842 * inode item when doing the truncate. 4843 */ 4844 while (1) { 4845 ret = btrfs_block_rsv_refill(root, rsv, min_size, 4846 BTRFS_RESERVE_FLUSH_LIMIT); 4847 4848 /* 4849 * Try and steal from the global reserve since we will 4850 * likely not use this space anyway, we want to try as 4851 * hard as possible to get this to work. 4852 */ 4853 if (ret) 4854 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); 4855 4856 if (ret) { 4857 btrfs_warn(root->fs_info, 4858 "Could not get space for a delete, will truncate on mount %d", 4859 ret); 4860 btrfs_orphan_del(NULL, inode); 4861 btrfs_free_block_rsv(root, rsv); 4862 goto no_delete; 4863 } 4864 4865 trans = btrfs_join_transaction(root); 4866 if (IS_ERR(trans)) { 4867 btrfs_orphan_del(NULL, inode); 4868 btrfs_free_block_rsv(root, rsv); 4869 goto no_delete; 4870 } 4871 4872 trans->block_rsv = rsv; 4873 4874 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 4875 if (ret != -ENOSPC) 4876 break; 4877 4878 trans->block_rsv = &root->fs_info->trans_block_rsv; 4879 btrfs_end_transaction(trans, root); 4880 trans = NULL; 4881 btrfs_btree_balance_dirty(root); 4882 } 4883 4884 btrfs_free_block_rsv(root, rsv); 4885 4886 /* 4887 * Errors here aren't a big deal, it just means we leave orphan items 4888 * in the tree. They will be cleaned up on the next mount. 4889 */ 4890 if (ret == 0) { 4891 trans->block_rsv = root->orphan_block_rsv; 4892 btrfs_orphan_del(trans, inode); 4893 } else { 4894 btrfs_orphan_del(NULL, inode); 4895 } 4896 4897 trans->block_rsv = &root->fs_info->trans_block_rsv; 4898 if (!(root == root->fs_info->tree_root || 4899 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 4900 btrfs_return_ino(root, btrfs_ino(inode)); 4901 4902 btrfs_end_transaction(trans, root); 4903 btrfs_btree_balance_dirty(root); 4904 no_delete: 4905 btrfs_remove_delayed_node(inode); 4906 clear_inode(inode); 4907 return; 4908 } 4909 4910 /* 4911 * this returns the key found in the dir entry in the location pointer. 4912 * If no dir entries were found, location->objectid is 0. 4913 */ 4914 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 4915 struct btrfs_key *location) 4916 { 4917 const char *name = dentry->d_name.name; 4918 int namelen = dentry->d_name.len; 4919 struct btrfs_dir_item *di; 4920 struct btrfs_path *path; 4921 struct btrfs_root *root = BTRFS_I(dir)->root; 4922 int ret = 0; 4923 4924 path = btrfs_alloc_path(); 4925 if (!path) 4926 return -ENOMEM; 4927 4928 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 4929 namelen, 0); 4930 if (IS_ERR(di)) 4931 ret = PTR_ERR(di); 4932 4933 if (IS_ERR_OR_NULL(di)) 4934 goto out_err; 4935 4936 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 4937 out: 4938 btrfs_free_path(path); 4939 return ret; 4940 out_err: 4941 location->objectid = 0; 4942 goto out; 4943 } 4944 4945 /* 4946 * when we hit a tree root in a directory, the btrfs part of the inode 4947 * needs to be changed to reflect the root directory of the tree root. This 4948 * is kind of like crossing a mount point. 4949 */ 4950 static int fixup_tree_root_location(struct btrfs_root *root, 4951 struct inode *dir, 4952 struct dentry *dentry, 4953 struct btrfs_key *location, 4954 struct btrfs_root **sub_root) 4955 { 4956 struct btrfs_path *path; 4957 struct btrfs_root *new_root; 4958 struct btrfs_root_ref *ref; 4959 struct extent_buffer *leaf; 4960 int ret; 4961 int err = 0; 4962 4963 path = btrfs_alloc_path(); 4964 if (!path) { 4965 err = -ENOMEM; 4966 goto out; 4967 } 4968 4969 err = -ENOENT; 4970 ret = btrfs_find_item(root->fs_info->tree_root, path, 4971 BTRFS_I(dir)->root->root_key.objectid, 4972 location->objectid, BTRFS_ROOT_REF_KEY, NULL); 4973 if (ret) { 4974 if (ret < 0) 4975 err = ret; 4976 goto out; 4977 } 4978 4979 leaf = path->nodes[0]; 4980 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 4981 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || 4982 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 4983 goto out; 4984 4985 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 4986 (unsigned long)(ref + 1), 4987 dentry->d_name.len); 4988 if (ret) 4989 goto out; 4990 4991 btrfs_release_path(path); 4992 4993 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 4994 if (IS_ERR(new_root)) { 4995 err = PTR_ERR(new_root); 4996 goto out; 4997 } 4998 4999 *sub_root = new_root; 5000 location->objectid = btrfs_root_dirid(&new_root->root_item); 5001 location->type = BTRFS_INODE_ITEM_KEY; 5002 location->offset = 0; 5003 err = 0; 5004 out: 5005 btrfs_free_path(path); 5006 return err; 5007 } 5008 5009 static void inode_tree_add(struct inode *inode) 5010 { 5011 struct btrfs_root *root = BTRFS_I(inode)->root; 5012 struct btrfs_inode *entry; 5013 struct rb_node **p; 5014 struct rb_node *parent; 5015 struct rb_node *new = &BTRFS_I(inode)->rb_node; 5016 u64 ino = btrfs_ino(inode); 5017 5018 if (inode_unhashed(inode)) 5019 return; 5020 parent = NULL; 5021 spin_lock(&root->inode_lock); 5022 p = &root->inode_tree.rb_node; 5023 while (*p) { 5024 parent = *p; 5025 entry = rb_entry(parent, struct btrfs_inode, rb_node); 5026 5027 if (ino < btrfs_ino(&entry->vfs_inode)) 5028 p = &parent->rb_left; 5029 else if (ino > btrfs_ino(&entry->vfs_inode)) 5030 p = &parent->rb_right; 5031 else { 5032 WARN_ON(!(entry->vfs_inode.i_state & 5033 (I_WILL_FREE | I_FREEING))); 5034 rb_replace_node(parent, new, &root->inode_tree); 5035 RB_CLEAR_NODE(parent); 5036 spin_unlock(&root->inode_lock); 5037 return; 5038 } 5039 } 5040 rb_link_node(new, parent, p); 5041 rb_insert_color(new, &root->inode_tree); 5042 spin_unlock(&root->inode_lock); 5043 } 5044 5045 static void inode_tree_del(struct inode *inode) 5046 { 5047 struct btrfs_root *root = BTRFS_I(inode)->root; 5048 int empty = 0; 5049 5050 spin_lock(&root->inode_lock); 5051 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 5052 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 5053 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 5054 empty = RB_EMPTY_ROOT(&root->inode_tree); 5055 } 5056 spin_unlock(&root->inode_lock); 5057 5058 if (empty && btrfs_root_refs(&root->root_item) == 0) { 5059 synchronize_srcu(&root->fs_info->subvol_srcu); 5060 spin_lock(&root->inode_lock); 5061 empty = RB_EMPTY_ROOT(&root->inode_tree); 5062 spin_unlock(&root->inode_lock); 5063 if (empty) 5064 btrfs_add_dead_root(root); 5065 } 5066 } 5067 5068 void btrfs_invalidate_inodes(struct btrfs_root *root) 5069 { 5070 struct rb_node *node; 5071 struct rb_node *prev; 5072 struct btrfs_inode *entry; 5073 struct inode *inode; 5074 u64 objectid = 0; 5075 5076 if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 5077 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 5078 5079 spin_lock(&root->inode_lock); 5080 again: 5081 node = root->inode_tree.rb_node; 5082 prev = NULL; 5083 while (node) { 5084 prev = node; 5085 entry = rb_entry(node, struct btrfs_inode, rb_node); 5086 5087 if (objectid < btrfs_ino(&entry->vfs_inode)) 5088 node = node->rb_left; 5089 else if (objectid > btrfs_ino(&entry->vfs_inode)) 5090 node = node->rb_right; 5091 else 5092 break; 5093 } 5094 if (!node) { 5095 while (prev) { 5096 entry = rb_entry(prev, struct btrfs_inode, rb_node); 5097 if (objectid <= btrfs_ino(&entry->vfs_inode)) { 5098 node = prev; 5099 break; 5100 } 5101 prev = rb_next(prev); 5102 } 5103 } 5104 while (node) { 5105 entry = rb_entry(node, struct btrfs_inode, rb_node); 5106 objectid = btrfs_ino(&entry->vfs_inode) + 1; 5107 inode = igrab(&entry->vfs_inode); 5108 if (inode) { 5109 spin_unlock(&root->inode_lock); 5110 if (atomic_read(&inode->i_count) > 1) 5111 d_prune_aliases(inode); 5112 /* 5113 * btrfs_drop_inode will have it removed from 5114 * the inode cache when its usage count 5115 * hits zero. 5116 */ 5117 iput(inode); 5118 cond_resched(); 5119 spin_lock(&root->inode_lock); 5120 goto again; 5121 } 5122 5123 if (cond_resched_lock(&root->inode_lock)) 5124 goto again; 5125 5126 node = rb_next(node); 5127 } 5128 spin_unlock(&root->inode_lock); 5129 } 5130 5131 static int btrfs_init_locked_inode(struct inode *inode, void *p) 5132 { 5133 struct btrfs_iget_args *args = p; 5134 inode->i_ino = args->location->objectid; 5135 memcpy(&BTRFS_I(inode)->location, args->location, 5136 sizeof(*args->location)); 5137 BTRFS_I(inode)->root = args->root; 5138 return 0; 5139 } 5140 5141 static int btrfs_find_actor(struct inode *inode, void *opaque) 5142 { 5143 struct btrfs_iget_args *args = opaque; 5144 return args->location->objectid == BTRFS_I(inode)->location.objectid && 5145 args->root == BTRFS_I(inode)->root; 5146 } 5147 5148 static struct inode *btrfs_iget_locked(struct super_block *s, 5149 struct btrfs_key *location, 5150 struct btrfs_root *root) 5151 { 5152 struct inode *inode; 5153 struct btrfs_iget_args args; 5154 unsigned long hashval = btrfs_inode_hash(location->objectid, root); 5155 5156 args.location = location; 5157 args.root = root; 5158 5159 inode = iget5_locked(s, hashval, btrfs_find_actor, 5160 btrfs_init_locked_inode, 5161 (void *)&args); 5162 return inode; 5163 } 5164 5165 /* Get an inode object given its location and corresponding root. 5166 * Returns in *is_new if the inode was read from disk 5167 */ 5168 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 5169 struct btrfs_root *root, int *new) 5170 { 5171 struct inode *inode; 5172 5173 inode = btrfs_iget_locked(s, location, root); 5174 if (!inode) 5175 return ERR_PTR(-ENOMEM); 5176 5177 if (inode->i_state & I_NEW) { 5178 btrfs_read_locked_inode(inode); 5179 if (!is_bad_inode(inode)) { 5180 inode_tree_add(inode); 5181 unlock_new_inode(inode); 5182 if (new) 5183 *new = 1; 5184 } else { 5185 unlock_new_inode(inode); 5186 iput(inode); 5187 inode = ERR_PTR(-ESTALE); 5188 } 5189 } 5190 5191 return inode; 5192 } 5193 5194 static struct inode *new_simple_dir(struct super_block *s, 5195 struct btrfs_key *key, 5196 struct btrfs_root *root) 5197 { 5198 struct inode *inode = new_inode(s); 5199 5200 if (!inode) 5201 return ERR_PTR(-ENOMEM); 5202 5203 BTRFS_I(inode)->root = root; 5204 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 5205 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 5206 5207 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 5208 inode->i_op = &btrfs_dir_ro_inode_operations; 5209 inode->i_fop = &simple_dir_operations; 5210 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5211 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 5212 5213 return inode; 5214 } 5215 5216 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 5217 { 5218 struct inode *inode; 5219 struct btrfs_root *root = BTRFS_I(dir)->root; 5220 struct btrfs_root *sub_root = root; 5221 struct btrfs_key location; 5222 int index; 5223 int ret = 0; 5224 5225 if (dentry->d_name.len > BTRFS_NAME_LEN) 5226 return ERR_PTR(-ENAMETOOLONG); 5227 5228 ret = btrfs_inode_by_name(dir, dentry, &location); 5229 if (ret < 0) 5230 return ERR_PTR(ret); 5231 5232 if (location.objectid == 0) 5233 return ERR_PTR(-ENOENT); 5234 5235 if (location.type == BTRFS_INODE_ITEM_KEY) { 5236 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 5237 return inode; 5238 } 5239 5240 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 5241 5242 index = srcu_read_lock(&root->fs_info->subvol_srcu); 5243 ret = fixup_tree_root_location(root, dir, dentry, 5244 &location, &sub_root); 5245 if (ret < 0) { 5246 if (ret != -ENOENT) 5247 inode = ERR_PTR(ret); 5248 else 5249 inode = new_simple_dir(dir->i_sb, &location, sub_root); 5250 } else { 5251 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); 5252 } 5253 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 5254 5255 if (!IS_ERR(inode) && root != sub_root) { 5256 down_read(&root->fs_info->cleanup_work_sem); 5257 if (!(inode->i_sb->s_flags & MS_RDONLY)) 5258 ret = btrfs_orphan_cleanup(sub_root); 5259 up_read(&root->fs_info->cleanup_work_sem); 5260 if (ret) { 5261 iput(inode); 5262 inode = ERR_PTR(ret); 5263 } 5264 } 5265 5266 return inode; 5267 } 5268 5269 static int btrfs_dentry_delete(const struct dentry *dentry) 5270 { 5271 struct btrfs_root *root; 5272 struct inode *inode = dentry->d_inode; 5273 5274 if (!inode && !IS_ROOT(dentry)) 5275 inode = dentry->d_parent->d_inode; 5276 5277 if (inode) { 5278 root = BTRFS_I(inode)->root; 5279 if (btrfs_root_refs(&root->root_item) == 0) 5280 return 1; 5281 5282 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 5283 return 1; 5284 } 5285 return 0; 5286 } 5287 5288 static void btrfs_dentry_release(struct dentry *dentry) 5289 { 5290 kfree(dentry->d_fsdata); 5291 } 5292 5293 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 5294 unsigned int flags) 5295 { 5296 struct inode *inode; 5297 5298 inode = btrfs_lookup_dentry(dir, dentry); 5299 if (IS_ERR(inode)) { 5300 if (PTR_ERR(inode) == -ENOENT) 5301 inode = NULL; 5302 else 5303 return ERR_CAST(inode); 5304 } 5305 5306 return d_materialise_unique(dentry, inode); 5307 } 5308 5309 unsigned char btrfs_filetype_table[] = { 5310 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 5311 }; 5312 5313 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) 5314 { 5315 struct inode *inode = file_inode(file); 5316 struct btrfs_root *root = BTRFS_I(inode)->root; 5317 struct btrfs_item *item; 5318 struct btrfs_dir_item *di; 5319 struct btrfs_key key; 5320 struct btrfs_key found_key; 5321 struct btrfs_path *path; 5322 struct list_head ins_list; 5323 struct list_head del_list; 5324 int ret; 5325 struct extent_buffer *leaf; 5326 int slot; 5327 unsigned char d_type; 5328 int over = 0; 5329 u32 di_cur; 5330 u32 di_total; 5331 u32 di_len; 5332 int key_type = BTRFS_DIR_INDEX_KEY; 5333 char tmp_name[32]; 5334 char *name_ptr; 5335 int name_len; 5336 int is_curr = 0; /* ctx->pos points to the current index? */ 5337 5338 /* FIXME, use a real flag for deciding about the key type */ 5339 if (root->fs_info->tree_root == root) 5340 key_type = BTRFS_DIR_ITEM_KEY; 5341 5342 if (!dir_emit_dots(file, ctx)) 5343 return 0; 5344 5345 path = btrfs_alloc_path(); 5346 if (!path) 5347 return -ENOMEM; 5348 5349 path->reada = 1; 5350 5351 if (key_type == BTRFS_DIR_INDEX_KEY) { 5352 INIT_LIST_HEAD(&ins_list); 5353 INIT_LIST_HEAD(&del_list); 5354 btrfs_get_delayed_items(inode, &ins_list, &del_list); 5355 } 5356 5357 key.type = key_type; 5358 key.offset = ctx->pos; 5359 key.objectid = btrfs_ino(inode); 5360 5361 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5362 if (ret < 0) 5363 goto err; 5364 5365 while (1) { 5366 leaf = path->nodes[0]; 5367 slot = path->slots[0]; 5368 if (slot >= btrfs_header_nritems(leaf)) { 5369 ret = btrfs_next_leaf(root, path); 5370 if (ret < 0) 5371 goto err; 5372 else if (ret > 0) 5373 break; 5374 continue; 5375 } 5376 5377 item = btrfs_item_nr(slot); 5378 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5379 5380 if (found_key.objectid != key.objectid) 5381 break; 5382 if (found_key.type != key_type) 5383 break; 5384 if (found_key.offset < ctx->pos) 5385 goto next; 5386 if (key_type == BTRFS_DIR_INDEX_KEY && 5387 btrfs_should_delete_dir_index(&del_list, 5388 found_key.offset)) 5389 goto next; 5390 5391 ctx->pos = found_key.offset; 5392 is_curr = 1; 5393 5394 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 5395 di_cur = 0; 5396 di_total = btrfs_item_size(leaf, item); 5397 5398 while (di_cur < di_total) { 5399 struct btrfs_key location; 5400 5401 if (verify_dir_item(root, leaf, di)) 5402 break; 5403 5404 name_len = btrfs_dir_name_len(leaf, di); 5405 if (name_len <= sizeof(tmp_name)) { 5406 name_ptr = tmp_name; 5407 } else { 5408 name_ptr = kmalloc(name_len, GFP_NOFS); 5409 if (!name_ptr) { 5410 ret = -ENOMEM; 5411 goto err; 5412 } 5413 } 5414 read_extent_buffer(leaf, name_ptr, 5415 (unsigned long)(di + 1), name_len); 5416 5417 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 5418 btrfs_dir_item_key_to_cpu(leaf, di, &location); 5419 5420 5421 /* is this a reference to our own snapshot? If so 5422 * skip it. 5423 * 5424 * In contrast to old kernels, we insert the snapshot's 5425 * dir item and dir index after it has been created, so 5426 * we won't find a reference to our own snapshot. We 5427 * still keep the following code for backward 5428 * compatibility. 5429 */ 5430 if (location.type == BTRFS_ROOT_ITEM_KEY && 5431 location.objectid == root->root_key.objectid) { 5432 over = 0; 5433 goto skip; 5434 } 5435 over = !dir_emit(ctx, name_ptr, name_len, 5436 location.objectid, d_type); 5437 5438 skip: 5439 if (name_ptr != tmp_name) 5440 kfree(name_ptr); 5441 5442 if (over) 5443 goto nopos; 5444 di_len = btrfs_dir_name_len(leaf, di) + 5445 btrfs_dir_data_len(leaf, di) + sizeof(*di); 5446 di_cur += di_len; 5447 di = (struct btrfs_dir_item *)((char *)di + di_len); 5448 } 5449 next: 5450 path->slots[0]++; 5451 } 5452 5453 if (key_type == BTRFS_DIR_INDEX_KEY) { 5454 if (is_curr) 5455 ctx->pos++; 5456 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); 5457 if (ret) 5458 goto nopos; 5459 } 5460 5461 /* Reached end of directory/root. Bump pos past the last item. */ 5462 ctx->pos++; 5463 5464 /* 5465 * Stop new entries from being returned after we return the last 5466 * entry. 5467 * 5468 * New directory entries are assigned a strictly increasing 5469 * offset. This means that new entries created during readdir 5470 * are *guaranteed* to be seen in the future by that readdir. 5471 * This has broken buggy programs which operate on names as 5472 * they're returned by readdir. Until we re-use freed offsets 5473 * we have this hack to stop new entries from being returned 5474 * under the assumption that they'll never reach this huge 5475 * offset. 5476 * 5477 * This is being careful not to overflow 32bit loff_t unless the 5478 * last entry requires it because doing so has broken 32bit apps 5479 * in the past. 5480 */ 5481 if (key_type == BTRFS_DIR_INDEX_KEY) { 5482 if (ctx->pos >= INT_MAX) 5483 ctx->pos = LLONG_MAX; 5484 else 5485 ctx->pos = INT_MAX; 5486 } 5487 nopos: 5488 ret = 0; 5489 err: 5490 if (key_type == BTRFS_DIR_INDEX_KEY) 5491 btrfs_put_delayed_items(&ins_list, &del_list); 5492 btrfs_free_path(path); 5493 return ret; 5494 } 5495 5496 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) 5497 { 5498 struct btrfs_root *root = BTRFS_I(inode)->root; 5499 struct btrfs_trans_handle *trans; 5500 int ret = 0; 5501 bool nolock = false; 5502 5503 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 5504 return 0; 5505 5506 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) 5507 nolock = true; 5508 5509 if (wbc->sync_mode == WB_SYNC_ALL) { 5510 if (nolock) 5511 trans = btrfs_join_transaction_nolock(root); 5512 else 5513 trans = btrfs_join_transaction(root); 5514 if (IS_ERR(trans)) 5515 return PTR_ERR(trans); 5516 ret = btrfs_commit_transaction(trans, root); 5517 } 5518 return ret; 5519 } 5520 5521 /* 5522 * This is somewhat expensive, updating the tree every time the 5523 * inode changes. But, it is most likely to find the inode in cache. 5524 * FIXME, needs more benchmarking...there are no reasons other than performance 5525 * to keep or drop this code. 5526 */ 5527 static int btrfs_dirty_inode(struct inode *inode) 5528 { 5529 struct btrfs_root *root = BTRFS_I(inode)->root; 5530 struct btrfs_trans_handle *trans; 5531 int ret; 5532 5533 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 5534 return 0; 5535 5536 trans = btrfs_join_transaction(root); 5537 if (IS_ERR(trans)) 5538 return PTR_ERR(trans); 5539 5540 ret = btrfs_update_inode(trans, root, inode); 5541 if (ret && ret == -ENOSPC) { 5542 /* whoops, lets try again with the full transaction */ 5543 btrfs_end_transaction(trans, root); 5544 trans = btrfs_start_transaction(root, 1); 5545 if (IS_ERR(trans)) 5546 return PTR_ERR(trans); 5547 5548 ret = btrfs_update_inode(trans, root, inode); 5549 } 5550 btrfs_end_transaction(trans, root); 5551 if (BTRFS_I(inode)->delayed_node) 5552 btrfs_balance_delayed_items(root); 5553 5554 return ret; 5555 } 5556 5557 /* 5558 * This is a copy of file_update_time. We need this so we can return error on 5559 * ENOSPC for updating the inode in the case of file write and mmap writes. 5560 */ 5561 static int btrfs_update_time(struct inode *inode, struct timespec *now, 5562 int flags) 5563 { 5564 struct btrfs_root *root = BTRFS_I(inode)->root; 5565 5566 if (btrfs_root_readonly(root)) 5567 return -EROFS; 5568 5569 if (flags & S_VERSION) 5570 inode_inc_iversion(inode); 5571 if (flags & S_CTIME) 5572 inode->i_ctime = *now; 5573 if (flags & S_MTIME) 5574 inode->i_mtime = *now; 5575 if (flags & S_ATIME) 5576 inode->i_atime = *now; 5577 return btrfs_dirty_inode(inode); 5578 } 5579 5580 /* 5581 * find the highest existing sequence number in a directory 5582 * and then set the in-memory index_cnt variable to reflect 5583 * free sequence numbers 5584 */ 5585 static int btrfs_set_inode_index_count(struct inode *inode) 5586 { 5587 struct btrfs_root *root = BTRFS_I(inode)->root; 5588 struct btrfs_key key, found_key; 5589 struct btrfs_path *path; 5590 struct extent_buffer *leaf; 5591 int ret; 5592 5593 key.objectid = btrfs_ino(inode); 5594 key.type = BTRFS_DIR_INDEX_KEY; 5595 key.offset = (u64)-1; 5596 5597 path = btrfs_alloc_path(); 5598 if (!path) 5599 return -ENOMEM; 5600 5601 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5602 if (ret < 0) 5603 goto out; 5604 /* FIXME: we should be able to handle this */ 5605 if (ret == 0) 5606 goto out; 5607 ret = 0; 5608 5609 /* 5610 * MAGIC NUMBER EXPLANATION: 5611 * since we search a directory based on f_pos we have to start at 2 5612 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 5613 * else has to start at 2 5614 */ 5615 if (path->slots[0] == 0) { 5616 BTRFS_I(inode)->index_cnt = 2; 5617 goto out; 5618 } 5619 5620 path->slots[0]--; 5621 5622 leaf = path->nodes[0]; 5623 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5624 5625 if (found_key.objectid != btrfs_ino(inode) || 5626 found_key.type != BTRFS_DIR_INDEX_KEY) { 5627 BTRFS_I(inode)->index_cnt = 2; 5628 goto out; 5629 } 5630 5631 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 5632 out: 5633 btrfs_free_path(path); 5634 return ret; 5635 } 5636 5637 /* 5638 * helper to find a free sequence number in a given directory. This current 5639 * code is very simple, later versions will do smarter things in the btree 5640 */ 5641 int btrfs_set_inode_index(struct inode *dir, u64 *index) 5642 { 5643 int ret = 0; 5644 5645 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 5646 ret = btrfs_inode_delayed_dir_index_count(dir); 5647 if (ret) { 5648 ret = btrfs_set_inode_index_count(dir); 5649 if (ret) 5650 return ret; 5651 } 5652 } 5653 5654 *index = BTRFS_I(dir)->index_cnt; 5655 BTRFS_I(dir)->index_cnt++; 5656 5657 return ret; 5658 } 5659 5660 static int btrfs_insert_inode_locked(struct inode *inode) 5661 { 5662 struct btrfs_iget_args args; 5663 args.location = &BTRFS_I(inode)->location; 5664 args.root = BTRFS_I(inode)->root; 5665 5666 return insert_inode_locked4(inode, 5667 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), 5668 btrfs_find_actor, &args); 5669 } 5670 5671 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 5672 struct btrfs_root *root, 5673 struct inode *dir, 5674 const char *name, int name_len, 5675 u64 ref_objectid, u64 objectid, 5676 umode_t mode, u64 *index) 5677 { 5678 struct inode *inode; 5679 struct btrfs_inode_item *inode_item; 5680 struct btrfs_key *location; 5681 struct btrfs_path *path; 5682 struct btrfs_inode_ref *ref; 5683 struct btrfs_key key[2]; 5684 u32 sizes[2]; 5685 int nitems = name ? 2 : 1; 5686 unsigned long ptr; 5687 int ret; 5688 5689 path = btrfs_alloc_path(); 5690 if (!path) 5691 return ERR_PTR(-ENOMEM); 5692 5693 inode = new_inode(root->fs_info->sb); 5694 if (!inode) { 5695 btrfs_free_path(path); 5696 return ERR_PTR(-ENOMEM); 5697 } 5698 5699 /* 5700 * O_TMPFILE, set link count to 0, so that after this point, 5701 * we fill in an inode item with the correct link count. 5702 */ 5703 if (!name) 5704 set_nlink(inode, 0); 5705 5706 /* 5707 * we have to initialize this early, so we can reclaim the inode 5708 * number if we fail afterwards in this function. 5709 */ 5710 inode->i_ino = objectid; 5711 5712 if (dir && name) { 5713 trace_btrfs_inode_request(dir); 5714 5715 ret = btrfs_set_inode_index(dir, index); 5716 if (ret) { 5717 btrfs_free_path(path); 5718 iput(inode); 5719 return ERR_PTR(ret); 5720 } 5721 } else if (dir) { 5722 *index = 0; 5723 } 5724 /* 5725 * index_cnt is ignored for everything but a dir, 5726 * btrfs_get_inode_index_count has an explanation for the magic 5727 * number 5728 */ 5729 BTRFS_I(inode)->index_cnt = 2; 5730 BTRFS_I(inode)->dir_index = *index; 5731 BTRFS_I(inode)->root = root; 5732 BTRFS_I(inode)->generation = trans->transid; 5733 inode->i_generation = BTRFS_I(inode)->generation; 5734 5735 /* 5736 * We could have gotten an inode number from somebody who was fsynced 5737 * and then removed in this same transaction, so let's just set full 5738 * sync since it will be a full sync anyway and this will blow away the 5739 * old info in the log. 5740 */ 5741 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 5742 5743 key[0].objectid = objectid; 5744 key[0].type = BTRFS_INODE_ITEM_KEY; 5745 key[0].offset = 0; 5746 5747 sizes[0] = sizeof(struct btrfs_inode_item); 5748 5749 if (name) { 5750 /* 5751 * Start new inodes with an inode_ref. This is slightly more 5752 * efficient for small numbers of hard links since they will 5753 * be packed into one item. Extended refs will kick in if we 5754 * add more hard links than can fit in the ref item. 5755 */ 5756 key[1].objectid = objectid; 5757 key[1].type = BTRFS_INODE_REF_KEY; 5758 key[1].offset = ref_objectid; 5759 5760 sizes[1] = name_len + sizeof(*ref); 5761 } 5762 5763 location = &BTRFS_I(inode)->location; 5764 location->objectid = objectid; 5765 location->offset = 0; 5766 location->type = BTRFS_INODE_ITEM_KEY; 5767 5768 ret = btrfs_insert_inode_locked(inode); 5769 if (ret < 0) 5770 goto fail; 5771 5772 path->leave_spinning = 1; 5773 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); 5774 if (ret != 0) 5775 goto fail_unlock; 5776 5777 inode_init_owner(inode, dir, mode); 5778 inode_set_bytes(inode, 0); 5779 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 5780 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5781 struct btrfs_inode_item); 5782 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, 5783 sizeof(*inode_item)); 5784 fill_inode_item(trans, path->nodes[0], inode_item, inode); 5785 5786 if (name) { 5787 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 5788 struct btrfs_inode_ref); 5789 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 5790 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 5791 ptr = (unsigned long)(ref + 1); 5792 write_extent_buffer(path->nodes[0], name, ptr, name_len); 5793 } 5794 5795 btrfs_mark_buffer_dirty(path->nodes[0]); 5796 btrfs_free_path(path); 5797 5798 btrfs_inherit_iflags(inode, dir); 5799 5800 if (S_ISREG(mode)) { 5801 if (btrfs_test_opt(root, NODATASUM)) 5802 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 5803 if (btrfs_test_opt(root, NODATACOW)) 5804 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | 5805 BTRFS_INODE_NODATASUM; 5806 } 5807 5808 inode_tree_add(inode); 5809 5810 trace_btrfs_inode_new(inode); 5811 btrfs_set_inode_last_trans(trans, inode); 5812 5813 btrfs_update_root_times(trans, root); 5814 5815 ret = btrfs_inode_inherit_props(trans, inode, dir); 5816 if (ret) 5817 btrfs_err(root->fs_info, 5818 "error inheriting props for ino %llu (root %llu): %d", 5819 btrfs_ino(inode), root->root_key.objectid, ret); 5820 5821 return inode; 5822 5823 fail_unlock: 5824 unlock_new_inode(inode); 5825 fail: 5826 if (dir && name) 5827 BTRFS_I(dir)->index_cnt--; 5828 btrfs_free_path(path); 5829 iput(inode); 5830 return ERR_PTR(ret); 5831 } 5832 5833 static inline u8 btrfs_inode_type(struct inode *inode) 5834 { 5835 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 5836 } 5837 5838 /* 5839 * utility function to add 'inode' into 'parent_inode' with 5840 * a give name and a given sequence number. 5841 * if 'add_backref' is true, also insert a backref from the 5842 * inode to the parent directory. 5843 */ 5844 int btrfs_add_link(struct btrfs_trans_handle *trans, 5845 struct inode *parent_inode, struct inode *inode, 5846 const char *name, int name_len, int add_backref, u64 index) 5847 { 5848 int ret = 0; 5849 struct btrfs_key key; 5850 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 5851 u64 ino = btrfs_ino(inode); 5852 u64 parent_ino = btrfs_ino(parent_inode); 5853 5854 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 5855 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 5856 } else { 5857 key.objectid = ino; 5858 key.type = BTRFS_INODE_ITEM_KEY; 5859 key.offset = 0; 5860 } 5861 5862 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 5863 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 5864 key.objectid, root->root_key.objectid, 5865 parent_ino, index, name, name_len); 5866 } else if (add_backref) { 5867 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 5868 parent_ino, index); 5869 } 5870 5871 /* Nothing to clean up yet */ 5872 if (ret) 5873 return ret; 5874 5875 ret = btrfs_insert_dir_item(trans, root, name, name_len, 5876 parent_inode, &key, 5877 btrfs_inode_type(inode), index); 5878 if (ret == -EEXIST || ret == -EOVERFLOW) 5879 goto fail_dir_item; 5880 else if (ret) { 5881 btrfs_abort_transaction(trans, root, ret); 5882 return ret; 5883 } 5884 5885 btrfs_i_size_write(parent_inode, parent_inode->i_size + 5886 name_len * 2); 5887 inode_inc_iversion(parent_inode); 5888 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 5889 ret = btrfs_update_inode(trans, root, parent_inode); 5890 if (ret) 5891 btrfs_abort_transaction(trans, root, ret); 5892 return ret; 5893 5894 fail_dir_item: 5895 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 5896 u64 local_index; 5897 int err; 5898 err = btrfs_del_root_ref(trans, root->fs_info->tree_root, 5899 key.objectid, root->root_key.objectid, 5900 parent_ino, &local_index, name, name_len); 5901 5902 } else if (add_backref) { 5903 u64 local_index; 5904 int err; 5905 5906 err = btrfs_del_inode_ref(trans, root, name, name_len, 5907 ino, parent_ino, &local_index); 5908 } 5909 return ret; 5910 } 5911 5912 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 5913 struct inode *dir, struct dentry *dentry, 5914 struct inode *inode, int backref, u64 index) 5915 { 5916 int err = btrfs_add_link(trans, dir, inode, 5917 dentry->d_name.name, dentry->d_name.len, 5918 backref, index); 5919 if (err > 0) 5920 err = -EEXIST; 5921 return err; 5922 } 5923 5924 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 5925 umode_t mode, dev_t rdev) 5926 { 5927 struct btrfs_trans_handle *trans; 5928 struct btrfs_root *root = BTRFS_I(dir)->root; 5929 struct inode *inode = NULL; 5930 int err; 5931 int drop_inode = 0; 5932 u64 objectid; 5933 u64 index = 0; 5934 5935 if (!new_valid_dev(rdev)) 5936 return -EINVAL; 5937 5938 /* 5939 * 2 for inode item and ref 5940 * 2 for dir items 5941 * 1 for xattr if selinux is on 5942 */ 5943 trans = btrfs_start_transaction(root, 5); 5944 if (IS_ERR(trans)) 5945 return PTR_ERR(trans); 5946 5947 err = btrfs_find_free_ino(root, &objectid); 5948 if (err) 5949 goto out_unlock; 5950 5951 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 5952 dentry->d_name.len, btrfs_ino(dir), objectid, 5953 mode, &index); 5954 if (IS_ERR(inode)) { 5955 err = PTR_ERR(inode); 5956 goto out_unlock; 5957 } 5958 5959 /* 5960 * If the active LSM wants to access the inode during 5961 * d_instantiate it needs these. Smack checks to see 5962 * if the filesystem supports xattrs by looking at the 5963 * ops vector. 5964 */ 5965 inode->i_op = &btrfs_special_inode_operations; 5966 init_special_inode(inode, inode->i_mode, rdev); 5967 5968 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5969 if (err) 5970 goto out_unlock_inode; 5971 5972 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5973 if (err) { 5974 goto out_unlock_inode; 5975 } else { 5976 btrfs_update_inode(trans, root, inode); 5977 unlock_new_inode(inode); 5978 d_instantiate(dentry, inode); 5979 } 5980 5981 out_unlock: 5982 btrfs_end_transaction(trans, root); 5983 btrfs_balance_delayed_items(root); 5984 btrfs_btree_balance_dirty(root); 5985 if (drop_inode) { 5986 inode_dec_link_count(inode); 5987 iput(inode); 5988 } 5989 return err; 5990 5991 out_unlock_inode: 5992 drop_inode = 1; 5993 unlock_new_inode(inode); 5994 goto out_unlock; 5995 5996 } 5997 5998 static int btrfs_create(struct inode *dir, struct dentry *dentry, 5999 umode_t mode, bool excl) 6000 { 6001 struct btrfs_trans_handle *trans; 6002 struct btrfs_root *root = BTRFS_I(dir)->root; 6003 struct inode *inode = NULL; 6004 int drop_inode_on_err = 0; 6005 int err; 6006 u64 objectid; 6007 u64 index = 0; 6008 6009 /* 6010 * 2 for inode item and ref 6011 * 2 for dir items 6012 * 1 for xattr if selinux is on 6013 */ 6014 trans = btrfs_start_transaction(root, 5); 6015 if (IS_ERR(trans)) 6016 return PTR_ERR(trans); 6017 6018 err = btrfs_find_free_ino(root, &objectid); 6019 if (err) 6020 goto out_unlock; 6021 6022 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6023 dentry->d_name.len, btrfs_ino(dir), objectid, 6024 mode, &index); 6025 if (IS_ERR(inode)) { 6026 err = PTR_ERR(inode); 6027 goto out_unlock; 6028 } 6029 drop_inode_on_err = 1; 6030 /* 6031 * If the active LSM wants to access the inode during 6032 * d_instantiate it needs these. Smack checks to see 6033 * if the filesystem supports xattrs by looking at the 6034 * ops vector. 6035 */ 6036 inode->i_fop = &btrfs_file_operations; 6037 inode->i_op = &btrfs_file_inode_operations; 6038 inode->i_mapping->a_ops = &btrfs_aops; 6039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 6040 6041 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6042 if (err) 6043 goto out_unlock_inode; 6044 6045 err = btrfs_update_inode(trans, root, inode); 6046 if (err) 6047 goto out_unlock_inode; 6048 6049 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 6050 if (err) 6051 goto out_unlock_inode; 6052 6053 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 6054 unlock_new_inode(inode); 6055 d_instantiate(dentry, inode); 6056 6057 out_unlock: 6058 btrfs_end_transaction(trans, root); 6059 if (err && drop_inode_on_err) { 6060 inode_dec_link_count(inode); 6061 iput(inode); 6062 } 6063 btrfs_balance_delayed_items(root); 6064 btrfs_btree_balance_dirty(root); 6065 return err; 6066 6067 out_unlock_inode: 6068 unlock_new_inode(inode); 6069 goto out_unlock; 6070 6071 } 6072 6073 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 6074 struct dentry *dentry) 6075 { 6076 struct btrfs_trans_handle *trans; 6077 struct btrfs_root *root = BTRFS_I(dir)->root; 6078 struct inode *inode = old_dentry->d_inode; 6079 u64 index; 6080 int err; 6081 int drop_inode = 0; 6082 6083 /* do not allow sys_link's with other subvols of the same device */ 6084 if (root->objectid != BTRFS_I(inode)->root->objectid) 6085 return -EXDEV; 6086 6087 if (inode->i_nlink >= BTRFS_LINK_MAX) 6088 return -EMLINK; 6089 6090 err = btrfs_set_inode_index(dir, &index); 6091 if (err) 6092 goto fail; 6093 6094 /* 6095 * 2 items for inode and inode ref 6096 * 2 items for dir items 6097 * 1 item for parent inode 6098 */ 6099 trans = btrfs_start_transaction(root, 5); 6100 if (IS_ERR(trans)) { 6101 err = PTR_ERR(trans); 6102 goto fail; 6103 } 6104 6105 /* There are several dir indexes for this inode, clear the cache. */ 6106 BTRFS_I(inode)->dir_index = 0ULL; 6107 inc_nlink(inode); 6108 inode_inc_iversion(inode); 6109 inode->i_ctime = CURRENT_TIME; 6110 ihold(inode); 6111 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6112 6113 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 6114 6115 if (err) { 6116 drop_inode = 1; 6117 } else { 6118 struct dentry *parent = dentry->d_parent; 6119 err = btrfs_update_inode(trans, root, inode); 6120 if (err) 6121 goto fail; 6122 if (inode->i_nlink == 1) { 6123 /* 6124 * If new hard link count is 1, it's a file created 6125 * with open(2) O_TMPFILE flag. 6126 */ 6127 err = btrfs_orphan_del(trans, inode); 6128 if (err) 6129 goto fail; 6130 } 6131 d_instantiate(dentry, inode); 6132 btrfs_log_new_name(trans, inode, NULL, parent); 6133 } 6134 6135 btrfs_end_transaction(trans, root); 6136 btrfs_balance_delayed_items(root); 6137 fail: 6138 if (drop_inode) { 6139 inode_dec_link_count(inode); 6140 iput(inode); 6141 } 6142 btrfs_btree_balance_dirty(root); 6143 return err; 6144 } 6145 6146 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 6147 { 6148 struct inode *inode = NULL; 6149 struct btrfs_trans_handle *trans; 6150 struct btrfs_root *root = BTRFS_I(dir)->root; 6151 int err = 0; 6152 int drop_on_err = 0; 6153 u64 objectid = 0; 6154 u64 index = 0; 6155 6156 /* 6157 * 2 items for inode and ref 6158 * 2 items for dir items 6159 * 1 for xattr if selinux is on 6160 */ 6161 trans = btrfs_start_transaction(root, 5); 6162 if (IS_ERR(trans)) 6163 return PTR_ERR(trans); 6164 6165 err = btrfs_find_free_ino(root, &objectid); 6166 if (err) 6167 goto out_fail; 6168 6169 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6170 dentry->d_name.len, btrfs_ino(dir), objectid, 6171 S_IFDIR | mode, &index); 6172 if (IS_ERR(inode)) { 6173 err = PTR_ERR(inode); 6174 goto out_fail; 6175 } 6176 6177 drop_on_err = 1; 6178 /* these must be set before we unlock the inode */ 6179 inode->i_op = &btrfs_dir_inode_operations; 6180 inode->i_fop = &btrfs_dir_file_operations; 6181 6182 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6183 if (err) 6184 goto out_fail_inode; 6185 6186 btrfs_i_size_write(inode, 0); 6187 err = btrfs_update_inode(trans, root, inode); 6188 if (err) 6189 goto out_fail_inode; 6190 6191 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 6192 dentry->d_name.len, 0, index); 6193 if (err) 6194 goto out_fail_inode; 6195 6196 d_instantiate(dentry, inode); 6197 /* 6198 * mkdir is special. We're unlocking after we call d_instantiate 6199 * to avoid a race with nfsd calling d_instantiate. 6200 */ 6201 unlock_new_inode(inode); 6202 drop_on_err = 0; 6203 6204 out_fail: 6205 btrfs_end_transaction(trans, root); 6206 if (drop_on_err) 6207 iput(inode); 6208 btrfs_balance_delayed_items(root); 6209 btrfs_btree_balance_dirty(root); 6210 return err; 6211 6212 out_fail_inode: 6213 unlock_new_inode(inode); 6214 goto out_fail; 6215 } 6216 6217 /* Find next extent map of a given extent map, caller needs to ensure locks */ 6218 static struct extent_map *next_extent_map(struct extent_map *em) 6219 { 6220 struct rb_node *next; 6221 6222 next = rb_next(&em->rb_node); 6223 if (!next) 6224 return NULL; 6225 return container_of(next, struct extent_map, rb_node); 6226 } 6227 6228 static struct extent_map *prev_extent_map(struct extent_map *em) 6229 { 6230 struct rb_node *prev; 6231 6232 prev = rb_prev(&em->rb_node); 6233 if (!prev) 6234 return NULL; 6235 return container_of(prev, struct extent_map, rb_node); 6236 } 6237 6238 /* helper for btfs_get_extent. Given an existing extent in the tree, 6239 * the existing extent is the nearest extent to map_start, 6240 * and an extent that you want to insert, deal with overlap and insert 6241 * the best fitted new extent into the tree. 6242 */ 6243 static int merge_extent_mapping(struct extent_map_tree *em_tree, 6244 struct extent_map *existing, 6245 struct extent_map *em, 6246 u64 map_start) 6247 { 6248 struct extent_map *prev; 6249 struct extent_map *next; 6250 u64 start; 6251 u64 end; 6252 u64 start_diff; 6253 6254 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 6255 6256 if (existing->start > map_start) { 6257 next = existing; 6258 prev = prev_extent_map(next); 6259 } else { 6260 prev = existing; 6261 next = next_extent_map(prev); 6262 } 6263 6264 start = prev ? extent_map_end(prev) : em->start; 6265 start = max_t(u64, start, em->start); 6266 end = next ? next->start : extent_map_end(em); 6267 end = min_t(u64, end, extent_map_end(em)); 6268 start_diff = start - em->start; 6269 em->start = start; 6270 em->len = end - start; 6271 if (em->block_start < EXTENT_MAP_LAST_BYTE && 6272 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 6273 em->block_start += start_diff; 6274 em->block_len -= start_diff; 6275 } 6276 return add_extent_mapping(em_tree, em, 0); 6277 } 6278 6279 static noinline int uncompress_inline(struct btrfs_path *path, 6280 struct inode *inode, struct page *page, 6281 size_t pg_offset, u64 extent_offset, 6282 struct btrfs_file_extent_item *item) 6283 { 6284 int ret; 6285 struct extent_buffer *leaf = path->nodes[0]; 6286 char *tmp; 6287 size_t max_size; 6288 unsigned long inline_size; 6289 unsigned long ptr; 6290 int compress_type; 6291 6292 WARN_ON(pg_offset != 0); 6293 compress_type = btrfs_file_extent_compression(leaf, item); 6294 max_size = btrfs_file_extent_ram_bytes(leaf, item); 6295 inline_size = btrfs_file_extent_inline_item_len(leaf, 6296 btrfs_item_nr(path->slots[0])); 6297 tmp = kmalloc(inline_size, GFP_NOFS); 6298 if (!tmp) 6299 return -ENOMEM; 6300 ptr = btrfs_file_extent_inline_start(item); 6301 6302 read_extent_buffer(leaf, tmp, ptr, inline_size); 6303 6304 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 6305 ret = btrfs_decompress(compress_type, tmp, page, 6306 extent_offset, inline_size, max_size); 6307 kfree(tmp); 6308 return ret; 6309 } 6310 6311 /* 6312 * a bit scary, this does extent mapping from logical file offset to the disk. 6313 * the ugly parts come from merging extents from the disk with the in-ram 6314 * representation. This gets more complex because of the data=ordered code, 6315 * where the in-ram extents might be locked pending data=ordered completion. 6316 * 6317 * This also copies inline extents directly into the page. 6318 */ 6319 6320 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 6321 size_t pg_offset, u64 start, u64 len, 6322 int create) 6323 { 6324 int ret; 6325 int err = 0; 6326 u64 extent_start = 0; 6327 u64 extent_end = 0; 6328 u64 objectid = btrfs_ino(inode); 6329 u32 found_type; 6330 struct btrfs_path *path = NULL; 6331 struct btrfs_root *root = BTRFS_I(inode)->root; 6332 struct btrfs_file_extent_item *item; 6333 struct extent_buffer *leaf; 6334 struct btrfs_key found_key; 6335 struct extent_map *em = NULL; 6336 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 6337 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6338 struct btrfs_trans_handle *trans = NULL; 6339 const bool new_inline = !page || create; 6340 6341 again: 6342 read_lock(&em_tree->lock); 6343 em = lookup_extent_mapping(em_tree, start, len); 6344 if (em) 6345 em->bdev = root->fs_info->fs_devices->latest_bdev; 6346 read_unlock(&em_tree->lock); 6347 6348 if (em) { 6349 if (em->start > start || em->start + em->len <= start) 6350 free_extent_map(em); 6351 else if (em->block_start == EXTENT_MAP_INLINE && page) 6352 free_extent_map(em); 6353 else 6354 goto out; 6355 } 6356 em = alloc_extent_map(); 6357 if (!em) { 6358 err = -ENOMEM; 6359 goto out; 6360 } 6361 em->bdev = root->fs_info->fs_devices->latest_bdev; 6362 em->start = EXTENT_MAP_HOLE; 6363 em->orig_start = EXTENT_MAP_HOLE; 6364 em->len = (u64)-1; 6365 em->block_len = (u64)-1; 6366 6367 if (!path) { 6368 path = btrfs_alloc_path(); 6369 if (!path) { 6370 err = -ENOMEM; 6371 goto out; 6372 } 6373 /* 6374 * Chances are we'll be called again, so go ahead and do 6375 * readahead 6376 */ 6377 path->reada = 1; 6378 } 6379 6380 ret = btrfs_lookup_file_extent(trans, root, path, 6381 objectid, start, trans != NULL); 6382 if (ret < 0) { 6383 err = ret; 6384 goto out; 6385 } 6386 6387 if (ret != 0) { 6388 if (path->slots[0] == 0) 6389 goto not_found; 6390 path->slots[0]--; 6391 } 6392 6393 leaf = path->nodes[0]; 6394 item = btrfs_item_ptr(leaf, path->slots[0], 6395 struct btrfs_file_extent_item); 6396 /* are we inside the extent that was found? */ 6397 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6398 found_type = found_key.type; 6399 if (found_key.objectid != objectid || 6400 found_type != BTRFS_EXTENT_DATA_KEY) { 6401 /* 6402 * If we backup past the first extent we want to move forward 6403 * and see if there is an extent in front of us, otherwise we'll 6404 * say there is a hole for our whole search range which can 6405 * cause problems. 6406 */ 6407 extent_end = start; 6408 goto next; 6409 } 6410 6411 found_type = btrfs_file_extent_type(leaf, item); 6412 extent_start = found_key.offset; 6413 if (found_type == BTRFS_FILE_EXTENT_REG || 6414 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6415 extent_end = extent_start + 6416 btrfs_file_extent_num_bytes(leaf, item); 6417 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6418 size_t size; 6419 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); 6420 extent_end = ALIGN(extent_start + size, root->sectorsize); 6421 } 6422 next: 6423 if (start >= extent_end) { 6424 path->slots[0]++; 6425 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 6426 ret = btrfs_next_leaf(root, path); 6427 if (ret < 0) { 6428 err = ret; 6429 goto out; 6430 } 6431 if (ret > 0) 6432 goto not_found; 6433 leaf = path->nodes[0]; 6434 } 6435 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6436 if (found_key.objectid != objectid || 6437 found_key.type != BTRFS_EXTENT_DATA_KEY) 6438 goto not_found; 6439 if (start + len <= found_key.offset) 6440 goto not_found; 6441 if (start > found_key.offset) 6442 goto next; 6443 em->start = start; 6444 em->orig_start = start; 6445 em->len = found_key.offset - start; 6446 goto not_found_em; 6447 } 6448 6449 btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); 6450 6451 if (found_type == BTRFS_FILE_EXTENT_REG || 6452 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6453 goto insert; 6454 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6455 unsigned long ptr; 6456 char *map; 6457 size_t size; 6458 size_t extent_offset; 6459 size_t copy_size; 6460 6461 if (new_inline) 6462 goto out; 6463 6464 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); 6465 extent_offset = page_offset(page) + pg_offset - extent_start; 6466 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 6467 size - extent_offset); 6468 em->start = extent_start + extent_offset; 6469 em->len = ALIGN(copy_size, root->sectorsize); 6470 em->orig_block_len = em->len; 6471 em->orig_start = em->start; 6472 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 6473 if (create == 0 && !PageUptodate(page)) { 6474 if (btrfs_file_extent_compression(leaf, item) != 6475 BTRFS_COMPRESS_NONE) { 6476 ret = uncompress_inline(path, inode, page, 6477 pg_offset, 6478 extent_offset, item); 6479 if (ret) { 6480 err = ret; 6481 goto out; 6482 } 6483 } else { 6484 map = kmap(page); 6485 read_extent_buffer(leaf, map + pg_offset, ptr, 6486 copy_size); 6487 if (pg_offset + copy_size < PAGE_CACHE_SIZE) { 6488 memset(map + pg_offset + copy_size, 0, 6489 PAGE_CACHE_SIZE - pg_offset - 6490 copy_size); 6491 } 6492 kunmap(page); 6493 } 6494 flush_dcache_page(page); 6495 } else if (create && PageUptodate(page)) { 6496 BUG(); 6497 if (!trans) { 6498 kunmap(page); 6499 free_extent_map(em); 6500 em = NULL; 6501 6502 btrfs_release_path(path); 6503 trans = btrfs_join_transaction(root); 6504 6505 if (IS_ERR(trans)) 6506 return ERR_CAST(trans); 6507 goto again; 6508 } 6509 map = kmap(page); 6510 write_extent_buffer(leaf, map + pg_offset, ptr, 6511 copy_size); 6512 kunmap(page); 6513 btrfs_mark_buffer_dirty(leaf); 6514 } 6515 set_extent_uptodate(io_tree, em->start, 6516 extent_map_end(em) - 1, NULL, GFP_NOFS); 6517 goto insert; 6518 } 6519 not_found: 6520 em->start = start; 6521 em->orig_start = start; 6522 em->len = len; 6523 not_found_em: 6524 em->block_start = EXTENT_MAP_HOLE; 6525 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 6526 insert: 6527 btrfs_release_path(path); 6528 if (em->start > start || extent_map_end(em) <= start) { 6529 btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", 6530 em->start, em->len, start, len); 6531 err = -EIO; 6532 goto out; 6533 } 6534 6535 err = 0; 6536 write_lock(&em_tree->lock); 6537 ret = add_extent_mapping(em_tree, em, 0); 6538 /* it is possible that someone inserted the extent into the tree 6539 * while we had the lock dropped. It is also possible that 6540 * an overlapping map exists in the tree 6541 */ 6542 if (ret == -EEXIST) { 6543 struct extent_map *existing; 6544 6545 ret = 0; 6546 6547 existing = search_extent_mapping(em_tree, start, len); 6548 /* 6549 * existing will always be non-NULL, since there must be 6550 * extent causing the -EEXIST. 6551 */ 6552 if (start >= extent_map_end(existing) || 6553 start <= existing->start) { 6554 /* 6555 * The existing extent map is the one nearest to 6556 * the [start, start + len) range which overlaps 6557 */ 6558 err = merge_extent_mapping(em_tree, existing, 6559 em, start); 6560 free_extent_map(existing); 6561 if (err) { 6562 free_extent_map(em); 6563 em = NULL; 6564 } 6565 } else { 6566 free_extent_map(em); 6567 em = existing; 6568 err = 0; 6569 } 6570 } 6571 write_unlock(&em_tree->lock); 6572 out: 6573 6574 trace_btrfs_get_extent(root, em); 6575 6576 if (path) 6577 btrfs_free_path(path); 6578 if (trans) { 6579 ret = btrfs_end_transaction(trans, root); 6580 if (!err) 6581 err = ret; 6582 } 6583 if (err) { 6584 free_extent_map(em); 6585 return ERR_PTR(err); 6586 } 6587 BUG_ON(!em); /* Error is always set */ 6588 return em; 6589 } 6590 6591 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 6592 size_t pg_offset, u64 start, u64 len, 6593 int create) 6594 { 6595 struct extent_map *em; 6596 struct extent_map *hole_em = NULL; 6597 u64 range_start = start; 6598 u64 end; 6599 u64 found; 6600 u64 found_end; 6601 int err = 0; 6602 6603 em = btrfs_get_extent(inode, page, pg_offset, start, len, create); 6604 if (IS_ERR(em)) 6605 return em; 6606 if (em) { 6607 /* 6608 * if our em maps to 6609 * - a hole or 6610 * - a pre-alloc extent, 6611 * there might actually be delalloc bytes behind it. 6612 */ 6613 if (em->block_start != EXTENT_MAP_HOLE && 6614 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 6615 return em; 6616 else 6617 hole_em = em; 6618 } 6619 6620 /* check to see if we've wrapped (len == -1 or similar) */ 6621 end = start + len; 6622 if (end < start) 6623 end = (u64)-1; 6624 else 6625 end -= 1; 6626 6627 em = NULL; 6628 6629 /* ok, we didn't find anything, lets look for delalloc */ 6630 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, 6631 end, len, EXTENT_DELALLOC, 1); 6632 found_end = range_start + found; 6633 if (found_end < range_start) 6634 found_end = (u64)-1; 6635 6636 /* 6637 * we didn't find anything useful, return 6638 * the original results from get_extent() 6639 */ 6640 if (range_start > end || found_end <= start) { 6641 em = hole_em; 6642 hole_em = NULL; 6643 goto out; 6644 } 6645 6646 /* adjust the range_start to make sure it doesn't 6647 * go backwards from the start they passed in 6648 */ 6649 range_start = max(start, range_start); 6650 found = found_end - range_start; 6651 6652 if (found > 0) { 6653 u64 hole_start = start; 6654 u64 hole_len = len; 6655 6656 em = alloc_extent_map(); 6657 if (!em) { 6658 err = -ENOMEM; 6659 goto out; 6660 } 6661 /* 6662 * when btrfs_get_extent can't find anything it 6663 * returns one huge hole 6664 * 6665 * make sure what it found really fits our range, and 6666 * adjust to make sure it is based on the start from 6667 * the caller 6668 */ 6669 if (hole_em) { 6670 u64 calc_end = extent_map_end(hole_em); 6671 6672 if (calc_end <= start || (hole_em->start > end)) { 6673 free_extent_map(hole_em); 6674 hole_em = NULL; 6675 } else { 6676 hole_start = max(hole_em->start, start); 6677 hole_len = calc_end - hole_start; 6678 } 6679 } 6680 em->bdev = NULL; 6681 if (hole_em && range_start > hole_start) { 6682 /* our hole starts before our delalloc, so we 6683 * have to return just the parts of the hole 6684 * that go until the delalloc starts 6685 */ 6686 em->len = min(hole_len, 6687 range_start - hole_start); 6688 em->start = hole_start; 6689 em->orig_start = hole_start; 6690 /* 6691 * don't adjust block start at all, 6692 * it is fixed at EXTENT_MAP_HOLE 6693 */ 6694 em->block_start = hole_em->block_start; 6695 em->block_len = hole_len; 6696 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) 6697 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 6698 } else { 6699 em->start = range_start; 6700 em->len = found; 6701 em->orig_start = range_start; 6702 em->block_start = EXTENT_MAP_DELALLOC; 6703 em->block_len = found; 6704 } 6705 } else if (hole_em) { 6706 return hole_em; 6707 } 6708 out: 6709 6710 free_extent_map(hole_em); 6711 if (err) { 6712 free_extent_map(em); 6713 return ERR_PTR(err); 6714 } 6715 return em; 6716 } 6717 6718 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 6719 u64 start, u64 len) 6720 { 6721 struct btrfs_root *root = BTRFS_I(inode)->root; 6722 struct extent_map *em; 6723 struct btrfs_key ins; 6724 u64 alloc_hint; 6725 int ret; 6726 6727 alloc_hint = get_extent_allocation_hint(inode, start, len); 6728 ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, 6729 alloc_hint, &ins, 1, 1); 6730 if (ret) 6731 return ERR_PTR(ret); 6732 6733 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, 6734 ins.offset, ins.offset, ins.offset, 0); 6735 if (IS_ERR(em)) { 6736 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 6737 return em; 6738 } 6739 6740 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 6741 ins.offset, ins.offset, 0); 6742 if (ret) { 6743 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 6744 free_extent_map(em); 6745 return ERR_PTR(ret); 6746 } 6747 6748 return em; 6749 } 6750 6751 /* 6752 * returns 1 when the nocow is safe, < 1 on error, 0 if the 6753 * block must be cow'd 6754 */ 6755 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, 6756 u64 *orig_start, u64 *orig_block_len, 6757 u64 *ram_bytes) 6758 { 6759 struct btrfs_trans_handle *trans; 6760 struct btrfs_path *path; 6761 int ret; 6762 struct extent_buffer *leaf; 6763 struct btrfs_root *root = BTRFS_I(inode)->root; 6764 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6765 struct btrfs_file_extent_item *fi; 6766 struct btrfs_key key; 6767 u64 disk_bytenr; 6768 u64 backref_offset; 6769 u64 extent_end; 6770 u64 num_bytes; 6771 int slot; 6772 int found_type; 6773 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); 6774 6775 path = btrfs_alloc_path(); 6776 if (!path) 6777 return -ENOMEM; 6778 6779 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), 6780 offset, 0); 6781 if (ret < 0) 6782 goto out; 6783 6784 slot = path->slots[0]; 6785 if (ret == 1) { 6786 if (slot == 0) { 6787 /* can't find the item, must cow */ 6788 ret = 0; 6789 goto out; 6790 } 6791 slot--; 6792 } 6793 ret = 0; 6794 leaf = path->nodes[0]; 6795 btrfs_item_key_to_cpu(leaf, &key, slot); 6796 if (key.objectid != btrfs_ino(inode) || 6797 key.type != BTRFS_EXTENT_DATA_KEY) { 6798 /* not our file or wrong item type, must cow */ 6799 goto out; 6800 } 6801 6802 if (key.offset > offset) { 6803 /* Wrong offset, must cow */ 6804 goto out; 6805 } 6806 6807 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 6808 found_type = btrfs_file_extent_type(leaf, fi); 6809 if (found_type != BTRFS_FILE_EXTENT_REG && 6810 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 6811 /* not a regular extent, must cow */ 6812 goto out; 6813 } 6814 6815 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) 6816 goto out; 6817 6818 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 6819 if (extent_end <= offset) 6820 goto out; 6821 6822 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 6823 if (disk_bytenr == 0) 6824 goto out; 6825 6826 if (btrfs_file_extent_compression(leaf, fi) || 6827 btrfs_file_extent_encryption(leaf, fi) || 6828 btrfs_file_extent_other_encoding(leaf, fi)) 6829 goto out; 6830 6831 backref_offset = btrfs_file_extent_offset(leaf, fi); 6832 6833 if (orig_start) { 6834 *orig_start = key.offset - backref_offset; 6835 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 6836 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 6837 } 6838 6839 if (btrfs_extent_readonly(root, disk_bytenr)) 6840 goto out; 6841 6842 num_bytes = min(offset + *len, extent_end) - offset; 6843 if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6844 u64 range_end; 6845 6846 range_end = round_up(offset + num_bytes, root->sectorsize) - 1; 6847 ret = test_range_bit(io_tree, offset, range_end, 6848 EXTENT_DELALLOC, 0, NULL); 6849 if (ret) { 6850 ret = -EAGAIN; 6851 goto out; 6852 } 6853 } 6854 6855 btrfs_release_path(path); 6856 6857 /* 6858 * look for other files referencing this extent, if we 6859 * find any we must cow 6860 */ 6861 trans = btrfs_join_transaction(root); 6862 if (IS_ERR(trans)) { 6863 ret = 0; 6864 goto out; 6865 } 6866 6867 ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), 6868 key.offset - backref_offset, disk_bytenr); 6869 btrfs_end_transaction(trans, root); 6870 if (ret) { 6871 ret = 0; 6872 goto out; 6873 } 6874 6875 /* 6876 * adjust disk_bytenr and num_bytes to cover just the bytes 6877 * in this extent we are about to write. If there 6878 * are any csums in that range we have to cow in order 6879 * to keep the csums correct 6880 */ 6881 disk_bytenr += backref_offset; 6882 disk_bytenr += offset - key.offset; 6883 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 6884 goto out; 6885 /* 6886 * all of the above have passed, it is safe to overwrite this extent 6887 * without cow 6888 */ 6889 *len = num_bytes; 6890 ret = 1; 6891 out: 6892 btrfs_free_path(path); 6893 return ret; 6894 } 6895 6896 bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) 6897 { 6898 struct radix_tree_root *root = &inode->i_mapping->page_tree; 6899 int found = false; 6900 void **pagep = NULL; 6901 struct page *page = NULL; 6902 int start_idx; 6903 int end_idx; 6904 6905 start_idx = start >> PAGE_CACHE_SHIFT; 6906 6907 /* 6908 * end is the last byte in the last page. end == start is legal 6909 */ 6910 end_idx = end >> PAGE_CACHE_SHIFT; 6911 6912 rcu_read_lock(); 6913 6914 /* Most of the code in this while loop is lifted from 6915 * find_get_page. It's been modified to begin searching from a 6916 * page and return just the first page found in that range. If the 6917 * found idx is less than or equal to the end idx then we know that 6918 * a page exists. If no pages are found or if those pages are 6919 * outside of the range then we're fine (yay!) */ 6920 while (page == NULL && 6921 radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { 6922 page = radix_tree_deref_slot(pagep); 6923 if (unlikely(!page)) 6924 break; 6925 6926 if (radix_tree_exception(page)) { 6927 if (radix_tree_deref_retry(page)) { 6928 page = NULL; 6929 continue; 6930 } 6931 /* 6932 * Otherwise, shmem/tmpfs must be storing a swap entry 6933 * here as an exceptional entry: so return it without 6934 * attempting to raise page count. 6935 */ 6936 page = NULL; 6937 break; /* TODO: Is this relevant for this use case? */ 6938 } 6939 6940 if (!page_cache_get_speculative(page)) { 6941 page = NULL; 6942 continue; 6943 } 6944 6945 /* 6946 * Has the page moved? 6947 * This is part of the lockless pagecache protocol. See 6948 * include/linux/pagemap.h for details. 6949 */ 6950 if (unlikely(page != *pagep)) { 6951 page_cache_release(page); 6952 page = NULL; 6953 } 6954 } 6955 6956 if (page) { 6957 if (page->index <= end_idx) 6958 found = true; 6959 page_cache_release(page); 6960 } 6961 6962 rcu_read_unlock(); 6963 return found; 6964 } 6965 6966 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 6967 struct extent_state **cached_state, int writing) 6968 { 6969 struct btrfs_ordered_extent *ordered; 6970 int ret = 0; 6971 6972 while (1) { 6973 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6974 0, cached_state); 6975 /* 6976 * We're concerned with the entire range that we're going to be 6977 * doing DIO to, so we need to make sure theres no ordered 6978 * extents in this range. 6979 */ 6980 ordered = btrfs_lookup_ordered_range(inode, lockstart, 6981 lockend - lockstart + 1); 6982 6983 /* 6984 * We need to make sure there are no buffered pages in this 6985 * range either, we could have raced between the invalidate in 6986 * generic_file_direct_write and locking the extent. The 6987 * invalidate needs to happen so that reads after a write do not 6988 * get stale data. 6989 */ 6990 if (!ordered && 6991 (!writing || 6992 !btrfs_page_exists_in_range(inode, lockstart, lockend))) 6993 break; 6994 6995 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6996 cached_state, GFP_NOFS); 6997 6998 if (ordered) { 6999 btrfs_start_ordered_extent(inode, ordered, 1); 7000 btrfs_put_ordered_extent(ordered); 7001 } else { 7002 /* Screw you mmap */ 7003 ret = filemap_write_and_wait_range(inode->i_mapping, 7004 lockstart, 7005 lockend); 7006 if (ret) 7007 break; 7008 7009 /* 7010 * If we found a page that couldn't be invalidated just 7011 * fall back to buffered. 7012 */ 7013 ret = invalidate_inode_pages2_range(inode->i_mapping, 7014 lockstart >> PAGE_CACHE_SHIFT, 7015 lockend >> PAGE_CACHE_SHIFT); 7016 if (ret) 7017 break; 7018 } 7019 7020 cond_resched(); 7021 } 7022 7023 return ret; 7024 } 7025 7026 static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 7027 u64 len, u64 orig_start, 7028 u64 block_start, u64 block_len, 7029 u64 orig_block_len, u64 ram_bytes, 7030 int type) 7031 { 7032 struct extent_map_tree *em_tree; 7033 struct extent_map *em; 7034 struct btrfs_root *root = BTRFS_I(inode)->root; 7035 int ret; 7036 7037 em_tree = &BTRFS_I(inode)->extent_tree; 7038 em = alloc_extent_map(); 7039 if (!em) 7040 return ERR_PTR(-ENOMEM); 7041 7042 em->start = start; 7043 em->orig_start = orig_start; 7044 em->mod_start = start; 7045 em->mod_len = len; 7046 em->len = len; 7047 em->block_len = block_len; 7048 em->block_start = block_start; 7049 em->bdev = root->fs_info->fs_devices->latest_bdev; 7050 em->orig_block_len = orig_block_len; 7051 em->ram_bytes = ram_bytes; 7052 em->generation = -1; 7053 set_bit(EXTENT_FLAG_PINNED, &em->flags); 7054 if (type == BTRFS_ORDERED_PREALLOC) 7055 set_bit(EXTENT_FLAG_FILLING, &em->flags); 7056 7057 do { 7058 btrfs_drop_extent_cache(inode, em->start, 7059 em->start + em->len - 1, 0); 7060 write_lock(&em_tree->lock); 7061 ret = add_extent_mapping(em_tree, em, 1); 7062 write_unlock(&em_tree->lock); 7063 } while (ret == -EEXIST); 7064 7065 if (ret) { 7066 free_extent_map(em); 7067 return ERR_PTR(ret); 7068 } 7069 7070 return em; 7071 } 7072 7073 7074 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 7075 struct buffer_head *bh_result, int create) 7076 { 7077 struct extent_map *em; 7078 struct btrfs_root *root = BTRFS_I(inode)->root; 7079 struct extent_state *cached_state = NULL; 7080 u64 start = iblock << inode->i_blkbits; 7081 u64 lockstart, lockend; 7082 u64 len = bh_result->b_size; 7083 int unlock_bits = EXTENT_LOCKED; 7084 int ret = 0; 7085 7086 if (create) 7087 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; 7088 else 7089 len = min_t(u64, len, root->sectorsize); 7090 7091 lockstart = start; 7092 lockend = start + len - 1; 7093 7094 /* 7095 * If this errors out it's because we couldn't invalidate pagecache for 7096 * this range and we need to fallback to buffered. 7097 */ 7098 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) 7099 return -ENOTBLK; 7100 7101 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 7102 if (IS_ERR(em)) { 7103 ret = PTR_ERR(em); 7104 goto unlock_err; 7105 } 7106 7107 /* 7108 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 7109 * io. INLINE is special, and we could probably kludge it in here, but 7110 * it's still buffered so for safety lets just fall back to the generic 7111 * buffered path. 7112 * 7113 * For COMPRESSED we _have_ to read the entire extent in so we can 7114 * decompress it, so there will be buffering required no matter what we 7115 * do, so go ahead and fallback to buffered. 7116 * 7117 * We return -ENOTBLK because thats what makes DIO go ahead and go back 7118 * to buffered IO. Don't blame me, this is the price we pay for using 7119 * the generic code. 7120 */ 7121 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 7122 em->block_start == EXTENT_MAP_INLINE) { 7123 free_extent_map(em); 7124 ret = -ENOTBLK; 7125 goto unlock_err; 7126 } 7127 7128 /* Just a good old fashioned hole, return */ 7129 if (!create && (em->block_start == EXTENT_MAP_HOLE || 7130 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 7131 free_extent_map(em); 7132 goto unlock_err; 7133 } 7134 7135 /* 7136 * We don't allocate a new extent in the following cases 7137 * 7138 * 1) The inode is marked as NODATACOW. In this case we'll just use the 7139 * existing extent. 7140 * 2) The extent is marked as PREALLOC. We're good to go here and can 7141 * just use the extent. 7142 * 7143 */ 7144 if (!create) { 7145 len = min(len, em->len - (start - em->start)); 7146 lockstart = start + len; 7147 goto unlock; 7148 } 7149 7150 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 7151 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 7152 em->block_start != EXTENT_MAP_HOLE)) { 7153 int type; 7154 int ret; 7155 u64 block_start, orig_start, orig_block_len, ram_bytes; 7156 7157 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7158 type = BTRFS_ORDERED_PREALLOC; 7159 else 7160 type = BTRFS_ORDERED_NOCOW; 7161 len = min(len, em->len - (start - em->start)); 7162 block_start = em->block_start + (start - em->start); 7163 7164 if (can_nocow_extent(inode, start, &len, &orig_start, 7165 &orig_block_len, &ram_bytes) == 1) { 7166 if (type == BTRFS_ORDERED_PREALLOC) { 7167 free_extent_map(em); 7168 em = create_pinned_em(inode, start, len, 7169 orig_start, 7170 block_start, len, 7171 orig_block_len, 7172 ram_bytes, type); 7173 if (IS_ERR(em)) { 7174 ret = PTR_ERR(em); 7175 goto unlock_err; 7176 } 7177 } 7178 7179 ret = btrfs_add_ordered_extent_dio(inode, start, 7180 block_start, len, len, type); 7181 if (ret) { 7182 free_extent_map(em); 7183 goto unlock_err; 7184 } 7185 goto unlock; 7186 } 7187 } 7188 7189 /* 7190 * this will cow the extent, reset the len in case we changed 7191 * it above 7192 */ 7193 len = bh_result->b_size; 7194 free_extent_map(em); 7195 em = btrfs_new_extent_direct(inode, start, len); 7196 if (IS_ERR(em)) { 7197 ret = PTR_ERR(em); 7198 goto unlock_err; 7199 } 7200 len = min(len, em->len - (start - em->start)); 7201 unlock: 7202 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 7203 inode->i_blkbits; 7204 bh_result->b_size = len; 7205 bh_result->b_bdev = em->bdev; 7206 set_buffer_mapped(bh_result); 7207 if (create) { 7208 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7209 set_buffer_new(bh_result); 7210 7211 /* 7212 * Need to update the i_size under the extent lock so buffered 7213 * readers will get the updated i_size when we unlock. 7214 */ 7215 if (start + len > i_size_read(inode)) 7216 i_size_write(inode, start + len); 7217 7218 spin_lock(&BTRFS_I(inode)->lock); 7219 BTRFS_I(inode)->outstanding_extents++; 7220 spin_unlock(&BTRFS_I(inode)->lock); 7221 7222 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 7223 lockstart + len - 1, EXTENT_DELALLOC, NULL, 7224 &cached_state, GFP_NOFS); 7225 BUG_ON(ret); 7226 } 7227 7228 /* 7229 * In the case of write we need to clear and unlock the entire range, 7230 * in the case of read we need to unlock only the end area that we 7231 * aren't using if there is any left over space. 7232 */ 7233 if (lockstart < lockend) { 7234 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 7235 lockend, unlock_bits, 1, 0, 7236 &cached_state, GFP_NOFS); 7237 } else { 7238 free_extent_state(cached_state); 7239 } 7240 7241 free_extent_map(em); 7242 7243 return 0; 7244 7245 unlock_err: 7246 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7247 unlock_bits, 1, 0, &cached_state, GFP_NOFS); 7248 return ret; 7249 } 7250 7251 static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, 7252 int rw, int mirror_num) 7253 { 7254 struct btrfs_root *root = BTRFS_I(inode)->root; 7255 int ret; 7256 7257 BUG_ON(rw & REQ_WRITE); 7258 7259 bio_get(bio); 7260 7261 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 7262 BTRFS_WQ_ENDIO_DIO_REPAIR); 7263 if (ret) 7264 goto err; 7265 7266 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); 7267 err: 7268 bio_put(bio); 7269 return ret; 7270 } 7271 7272 static int btrfs_check_dio_repairable(struct inode *inode, 7273 struct bio *failed_bio, 7274 struct io_failure_record *failrec, 7275 int failed_mirror) 7276 { 7277 int num_copies; 7278 7279 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, 7280 failrec->logical, failrec->len); 7281 if (num_copies == 1) { 7282 /* 7283 * we only have a single copy of the data, so don't bother with 7284 * all the retry and error correction code that follows. no 7285 * matter what the error is, it is very likely to persist. 7286 */ 7287 pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", 7288 num_copies, failrec->this_mirror, failed_mirror); 7289 return 0; 7290 } 7291 7292 failrec->failed_mirror = failed_mirror; 7293 failrec->this_mirror++; 7294 if (failrec->this_mirror == failed_mirror) 7295 failrec->this_mirror++; 7296 7297 if (failrec->this_mirror > num_copies) { 7298 pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", 7299 num_copies, failrec->this_mirror, failed_mirror); 7300 return 0; 7301 } 7302 7303 return 1; 7304 } 7305 7306 static int dio_read_error(struct inode *inode, struct bio *failed_bio, 7307 struct page *page, u64 start, u64 end, 7308 int failed_mirror, bio_end_io_t *repair_endio, 7309 void *repair_arg) 7310 { 7311 struct io_failure_record *failrec; 7312 struct bio *bio; 7313 int isector; 7314 int read_mode; 7315 int ret; 7316 7317 BUG_ON(failed_bio->bi_rw & REQ_WRITE); 7318 7319 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 7320 if (ret) 7321 return ret; 7322 7323 ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, 7324 failed_mirror); 7325 if (!ret) { 7326 free_io_failure(inode, failrec); 7327 return -EIO; 7328 } 7329 7330 if (failed_bio->bi_vcnt > 1) 7331 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 7332 else 7333 read_mode = READ_SYNC; 7334 7335 isector = start - btrfs_io_bio(failed_bio)->logical; 7336 isector >>= inode->i_sb->s_blocksize_bits; 7337 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 7338 0, isector, repair_endio, repair_arg); 7339 if (!bio) { 7340 free_io_failure(inode, failrec); 7341 return -EIO; 7342 } 7343 7344 btrfs_debug(BTRFS_I(inode)->root->fs_info, 7345 "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", 7346 read_mode, failrec->this_mirror, failrec->in_validation); 7347 7348 ret = submit_dio_repair_bio(inode, bio, read_mode, 7349 failrec->this_mirror); 7350 if (ret) { 7351 free_io_failure(inode, failrec); 7352 bio_put(bio); 7353 } 7354 7355 return ret; 7356 } 7357 7358 struct btrfs_retry_complete { 7359 struct completion done; 7360 struct inode *inode; 7361 u64 start; 7362 int uptodate; 7363 }; 7364 7365 static void btrfs_retry_endio_nocsum(struct bio *bio, int err) 7366 { 7367 struct btrfs_retry_complete *done = bio->bi_private; 7368 struct bio_vec *bvec; 7369 int i; 7370 7371 if (err) 7372 goto end; 7373 7374 done->uptodate = 1; 7375 bio_for_each_segment_all(bvec, bio, i) 7376 clean_io_failure(done->inode, done->start, bvec->bv_page, 0); 7377 end: 7378 complete(&done->done); 7379 bio_put(bio); 7380 } 7381 7382 static int __btrfs_correct_data_nocsum(struct inode *inode, 7383 struct btrfs_io_bio *io_bio) 7384 { 7385 struct bio_vec *bvec; 7386 struct btrfs_retry_complete done; 7387 u64 start; 7388 int i; 7389 int ret; 7390 7391 start = io_bio->logical; 7392 done.inode = inode; 7393 7394 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 7395 try_again: 7396 done.uptodate = 0; 7397 done.start = start; 7398 init_completion(&done.done); 7399 7400 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, 7401 start + bvec->bv_len - 1, 7402 io_bio->mirror_num, 7403 btrfs_retry_endio_nocsum, &done); 7404 if (ret) 7405 return ret; 7406 7407 wait_for_completion(&done.done); 7408 7409 if (!done.uptodate) { 7410 /* We might have another mirror, so try again */ 7411 goto try_again; 7412 } 7413 7414 start += bvec->bv_len; 7415 } 7416 7417 return 0; 7418 } 7419 7420 static void btrfs_retry_endio(struct bio *bio, int err) 7421 { 7422 struct btrfs_retry_complete *done = bio->bi_private; 7423 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 7424 struct bio_vec *bvec; 7425 int uptodate; 7426 int ret; 7427 int i; 7428 7429 if (err) 7430 goto end; 7431 7432 uptodate = 1; 7433 bio_for_each_segment_all(bvec, bio, i) { 7434 ret = __readpage_endio_check(done->inode, io_bio, i, 7435 bvec->bv_page, 0, 7436 done->start, bvec->bv_len); 7437 if (!ret) 7438 clean_io_failure(done->inode, done->start, 7439 bvec->bv_page, 0); 7440 else 7441 uptodate = 0; 7442 } 7443 7444 done->uptodate = uptodate; 7445 end: 7446 complete(&done->done); 7447 bio_put(bio); 7448 } 7449 7450 static int __btrfs_subio_endio_read(struct inode *inode, 7451 struct btrfs_io_bio *io_bio, int err) 7452 { 7453 struct bio_vec *bvec; 7454 struct btrfs_retry_complete done; 7455 u64 start; 7456 u64 offset = 0; 7457 int i; 7458 int ret; 7459 7460 err = 0; 7461 start = io_bio->logical; 7462 done.inode = inode; 7463 7464 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 7465 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, 7466 0, start, bvec->bv_len); 7467 if (likely(!ret)) 7468 goto next; 7469 try_again: 7470 done.uptodate = 0; 7471 done.start = start; 7472 init_completion(&done.done); 7473 7474 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, 7475 start + bvec->bv_len - 1, 7476 io_bio->mirror_num, 7477 btrfs_retry_endio, &done); 7478 if (ret) { 7479 err = ret; 7480 goto next; 7481 } 7482 7483 wait_for_completion(&done.done); 7484 7485 if (!done.uptodate) { 7486 /* We might have another mirror, so try again */ 7487 goto try_again; 7488 } 7489 next: 7490 offset += bvec->bv_len; 7491 start += bvec->bv_len; 7492 } 7493 7494 return err; 7495 } 7496 7497 static int btrfs_subio_endio_read(struct inode *inode, 7498 struct btrfs_io_bio *io_bio, int err) 7499 { 7500 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 7501 7502 if (skip_csum) { 7503 if (unlikely(err)) 7504 return __btrfs_correct_data_nocsum(inode, io_bio); 7505 else 7506 return 0; 7507 } else { 7508 return __btrfs_subio_endio_read(inode, io_bio, err); 7509 } 7510 } 7511 7512 static void btrfs_endio_direct_read(struct bio *bio, int err) 7513 { 7514 struct btrfs_dio_private *dip = bio->bi_private; 7515 struct inode *inode = dip->inode; 7516 struct bio *dio_bio; 7517 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 7518 7519 if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) 7520 err = btrfs_subio_endio_read(inode, io_bio, err); 7521 7522 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 7523 dip->logical_offset + dip->bytes - 1); 7524 dio_bio = dip->dio_bio; 7525 7526 kfree(dip); 7527 7528 /* If we had a csum failure make sure to clear the uptodate flag */ 7529 if (err) 7530 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); 7531 dio_end_io(dio_bio, err); 7532 7533 if (io_bio->end_io) 7534 io_bio->end_io(io_bio, err); 7535 bio_put(bio); 7536 } 7537 7538 static void btrfs_endio_direct_write(struct bio *bio, int err) 7539 { 7540 struct btrfs_dio_private *dip = bio->bi_private; 7541 struct inode *inode = dip->inode; 7542 struct btrfs_root *root = BTRFS_I(inode)->root; 7543 struct btrfs_ordered_extent *ordered = NULL; 7544 u64 ordered_offset = dip->logical_offset; 7545 u64 ordered_bytes = dip->bytes; 7546 struct bio *dio_bio; 7547 int ret; 7548 7549 if (err) 7550 goto out_done; 7551 again: 7552 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 7553 &ordered_offset, 7554 ordered_bytes, !err); 7555 if (!ret) 7556 goto out_test; 7557 7558 btrfs_init_work(&ordered->work, btrfs_endio_write_helper, 7559 finish_ordered_fn, NULL, NULL); 7560 btrfs_queue_work(root->fs_info->endio_write_workers, 7561 &ordered->work); 7562 out_test: 7563 /* 7564 * our bio might span multiple ordered extents. If we haven't 7565 * completed the accounting for the whole dio, go back and try again 7566 */ 7567 if (ordered_offset < dip->logical_offset + dip->bytes) { 7568 ordered_bytes = dip->logical_offset + dip->bytes - 7569 ordered_offset; 7570 ordered = NULL; 7571 goto again; 7572 } 7573 out_done: 7574 dio_bio = dip->dio_bio; 7575 7576 kfree(dip); 7577 7578 /* If we had an error make sure to clear the uptodate flag */ 7579 if (err) 7580 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); 7581 dio_end_io(dio_bio, err); 7582 bio_put(bio); 7583 } 7584 7585 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, 7586 struct bio *bio, int mirror_num, 7587 unsigned long bio_flags, u64 offset) 7588 { 7589 int ret; 7590 struct btrfs_root *root = BTRFS_I(inode)->root; 7591 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); 7592 BUG_ON(ret); /* -ENOMEM */ 7593 return 0; 7594 } 7595 7596 static void btrfs_end_dio_bio(struct bio *bio, int err) 7597 { 7598 struct btrfs_dio_private *dip = bio->bi_private; 7599 7600 if (err) 7601 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, 7602 "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", 7603 btrfs_ino(dip->inode), bio->bi_rw, 7604 (unsigned long long)bio->bi_iter.bi_sector, 7605 bio->bi_iter.bi_size, err); 7606 7607 if (dip->subio_endio) 7608 err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); 7609 7610 if (err) { 7611 dip->errors = 1; 7612 7613 /* 7614 * before atomic variable goto zero, we must make sure 7615 * dip->errors is perceived to be set. 7616 */ 7617 smp_mb__before_atomic(); 7618 } 7619 7620 /* if there are more bios still pending for this dio, just exit */ 7621 if (!atomic_dec_and_test(&dip->pending_bios)) 7622 goto out; 7623 7624 if (dip->errors) { 7625 bio_io_error(dip->orig_bio); 7626 } else { 7627 set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); 7628 bio_endio(dip->orig_bio, 0); 7629 } 7630 out: 7631 bio_put(bio); 7632 } 7633 7634 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, 7635 u64 first_sector, gfp_t gfp_flags) 7636 { 7637 int nr_vecs = bio_get_nr_vecs(bdev); 7638 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 7639 } 7640 7641 static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, 7642 struct inode *inode, 7643 struct btrfs_dio_private *dip, 7644 struct bio *bio, 7645 u64 file_offset) 7646 { 7647 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 7648 struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); 7649 int ret; 7650 7651 /* 7652 * We load all the csum data we need when we submit 7653 * the first bio to reduce the csum tree search and 7654 * contention. 7655 */ 7656 if (dip->logical_offset == file_offset) { 7657 ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio, 7658 file_offset); 7659 if (ret) 7660 return ret; 7661 } 7662 7663 if (bio == dip->orig_bio) 7664 return 0; 7665 7666 file_offset -= dip->logical_offset; 7667 file_offset >>= inode->i_sb->s_blocksize_bits; 7668 io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset); 7669 7670 return 0; 7671 } 7672 7673 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 7674 int rw, u64 file_offset, int skip_sum, 7675 int async_submit) 7676 { 7677 struct btrfs_dio_private *dip = bio->bi_private; 7678 int write = rw & REQ_WRITE; 7679 struct btrfs_root *root = BTRFS_I(inode)->root; 7680 int ret; 7681 7682 if (async_submit) 7683 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); 7684 7685 bio_get(bio); 7686 7687 if (!write) { 7688 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 7689 BTRFS_WQ_ENDIO_DATA); 7690 if (ret) 7691 goto err; 7692 } 7693 7694 if (skip_sum) 7695 goto map; 7696 7697 if (write && async_submit) { 7698 ret = btrfs_wq_submit_bio(root->fs_info, 7699 inode, rw, bio, 0, 0, 7700 file_offset, 7701 __btrfs_submit_bio_start_direct_io, 7702 __btrfs_submit_bio_done); 7703 goto err; 7704 } else if (write) { 7705 /* 7706 * If we aren't doing async submit, calculate the csum of the 7707 * bio now. 7708 */ 7709 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 7710 if (ret) 7711 goto err; 7712 } else { 7713 ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio, 7714 file_offset); 7715 if (ret) 7716 goto err; 7717 } 7718 map: 7719 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 7720 err: 7721 bio_put(bio); 7722 return ret; 7723 } 7724 7725 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, 7726 int skip_sum) 7727 { 7728 struct inode *inode = dip->inode; 7729 struct btrfs_root *root = BTRFS_I(inode)->root; 7730 struct bio *bio; 7731 struct bio *orig_bio = dip->orig_bio; 7732 struct bio_vec *bvec = orig_bio->bi_io_vec; 7733 u64 start_sector = orig_bio->bi_iter.bi_sector; 7734 u64 file_offset = dip->logical_offset; 7735 u64 submit_len = 0; 7736 u64 map_length; 7737 int nr_pages = 0; 7738 int ret; 7739 int async_submit = 0; 7740 7741 map_length = orig_bio->bi_iter.bi_size; 7742 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, 7743 &map_length, NULL, 0); 7744 if (ret) 7745 return -EIO; 7746 7747 if (map_length >= orig_bio->bi_iter.bi_size) { 7748 bio = orig_bio; 7749 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; 7750 goto submit; 7751 } 7752 7753 /* async crcs make it difficult to collect full stripe writes. */ 7754 if (btrfs_get_alloc_profile(root, 1) & 7755 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) 7756 async_submit = 0; 7757 else 7758 async_submit = 1; 7759 7760 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 7761 if (!bio) 7762 return -ENOMEM; 7763 7764 bio->bi_private = dip; 7765 bio->bi_end_io = btrfs_end_dio_bio; 7766 btrfs_io_bio(bio)->logical = file_offset; 7767 atomic_inc(&dip->pending_bios); 7768 7769 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 7770 if (map_length < submit_len + bvec->bv_len || 7771 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 7772 bvec->bv_offset) < bvec->bv_len) { 7773 /* 7774 * inc the count before we submit the bio so 7775 * we know the end IO handler won't happen before 7776 * we inc the count. Otherwise, the dip might get freed 7777 * before we're done setting it up 7778 */ 7779 atomic_inc(&dip->pending_bios); 7780 ret = __btrfs_submit_dio_bio(bio, inode, rw, 7781 file_offset, skip_sum, 7782 async_submit); 7783 if (ret) { 7784 bio_put(bio); 7785 atomic_dec(&dip->pending_bios); 7786 goto out_err; 7787 } 7788 7789 start_sector += submit_len >> 9; 7790 file_offset += submit_len; 7791 7792 submit_len = 0; 7793 nr_pages = 0; 7794 7795 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 7796 start_sector, GFP_NOFS); 7797 if (!bio) 7798 goto out_err; 7799 bio->bi_private = dip; 7800 bio->bi_end_io = btrfs_end_dio_bio; 7801 btrfs_io_bio(bio)->logical = file_offset; 7802 7803 map_length = orig_bio->bi_iter.bi_size; 7804 ret = btrfs_map_block(root->fs_info, rw, 7805 start_sector << 9, 7806 &map_length, NULL, 0); 7807 if (ret) { 7808 bio_put(bio); 7809 goto out_err; 7810 } 7811 } else { 7812 submit_len += bvec->bv_len; 7813 nr_pages++; 7814 bvec++; 7815 } 7816 } 7817 7818 submit: 7819 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 7820 async_submit); 7821 if (!ret) 7822 return 0; 7823 7824 bio_put(bio); 7825 out_err: 7826 dip->errors = 1; 7827 /* 7828 * before atomic variable goto zero, we must 7829 * make sure dip->errors is perceived to be set. 7830 */ 7831 smp_mb__before_atomic(); 7832 if (atomic_dec_and_test(&dip->pending_bios)) 7833 bio_io_error(dip->orig_bio); 7834 7835 /* bio_end_io() will handle error, so we needn't return it */ 7836 return 0; 7837 } 7838 7839 static void btrfs_submit_direct(int rw, struct bio *dio_bio, 7840 struct inode *inode, loff_t file_offset) 7841 { 7842 struct btrfs_root *root = BTRFS_I(inode)->root; 7843 struct btrfs_dio_private *dip; 7844 struct bio *io_bio; 7845 struct btrfs_io_bio *btrfs_bio; 7846 int skip_sum; 7847 int write = rw & REQ_WRITE; 7848 int ret = 0; 7849 7850 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 7851 7852 io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); 7853 if (!io_bio) { 7854 ret = -ENOMEM; 7855 goto free_ordered; 7856 } 7857 7858 dip = kzalloc(sizeof(*dip), GFP_NOFS); 7859 if (!dip) { 7860 ret = -ENOMEM; 7861 goto free_io_bio; 7862 } 7863 7864 dip->private = dio_bio->bi_private; 7865 dip->inode = inode; 7866 dip->logical_offset = file_offset; 7867 dip->bytes = dio_bio->bi_iter.bi_size; 7868 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; 7869 io_bio->bi_private = dip; 7870 dip->orig_bio = io_bio; 7871 dip->dio_bio = dio_bio; 7872 atomic_set(&dip->pending_bios, 0); 7873 btrfs_bio = btrfs_io_bio(io_bio); 7874 btrfs_bio->logical = file_offset; 7875 7876 if (write) { 7877 io_bio->bi_end_io = btrfs_endio_direct_write; 7878 } else { 7879 io_bio->bi_end_io = btrfs_endio_direct_read; 7880 dip->subio_endio = btrfs_subio_endio_read; 7881 } 7882 7883 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 7884 if (!ret) 7885 return; 7886 7887 if (btrfs_bio->end_io) 7888 btrfs_bio->end_io(btrfs_bio, ret); 7889 free_io_bio: 7890 bio_put(io_bio); 7891 7892 free_ordered: 7893 /* 7894 * If this is a write, we need to clean up the reserved space and kill 7895 * the ordered extent. 7896 */ 7897 if (write) { 7898 struct btrfs_ordered_extent *ordered; 7899 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 7900 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 7901 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 7902 btrfs_free_reserved_extent(root, ordered->start, 7903 ordered->disk_len, 1); 7904 btrfs_put_ordered_extent(ordered); 7905 btrfs_put_ordered_extent(ordered); 7906 } 7907 bio_endio(dio_bio, ret); 7908 } 7909 7910 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, 7911 const struct iov_iter *iter, loff_t offset) 7912 { 7913 int seg; 7914 int i; 7915 unsigned blocksize_mask = root->sectorsize - 1; 7916 ssize_t retval = -EINVAL; 7917 7918 if (offset & blocksize_mask) 7919 goto out; 7920 7921 if (iov_iter_alignment(iter) & blocksize_mask) 7922 goto out; 7923 7924 /* If this is a write we don't need to check anymore */ 7925 if (rw & WRITE) 7926 return 0; 7927 /* 7928 * Check to make sure we don't have duplicate iov_base's in this 7929 * iovec, if so return EINVAL, otherwise we'll get csum errors 7930 * when reading back. 7931 */ 7932 for (seg = 0; seg < iter->nr_segs; seg++) { 7933 for (i = seg + 1; i < iter->nr_segs; i++) { 7934 if (iter->iov[seg].iov_base == iter->iov[i].iov_base) 7935 goto out; 7936 } 7937 } 7938 retval = 0; 7939 out: 7940 return retval; 7941 } 7942 7943 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 7944 struct iov_iter *iter, loff_t offset) 7945 { 7946 struct file *file = iocb->ki_filp; 7947 struct inode *inode = file->f_mapping->host; 7948 size_t count = 0; 7949 int flags = 0; 7950 bool wakeup = true; 7951 bool relock = false; 7952 ssize_t ret; 7953 7954 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset)) 7955 return 0; 7956 7957 atomic_inc(&inode->i_dio_count); 7958 smp_mb__after_atomic(); 7959 7960 /* 7961 * The generic stuff only does filemap_write_and_wait_range, which 7962 * isn't enough if we've written compressed pages to this area, so 7963 * we need to flush the dirty pages again to make absolutely sure 7964 * that any outstanding dirty pages are on disk. 7965 */ 7966 count = iov_iter_count(iter); 7967 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 7968 &BTRFS_I(inode)->runtime_flags)) 7969 filemap_fdatawrite_range(inode->i_mapping, offset, 7970 offset + count - 1); 7971 7972 if (rw & WRITE) { 7973 /* 7974 * If the write DIO is beyond the EOF, we need update 7975 * the isize, but it is protected by i_mutex. So we can 7976 * not unlock the i_mutex at this case. 7977 */ 7978 if (offset + count <= inode->i_size) { 7979 mutex_unlock(&inode->i_mutex); 7980 relock = true; 7981 } 7982 ret = btrfs_delalloc_reserve_space(inode, count); 7983 if (ret) 7984 goto out; 7985 } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, 7986 &BTRFS_I(inode)->runtime_flags)) { 7987 inode_dio_done(inode); 7988 flags = DIO_LOCKING | DIO_SKIP_HOLES; 7989 wakeup = false; 7990 } 7991 7992 ret = __blockdev_direct_IO(rw, iocb, inode, 7993 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 7994 iter, offset, btrfs_get_blocks_direct, NULL, 7995 btrfs_submit_direct, flags); 7996 if (rw & WRITE) { 7997 if (ret < 0 && ret != -EIOCBQUEUED) 7998 btrfs_delalloc_release_space(inode, count); 7999 else if (ret >= 0 && (size_t)ret < count) 8000 btrfs_delalloc_release_space(inode, 8001 count - (size_t)ret); 8002 else 8003 btrfs_delalloc_release_metadata(inode, 0); 8004 } 8005 out: 8006 if (wakeup) 8007 inode_dio_done(inode); 8008 if (relock) 8009 mutex_lock(&inode->i_mutex); 8010 8011 return ret; 8012 } 8013 8014 #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 8015 8016 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 8017 __u64 start, __u64 len) 8018 { 8019 int ret; 8020 8021 ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); 8022 if (ret) 8023 return ret; 8024 8025 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 8026 } 8027 8028 int btrfs_readpage(struct file *file, struct page *page) 8029 { 8030 struct extent_io_tree *tree; 8031 tree = &BTRFS_I(page->mapping->host)->io_tree; 8032 return extent_read_full_page(tree, page, btrfs_get_extent, 0); 8033 } 8034 8035 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 8036 { 8037 struct extent_io_tree *tree; 8038 8039 8040 if (current->flags & PF_MEMALLOC) { 8041 redirty_page_for_writepage(wbc, page); 8042 unlock_page(page); 8043 return 0; 8044 } 8045 tree = &BTRFS_I(page->mapping->host)->io_tree; 8046 return extent_write_full_page(tree, page, btrfs_get_extent, wbc); 8047 } 8048 8049 static int btrfs_writepages(struct address_space *mapping, 8050 struct writeback_control *wbc) 8051 { 8052 struct extent_io_tree *tree; 8053 8054 tree = &BTRFS_I(mapping->host)->io_tree; 8055 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 8056 } 8057 8058 static int 8059 btrfs_readpages(struct file *file, struct address_space *mapping, 8060 struct list_head *pages, unsigned nr_pages) 8061 { 8062 struct extent_io_tree *tree; 8063 tree = &BTRFS_I(mapping->host)->io_tree; 8064 return extent_readpages(tree, mapping, pages, nr_pages, 8065 btrfs_get_extent); 8066 } 8067 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8068 { 8069 struct extent_io_tree *tree; 8070 struct extent_map_tree *map; 8071 int ret; 8072 8073 tree = &BTRFS_I(page->mapping->host)->io_tree; 8074 map = &BTRFS_I(page->mapping->host)->extent_tree; 8075 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 8076 if (ret == 1) { 8077 ClearPagePrivate(page); 8078 set_page_private(page, 0); 8079 page_cache_release(page); 8080 } 8081 return ret; 8082 } 8083 8084 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 8085 { 8086 if (PageWriteback(page) || PageDirty(page)) 8087 return 0; 8088 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 8089 } 8090 8091 static void btrfs_invalidatepage(struct page *page, unsigned int offset, 8092 unsigned int length) 8093 { 8094 struct inode *inode = page->mapping->host; 8095 struct extent_io_tree *tree; 8096 struct btrfs_ordered_extent *ordered; 8097 struct extent_state *cached_state = NULL; 8098 u64 page_start = page_offset(page); 8099 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 8100 int inode_evicting = inode->i_state & I_FREEING; 8101 8102 /* 8103 * we have the page locked, so new writeback can't start, 8104 * and the dirty bit won't be cleared while we are here. 8105 * 8106 * Wait for IO on this page so that we can safely clear 8107 * the PagePrivate2 bit and do ordered accounting 8108 */ 8109 wait_on_page_writeback(page); 8110 8111 tree = &BTRFS_I(inode)->io_tree; 8112 if (offset) { 8113 btrfs_releasepage(page, GFP_NOFS); 8114 return; 8115 } 8116 8117 if (!inode_evicting) 8118 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 8119 ordered = btrfs_lookup_ordered_extent(inode, page_start); 8120 if (ordered) { 8121 /* 8122 * IO on this page will never be started, so we need 8123 * to account for any ordered extents now 8124 */ 8125 if (!inode_evicting) 8126 clear_extent_bit(tree, page_start, page_end, 8127 EXTENT_DIRTY | EXTENT_DELALLOC | 8128 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 8129 EXTENT_DEFRAG, 1, 0, &cached_state, 8130 GFP_NOFS); 8131 /* 8132 * whoever cleared the private bit is responsible 8133 * for the finish_ordered_io 8134 */ 8135 if (TestClearPagePrivate2(page)) { 8136 struct btrfs_ordered_inode_tree *tree; 8137 u64 new_len; 8138 8139 tree = &BTRFS_I(inode)->ordered_tree; 8140 8141 spin_lock_irq(&tree->lock); 8142 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 8143 new_len = page_start - ordered->file_offset; 8144 if (new_len < ordered->truncated_len) 8145 ordered->truncated_len = new_len; 8146 spin_unlock_irq(&tree->lock); 8147 8148 if (btrfs_dec_test_ordered_pending(inode, &ordered, 8149 page_start, 8150 PAGE_CACHE_SIZE, 1)) 8151 btrfs_finish_ordered_io(ordered); 8152 } 8153 btrfs_put_ordered_extent(ordered); 8154 if (!inode_evicting) { 8155 cached_state = NULL; 8156 lock_extent_bits(tree, page_start, page_end, 0, 8157 &cached_state); 8158 } 8159 } 8160 8161 if (!inode_evicting) { 8162 clear_extent_bit(tree, page_start, page_end, 8163 EXTENT_LOCKED | EXTENT_DIRTY | 8164 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 8165 EXTENT_DEFRAG, 1, 1, 8166 &cached_state, GFP_NOFS); 8167 8168 __btrfs_releasepage(page, GFP_NOFS); 8169 } 8170 8171 ClearPageChecked(page); 8172 if (PagePrivate(page)) { 8173 ClearPagePrivate(page); 8174 set_page_private(page, 0); 8175 page_cache_release(page); 8176 } 8177 } 8178 8179 /* 8180 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 8181 * called from a page fault handler when a page is first dirtied. Hence we must 8182 * be careful to check for EOF conditions here. We set the page up correctly 8183 * for a written page which means we get ENOSPC checking when writing into 8184 * holes and correct delalloc and unwritten extent mapping on filesystems that 8185 * support these features. 8186 * 8187 * We are not allowed to take the i_mutex here so we have to play games to 8188 * protect against truncate races as the page could now be beyond EOF. Because 8189 * vmtruncate() writes the inode size before removing pages, once we have the 8190 * page lock we can determine safely if the page is beyond EOF. If it is not 8191 * beyond EOF, then the page is guaranteed safe against truncation until we 8192 * unlock the page. 8193 */ 8194 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 8195 { 8196 struct page *page = vmf->page; 8197 struct inode *inode = file_inode(vma->vm_file); 8198 struct btrfs_root *root = BTRFS_I(inode)->root; 8199 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 8200 struct btrfs_ordered_extent *ordered; 8201 struct extent_state *cached_state = NULL; 8202 char *kaddr; 8203 unsigned long zero_start; 8204 loff_t size; 8205 int ret; 8206 int reserved = 0; 8207 u64 page_start; 8208 u64 page_end; 8209 8210 sb_start_pagefault(inode->i_sb); 8211 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 8212 if (!ret) { 8213 ret = file_update_time(vma->vm_file); 8214 reserved = 1; 8215 } 8216 if (ret) { 8217 if (ret == -ENOMEM) 8218 ret = VM_FAULT_OOM; 8219 else /* -ENOSPC, -EIO, etc */ 8220 ret = VM_FAULT_SIGBUS; 8221 if (reserved) 8222 goto out; 8223 goto out_noreserve; 8224 } 8225 8226 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 8227 again: 8228 lock_page(page); 8229 size = i_size_read(inode); 8230 page_start = page_offset(page); 8231 page_end = page_start + PAGE_CACHE_SIZE - 1; 8232 8233 if ((page->mapping != inode->i_mapping) || 8234 (page_start >= size)) { 8235 /* page got truncated out from underneath us */ 8236 goto out_unlock; 8237 } 8238 wait_on_page_writeback(page); 8239 8240 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 8241 set_page_extent_mapped(page); 8242 8243 /* 8244 * we can't set the delalloc bits if there are pending ordered 8245 * extents. Drop our locks and wait for them to finish 8246 */ 8247 ordered = btrfs_lookup_ordered_extent(inode, page_start); 8248 if (ordered) { 8249 unlock_extent_cached(io_tree, page_start, page_end, 8250 &cached_state, GFP_NOFS); 8251 unlock_page(page); 8252 btrfs_start_ordered_extent(inode, ordered, 1); 8253 btrfs_put_ordered_extent(ordered); 8254 goto again; 8255 } 8256 8257 /* 8258 * XXX - page_mkwrite gets called every time the page is dirtied, even 8259 * if it was already dirty, so for space accounting reasons we need to 8260 * clear any delalloc bits for the range we are fixing to save. There 8261 * is probably a better way to do this, but for now keep consistent with 8262 * prepare_pages in the normal write path. 8263 */ 8264 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 8265 EXTENT_DIRTY | EXTENT_DELALLOC | 8266 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 8267 0, 0, &cached_state, GFP_NOFS); 8268 8269 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 8270 &cached_state); 8271 if (ret) { 8272 unlock_extent_cached(io_tree, page_start, page_end, 8273 &cached_state, GFP_NOFS); 8274 ret = VM_FAULT_SIGBUS; 8275 goto out_unlock; 8276 } 8277 ret = 0; 8278 8279 /* page is wholly or partially inside EOF */ 8280 if (page_start + PAGE_CACHE_SIZE > size) 8281 zero_start = size & ~PAGE_CACHE_MASK; 8282 else 8283 zero_start = PAGE_CACHE_SIZE; 8284 8285 if (zero_start != PAGE_CACHE_SIZE) { 8286 kaddr = kmap(page); 8287 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); 8288 flush_dcache_page(page); 8289 kunmap(page); 8290 } 8291 ClearPageChecked(page); 8292 set_page_dirty(page); 8293 SetPageUptodate(page); 8294 8295 BTRFS_I(inode)->last_trans = root->fs_info->generation; 8296 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 8297 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; 8298 8299 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 8300 8301 out_unlock: 8302 if (!ret) { 8303 sb_end_pagefault(inode->i_sb); 8304 return VM_FAULT_LOCKED; 8305 } 8306 unlock_page(page); 8307 out: 8308 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 8309 out_noreserve: 8310 sb_end_pagefault(inode->i_sb); 8311 return ret; 8312 } 8313 8314 static int btrfs_truncate(struct inode *inode) 8315 { 8316 struct btrfs_root *root = BTRFS_I(inode)->root; 8317 struct btrfs_block_rsv *rsv; 8318 int ret = 0; 8319 int err = 0; 8320 struct btrfs_trans_handle *trans; 8321 u64 mask = root->sectorsize - 1; 8322 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 8323 8324 ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), 8325 (u64)-1); 8326 if (ret) 8327 return ret; 8328 8329 /* 8330 * Yes ladies and gentelment, this is indeed ugly. The fact is we have 8331 * 3 things going on here 8332 * 8333 * 1) We need to reserve space for our orphan item and the space to 8334 * delete our orphan item. Lord knows we don't want to have a dangling 8335 * orphan item because we didn't reserve space to remove it. 8336 * 8337 * 2) We need to reserve space to update our inode. 8338 * 8339 * 3) We need to have something to cache all the space that is going to 8340 * be free'd up by the truncate operation, but also have some slack 8341 * space reserved in case it uses space during the truncate (thank you 8342 * very much snapshotting). 8343 * 8344 * And we need these to all be seperate. The fact is we can use alot of 8345 * space doing the truncate, and we have no earthly idea how much space 8346 * we will use, so we need the truncate reservation to be seperate so it 8347 * doesn't end up using space reserved for updating the inode or 8348 * removing the orphan item. We also need to be able to stop the 8349 * transaction and start a new one, which means we need to be able to 8350 * update the inode several times, and we have no idea of knowing how 8351 * many times that will be, so we can't just reserve 1 item for the 8352 * entirety of the opration, so that has to be done seperately as well. 8353 * Then there is the orphan item, which does indeed need to be held on 8354 * to for the whole operation, and we need nobody to touch this reserved 8355 * space except the orphan code. 8356 * 8357 * So that leaves us with 8358 * 8359 * 1) root->orphan_block_rsv - for the orphan deletion. 8360 * 2) rsv - for the truncate reservation, which we will steal from the 8361 * transaction reservation. 8362 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 8363 * updating the inode. 8364 */ 8365 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 8366 if (!rsv) 8367 return -ENOMEM; 8368 rsv->size = min_size; 8369 rsv->failfast = 1; 8370 8371 /* 8372 * 1 for the truncate slack space 8373 * 1 for updating the inode. 8374 */ 8375 trans = btrfs_start_transaction(root, 2); 8376 if (IS_ERR(trans)) { 8377 err = PTR_ERR(trans); 8378 goto out; 8379 } 8380 8381 /* Migrate the slack space for the truncate to our reserve */ 8382 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, 8383 min_size); 8384 BUG_ON(ret); 8385 8386 /* 8387 * So if we truncate and then write and fsync we normally would just 8388 * write the extents that changed, which is a problem if we need to 8389 * first truncate that entire inode. So set this flag so we write out 8390 * all of the extents in the inode to the sync log so we're completely 8391 * safe. 8392 */ 8393 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 8394 trans->block_rsv = rsv; 8395 8396 while (1) { 8397 ret = btrfs_truncate_inode_items(trans, root, inode, 8398 inode->i_size, 8399 BTRFS_EXTENT_DATA_KEY); 8400 if (ret != -ENOSPC) { 8401 err = ret; 8402 break; 8403 } 8404 8405 trans->block_rsv = &root->fs_info->trans_block_rsv; 8406 ret = btrfs_update_inode(trans, root, inode); 8407 if (ret) { 8408 err = ret; 8409 break; 8410 } 8411 8412 btrfs_end_transaction(trans, root); 8413 btrfs_btree_balance_dirty(root); 8414 8415 trans = btrfs_start_transaction(root, 2); 8416 if (IS_ERR(trans)) { 8417 ret = err = PTR_ERR(trans); 8418 trans = NULL; 8419 break; 8420 } 8421 8422 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, 8423 rsv, min_size); 8424 BUG_ON(ret); /* shouldn't happen */ 8425 trans->block_rsv = rsv; 8426 } 8427 8428 if (ret == 0 && inode->i_nlink > 0) { 8429 trans->block_rsv = root->orphan_block_rsv; 8430 ret = btrfs_orphan_del(trans, inode); 8431 if (ret) 8432 err = ret; 8433 } 8434 8435 if (trans) { 8436 trans->block_rsv = &root->fs_info->trans_block_rsv; 8437 ret = btrfs_update_inode(trans, root, inode); 8438 if (ret && !err) 8439 err = ret; 8440 8441 ret = btrfs_end_transaction(trans, root); 8442 btrfs_btree_balance_dirty(root); 8443 } 8444 8445 out: 8446 btrfs_free_block_rsv(root, rsv); 8447 8448 if (ret && !err) 8449 err = ret; 8450 8451 return err; 8452 } 8453 8454 /* 8455 * create a new subvolume directory/inode (helper for the ioctl). 8456 */ 8457 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 8458 struct btrfs_root *new_root, 8459 struct btrfs_root *parent_root, 8460 u64 new_dirid) 8461 { 8462 struct inode *inode; 8463 int err; 8464 u64 index = 0; 8465 8466 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, 8467 new_dirid, new_dirid, 8468 S_IFDIR | (~current_umask() & S_IRWXUGO), 8469 &index); 8470 if (IS_ERR(inode)) 8471 return PTR_ERR(inode); 8472 inode->i_op = &btrfs_dir_inode_operations; 8473 inode->i_fop = &btrfs_dir_file_operations; 8474 8475 set_nlink(inode, 1); 8476 btrfs_i_size_write(inode, 0); 8477 unlock_new_inode(inode); 8478 8479 err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 8480 if (err) 8481 btrfs_err(new_root->fs_info, 8482 "error inheriting subvolume %llu properties: %d", 8483 new_root->root_key.objectid, err); 8484 8485 err = btrfs_update_inode(trans, new_root, inode); 8486 8487 iput(inode); 8488 return err; 8489 } 8490 8491 struct inode *btrfs_alloc_inode(struct super_block *sb) 8492 { 8493 struct btrfs_inode *ei; 8494 struct inode *inode; 8495 8496 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 8497 if (!ei) 8498 return NULL; 8499 8500 ei->root = NULL; 8501 ei->generation = 0; 8502 ei->last_trans = 0; 8503 ei->last_sub_trans = 0; 8504 ei->logged_trans = 0; 8505 ei->delalloc_bytes = 0; 8506 ei->defrag_bytes = 0; 8507 ei->disk_i_size = 0; 8508 ei->flags = 0; 8509 ei->csum_bytes = 0; 8510 ei->index_cnt = (u64)-1; 8511 ei->dir_index = 0; 8512 ei->last_unlink_trans = 0; 8513 ei->last_log_commit = 0; 8514 8515 spin_lock_init(&ei->lock); 8516 ei->outstanding_extents = 0; 8517 ei->reserved_extents = 0; 8518 8519 ei->runtime_flags = 0; 8520 ei->force_compress = BTRFS_COMPRESS_NONE; 8521 8522 ei->delayed_node = NULL; 8523 8524 inode = &ei->vfs_inode; 8525 extent_map_tree_init(&ei->extent_tree); 8526 extent_io_tree_init(&ei->io_tree, &inode->i_data); 8527 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 8528 ei->io_tree.track_uptodate = 1; 8529 ei->io_failure_tree.track_uptodate = 1; 8530 atomic_set(&ei->sync_writers, 0); 8531 mutex_init(&ei->log_mutex); 8532 mutex_init(&ei->delalloc_mutex); 8533 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 8534 INIT_LIST_HEAD(&ei->delalloc_inodes); 8535 RB_CLEAR_NODE(&ei->rb_node); 8536 8537 return inode; 8538 } 8539 8540 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 8541 void btrfs_test_destroy_inode(struct inode *inode) 8542 { 8543 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 8544 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 8545 } 8546 #endif 8547 8548 static void btrfs_i_callback(struct rcu_head *head) 8549 { 8550 struct inode *inode = container_of(head, struct inode, i_rcu); 8551 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 8552 } 8553 8554 void btrfs_destroy_inode(struct inode *inode) 8555 { 8556 struct btrfs_ordered_extent *ordered; 8557 struct btrfs_root *root = BTRFS_I(inode)->root; 8558 8559 WARN_ON(!hlist_empty(&inode->i_dentry)); 8560 WARN_ON(inode->i_data.nrpages); 8561 WARN_ON(BTRFS_I(inode)->outstanding_extents); 8562 WARN_ON(BTRFS_I(inode)->reserved_extents); 8563 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 8564 WARN_ON(BTRFS_I(inode)->csum_bytes); 8565 WARN_ON(BTRFS_I(inode)->defrag_bytes); 8566 8567 /* 8568 * This can happen where we create an inode, but somebody else also 8569 * created the same inode and we need to destroy the one we already 8570 * created. 8571 */ 8572 if (!root) 8573 goto free; 8574 8575 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 8576 &BTRFS_I(inode)->runtime_flags)) { 8577 btrfs_info(root->fs_info, "inode %llu still on the orphan list", 8578 btrfs_ino(inode)); 8579 atomic_dec(&root->orphan_inodes); 8580 } 8581 8582 while (1) { 8583 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 8584 if (!ordered) 8585 break; 8586 else { 8587 btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", 8588 ordered->file_offset, ordered->len); 8589 btrfs_remove_ordered_extent(inode, ordered); 8590 btrfs_put_ordered_extent(ordered); 8591 btrfs_put_ordered_extent(ordered); 8592 } 8593 } 8594 inode_tree_del(inode); 8595 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 8596 free: 8597 call_rcu(&inode->i_rcu, btrfs_i_callback); 8598 } 8599 8600 int btrfs_drop_inode(struct inode *inode) 8601 { 8602 struct btrfs_root *root = BTRFS_I(inode)->root; 8603 8604 if (root == NULL) 8605 return 1; 8606 8607 /* the snap/subvol tree is on deleting */ 8608 if (btrfs_root_refs(&root->root_item) == 0) 8609 return 1; 8610 else 8611 return generic_drop_inode(inode); 8612 } 8613 8614 static void init_once(void *foo) 8615 { 8616 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 8617 8618 inode_init_once(&ei->vfs_inode); 8619 } 8620 8621 void btrfs_destroy_cachep(void) 8622 { 8623 /* 8624 * Make sure all delayed rcu free inodes are flushed before we 8625 * destroy cache. 8626 */ 8627 rcu_barrier(); 8628 if (btrfs_inode_cachep) 8629 kmem_cache_destroy(btrfs_inode_cachep); 8630 if (btrfs_trans_handle_cachep) 8631 kmem_cache_destroy(btrfs_trans_handle_cachep); 8632 if (btrfs_transaction_cachep) 8633 kmem_cache_destroy(btrfs_transaction_cachep); 8634 if (btrfs_path_cachep) 8635 kmem_cache_destroy(btrfs_path_cachep); 8636 if (btrfs_free_space_cachep) 8637 kmem_cache_destroy(btrfs_free_space_cachep); 8638 if (btrfs_delalloc_work_cachep) 8639 kmem_cache_destroy(btrfs_delalloc_work_cachep); 8640 } 8641 8642 int btrfs_init_cachep(void) 8643 { 8644 btrfs_inode_cachep = kmem_cache_create("btrfs_inode", 8645 sizeof(struct btrfs_inode), 0, 8646 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 8647 if (!btrfs_inode_cachep) 8648 goto fail; 8649 8650 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", 8651 sizeof(struct btrfs_trans_handle), 0, 8652 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 8653 if (!btrfs_trans_handle_cachep) 8654 goto fail; 8655 8656 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", 8657 sizeof(struct btrfs_transaction), 0, 8658 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 8659 if (!btrfs_transaction_cachep) 8660 goto fail; 8661 8662 btrfs_path_cachep = kmem_cache_create("btrfs_path", 8663 sizeof(struct btrfs_path), 0, 8664 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 8665 if (!btrfs_path_cachep) 8666 goto fail; 8667 8668 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 8669 sizeof(struct btrfs_free_space), 0, 8670 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 8671 if (!btrfs_free_space_cachep) 8672 goto fail; 8673 8674 btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", 8675 sizeof(struct btrfs_delalloc_work), 0, 8676 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 8677 NULL); 8678 if (!btrfs_delalloc_work_cachep) 8679 goto fail; 8680 8681 return 0; 8682 fail: 8683 btrfs_destroy_cachep(); 8684 return -ENOMEM; 8685 } 8686 8687 static int btrfs_getattr(struct vfsmount *mnt, 8688 struct dentry *dentry, struct kstat *stat) 8689 { 8690 u64 delalloc_bytes; 8691 struct inode *inode = dentry->d_inode; 8692 u32 blocksize = inode->i_sb->s_blocksize; 8693 8694 generic_fillattr(inode, stat); 8695 stat->dev = BTRFS_I(inode)->root->anon_dev; 8696 stat->blksize = PAGE_CACHE_SIZE; 8697 8698 spin_lock(&BTRFS_I(inode)->lock); 8699 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; 8700 spin_unlock(&BTRFS_I(inode)->lock); 8701 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + 8702 ALIGN(delalloc_bytes, blocksize)) >> 9; 8703 return 0; 8704 } 8705 8706 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 8707 struct inode *new_dir, struct dentry *new_dentry) 8708 { 8709 struct btrfs_trans_handle *trans; 8710 struct btrfs_root *root = BTRFS_I(old_dir)->root; 8711 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 8712 struct inode *new_inode = new_dentry->d_inode; 8713 struct inode *old_inode = old_dentry->d_inode; 8714 struct timespec ctime = CURRENT_TIME; 8715 u64 index = 0; 8716 u64 root_objectid; 8717 int ret; 8718 u64 old_ino = btrfs_ino(old_inode); 8719 8720 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 8721 return -EPERM; 8722 8723 /* we only allow rename subvolume link between subvolumes */ 8724 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 8725 return -EXDEV; 8726 8727 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 8728 (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) 8729 return -ENOTEMPTY; 8730 8731 if (S_ISDIR(old_inode->i_mode) && new_inode && 8732 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 8733 return -ENOTEMPTY; 8734 8735 8736 /* check for collisions, even if the name isn't there */ 8737 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, 8738 new_dentry->d_name.name, 8739 new_dentry->d_name.len); 8740 8741 if (ret) { 8742 if (ret == -EEXIST) { 8743 /* we shouldn't get 8744 * eexist without a new_inode */ 8745 if (WARN_ON(!new_inode)) { 8746 return ret; 8747 } 8748 } else { 8749 /* maybe -EOVERFLOW */ 8750 return ret; 8751 } 8752 } 8753 ret = 0; 8754 8755 /* 8756 * we're using rename to replace one file with another. Start IO on it 8757 * now so we don't add too much work to the end of the transaction 8758 */ 8759 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) 8760 filemap_flush(old_inode->i_mapping); 8761 8762 /* close the racy window with snapshot create/destroy ioctl */ 8763 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 8764 down_read(&root->fs_info->subvol_sem); 8765 /* 8766 * We want to reserve the absolute worst case amount of items. So if 8767 * both inodes are subvols and we need to unlink them then that would 8768 * require 4 item modifications, but if they are both normal inodes it 8769 * would require 5 item modifications, so we'll assume their normal 8770 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 8771 * should cover the worst case number of items we'll modify. 8772 */ 8773 trans = btrfs_start_transaction(root, 11); 8774 if (IS_ERR(trans)) { 8775 ret = PTR_ERR(trans); 8776 goto out_notrans; 8777 } 8778 8779 if (dest != root) 8780 btrfs_record_root_in_trans(trans, dest); 8781 8782 ret = btrfs_set_inode_index(new_dir, &index); 8783 if (ret) 8784 goto out_fail; 8785 8786 BTRFS_I(old_inode)->dir_index = 0ULL; 8787 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 8788 /* force full log commit if subvolume involved. */ 8789 btrfs_set_log_full_commit(root->fs_info, trans); 8790 } else { 8791 ret = btrfs_insert_inode_ref(trans, dest, 8792 new_dentry->d_name.name, 8793 new_dentry->d_name.len, 8794 old_ino, 8795 btrfs_ino(new_dir), index); 8796 if (ret) 8797 goto out_fail; 8798 /* 8799 * this is an ugly little race, but the rename is required 8800 * to make sure that if we crash, the inode is either at the 8801 * old name or the new one. pinning the log transaction lets 8802 * us make sure we don't allow a log commit to come in after 8803 * we unlink the name but before we add the new name back in. 8804 */ 8805 btrfs_pin_log_trans(root); 8806 } 8807 8808 inode_inc_iversion(old_dir); 8809 inode_inc_iversion(new_dir); 8810 inode_inc_iversion(old_inode); 8811 old_dir->i_ctime = old_dir->i_mtime = ctime; 8812 new_dir->i_ctime = new_dir->i_mtime = ctime; 8813 old_inode->i_ctime = ctime; 8814 8815 if (old_dentry->d_parent != new_dentry->d_parent) 8816 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 8817 8818 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 8819 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 8820 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 8821 old_dentry->d_name.name, 8822 old_dentry->d_name.len); 8823 } else { 8824 ret = __btrfs_unlink_inode(trans, root, old_dir, 8825 old_dentry->d_inode, 8826 old_dentry->d_name.name, 8827 old_dentry->d_name.len); 8828 if (!ret) 8829 ret = btrfs_update_inode(trans, root, old_inode); 8830 } 8831 if (ret) { 8832 btrfs_abort_transaction(trans, root, ret); 8833 goto out_fail; 8834 } 8835 8836 if (new_inode) { 8837 inode_inc_iversion(new_inode); 8838 new_inode->i_ctime = CURRENT_TIME; 8839 if (unlikely(btrfs_ino(new_inode) == 8840 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 8841 root_objectid = BTRFS_I(new_inode)->location.objectid; 8842 ret = btrfs_unlink_subvol(trans, dest, new_dir, 8843 root_objectid, 8844 new_dentry->d_name.name, 8845 new_dentry->d_name.len); 8846 BUG_ON(new_inode->i_nlink == 0); 8847 } else { 8848 ret = btrfs_unlink_inode(trans, dest, new_dir, 8849 new_dentry->d_inode, 8850 new_dentry->d_name.name, 8851 new_dentry->d_name.len); 8852 } 8853 if (!ret && new_inode->i_nlink == 0) 8854 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 8855 if (ret) { 8856 btrfs_abort_transaction(trans, root, ret); 8857 goto out_fail; 8858 } 8859 } 8860 8861 ret = btrfs_add_link(trans, new_dir, old_inode, 8862 new_dentry->d_name.name, 8863 new_dentry->d_name.len, 0, index); 8864 if (ret) { 8865 btrfs_abort_transaction(trans, root, ret); 8866 goto out_fail; 8867 } 8868 8869 if (old_inode->i_nlink == 1) 8870 BTRFS_I(old_inode)->dir_index = index; 8871 8872 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 8873 struct dentry *parent = new_dentry->d_parent; 8874 btrfs_log_new_name(trans, old_inode, old_dir, parent); 8875 btrfs_end_log_trans(root); 8876 } 8877 out_fail: 8878 btrfs_end_transaction(trans, root); 8879 out_notrans: 8880 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 8881 up_read(&root->fs_info->subvol_sem); 8882 8883 return ret; 8884 } 8885 8886 static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, 8887 struct inode *new_dir, struct dentry *new_dentry, 8888 unsigned int flags) 8889 { 8890 if (flags & ~RENAME_NOREPLACE) 8891 return -EINVAL; 8892 8893 return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); 8894 } 8895 8896 static void btrfs_run_delalloc_work(struct btrfs_work *work) 8897 { 8898 struct btrfs_delalloc_work *delalloc_work; 8899 struct inode *inode; 8900 8901 delalloc_work = container_of(work, struct btrfs_delalloc_work, 8902 work); 8903 inode = delalloc_work->inode; 8904 if (delalloc_work->wait) { 8905 btrfs_wait_ordered_range(inode, 0, (u64)-1); 8906 } else { 8907 filemap_flush(inode->i_mapping); 8908 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 8909 &BTRFS_I(inode)->runtime_flags)) 8910 filemap_flush(inode->i_mapping); 8911 } 8912 8913 if (delalloc_work->delay_iput) 8914 btrfs_add_delayed_iput(inode); 8915 else 8916 iput(inode); 8917 complete(&delalloc_work->completion); 8918 } 8919 8920 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, 8921 int wait, int delay_iput) 8922 { 8923 struct btrfs_delalloc_work *work; 8924 8925 work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); 8926 if (!work) 8927 return NULL; 8928 8929 init_completion(&work->completion); 8930 INIT_LIST_HEAD(&work->list); 8931 work->inode = inode; 8932 work->wait = wait; 8933 work->delay_iput = delay_iput; 8934 WARN_ON_ONCE(!inode); 8935 btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, 8936 btrfs_run_delalloc_work, NULL, NULL); 8937 8938 return work; 8939 } 8940 8941 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) 8942 { 8943 wait_for_completion(&work->completion); 8944 kmem_cache_free(btrfs_delalloc_work_cachep, work); 8945 } 8946 8947 /* 8948 * some fairly slow code that needs optimization. This walks the list 8949 * of all the inodes with pending delalloc and forces them to disk. 8950 */ 8951 static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, 8952 int nr) 8953 { 8954 struct btrfs_inode *binode; 8955 struct inode *inode; 8956 struct btrfs_delalloc_work *work, *next; 8957 struct list_head works; 8958 struct list_head splice; 8959 int ret = 0; 8960 8961 INIT_LIST_HEAD(&works); 8962 INIT_LIST_HEAD(&splice); 8963 8964 mutex_lock(&root->delalloc_mutex); 8965 spin_lock(&root->delalloc_lock); 8966 list_splice_init(&root->delalloc_inodes, &splice); 8967 while (!list_empty(&splice)) { 8968 binode = list_entry(splice.next, struct btrfs_inode, 8969 delalloc_inodes); 8970 8971 list_move_tail(&binode->delalloc_inodes, 8972 &root->delalloc_inodes); 8973 inode = igrab(&binode->vfs_inode); 8974 if (!inode) { 8975 cond_resched_lock(&root->delalloc_lock); 8976 continue; 8977 } 8978 spin_unlock(&root->delalloc_lock); 8979 8980 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 8981 if (!work) { 8982 if (delay_iput) 8983 btrfs_add_delayed_iput(inode); 8984 else 8985 iput(inode); 8986 ret = -ENOMEM; 8987 goto out; 8988 } 8989 list_add_tail(&work->list, &works); 8990 btrfs_queue_work(root->fs_info->flush_workers, 8991 &work->work); 8992 ret++; 8993 if (nr != -1 && ret >= nr) 8994 goto out; 8995 cond_resched(); 8996 spin_lock(&root->delalloc_lock); 8997 } 8998 spin_unlock(&root->delalloc_lock); 8999 9000 out: 9001 list_for_each_entry_safe(work, next, &works, list) { 9002 list_del_init(&work->list); 9003 btrfs_wait_and_free_delalloc_work(work); 9004 } 9005 9006 if (!list_empty_careful(&splice)) { 9007 spin_lock(&root->delalloc_lock); 9008 list_splice_tail(&splice, &root->delalloc_inodes); 9009 spin_unlock(&root->delalloc_lock); 9010 } 9011 mutex_unlock(&root->delalloc_mutex); 9012 return ret; 9013 } 9014 9015 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 9016 { 9017 int ret; 9018 9019 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 9020 return -EROFS; 9021 9022 ret = __start_delalloc_inodes(root, delay_iput, -1); 9023 if (ret > 0) 9024 ret = 0; 9025 /* 9026 * the filemap_flush will queue IO into the worker threads, but 9027 * we have to make sure the IO is actually started and that 9028 * ordered extents get created before we return 9029 */ 9030 atomic_inc(&root->fs_info->async_submit_draining); 9031 while (atomic_read(&root->fs_info->nr_async_submits) || 9032 atomic_read(&root->fs_info->async_delalloc_pages)) { 9033 wait_event(root->fs_info->async_submit_wait, 9034 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 9035 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 9036 } 9037 atomic_dec(&root->fs_info->async_submit_draining); 9038 return ret; 9039 } 9040 9041 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, 9042 int nr) 9043 { 9044 struct btrfs_root *root; 9045 struct list_head splice; 9046 int ret; 9047 9048 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 9049 return -EROFS; 9050 9051 INIT_LIST_HEAD(&splice); 9052 9053 mutex_lock(&fs_info->delalloc_root_mutex); 9054 spin_lock(&fs_info->delalloc_root_lock); 9055 list_splice_init(&fs_info->delalloc_roots, &splice); 9056 while (!list_empty(&splice) && nr) { 9057 root = list_first_entry(&splice, struct btrfs_root, 9058 delalloc_root); 9059 root = btrfs_grab_fs_root(root); 9060 BUG_ON(!root); 9061 list_move_tail(&root->delalloc_root, 9062 &fs_info->delalloc_roots); 9063 spin_unlock(&fs_info->delalloc_root_lock); 9064 9065 ret = __start_delalloc_inodes(root, delay_iput, nr); 9066 btrfs_put_fs_root(root); 9067 if (ret < 0) 9068 goto out; 9069 9070 if (nr != -1) { 9071 nr -= ret; 9072 WARN_ON(nr < 0); 9073 } 9074 spin_lock(&fs_info->delalloc_root_lock); 9075 } 9076 spin_unlock(&fs_info->delalloc_root_lock); 9077 9078 ret = 0; 9079 atomic_inc(&fs_info->async_submit_draining); 9080 while (atomic_read(&fs_info->nr_async_submits) || 9081 atomic_read(&fs_info->async_delalloc_pages)) { 9082 wait_event(fs_info->async_submit_wait, 9083 (atomic_read(&fs_info->nr_async_submits) == 0 && 9084 atomic_read(&fs_info->async_delalloc_pages) == 0)); 9085 } 9086 atomic_dec(&fs_info->async_submit_draining); 9087 out: 9088 if (!list_empty_careful(&splice)) { 9089 spin_lock(&fs_info->delalloc_root_lock); 9090 list_splice_tail(&splice, &fs_info->delalloc_roots); 9091 spin_unlock(&fs_info->delalloc_root_lock); 9092 } 9093 mutex_unlock(&fs_info->delalloc_root_mutex); 9094 return ret; 9095 } 9096 9097 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 9098 const char *symname) 9099 { 9100 struct btrfs_trans_handle *trans; 9101 struct btrfs_root *root = BTRFS_I(dir)->root; 9102 struct btrfs_path *path; 9103 struct btrfs_key key; 9104 struct inode *inode = NULL; 9105 int err; 9106 int drop_inode = 0; 9107 u64 objectid; 9108 u64 index = 0; 9109 int name_len; 9110 int datasize; 9111 unsigned long ptr; 9112 struct btrfs_file_extent_item *ei; 9113 struct extent_buffer *leaf; 9114 9115 name_len = strlen(symname); 9116 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 9117 return -ENAMETOOLONG; 9118 9119 /* 9120 * 2 items for inode item and ref 9121 * 2 items for dir items 9122 * 1 item for xattr if selinux is on 9123 */ 9124 trans = btrfs_start_transaction(root, 5); 9125 if (IS_ERR(trans)) 9126 return PTR_ERR(trans); 9127 9128 err = btrfs_find_free_ino(root, &objectid); 9129 if (err) 9130 goto out_unlock; 9131 9132 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 9133 dentry->d_name.len, btrfs_ino(dir), objectid, 9134 S_IFLNK|S_IRWXUGO, &index); 9135 if (IS_ERR(inode)) { 9136 err = PTR_ERR(inode); 9137 goto out_unlock; 9138 } 9139 9140 /* 9141 * If the active LSM wants to access the inode during 9142 * d_instantiate it needs these. Smack checks to see 9143 * if the filesystem supports xattrs by looking at the 9144 * ops vector. 9145 */ 9146 inode->i_fop = &btrfs_file_operations; 9147 inode->i_op = &btrfs_file_inode_operations; 9148 inode->i_mapping->a_ops = &btrfs_aops; 9149 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 9150 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 9151 9152 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 9153 if (err) 9154 goto out_unlock_inode; 9155 9156 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 9157 if (err) 9158 goto out_unlock_inode; 9159 9160 path = btrfs_alloc_path(); 9161 if (!path) { 9162 err = -ENOMEM; 9163 goto out_unlock_inode; 9164 } 9165 key.objectid = btrfs_ino(inode); 9166 key.offset = 0; 9167 key.type = BTRFS_EXTENT_DATA_KEY; 9168 datasize = btrfs_file_extent_calc_inline_size(name_len); 9169 err = btrfs_insert_empty_item(trans, root, path, &key, 9170 datasize); 9171 if (err) { 9172 btrfs_free_path(path); 9173 goto out_unlock_inode; 9174 } 9175 leaf = path->nodes[0]; 9176 ei = btrfs_item_ptr(leaf, path->slots[0], 9177 struct btrfs_file_extent_item); 9178 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 9179 btrfs_set_file_extent_type(leaf, ei, 9180 BTRFS_FILE_EXTENT_INLINE); 9181 btrfs_set_file_extent_encryption(leaf, ei, 0); 9182 btrfs_set_file_extent_compression(leaf, ei, 0); 9183 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 9184 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 9185 9186 ptr = btrfs_file_extent_inline_start(ei); 9187 write_extent_buffer(leaf, symname, ptr, name_len); 9188 btrfs_mark_buffer_dirty(leaf); 9189 btrfs_free_path(path); 9190 9191 inode->i_op = &btrfs_symlink_inode_operations; 9192 inode->i_mapping->a_ops = &btrfs_symlink_aops; 9193 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 9194 inode_set_bytes(inode, name_len); 9195 btrfs_i_size_write(inode, name_len); 9196 err = btrfs_update_inode(trans, root, inode); 9197 if (err) { 9198 drop_inode = 1; 9199 goto out_unlock_inode; 9200 } 9201 9202 unlock_new_inode(inode); 9203 d_instantiate(dentry, inode); 9204 9205 out_unlock: 9206 btrfs_end_transaction(trans, root); 9207 if (drop_inode) { 9208 inode_dec_link_count(inode); 9209 iput(inode); 9210 } 9211 btrfs_btree_balance_dirty(root); 9212 return err; 9213 9214 out_unlock_inode: 9215 drop_inode = 1; 9216 unlock_new_inode(inode); 9217 goto out_unlock; 9218 } 9219 9220 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 9221 u64 start, u64 num_bytes, u64 min_size, 9222 loff_t actual_len, u64 *alloc_hint, 9223 struct btrfs_trans_handle *trans) 9224 { 9225 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 9226 struct extent_map *em; 9227 struct btrfs_root *root = BTRFS_I(inode)->root; 9228 struct btrfs_key ins; 9229 u64 cur_offset = start; 9230 u64 i_size; 9231 u64 cur_bytes; 9232 int ret = 0; 9233 bool own_trans = true; 9234 9235 if (trans) 9236 own_trans = false; 9237 while (num_bytes > 0) { 9238 if (own_trans) { 9239 trans = btrfs_start_transaction(root, 3); 9240 if (IS_ERR(trans)) { 9241 ret = PTR_ERR(trans); 9242 break; 9243 } 9244 } 9245 9246 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); 9247 cur_bytes = max(cur_bytes, min_size); 9248 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, 9249 *alloc_hint, &ins, 1, 0); 9250 if (ret) { 9251 if (own_trans) 9252 btrfs_end_transaction(trans, root); 9253 break; 9254 } 9255 9256 ret = insert_reserved_file_extent(trans, inode, 9257 cur_offset, ins.objectid, 9258 ins.offset, ins.offset, 9259 ins.offset, 0, 0, 0, 9260 BTRFS_FILE_EXTENT_PREALLOC); 9261 if (ret) { 9262 btrfs_free_reserved_extent(root, ins.objectid, 9263 ins.offset, 0); 9264 btrfs_abort_transaction(trans, root, ret); 9265 if (own_trans) 9266 btrfs_end_transaction(trans, root); 9267 break; 9268 } 9269 btrfs_drop_extent_cache(inode, cur_offset, 9270 cur_offset + ins.offset -1, 0); 9271 9272 em = alloc_extent_map(); 9273 if (!em) { 9274 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 9275 &BTRFS_I(inode)->runtime_flags); 9276 goto next; 9277 } 9278 9279 em->start = cur_offset; 9280 em->orig_start = cur_offset; 9281 em->len = ins.offset; 9282 em->block_start = ins.objectid; 9283 em->block_len = ins.offset; 9284 em->orig_block_len = ins.offset; 9285 em->ram_bytes = ins.offset; 9286 em->bdev = root->fs_info->fs_devices->latest_bdev; 9287 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 9288 em->generation = trans->transid; 9289 9290 while (1) { 9291 write_lock(&em_tree->lock); 9292 ret = add_extent_mapping(em_tree, em, 1); 9293 write_unlock(&em_tree->lock); 9294 if (ret != -EEXIST) 9295 break; 9296 btrfs_drop_extent_cache(inode, cur_offset, 9297 cur_offset + ins.offset - 1, 9298 0); 9299 } 9300 free_extent_map(em); 9301 next: 9302 num_bytes -= ins.offset; 9303 cur_offset += ins.offset; 9304 *alloc_hint = ins.objectid + ins.offset; 9305 9306 inode_inc_iversion(inode); 9307 inode->i_ctime = CURRENT_TIME; 9308 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 9309 if (!(mode & FALLOC_FL_KEEP_SIZE) && 9310 (actual_len > inode->i_size) && 9311 (cur_offset > inode->i_size)) { 9312 if (cur_offset > actual_len) 9313 i_size = actual_len; 9314 else 9315 i_size = cur_offset; 9316 i_size_write(inode, i_size); 9317 btrfs_ordered_update_i_size(inode, i_size, NULL); 9318 } 9319 9320 ret = btrfs_update_inode(trans, root, inode); 9321 9322 if (ret) { 9323 btrfs_abort_transaction(trans, root, ret); 9324 if (own_trans) 9325 btrfs_end_transaction(trans, root); 9326 break; 9327 } 9328 9329 if (own_trans) 9330 btrfs_end_transaction(trans, root); 9331 } 9332 return ret; 9333 } 9334 9335 int btrfs_prealloc_file_range(struct inode *inode, int mode, 9336 u64 start, u64 num_bytes, u64 min_size, 9337 loff_t actual_len, u64 *alloc_hint) 9338 { 9339 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 9340 min_size, actual_len, alloc_hint, 9341 NULL); 9342 } 9343 9344 int btrfs_prealloc_file_range_trans(struct inode *inode, 9345 struct btrfs_trans_handle *trans, int mode, 9346 u64 start, u64 num_bytes, u64 min_size, 9347 loff_t actual_len, u64 *alloc_hint) 9348 { 9349 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 9350 min_size, actual_len, alloc_hint, trans); 9351 } 9352 9353 static int btrfs_set_page_dirty(struct page *page) 9354 { 9355 return __set_page_dirty_nobuffers(page); 9356 } 9357 9358 static int btrfs_permission(struct inode *inode, int mask) 9359 { 9360 struct btrfs_root *root = BTRFS_I(inode)->root; 9361 umode_t mode = inode->i_mode; 9362 9363 if (mask & MAY_WRITE && 9364 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 9365 if (btrfs_root_readonly(root)) 9366 return -EROFS; 9367 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 9368 return -EACCES; 9369 } 9370 return generic_permission(inode, mask); 9371 } 9372 9373 static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 9374 { 9375 struct btrfs_trans_handle *trans; 9376 struct btrfs_root *root = BTRFS_I(dir)->root; 9377 struct inode *inode = NULL; 9378 u64 objectid; 9379 u64 index; 9380 int ret = 0; 9381 9382 /* 9383 * 5 units required for adding orphan entry 9384 */ 9385 trans = btrfs_start_transaction(root, 5); 9386 if (IS_ERR(trans)) 9387 return PTR_ERR(trans); 9388 9389 ret = btrfs_find_free_ino(root, &objectid); 9390 if (ret) 9391 goto out; 9392 9393 inode = btrfs_new_inode(trans, root, dir, NULL, 0, 9394 btrfs_ino(dir), objectid, mode, &index); 9395 if (IS_ERR(inode)) { 9396 ret = PTR_ERR(inode); 9397 inode = NULL; 9398 goto out; 9399 } 9400 9401 inode->i_fop = &btrfs_file_operations; 9402 inode->i_op = &btrfs_file_inode_operations; 9403 9404 inode->i_mapping->a_ops = &btrfs_aops; 9405 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 9406 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 9407 9408 ret = btrfs_init_inode_security(trans, inode, dir, NULL); 9409 if (ret) 9410 goto out_inode; 9411 9412 ret = btrfs_update_inode(trans, root, inode); 9413 if (ret) 9414 goto out_inode; 9415 ret = btrfs_orphan_add(trans, inode); 9416 if (ret) 9417 goto out_inode; 9418 9419 /* 9420 * We set number of links to 0 in btrfs_new_inode(), and here we set 9421 * it to 1 because d_tmpfile() will issue a warning if the count is 0, 9422 * through: 9423 * 9424 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() 9425 */ 9426 set_nlink(inode, 1); 9427 unlock_new_inode(inode); 9428 d_tmpfile(dentry, inode); 9429 mark_inode_dirty(inode); 9430 9431 out: 9432 btrfs_end_transaction(trans, root); 9433 if (ret) 9434 iput(inode); 9435 btrfs_balance_delayed_items(root); 9436 btrfs_btree_balance_dirty(root); 9437 return ret; 9438 9439 out_inode: 9440 unlock_new_inode(inode); 9441 goto out; 9442 9443 } 9444 9445 static const struct inode_operations btrfs_dir_inode_operations = { 9446 .getattr = btrfs_getattr, 9447 .lookup = btrfs_lookup, 9448 .create = btrfs_create, 9449 .unlink = btrfs_unlink, 9450 .link = btrfs_link, 9451 .mkdir = btrfs_mkdir, 9452 .rmdir = btrfs_rmdir, 9453 .rename2 = btrfs_rename2, 9454 .symlink = btrfs_symlink, 9455 .setattr = btrfs_setattr, 9456 .mknod = btrfs_mknod, 9457 .setxattr = btrfs_setxattr, 9458 .getxattr = btrfs_getxattr, 9459 .listxattr = btrfs_listxattr, 9460 .removexattr = btrfs_removexattr, 9461 .permission = btrfs_permission, 9462 .get_acl = btrfs_get_acl, 9463 .set_acl = btrfs_set_acl, 9464 .update_time = btrfs_update_time, 9465 .tmpfile = btrfs_tmpfile, 9466 }; 9467 static const struct inode_operations btrfs_dir_ro_inode_operations = { 9468 .lookup = btrfs_lookup, 9469 .permission = btrfs_permission, 9470 .get_acl = btrfs_get_acl, 9471 .set_acl = btrfs_set_acl, 9472 .update_time = btrfs_update_time, 9473 }; 9474 9475 static const struct file_operations btrfs_dir_file_operations = { 9476 .llseek = generic_file_llseek, 9477 .read = generic_read_dir, 9478 .iterate = btrfs_real_readdir, 9479 .unlocked_ioctl = btrfs_ioctl, 9480 #ifdef CONFIG_COMPAT 9481 .compat_ioctl = btrfs_ioctl, 9482 #endif 9483 .release = btrfs_release_file, 9484 .fsync = btrfs_sync_file, 9485 }; 9486 9487 static struct extent_io_ops btrfs_extent_io_ops = { 9488 .fill_delalloc = run_delalloc_range, 9489 .submit_bio_hook = btrfs_submit_bio_hook, 9490 .merge_bio_hook = btrfs_merge_bio_hook, 9491 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 9492 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 9493 .writepage_start_hook = btrfs_writepage_start_hook, 9494 .set_bit_hook = btrfs_set_bit_hook, 9495 .clear_bit_hook = btrfs_clear_bit_hook, 9496 .merge_extent_hook = btrfs_merge_extent_hook, 9497 .split_extent_hook = btrfs_split_extent_hook, 9498 }; 9499 9500 /* 9501 * btrfs doesn't support the bmap operation because swapfiles 9502 * use bmap to make a mapping of extents in the file. They assume 9503 * these extents won't change over the life of the file and they 9504 * use the bmap result to do IO directly to the drive. 9505 * 9506 * the btrfs bmap call would return logical addresses that aren't 9507 * suitable for IO and they also will change frequently as COW 9508 * operations happen. So, swapfile + btrfs == corruption. 9509 * 9510 * For now we're avoiding this by dropping bmap. 9511 */ 9512 static const struct address_space_operations btrfs_aops = { 9513 .readpage = btrfs_readpage, 9514 .writepage = btrfs_writepage, 9515 .writepages = btrfs_writepages, 9516 .readpages = btrfs_readpages, 9517 .direct_IO = btrfs_direct_IO, 9518 .invalidatepage = btrfs_invalidatepage, 9519 .releasepage = btrfs_releasepage, 9520 .set_page_dirty = btrfs_set_page_dirty, 9521 .error_remove_page = generic_error_remove_page, 9522 }; 9523 9524 static const struct address_space_operations btrfs_symlink_aops = { 9525 .readpage = btrfs_readpage, 9526 .writepage = btrfs_writepage, 9527 .invalidatepage = btrfs_invalidatepage, 9528 .releasepage = btrfs_releasepage, 9529 }; 9530 9531 static const struct inode_operations btrfs_file_inode_operations = { 9532 .getattr = btrfs_getattr, 9533 .setattr = btrfs_setattr, 9534 .setxattr = btrfs_setxattr, 9535 .getxattr = btrfs_getxattr, 9536 .listxattr = btrfs_listxattr, 9537 .removexattr = btrfs_removexattr, 9538 .permission = btrfs_permission, 9539 .fiemap = btrfs_fiemap, 9540 .get_acl = btrfs_get_acl, 9541 .set_acl = btrfs_set_acl, 9542 .update_time = btrfs_update_time, 9543 }; 9544 static const struct inode_operations btrfs_special_inode_operations = { 9545 .getattr = btrfs_getattr, 9546 .setattr = btrfs_setattr, 9547 .permission = btrfs_permission, 9548 .setxattr = btrfs_setxattr, 9549 .getxattr = btrfs_getxattr, 9550 .listxattr = btrfs_listxattr, 9551 .removexattr = btrfs_removexattr, 9552 .get_acl = btrfs_get_acl, 9553 .set_acl = btrfs_set_acl, 9554 .update_time = btrfs_update_time, 9555 }; 9556 static const struct inode_operations btrfs_symlink_inode_operations = { 9557 .readlink = generic_readlink, 9558 .follow_link = page_follow_link_light, 9559 .put_link = page_put_link, 9560 .getattr = btrfs_getattr, 9561 .setattr = btrfs_setattr, 9562 .permission = btrfs_permission, 9563 .setxattr = btrfs_setxattr, 9564 .getxattr = btrfs_getxattr, 9565 .listxattr = btrfs_listxattr, 9566 .removexattr = btrfs_removexattr, 9567 .update_time = btrfs_update_time, 9568 }; 9569 9570 const struct dentry_operations btrfs_dentry_operations = { 9571 .d_delete = btrfs_dentry_delete, 9572 .d_release = btrfs_dentry_release, 9573 }; 9574