1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/aio.h> 36 #include <linux/bit_spinlock.h> 37 #include <linux/xattr.h> 38 #include <linux/posix_acl.h> 39 #include <linux/falloc.h> 40 #include <linux/slab.h> 41 #include <linux/ratelimit.h> 42 #include <linux/mount.h> 43 #include <linux/btrfs.h> 44 #include <linux/blkdev.h> 45 #include <linux/posix_acl_xattr.h> 46 #include "compat.h" 47 #include "ctree.h" 48 #include "disk-io.h" 49 #include "transaction.h" 50 #include "btrfs_inode.h" 51 #include "print-tree.h" 52 #include "ordered-data.h" 53 #include "xattr.h" 54 #include "tree-log.h" 55 #include "volumes.h" 56 #include "compression.h" 57 #include "locking.h" 58 #include "free-space-cache.h" 59 #include "inode-map.h" 60 #include "backref.h" 61 #include "hash.h" 62 63 struct btrfs_iget_args { 64 u64 ino; 65 struct btrfs_root *root; 66 }; 67 68 static const struct inode_operations btrfs_dir_inode_operations; 69 static const struct inode_operations btrfs_symlink_inode_operations; 70 static const struct inode_operations btrfs_dir_ro_inode_operations; 71 static const struct inode_operations btrfs_special_inode_operations; 72 static const struct inode_operations btrfs_file_inode_operations; 73 static const struct address_space_operations btrfs_aops; 74 static const struct address_space_operations btrfs_symlink_aops; 75 static const struct file_operations btrfs_dir_file_operations; 76 static struct extent_io_ops btrfs_extent_io_ops; 77 78 static struct kmem_cache *btrfs_inode_cachep; 79 static struct kmem_cache *btrfs_delalloc_work_cachep; 80 struct kmem_cache *btrfs_trans_handle_cachep; 81 struct kmem_cache *btrfs_transaction_cachep; 82 struct kmem_cache *btrfs_path_cachep; 83 struct kmem_cache *btrfs_free_space_cachep; 84 85 #define S_SHIFT 12 86 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 87 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 88 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 89 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 90 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 91 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 92 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 93 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 94 }; 95 96 static int btrfs_setsize(struct inode *inode, struct iattr *attr); 97 static int btrfs_truncate(struct inode *inode); 98 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 99 static noinline int cow_file_range(struct inode *inode, 100 struct page *locked_page, 101 u64 start, u64 end, int *page_started, 102 unsigned long *nr_written, int unlock); 103 static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 104 u64 len, u64 orig_start, 105 u64 block_start, u64 block_len, 106 u64 orig_block_len, u64 ram_bytes, 107 int type); 108 109 static int btrfs_dirty_inode(struct inode *inode); 110 111 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 112 struct inode *inode, struct inode *dir, 113 const struct qstr *qstr) 114 { 115 int err; 116 117 err = btrfs_init_acl(trans, inode, dir); 118 if (!err) 119 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 120 return err; 121 } 122 123 /* 124 * this does all the hard work for inserting an inline extent into 125 * the btree. The caller should have done a btrfs_drop_extents so that 126 * no overlapping inline items exist in the btree 127 */ 128 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 129 struct btrfs_root *root, struct inode *inode, 130 u64 start, size_t size, size_t compressed_size, 131 int compress_type, 132 struct page **compressed_pages) 133 { 134 struct btrfs_key key; 135 struct btrfs_path *path; 136 struct extent_buffer *leaf; 137 struct page *page = NULL; 138 char *kaddr; 139 unsigned long ptr; 140 struct btrfs_file_extent_item *ei; 141 int err = 0; 142 int ret; 143 size_t cur_size = size; 144 size_t datasize; 145 unsigned long offset; 146 147 if (compressed_size && compressed_pages) 148 cur_size = compressed_size; 149 150 path = btrfs_alloc_path(); 151 if (!path) 152 return -ENOMEM; 153 154 path->leave_spinning = 1; 155 156 key.objectid = btrfs_ino(inode); 157 key.offset = start; 158 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 159 datasize = btrfs_file_extent_calc_inline_size(cur_size); 160 161 inode_add_bytes(inode, size); 162 ret = btrfs_insert_empty_item(trans, root, path, &key, 163 datasize); 164 if (ret) { 165 err = ret; 166 goto fail; 167 } 168 leaf = path->nodes[0]; 169 ei = btrfs_item_ptr(leaf, path->slots[0], 170 struct btrfs_file_extent_item); 171 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 172 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 173 btrfs_set_file_extent_encryption(leaf, ei, 0); 174 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 175 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 176 ptr = btrfs_file_extent_inline_start(ei); 177 178 if (compress_type != BTRFS_COMPRESS_NONE) { 179 struct page *cpage; 180 int i = 0; 181 while (compressed_size > 0) { 182 cpage = compressed_pages[i]; 183 cur_size = min_t(unsigned long, compressed_size, 184 PAGE_CACHE_SIZE); 185 186 kaddr = kmap_atomic(cpage); 187 write_extent_buffer(leaf, kaddr, ptr, cur_size); 188 kunmap_atomic(kaddr); 189 190 i++; 191 ptr += cur_size; 192 compressed_size -= cur_size; 193 } 194 btrfs_set_file_extent_compression(leaf, ei, 195 compress_type); 196 } else { 197 page = find_get_page(inode->i_mapping, 198 start >> PAGE_CACHE_SHIFT); 199 btrfs_set_file_extent_compression(leaf, ei, 0); 200 kaddr = kmap_atomic(page); 201 offset = start & (PAGE_CACHE_SIZE - 1); 202 write_extent_buffer(leaf, kaddr + offset, ptr, size); 203 kunmap_atomic(kaddr); 204 page_cache_release(page); 205 } 206 btrfs_mark_buffer_dirty(leaf); 207 btrfs_free_path(path); 208 209 /* 210 * we're an inline extent, so nobody can 211 * extend the file past i_size without locking 212 * a page we already have locked. 213 * 214 * We must do any isize and inode updates 215 * before we unlock the pages. Otherwise we 216 * could end up racing with unlink. 217 */ 218 BTRFS_I(inode)->disk_i_size = inode->i_size; 219 ret = btrfs_update_inode(trans, root, inode); 220 221 return ret; 222 fail: 223 btrfs_free_path(path); 224 return err; 225 } 226 227 228 /* 229 * conditionally insert an inline extent into the file. This 230 * does the checks required to make sure the data is small enough 231 * to fit as an inline extent. 232 */ 233 static noinline int cow_file_range_inline(struct btrfs_root *root, 234 struct inode *inode, u64 start, 235 u64 end, size_t compressed_size, 236 int compress_type, 237 struct page **compressed_pages) 238 { 239 struct btrfs_trans_handle *trans; 240 u64 isize = i_size_read(inode); 241 u64 actual_end = min(end + 1, isize); 242 u64 inline_len = actual_end - start; 243 u64 aligned_end = ALIGN(end, root->sectorsize); 244 u64 data_len = inline_len; 245 int ret; 246 247 if (compressed_size) 248 data_len = compressed_size; 249 250 if (start > 0 || 251 actual_end >= PAGE_CACHE_SIZE || 252 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 253 (!compressed_size && 254 (actual_end & (root->sectorsize - 1)) == 0) || 255 end + 1 < isize || 256 data_len > root->fs_info->max_inline) { 257 return 1; 258 } 259 260 trans = btrfs_join_transaction(root); 261 if (IS_ERR(trans)) 262 return PTR_ERR(trans); 263 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 264 265 ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1); 266 if (ret) { 267 btrfs_abort_transaction(trans, root, ret); 268 goto out; 269 } 270 271 if (isize > actual_end) 272 inline_len = min_t(u64, isize, actual_end); 273 ret = insert_inline_extent(trans, root, inode, start, 274 inline_len, compressed_size, 275 compress_type, compressed_pages); 276 if (ret && ret != -ENOSPC) { 277 btrfs_abort_transaction(trans, root, ret); 278 goto out; 279 } else if (ret == -ENOSPC) { 280 ret = 1; 281 goto out; 282 } 283 284 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 285 btrfs_delalloc_release_metadata(inode, end + 1 - start); 286 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 287 out: 288 btrfs_end_transaction(trans, root); 289 return ret; 290 } 291 292 struct async_extent { 293 u64 start; 294 u64 ram_size; 295 u64 compressed_size; 296 struct page **pages; 297 unsigned long nr_pages; 298 int compress_type; 299 struct list_head list; 300 }; 301 302 struct async_cow { 303 struct inode *inode; 304 struct btrfs_root *root; 305 struct page *locked_page; 306 u64 start; 307 u64 end; 308 struct list_head extents; 309 struct btrfs_work work; 310 }; 311 312 static noinline int add_async_extent(struct async_cow *cow, 313 u64 start, u64 ram_size, 314 u64 compressed_size, 315 struct page **pages, 316 unsigned long nr_pages, 317 int compress_type) 318 { 319 struct async_extent *async_extent; 320 321 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 322 BUG_ON(!async_extent); /* -ENOMEM */ 323 async_extent->start = start; 324 async_extent->ram_size = ram_size; 325 async_extent->compressed_size = compressed_size; 326 async_extent->pages = pages; 327 async_extent->nr_pages = nr_pages; 328 async_extent->compress_type = compress_type; 329 list_add_tail(&async_extent->list, &cow->extents); 330 return 0; 331 } 332 333 /* 334 * we create compressed extents in two phases. The first 335 * phase compresses a range of pages that have already been 336 * locked (both pages and state bits are locked). 337 * 338 * This is done inside an ordered work queue, and the compression 339 * is spread across many cpus. The actual IO submission is step 340 * two, and the ordered work queue takes care of making sure that 341 * happens in the same order things were put onto the queue by 342 * writepages and friends. 343 * 344 * If this code finds it can't get good compression, it puts an 345 * entry onto the work queue to write the uncompressed bytes. This 346 * makes sure that both compressed inodes and uncompressed inodes 347 * are written in the same order that the flusher thread sent them 348 * down. 349 */ 350 static noinline int compress_file_range(struct inode *inode, 351 struct page *locked_page, 352 u64 start, u64 end, 353 struct async_cow *async_cow, 354 int *num_added) 355 { 356 struct btrfs_root *root = BTRFS_I(inode)->root; 357 u64 num_bytes; 358 u64 blocksize = root->sectorsize; 359 u64 actual_end; 360 u64 isize = i_size_read(inode); 361 int ret = 0; 362 struct page **pages = NULL; 363 unsigned long nr_pages; 364 unsigned long nr_pages_ret = 0; 365 unsigned long total_compressed = 0; 366 unsigned long total_in = 0; 367 unsigned long max_compressed = 128 * 1024; 368 unsigned long max_uncompressed = 128 * 1024; 369 int i; 370 int will_compress; 371 int compress_type = root->fs_info->compress_type; 372 int redirty = 0; 373 374 /* if this is a small write inside eof, kick off a defrag */ 375 if ((end - start + 1) < 16 * 1024 && 376 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 377 btrfs_add_inode_defrag(NULL, inode); 378 379 actual_end = min_t(u64, isize, end + 1); 380 again: 381 will_compress = 0; 382 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 383 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 384 385 /* 386 * we don't want to send crud past the end of i_size through 387 * compression, that's just a waste of CPU time. So, if the 388 * end of the file is before the start of our current 389 * requested range of bytes, we bail out to the uncompressed 390 * cleanup code that can deal with all of this. 391 * 392 * It isn't really the fastest way to fix things, but this is a 393 * very uncommon corner. 394 */ 395 if (actual_end <= start) 396 goto cleanup_and_bail_uncompressed; 397 398 total_compressed = actual_end - start; 399 400 /* we want to make sure that amount of ram required to uncompress 401 * an extent is reasonable, so we limit the total size in ram 402 * of a compressed extent to 128k. This is a crucial number 403 * because it also controls how easily we can spread reads across 404 * cpus for decompression. 405 * 406 * We also want to make sure the amount of IO required to do 407 * a random read is reasonably small, so we limit the size of 408 * a compressed extent to 128k. 409 */ 410 total_compressed = min(total_compressed, max_uncompressed); 411 num_bytes = ALIGN(end - start + 1, blocksize); 412 num_bytes = max(blocksize, num_bytes); 413 total_in = 0; 414 ret = 0; 415 416 /* 417 * we do compression for mount -o compress and when the 418 * inode has not been flagged as nocompress. This flag can 419 * change at any time if we discover bad compression ratios. 420 */ 421 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 422 (btrfs_test_opt(root, COMPRESS) || 423 (BTRFS_I(inode)->force_compress) || 424 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 425 WARN_ON(pages); 426 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 427 if (!pages) { 428 /* just bail out to the uncompressed code */ 429 goto cont; 430 } 431 432 if (BTRFS_I(inode)->force_compress) 433 compress_type = BTRFS_I(inode)->force_compress; 434 435 /* 436 * we need to call clear_page_dirty_for_io on each 437 * page in the range. Otherwise applications with the file 438 * mmap'd can wander in and change the page contents while 439 * we are compressing them. 440 * 441 * If the compression fails for any reason, we set the pages 442 * dirty again later on. 443 */ 444 extent_range_clear_dirty_for_io(inode, start, end); 445 redirty = 1; 446 ret = btrfs_compress_pages(compress_type, 447 inode->i_mapping, start, 448 total_compressed, pages, 449 nr_pages, &nr_pages_ret, 450 &total_in, 451 &total_compressed, 452 max_compressed); 453 454 if (!ret) { 455 unsigned long offset = total_compressed & 456 (PAGE_CACHE_SIZE - 1); 457 struct page *page = pages[nr_pages_ret - 1]; 458 char *kaddr; 459 460 /* zero the tail end of the last page, we might be 461 * sending it down to disk 462 */ 463 if (offset) { 464 kaddr = kmap_atomic(page); 465 memset(kaddr + offset, 0, 466 PAGE_CACHE_SIZE - offset); 467 kunmap_atomic(kaddr); 468 } 469 will_compress = 1; 470 } 471 } 472 cont: 473 if (start == 0) { 474 /* lets try to make an inline extent */ 475 if (ret || total_in < (actual_end - start)) { 476 /* we didn't compress the entire range, try 477 * to make an uncompressed inline extent. 478 */ 479 ret = cow_file_range_inline(root, inode, start, end, 480 0, 0, NULL); 481 } else { 482 /* try making a compressed inline extent */ 483 ret = cow_file_range_inline(root, inode, start, end, 484 total_compressed, 485 compress_type, pages); 486 } 487 if (ret <= 0) { 488 unsigned long clear_flags = EXTENT_DELALLOC | 489 EXTENT_DEFRAG; 490 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; 491 492 /* 493 * inline extent creation worked or returned error, 494 * we don't need to create any more async work items. 495 * Unlock and free up our temp pages. 496 */ 497 extent_clear_unlock_delalloc(inode, start, end, NULL, 498 clear_flags, PAGE_UNLOCK | 499 PAGE_CLEAR_DIRTY | 500 PAGE_SET_WRITEBACK | 501 PAGE_END_WRITEBACK); 502 goto free_pages_out; 503 } 504 } 505 506 if (will_compress) { 507 /* 508 * we aren't doing an inline extent round the compressed size 509 * up to a block size boundary so the allocator does sane 510 * things 511 */ 512 total_compressed = ALIGN(total_compressed, blocksize); 513 514 /* 515 * one last check to make sure the compression is really a 516 * win, compare the page count read with the blocks on disk 517 */ 518 total_in = ALIGN(total_in, PAGE_CACHE_SIZE); 519 if (total_compressed >= total_in) { 520 will_compress = 0; 521 } else { 522 num_bytes = total_in; 523 } 524 } 525 if (!will_compress && pages) { 526 /* 527 * the compression code ran but failed to make things smaller, 528 * free any pages it allocated and our page pointer array 529 */ 530 for (i = 0; i < nr_pages_ret; i++) { 531 WARN_ON(pages[i]->mapping); 532 page_cache_release(pages[i]); 533 } 534 kfree(pages); 535 pages = NULL; 536 total_compressed = 0; 537 nr_pages_ret = 0; 538 539 /* flag the file so we don't compress in the future */ 540 if (!btrfs_test_opt(root, FORCE_COMPRESS) && 541 !(BTRFS_I(inode)->force_compress)) { 542 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 543 } 544 } 545 if (will_compress) { 546 *num_added += 1; 547 548 /* the async work queues will take care of doing actual 549 * allocation on disk for these compressed pages, 550 * and will submit them to the elevator. 551 */ 552 add_async_extent(async_cow, start, num_bytes, 553 total_compressed, pages, nr_pages_ret, 554 compress_type); 555 556 if (start + num_bytes < end) { 557 start += num_bytes; 558 pages = NULL; 559 cond_resched(); 560 goto again; 561 } 562 } else { 563 cleanup_and_bail_uncompressed: 564 /* 565 * No compression, but we still need to write the pages in 566 * the file we've been given so far. redirty the locked 567 * page if it corresponds to our extent and set things up 568 * for the async work queue to run cow_file_range to do 569 * the normal delalloc dance 570 */ 571 if (page_offset(locked_page) >= start && 572 page_offset(locked_page) <= end) { 573 __set_page_dirty_nobuffers(locked_page); 574 /* unlocked later on in the async handlers */ 575 } 576 if (redirty) 577 extent_range_redirty_for_io(inode, start, end); 578 add_async_extent(async_cow, start, end - start + 1, 579 0, NULL, 0, BTRFS_COMPRESS_NONE); 580 *num_added += 1; 581 } 582 583 out: 584 return ret; 585 586 free_pages_out: 587 for (i = 0; i < nr_pages_ret; i++) { 588 WARN_ON(pages[i]->mapping); 589 page_cache_release(pages[i]); 590 } 591 kfree(pages); 592 593 goto out; 594 } 595 596 /* 597 * phase two of compressed writeback. This is the ordered portion 598 * of the code, which only gets called in the order the work was 599 * queued. We walk all the async extents created by compress_file_range 600 * and send them down to the disk. 601 */ 602 static noinline int submit_compressed_extents(struct inode *inode, 603 struct async_cow *async_cow) 604 { 605 struct async_extent *async_extent; 606 u64 alloc_hint = 0; 607 struct btrfs_key ins; 608 struct extent_map *em; 609 struct btrfs_root *root = BTRFS_I(inode)->root; 610 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 611 struct extent_io_tree *io_tree; 612 int ret = 0; 613 614 if (list_empty(&async_cow->extents)) 615 return 0; 616 617 again: 618 while (!list_empty(&async_cow->extents)) { 619 async_extent = list_entry(async_cow->extents.next, 620 struct async_extent, list); 621 list_del(&async_extent->list); 622 623 io_tree = &BTRFS_I(inode)->io_tree; 624 625 retry: 626 /* did the compression code fall back to uncompressed IO? */ 627 if (!async_extent->pages) { 628 int page_started = 0; 629 unsigned long nr_written = 0; 630 631 lock_extent(io_tree, async_extent->start, 632 async_extent->start + 633 async_extent->ram_size - 1); 634 635 /* allocate blocks */ 636 ret = cow_file_range(inode, async_cow->locked_page, 637 async_extent->start, 638 async_extent->start + 639 async_extent->ram_size - 1, 640 &page_started, &nr_written, 0); 641 642 /* JDM XXX */ 643 644 /* 645 * if page_started, cow_file_range inserted an 646 * inline extent and took care of all the unlocking 647 * and IO for us. Otherwise, we need to submit 648 * all those pages down to the drive. 649 */ 650 if (!page_started && !ret) 651 extent_write_locked_range(io_tree, 652 inode, async_extent->start, 653 async_extent->start + 654 async_extent->ram_size - 1, 655 btrfs_get_extent, 656 WB_SYNC_ALL); 657 else if (ret) 658 unlock_page(async_cow->locked_page); 659 kfree(async_extent); 660 cond_resched(); 661 continue; 662 } 663 664 lock_extent(io_tree, async_extent->start, 665 async_extent->start + async_extent->ram_size - 1); 666 667 ret = btrfs_reserve_extent(root, 668 async_extent->compressed_size, 669 async_extent->compressed_size, 670 0, alloc_hint, &ins, 1); 671 if (ret) { 672 int i; 673 674 for (i = 0; i < async_extent->nr_pages; i++) { 675 WARN_ON(async_extent->pages[i]->mapping); 676 page_cache_release(async_extent->pages[i]); 677 } 678 kfree(async_extent->pages); 679 async_extent->nr_pages = 0; 680 async_extent->pages = NULL; 681 682 if (ret == -ENOSPC) { 683 unlock_extent(io_tree, async_extent->start, 684 async_extent->start + 685 async_extent->ram_size - 1); 686 goto retry; 687 } 688 goto out_free; 689 } 690 691 /* 692 * here we're doing allocation and writeback of the 693 * compressed pages 694 */ 695 btrfs_drop_extent_cache(inode, async_extent->start, 696 async_extent->start + 697 async_extent->ram_size - 1, 0); 698 699 em = alloc_extent_map(); 700 if (!em) { 701 ret = -ENOMEM; 702 goto out_free_reserve; 703 } 704 em->start = async_extent->start; 705 em->len = async_extent->ram_size; 706 em->orig_start = em->start; 707 em->mod_start = em->start; 708 em->mod_len = em->len; 709 710 em->block_start = ins.objectid; 711 em->block_len = ins.offset; 712 em->orig_block_len = ins.offset; 713 em->ram_bytes = async_extent->ram_size; 714 em->bdev = root->fs_info->fs_devices->latest_bdev; 715 em->compress_type = async_extent->compress_type; 716 set_bit(EXTENT_FLAG_PINNED, &em->flags); 717 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 718 em->generation = -1; 719 720 while (1) { 721 write_lock(&em_tree->lock); 722 ret = add_extent_mapping(em_tree, em, 1); 723 write_unlock(&em_tree->lock); 724 if (ret != -EEXIST) { 725 free_extent_map(em); 726 break; 727 } 728 btrfs_drop_extent_cache(inode, async_extent->start, 729 async_extent->start + 730 async_extent->ram_size - 1, 0); 731 } 732 733 if (ret) 734 goto out_free_reserve; 735 736 ret = btrfs_add_ordered_extent_compress(inode, 737 async_extent->start, 738 ins.objectid, 739 async_extent->ram_size, 740 ins.offset, 741 BTRFS_ORDERED_COMPRESSED, 742 async_extent->compress_type); 743 if (ret) 744 goto out_free_reserve; 745 746 /* 747 * clear dirty, set writeback and unlock the pages. 748 */ 749 extent_clear_unlock_delalloc(inode, async_extent->start, 750 async_extent->start + 751 async_extent->ram_size - 1, 752 NULL, EXTENT_LOCKED | EXTENT_DELALLOC, 753 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 754 PAGE_SET_WRITEBACK); 755 ret = btrfs_submit_compressed_write(inode, 756 async_extent->start, 757 async_extent->ram_size, 758 ins.objectid, 759 ins.offset, async_extent->pages, 760 async_extent->nr_pages); 761 alloc_hint = ins.objectid + ins.offset; 762 kfree(async_extent); 763 if (ret) 764 goto out; 765 cond_resched(); 766 } 767 ret = 0; 768 out: 769 return ret; 770 out_free_reserve: 771 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 772 out_free: 773 extent_clear_unlock_delalloc(inode, async_extent->start, 774 async_extent->start + 775 async_extent->ram_size - 1, 776 NULL, EXTENT_LOCKED | EXTENT_DELALLOC | 777 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 778 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 779 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); 780 kfree(async_extent); 781 goto again; 782 } 783 784 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 785 u64 num_bytes) 786 { 787 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 788 struct extent_map *em; 789 u64 alloc_hint = 0; 790 791 read_lock(&em_tree->lock); 792 em = search_extent_mapping(em_tree, start, num_bytes); 793 if (em) { 794 /* 795 * if block start isn't an actual block number then find the 796 * first block in this inode and use that as a hint. If that 797 * block is also bogus then just don't worry about it. 798 */ 799 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 800 free_extent_map(em); 801 em = search_extent_mapping(em_tree, 0, 0); 802 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 803 alloc_hint = em->block_start; 804 if (em) 805 free_extent_map(em); 806 } else { 807 alloc_hint = em->block_start; 808 free_extent_map(em); 809 } 810 } 811 read_unlock(&em_tree->lock); 812 813 return alloc_hint; 814 } 815 816 /* 817 * when extent_io.c finds a delayed allocation range in the file, 818 * the call backs end up in this code. The basic idea is to 819 * allocate extents on disk for the range, and create ordered data structs 820 * in ram to track those extents. 821 * 822 * locked_page is the page that writepage had locked already. We use 823 * it to make sure we don't do extra locks or unlocks. 824 * 825 * *page_started is set to one if we unlock locked_page and do everything 826 * required to start IO on it. It may be clean and already done with 827 * IO when we return. 828 */ 829 static noinline int cow_file_range(struct inode *inode, 830 struct page *locked_page, 831 u64 start, u64 end, int *page_started, 832 unsigned long *nr_written, 833 int unlock) 834 { 835 struct btrfs_root *root = BTRFS_I(inode)->root; 836 u64 alloc_hint = 0; 837 u64 num_bytes; 838 unsigned long ram_size; 839 u64 disk_num_bytes; 840 u64 cur_alloc_size; 841 u64 blocksize = root->sectorsize; 842 struct btrfs_key ins; 843 struct extent_map *em; 844 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 845 int ret = 0; 846 847 BUG_ON(btrfs_is_free_space_inode(inode)); 848 849 num_bytes = ALIGN(end - start + 1, blocksize); 850 num_bytes = max(blocksize, num_bytes); 851 disk_num_bytes = num_bytes; 852 853 /* if this is a small write inside eof, kick off defrag */ 854 if (num_bytes < 64 * 1024 && 855 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 856 btrfs_add_inode_defrag(NULL, inode); 857 858 if (start == 0) { 859 /* lets try to make an inline extent */ 860 ret = cow_file_range_inline(root, inode, start, end, 0, 0, 861 NULL); 862 if (ret == 0) { 863 extent_clear_unlock_delalloc(inode, start, end, NULL, 864 EXTENT_LOCKED | EXTENT_DELALLOC | 865 EXTENT_DEFRAG, PAGE_UNLOCK | 866 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 867 PAGE_END_WRITEBACK); 868 869 *nr_written = *nr_written + 870 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 871 *page_started = 1; 872 goto out; 873 } else if (ret < 0) { 874 goto out_unlock; 875 } 876 } 877 878 BUG_ON(disk_num_bytes > 879 btrfs_super_total_bytes(root->fs_info->super_copy)); 880 881 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 882 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 883 884 while (disk_num_bytes > 0) { 885 unsigned long op; 886 887 cur_alloc_size = disk_num_bytes; 888 ret = btrfs_reserve_extent(root, cur_alloc_size, 889 root->sectorsize, 0, alloc_hint, 890 &ins, 1); 891 if (ret < 0) 892 goto out_unlock; 893 894 em = alloc_extent_map(); 895 if (!em) { 896 ret = -ENOMEM; 897 goto out_reserve; 898 } 899 em->start = start; 900 em->orig_start = em->start; 901 ram_size = ins.offset; 902 em->len = ins.offset; 903 em->mod_start = em->start; 904 em->mod_len = em->len; 905 906 em->block_start = ins.objectid; 907 em->block_len = ins.offset; 908 em->orig_block_len = ins.offset; 909 em->ram_bytes = ram_size; 910 em->bdev = root->fs_info->fs_devices->latest_bdev; 911 set_bit(EXTENT_FLAG_PINNED, &em->flags); 912 em->generation = -1; 913 914 while (1) { 915 write_lock(&em_tree->lock); 916 ret = add_extent_mapping(em_tree, em, 1); 917 write_unlock(&em_tree->lock); 918 if (ret != -EEXIST) { 919 free_extent_map(em); 920 break; 921 } 922 btrfs_drop_extent_cache(inode, start, 923 start + ram_size - 1, 0); 924 } 925 if (ret) 926 goto out_reserve; 927 928 cur_alloc_size = ins.offset; 929 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 930 ram_size, cur_alloc_size, 0); 931 if (ret) 932 goto out_reserve; 933 934 if (root->root_key.objectid == 935 BTRFS_DATA_RELOC_TREE_OBJECTID) { 936 ret = btrfs_reloc_clone_csums(inode, start, 937 cur_alloc_size); 938 if (ret) 939 goto out_reserve; 940 } 941 942 if (disk_num_bytes < cur_alloc_size) 943 break; 944 945 /* we're not doing compressed IO, don't unlock the first 946 * page (which the caller expects to stay locked), don't 947 * clear any dirty bits and don't set any writeback bits 948 * 949 * Do set the Private2 bit so we know this page was properly 950 * setup for writepage 951 */ 952 op = unlock ? PAGE_UNLOCK : 0; 953 op |= PAGE_SET_PRIVATE2; 954 955 extent_clear_unlock_delalloc(inode, start, 956 start + ram_size - 1, locked_page, 957 EXTENT_LOCKED | EXTENT_DELALLOC, 958 op); 959 disk_num_bytes -= cur_alloc_size; 960 num_bytes -= cur_alloc_size; 961 alloc_hint = ins.objectid + ins.offset; 962 start += cur_alloc_size; 963 } 964 out: 965 return ret; 966 967 out_reserve: 968 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 969 out_unlock: 970 extent_clear_unlock_delalloc(inode, start, end, locked_page, 971 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 972 EXTENT_DELALLOC | EXTENT_DEFRAG, 973 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 974 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); 975 goto out; 976 } 977 978 /* 979 * work queue call back to started compression on a file and pages 980 */ 981 static noinline void async_cow_start(struct btrfs_work *work) 982 { 983 struct async_cow *async_cow; 984 int num_added = 0; 985 async_cow = container_of(work, struct async_cow, work); 986 987 compress_file_range(async_cow->inode, async_cow->locked_page, 988 async_cow->start, async_cow->end, async_cow, 989 &num_added); 990 if (num_added == 0) { 991 btrfs_add_delayed_iput(async_cow->inode); 992 async_cow->inode = NULL; 993 } 994 } 995 996 /* 997 * work queue call back to submit previously compressed pages 998 */ 999 static noinline void async_cow_submit(struct btrfs_work *work) 1000 { 1001 struct async_cow *async_cow; 1002 struct btrfs_root *root; 1003 unsigned long nr_pages; 1004 1005 async_cow = container_of(work, struct async_cow, work); 1006 1007 root = async_cow->root; 1008 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 1009 PAGE_CACHE_SHIFT; 1010 1011 if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < 1012 5 * 1024 * 1024 && 1013 waitqueue_active(&root->fs_info->async_submit_wait)) 1014 wake_up(&root->fs_info->async_submit_wait); 1015 1016 if (async_cow->inode) 1017 submit_compressed_extents(async_cow->inode, async_cow); 1018 } 1019 1020 static noinline void async_cow_free(struct btrfs_work *work) 1021 { 1022 struct async_cow *async_cow; 1023 async_cow = container_of(work, struct async_cow, work); 1024 if (async_cow->inode) 1025 btrfs_add_delayed_iput(async_cow->inode); 1026 kfree(async_cow); 1027 } 1028 1029 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 1030 u64 start, u64 end, int *page_started, 1031 unsigned long *nr_written) 1032 { 1033 struct async_cow *async_cow; 1034 struct btrfs_root *root = BTRFS_I(inode)->root; 1035 unsigned long nr_pages; 1036 u64 cur_end; 1037 int limit = 10 * 1024 * 1024; 1038 1039 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1040 1, 0, NULL, GFP_NOFS); 1041 while (start < end) { 1042 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 1043 BUG_ON(!async_cow); /* -ENOMEM */ 1044 async_cow->inode = igrab(inode); 1045 async_cow->root = root; 1046 async_cow->locked_page = locked_page; 1047 async_cow->start = start; 1048 1049 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 1050 cur_end = end; 1051 else 1052 cur_end = min(end, start + 512 * 1024 - 1); 1053 1054 async_cow->end = cur_end; 1055 INIT_LIST_HEAD(&async_cow->extents); 1056 1057 async_cow->work.func = async_cow_start; 1058 async_cow->work.ordered_func = async_cow_submit; 1059 async_cow->work.ordered_free = async_cow_free; 1060 async_cow->work.flags = 0; 1061 1062 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1063 PAGE_CACHE_SHIFT; 1064 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 1065 1066 btrfs_queue_worker(&root->fs_info->delalloc_workers, 1067 &async_cow->work); 1068 1069 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 1070 wait_event(root->fs_info->async_submit_wait, 1071 (atomic_read(&root->fs_info->async_delalloc_pages) < 1072 limit)); 1073 } 1074 1075 while (atomic_read(&root->fs_info->async_submit_draining) && 1076 atomic_read(&root->fs_info->async_delalloc_pages)) { 1077 wait_event(root->fs_info->async_submit_wait, 1078 (atomic_read(&root->fs_info->async_delalloc_pages) == 1079 0)); 1080 } 1081 1082 *nr_written += nr_pages; 1083 start = cur_end + 1; 1084 } 1085 *page_started = 1; 1086 return 0; 1087 } 1088 1089 static noinline int csum_exist_in_range(struct btrfs_root *root, 1090 u64 bytenr, u64 num_bytes) 1091 { 1092 int ret; 1093 struct btrfs_ordered_sum *sums; 1094 LIST_HEAD(list); 1095 1096 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1097 bytenr + num_bytes - 1, &list, 0); 1098 if (ret == 0 && list_empty(&list)) 1099 return 0; 1100 1101 while (!list_empty(&list)) { 1102 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1103 list_del(&sums->list); 1104 kfree(sums); 1105 } 1106 return 1; 1107 } 1108 1109 /* 1110 * when nowcow writeback call back. This checks for snapshots or COW copies 1111 * of the extents that exist in the file, and COWs the file as required. 1112 * 1113 * If no cow copies or snapshots exist, we write directly to the existing 1114 * blocks on disk 1115 */ 1116 static noinline int run_delalloc_nocow(struct inode *inode, 1117 struct page *locked_page, 1118 u64 start, u64 end, int *page_started, int force, 1119 unsigned long *nr_written) 1120 { 1121 struct btrfs_root *root = BTRFS_I(inode)->root; 1122 struct btrfs_trans_handle *trans; 1123 struct extent_buffer *leaf; 1124 struct btrfs_path *path; 1125 struct btrfs_file_extent_item *fi; 1126 struct btrfs_key found_key; 1127 u64 cow_start; 1128 u64 cur_offset; 1129 u64 extent_end; 1130 u64 extent_offset; 1131 u64 disk_bytenr; 1132 u64 num_bytes; 1133 u64 disk_num_bytes; 1134 u64 ram_bytes; 1135 int extent_type; 1136 int ret, err; 1137 int type; 1138 int nocow; 1139 int check_prev = 1; 1140 bool nolock; 1141 u64 ino = btrfs_ino(inode); 1142 1143 path = btrfs_alloc_path(); 1144 if (!path) { 1145 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1146 EXTENT_LOCKED | EXTENT_DELALLOC | 1147 EXTENT_DO_ACCOUNTING | 1148 EXTENT_DEFRAG, PAGE_UNLOCK | 1149 PAGE_CLEAR_DIRTY | 1150 PAGE_SET_WRITEBACK | 1151 PAGE_END_WRITEBACK); 1152 return -ENOMEM; 1153 } 1154 1155 nolock = btrfs_is_free_space_inode(inode); 1156 1157 if (nolock) 1158 trans = btrfs_join_transaction_nolock(root); 1159 else 1160 trans = btrfs_join_transaction(root); 1161 1162 if (IS_ERR(trans)) { 1163 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1164 EXTENT_LOCKED | EXTENT_DELALLOC | 1165 EXTENT_DO_ACCOUNTING | 1166 EXTENT_DEFRAG, PAGE_UNLOCK | 1167 PAGE_CLEAR_DIRTY | 1168 PAGE_SET_WRITEBACK | 1169 PAGE_END_WRITEBACK); 1170 btrfs_free_path(path); 1171 return PTR_ERR(trans); 1172 } 1173 1174 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1175 1176 cow_start = (u64)-1; 1177 cur_offset = start; 1178 while (1) { 1179 ret = btrfs_lookup_file_extent(trans, root, path, ino, 1180 cur_offset, 0); 1181 if (ret < 0) { 1182 btrfs_abort_transaction(trans, root, ret); 1183 goto error; 1184 } 1185 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1186 leaf = path->nodes[0]; 1187 btrfs_item_key_to_cpu(leaf, &found_key, 1188 path->slots[0] - 1); 1189 if (found_key.objectid == ino && 1190 found_key.type == BTRFS_EXTENT_DATA_KEY) 1191 path->slots[0]--; 1192 } 1193 check_prev = 0; 1194 next_slot: 1195 leaf = path->nodes[0]; 1196 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1197 ret = btrfs_next_leaf(root, path); 1198 if (ret < 0) { 1199 btrfs_abort_transaction(trans, root, ret); 1200 goto error; 1201 } 1202 if (ret > 0) 1203 break; 1204 leaf = path->nodes[0]; 1205 } 1206 1207 nocow = 0; 1208 disk_bytenr = 0; 1209 num_bytes = 0; 1210 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1211 1212 if (found_key.objectid > ino || 1213 found_key.type > BTRFS_EXTENT_DATA_KEY || 1214 found_key.offset > end) 1215 break; 1216 1217 if (found_key.offset > cur_offset) { 1218 extent_end = found_key.offset; 1219 extent_type = 0; 1220 goto out_check; 1221 } 1222 1223 fi = btrfs_item_ptr(leaf, path->slots[0], 1224 struct btrfs_file_extent_item); 1225 extent_type = btrfs_file_extent_type(leaf, fi); 1226 1227 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1228 if (extent_type == BTRFS_FILE_EXTENT_REG || 1229 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1230 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1231 extent_offset = btrfs_file_extent_offset(leaf, fi); 1232 extent_end = found_key.offset + 1233 btrfs_file_extent_num_bytes(leaf, fi); 1234 disk_num_bytes = 1235 btrfs_file_extent_disk_num_bytes(leaf, fi); 1236 if (extent_end <= start) { 1237 path->slots[0]++; 1238 goto next_slot; 1239 } 1240 if (disk_bytenr == 0) 1241 goto out_check; 1242 if (btrfs_file_extent_compression(leaf, fi) || 1243 btrfs_file_extent_encryption(leaf, fi) || 1244 btrfs_file_extent_other_encoding(leaf, fi)) 1245 goto out_check; 1246 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1247 goto out_check; 1248 if (btrfs_extent_readonly(root, disk_bytenr)) 1249 goto out_check; 1250 if (btrfs_cross_ref_exist(trans, root, ino, 1251 found_key.offset - 1252 extent_offset, disk_bytenr)) 1253 goto out_check; 1254 disk_bytenr += extent_offset; 1255 disk_bytenr += cur_offset - found_key.offset; 1256 num_bytes = min(end + 1, extent_end) - cur_offset; 1257 /* 1258 * force cow if csum exists in the range. 1259 * this ensure that csum for a given extent are 1260 * either valid or do not exist. 1261 */ 1262 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1263 goto out_check; 1264 nocow = 1; 1265 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1266 extent_end = found_key.offset + 1267 btrfs_file_extent_inline_len(leaf, fi); 1268 extent_end = ALIGN(extent_end, root->sectorsize); 1269 } else { 1270 BUG_ON(1); 1271 } 1272 out_check: 1273 if (extent_end <= start) { 1274 path->slots[0]++; 1275 goto next_slot; 1276 } 1277 if (!nocow) { 1278 if (cow_start == (u64)-1) 1279 cow_start = cur_offset; 1280 cur_offset = extent_end; 1281 if (cur_offset > end) 1282 break; 1283 path->slots[0]++; 1284 goto next_slot; 1285 } 1286 1287 btrfs_release_path(path); 1288 if (cow_start != (u64)-1) { 1289 ret = cow_file_range(inode, locked_page, 1290 cow_start, found_key.offset - 1, 1291 page_started, nr_written, 1); 1292 if (ret) { 1293 btrfs_abort_transaction(trans, root, ret); 1294 goto error; 1295 } 1296 cow_start = (u64)-1; 1297 } 1298 1299 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1300 struct extent_map *em; 1301 struct extent_map_tree *em_tree; 1302 em_tree = &BTRFS_I(inode)->extent_tree; 1303 em = alloc_extent_map(); 1304 BUG_ON(!em); /* -ENOMEM */ 1305 em->start = cur_offset; 1306 em->orig_start = found_key.offset - extent_offset; 1307 em->len = num_bytes; 1308 em->block_len = num_bytes; 1309 em->block_start = disk_bytenr; 1310 em->orig_block_len = disk_num_bytes; 1311 em->ram_bytes = ram_bytes; 1312 em->bdev = root->fs_info->fs_devices->latest_bdev; 1313 em->mod_start = em->start; 1314 em->mod_len = em->len; 1315 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1316 set_bit(EXTENT_FLAG_FILLING, &em->flags); 1317 em->generation = -1; 1318 while (1) { 1319 write_lock(&em_tree->lock); 1320 ret = add_extent_mapping(em_tree, em, 1); 1321 write_unlock(&em_tree->lock); 1322 if (ret != -EEXIST) { 1323 free_extent_map(em); 1324 break; 1325 } 1326 btrfs_drop_extent_cache(inode, em->start, 1327 em->start + em->len - 1, 0); 1328 } 1329 type = BTRFS_ORDERED_PREALLOC; 1330 } else { 1331 type = BTRFS_ORDERED_NOCOW; 1332 } 1333 1334 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1335 num_bytes, num_bytes, type); 1336 BUG_ON(ret); /* -ENOMEM */ 1337 1338 if (root->root_key.objectid == 1339 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1340 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1341 num_bytes); 1342 if (ret) { 1343 btrfs_abort_transaction(trans, root, ret); 1344 goto error; 1345 } 1346 } 1347 1348 extent_clear_unlock_delalloc(inode, cur_offset, 1349 cur_offset + num_bytes - 1, 1350 locked_page, EXTENT_LOCKED | 1351 EXTENT_DELALLOC, PAGE_UNLOCK | 1352 PAGE_SET_PRIVATE2); 1353 cur_offset = extent_end; 1354 if (cur_offset > end) 1355 break; 1356 } 1357 btrfs_release_path(path); 1358 1359 if (cur_offset <= end && cow_start == (u64)-1) { 1360 cow_start = cur_offset; 1361 cur_offset = end; 1362 } 1363 1364 if (cow_start != (u64)-1) { 1365 ret = cow_file_range(inode, locked_page, cow_start, end, 1366 page_started, nr_written, 1); 1367 if (ret) { 1368 btrfs_abort_transaction(trans, root, ret); 1369 goto error; 1370 } 1371 } 1372 1373 error: 1374 err = btrfs_end_transaction(trans, root); 1375 if (!ret) 1376 ret = err; 1377 1378 if (ret && cur_offset < end) 1379 extent_clear_unlock_delalloc(inode, cur_offset, end, 1380 locked_page, EXTENT_LOCKED | 1381 EXTENT_DELALLOC | EXTENT_DEFRAG | 1382 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | 1383 PAGE_CLEAR_DIRTY | 1384 PAGE_SET_WRITEBACK | 1385 PAGE_END_WRITEBACK); 1386 btrfs_free_path(path); 1387 return ret; 1388 } 1389 1390 /* 1391 * extent_io.c call back to do delayed allocation processing 1392 */ 1393 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1394 u64 start, u64 end, int *page_started, 1395 unsigned long *nr_written) 1396 { 1397 int ret; 1398 struct btrfs_root *root = BTRFS_I(inode)->root; 1399 1400 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { 1401 ret = run_delalloc_nocow(inode, locked_page, start, end, 1402 page_started, 1, nr_written); 1403 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { 1404 ret = run_delalloc_nocow(inode, locked_page, start, end, 1405 page_started, 0, nr_written); 1406 } else if (!btrfs_test_opt(root, COMPRESS) && 1407 !(BTRFS_I(inode)->force_compress) && 1408 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { 1409 ret = cow_file_range(inode, locked_page, start, end, 1410 page_started, nr_written, 1); 1411 } else { 1412 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1413 &BTRFS_I(inode)->runtime_flags); 1414 ret = cow_file_range_async(inode, locked_page, start, end, 1415 page_started, nr_written); 1416 } 1417 return ret; 1418 } 1419 1420 static void btrfs_split_extent_hook(struct inode *inode, 1421 struct extent_state *orig, u64 split) 1422 { 1423 /* not delalloc, ignore it */ 1424 if (!(orig->state & EXTENT_DELALLOC)) 1425 return; 1426 1427 spin_lock(&BTRFS_I(inode)->lock); 1428 BTRFS_I(inode)->outstanding_extents++; 1429 spin_unlock(&BTRFS_I(inode)->lock); 1430 } 1431 1432 /* 1433 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1434 * extents so we can keep track of new extents that are just merged onto old 1435 * extents, such as when we are doing sequential writes, so we can properly 1436 * account for the metadata space we'll need. 1437 */ 1438 static void btrfs_merge_extent_hook(struct inode *inode, 1439 struct extent_state *new, 1440 struct extent_state *other) 1441 { 1442 /* not delalloc, ignore it */ 1443 if (!(other->state & EXTENT_DELALLOC)) 1444 return; 1445 1446 spin_lock(&BTRFS_I(inode)->lock); 1447 BTRFS_I(inode)->outstanding_extents--; 1448 spin_unlock(&BTRFS_I(inode)->lock); 1449 } 1450 1451 static void btrfs_add_delalloc_inodes(struct btrfs_root *root, 1452 struct inode *inode) 1453 { 1454 spin_lock(&root->delalloc_lock); 1455 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1456 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1457 &root->delalloc_inodes); 1458 set_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1459 &BTRFS_I(inode)->runtime_flags); 1460 root->nr_delalloc_inodes++; 1461 if (root->nr_delalloc_inodes == 1) { 1462 spin_lock(&root->fs_info->delalloc_root_lock); 1463 BUG_ON(!list_empty(&root->delalloc_root)); 1464 list_add_tail(&root->delalloc_root, 1465 &root->fs_info->delalloc_roots); 1466 spin_unlock(&root->fs_info->delalloc_root_lock); 1467 } 1468 } 1469 spin_unlock(&root->delalloc_lock); 1470 } 1471 1472 static void btrfs_del_delalloc_inode(struct btrfs_root *root, 1473 struct inode *inode) 1474 { 1475 spin_lock(&root->delalloc_lock); 1476 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1477 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1478 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1479 &BTRFS_I(inode)->runtime_flags); 1480 root->nr_delalloc_inodes--; 1481 if (!root->nr_delalloc_inodes) { 1482 spin_lock(&root->fs_info->delalloc_root_lock); 1483 BUG_ON(list_empty(&root->delalloc_root)); 1484 list_del_init(&root->delalloc_root); 1485 spin_unlock(&root->fs_info->delalloc_root_lock); 1486 } 1487 } 1488 spin_unlock(&root->delalloc_lock); 1489 } 1490 1491 /* 1492 * extent_io.c set_bit_hook, used to track delayed allocation 1493 * bytes in this file, and to maintain the list of inodes that 1494 * have pending delalloc work to be done. 1495 */ 1496 static void btrfs_set_bit_hook(struct inode *inode, 1497 struct extent_state *state, unsigned long *bits) 1498 { 1499 1500 /* 1501 * set_bit and clear bit hooks normally require _irqsave/restore 1502 * but in this case, we are only testing for the DELALLOC 1503 * bit, which is only set or cleared with irqs on 1504 */ 1505 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1506 struct btrfs_root *root = BTRFS_I(inode)->root; 1507 u64 len = state->end + 1 - state->start; 1508 bool do_list = !btrfs_is_free_space_inode(inode); 1509 1510 if (*bits & EXTENT_FIRST_DELALLOC) { 1511 *bits &= ~EXTENT_FIRST_DELALLOC; 1512 } else { 1513 spin_lock(&BTRFS_I(inode)->lock); 1514 BTRFS_I(inode)->outstanding_extents++; 1515 spin_unlock(&BTRFS_I(inode)->lock); 1516 } 1517 1518 __percpu_counter_add(&root->fs_info->delalloc_bytes, len, 1519 root->fs_info->delalloc_batch); 1520 spin_lock(&BTRFS_I(inode)->lock); 1521 BTRFS_I(inode)->delalloc_bytes += len; 1522 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1523 &BTRFS_I(inode)->runtime_flags)) 1524 btrfs_add_delalloc_inodes(root, inode); 1525 spin_unlock(&BTRFS_I(inode)->lock); 1526 } 1527 } 1528 1529 /* 1530 * extent_io.c clear_bit_hook, see set_bit_hook for why 1531 */ 1532 static void btrfs_clear_bit_hook(struct inode *inode, 1533 struct extent_state *state, 1534 unsigned long *bits) 1535 { 1536 /* 1537 * set_bit and clear bit hooks normally require _irqsave/restore 1538 * but in this case, we are only testing for the DELALLOC 1539 * bit, which is only set or cleared with irqs on 1540 */ 1541 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1542 struct btrfs_root *root = BTRFS_I(inode)->root; 1543 u64 len = state->end + 1 - state->start; 1544 bool do_list = !btrfs_is_free_space_inode(inode); 1545 1546 if (*bits & EXTENT_FIRST_DELALLOC) { 1547 *bits &= ~EXTENT_FIRST_DELALLOC; 1548 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { 1549 spin_lock(&BTRFS_I(inode)->lock); 1550 BTRFS_I(inode)->outstanding_extents--; 1551 spin_unlock(&BTRFS_I(inode)->lock); 1552 } 1553 1554 if (*bits & EXTENT_DO_ACCOUNTING) 1555 btrfs_delalloc_release_metadata(inode, len); 1556 1557 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1558 && do_list && !(state->state & EXTENT_NORESERVE)) 1559 btrfs_free_reserved_data_space(inode, len); 1560 1561 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, 1562 root->fs_info->delalloc_batch); 1563 spin_lock(&BTRFS_I(inode)->lock); 1564 BTRFS_I(inode)->delalloc_bytes -= len; 1565 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1566 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1567 &BTRFS_I(inode)->runtime_flags)) 1568 btrfs_del_delalloc_inode(root, inode); 1569 spin_unlock(&BTRFS_I(inode)->lock); 1570 } 1571 } 1572 1573 /* 1574 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1575 * we don't create bios that span stripes or chunks 1576 */ 1577 int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, 1578 size_t size, struct bio *bio, 1579 unsigned long bio_flags) 1580 { 1581 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1582 u64 logical = (u64)bio->bi_sector << 9; 1583 u64 length = 0; 1584 u64 map_length; 1585 int ret; 1586 1587 if (bio_flags & EXTENT_BIO_COMPRESSED) 1588 return 0; 1589 1590 length = bio->bi_size; 1591 map_length = length; 1592 ret = btrfs_map_block(root->fs_info, rw, logical, 1593 &map_length, NULL, 0); 1594 /* Will always return 0 with map_multi == NULL */ 1595 BUG_ON(ret < 0); 1596 if (map_length < length + size) 1597 return 1; 1598 return 0; 1599 } 1600 1601 /* 1602 * in order to insert checksums into the metadata in large chunks, 1603 * we wait until bio submission time. All the pages in the bio are 1604 * checksummed and sums are attached onto the ordered extent record. 1605 * 1606 * At IO completion time the cums attached on the ordered extent record 1607 * are inserted into the btree 1608 */ 1609 static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1610 struct bio *bio, int mirror_num, 1611 unsigned long bio_flags, 1612 u64 bio_offset) 1613 { 1614 struct btrfs_root *root = BTRFS_I(inode)->root; 1615 int ret = 0; 1616 1617 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1618 BUG_ON(ret); /* -ENOMEM */ 1619 return 0; 1620 } 1621 1622 /* 1623 * in order to insert checksums into the metadata in large chunks, 1624 * we wait until bio submission time. All the pages in the bio are 1625 * checksummed and sums are attached onto the ordered extent record. 1626 * 1627 * At IO completion time the cums attached on the ordered extent record 1628 * are inserted into the btree 1629 */ 1630 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1631 int mirror_num, unsigned long bio_flags, 1632 u64 bio_offset) 1633 { 1634 struct btrfs_root *root = BTRFS_I(inode)->root; 1635 int ret; 1636 1637 ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); 1638 if (ret) 1639 bio_endio(bio, ret); 1640 return ret; 1641 } 1642 1643 /* 1644 * extent_io.c submission hook. This does the right thing for csum calculation 1645 * on write, or reading the csums from the tree before a read 1646 */ 1647 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1648 int mirror_num, unsigned long bio_flags, 1649 u64 bio_offset) 1650 { 1651 struct btrfs_root *root = BTRFS_I(inode)->root; 1652 int ret = 0; 1653 int skip_sum; 1654 int metadata = 0; 1655 int async = !atomic_read(&BTRFS_I(inode)->sync_writers); 1656 1657 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1658 1659 if (btrfs_is_free_space_inode(inode)) 1660 metadata = 2; 1661 1662 if (!(rw & REQ_WRITE)) { 1663 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1664 if (ret) 1665 goto out; 1666 1667 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1668 ret = btrfs_submit_compressed_read(inode, bio, 1669 mirror_num, 1670 bio_flags); 1671 goto out; 1672 } else if (!skip_sum) { 1673 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1674 if (ret) 1675 goto out; 1676 } 1677 goto mapit; 1678 } else if (async && !skip_sum) { 1679 /* csum items have already been cloned */ 1680 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1681 goto mapit; 1682 /* we're doing a write, do the async checksumming */ 1683 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1684 inode, rw, bio, mirror_num, 1685 bio_flags, bio_offset, 1686 __btrfs_submit_bio_start, 1687 __btrfs_submit_bio_done); 1688 goto out; 1689 } else if (!skip_sum) { 1690 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1691 if (ret) 1692 goto out; 1693 } 1694 1695 mapit: 1696 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); 1697 1698 out: 1699 if (ret < 0) 1700 bio_endio(bio, ret); 1701 return ret; 1702 } 1703 1704 /* 1705 * given a list of ordered sums record them in the inode. This happens 1706 * at IO completion time based on sums calculated at bio submission time. 1707 */ 1708 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1709 struct inode *inode, u64 file_offset, 1710 struct list_head *list) 1711 { 1712 struct btrfs_ordered_sum *sum; 1713 1714 list_for_each_entry(sum, list, list) { 1715 trans->adding_csums = 1; 1716 btrfs_csum_file_blocks(trans, 1717 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1718 trans->adding_csums = 0; 1719 } 1720 return 0; 1721 } 1722 1723 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1724 struct extent_state **cached_state) 1725 { 1726 WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); 1727 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1728 cached_state, GFP_NOFS); 1729 } 1730 1731 /* see btrfs_writepage_start_hook for details on why this is required */ 1732 struct btrfs_writepage_fixup { 1733 struct page *page; 1734 struct btrfs_work work; 1735 }; 1736 1737 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 1738 { 1739 struct btrfs_writepage_fixup *fixup; 1740 struct btrfs_ordered_extent *ordered; 1741 struct extent_state *cached_state = NULL; 1742 struct page *page; 1743 struct inode *inode; 1744 u64 page_start; 1745 u64 page_end; 1746 int ret; 1747 1748 fixup = container_of(work, struct btrfs_writepage_fixup, work); 1749 page = fixup->page; 1750 again: 1751 lock_page(page); 1752 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 1753 ClearPageChecked(page); 1754 goto out_page; 1755 } 1756 1757 inode = page->mapping->host; 1758 page_start = page_offset(page); 1759 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1760 1761 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, 1762 &cached_state); 1763 1764 /* already ordered? We're done */ 1765 if (PagePrivate2(page)) 1766 goto out; 1767 1768 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1769 if (ordered) { 1770 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 1771 page_end, &cached_state, GFP_NOFS); 1772 unlock_page(page); 1773 btrfs_start_ordered_extent(inode, ordered, 1); 1774 btrfs_put_ordered_extent(ordered); 1775 goto again; 1776 } 1777 1778 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 1779 if (ret) { 1780 mapping_set_error(page->mapping, ret); 1781 end_extent_writepage(page, ret, page_start, page_end); 1782 ClearPageChecked(page); 1783 goto out; 1784 } 1785 1786 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1787 ClearPageChecked(page); 1788 set_page_dirty(page); 1789 out: 1790 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 1791 &cached_state, GFP_NOFS); 1792 out_page: 1793 unlock_page(page); 1794 page_cache_release(page); 1795 kfree(fixup); 1796 } 1797 1798 /* 1799 * There are a few paths in the higher layers of the kernel that directly 1800 * set the page dirty bit without asking the filesystem if it is a 1801 * good idea. This causes problems because we want to make sure COW 1802 * properly happens and the data=ordered rules are followed. 1803 * 1804 * In our case any range that doesn't have the ORDERED bit set 1805 * hasn't been properly setup for IO. We kick off an async process 1806 * to fix it up. The async helper will wait for ordered extents, set 1807 * the delalloc bit and make it safe to write the page. 1808 */ 1809 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 1810 { 1811 struct inode *inode = page->mapping->host; 1812 struct btrfs_writepage_fixup *fixup; 1813 struct btrfs_root *root = BTRFS_I(inode)->root; 1814 1815 /* this page is properly in the ordered list */ 1816 if (TestClearPagePrivate2(page)) 1817 return 0; 1818 1819 if (PageChecked(page)) 1820 return -EAGAIN; 1821 1822 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 1823 if (!fixup) 1824 return -EAGAIN; 1825 1826 SetPageChecked(page); 1827 page_cache_get(page); 1828 fixup->work.func = btrfs_writepage_fixup_worker; 1829 fixup->page = page; 1830 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1831 return -EBUSY; 1832 } 1833 1834 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 1835 struct inode *inode, u64 file_pos, 1836 u64 disk_bytenr, u64 disk_num_bytes, 1837 u64 num_bytes, u64 ram_bytes, 1838 u8 compression, u8 encryption, 1839 u16 other_encoding, int extent_type) 1840 { 1841 struct btrfs_root *root = BTRFS_I(inode)->root; 1842 struct btrfs_file_extent_item *fi; 1843 struct btrfs_path *path; 1844 struct extent_buffer *leaf; 1845 struct btrfs_key ins; 1846 int ret; 1847 1848 path = btrfs_alloc_path(); 1849 if (!path) 1850 return -ENOMEM; 1851 1852 path->leave_spinning = 1; 1853 1854 /* 1855 * we may be replacing one extent in the tree with another. 1856 * The new extent is pinned in the extent map, and we don't want 1857 * to drop it from the cache until it is completely in the btree. 1858 * 1859 * So, tell btrfs_drop_extents to leave this extent in the cache. 1860 * the caller is expected to unpin it and allow it to be merged 1861 * with the others. 1862 */ 1863 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1864 file_pos + num_bytes, 0); 1865 if (ret) 1866 goto out; 1867 1868 ins.objectid = btrfs_ino(inode); 1869 ins.offset = file_pos; 1870 ins.type = BTRFS_EXTENT_DATA_KEY; 1871 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1872 if (ret) 1873 goto out; 1874 leaf = path->nodes[0]; 1875 fi = btrfs_item_ptr(leaf, path->slots[0], 1876 struct btrfs_file_extent_item); 1877 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1878 btrfs_set_file_extent_type(leaf, fi, extent_type); 1879 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 1880 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 1881 btrfs_set_file_extent_offset(leaf, fi, 0); 1882 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1883 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 1884 btrfs_set_file_extent_compression(leaf, fi, compression); 1885 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1886 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1887 1888 btrfs_mark_buffer_dirty(leaf); 1889 btrfs_release_path(path); 1890 1891 inode_add_bytes(inode, num_bytes); 1892 1893 ins.objectid = disk_bytenr; 1894 ins.offset = disk_num_bytes; 1895 ins.type = BTRFS_EXTENT_ITEM_KEY; 1896 ret = btrfs_alloc_reserved_file_extent(trans, root, 1897 root->root_key.objectid, 1898 btrfs_ino(inode), file_pos, &ins); 1899 out: 1900 btrfs_free_path(path); 1901 1902 return ret; 1903 } 1904 1905 /* snapshot-aware defrag */ 1906 struct sa_defrag_extent_backref { 1907 struct rb_node node; 1908 struct old_sa_defrag_extent *old; 1909 u64 root_id; 1910 u64 inum; 1911 u64 file_pos; 1912 u64 extent_offset; 1913 u64 num_bytes; 1914 u64 generation; 1915 }; 1916 1917 struct old_sa_defrag_extent { 1918 struct list_head list; 1919 struct new_sa_defrag_extent *new; 1920 1921 u64 extent_offset; 1922 u64 bytenr; 1923 u64 offset; 1924 u64 len; 1925 int count; 1926 }; 1927 1928 struct new_sa_defrag_extent { 1929 struct rb_root root; 1930 struct list_head head; 1931 struct btrfs_path *path; 1932 struct inode *inode; 1933 u64 file_pos; 1934 u64 len; 1935 u64 bytenr; 1936 u64 disk_len; 1937 u8 compress_type; 1938 }; 1939 1940 static int backref_comp(struct sa_defrag_extent_backref *b1, 1941 struct sa_defrag_extent_backref *b2) 1942 { 1943 if (b1->root_id < b2->root_id) 1944 return -1; 1945 else if (b1->root_id > b2->root_id) 1946 return 1; 1947 1948 if (b1->inum < b2->inum) 1949 return -1; 1950 else if (b1->inum > b2->inum) 1951 return 1; 1952 1953 if (b1->file_pos < b2->file_pos) 1954 return -1; 1955 else if (b1->file_pos > b2->file_pos) 1956 return 1; 1957 1958 /* 1959 * [------------------------------] ===> (a range of space) 1960 * |<--->| |<---->| =============> (fs/file tree A) 1961 * |<---------------------------->| ===> (fs/file tree B) 1962 * 1963 * A range of space can refer to two file extents in one tree while 1964 * refer to only one file extent in another tree. 1965 * 1966 * So we may process a disk offset more than one time(two extents in A) 1967 * and locate at the same extent(one extent in B), then insert two same 1968 * backrefs(both refer to the extent in B). 1969 */ 1970 return 0; 1971 } 1972 1973 static void backref_insert(struct rb_root *root, 1974 struct sa_defrag_extent_backref *backref) 1975 { 1976 struct rb_node **p = &root->rb_node; 1977 struct rb_node *parent = NULL; 1978 struct sa_defrag_extent_backref *entry; 1979 int ret; 1980 1981 while (*p) { 1982 parent = *p; 1983 entry = rb_entry(parent, struct sa_defrag_extent_backref, node); 1984 1985 ret = backref_comp(backref, entry); 1986 if (ret < 0) 1987 p = &(*p)->rb_left; 1988 else 1989 p = &(*p)->rb_right; 1990 } 1991 1992 rb_link_node(&backref->node, parent, p); 1993 rb_insert_color(&backref->node, root); 1994 } 1995 1996 /* 1997 * Note the backref might has changed, and in this case we just return 0. 1998 */ 1999 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, 2000 void *ctx) 2001 { 2002 struct btrfs_file_extent_item *extent; 2003 struct btrfs_fs_info *fs_info; 2004 struct old_sa_defrag_extent *old = ctx; 2005 struct new_sa_defrag_extent *new = old->new; 2006 struct btrfs_path *path = new->path; 2007 struct btrfs_key key; 2008 struct btrfs_root *root; 2009 struct sa_defrag_extent_backref *backref; 2010 struct extent_buffer *leaf; 2011 struct inode *inode = new->inode; 2012 int slot; 2013 int ret; 2014 u64 extent_offset; 2015 u64 num_bytes; 2016 2017 if (BTRFS_I(inode)->root->root_key.objectid == root_id && 2018 inum == btrfs_ino(inode)) 2019 return 0; 2020 2021 key.objectid = root_id; 2022 key.type = BTRFS_ROOT_ITEM_KEY; 2023 key.offset = (u64)-1; 2024 2025 fs_info = BTRFS_I(inode)->root->fs_info; 2026 root = btrfs_read_fs_root_no_name(fs_info, &key); 2027 if (IS_ERR(root)) { 2028 if (PTR_ERR(root) == -ENOENT) 2029 return 0; 2030 WARN_ON(1); 2031 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", 2032 inum, offset, root_id); 2033 return PTR_ERR(root); 2034 } 2035 2036 key.objectid = inum; 2037 key.type = BTRFS_EXTENT_DATA_KEY; 2038 if (offset > (u64)-1 << 32) 2039 key.offset = 0; 2040 else 2041 key.offset = offset; 2042 2043 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2044 if (ret < 0) { 2045 WARN_ON(1); 2046 return ret; 2047 } 2048 ret = 0; 2049 2050 while (1) { 2051 cond_resched(); 2052 2053 leaf = path->nodes[0]; 2054 slot = path->slots[0]; 2055 2056 if (slot >= btrfs_header_nritems(leaf)) { 2057 ret = btrfs_next_leaf(root, path); 2058 if (ret < 0) { 2059 goto out; 2060 } else if (ret > 0) { 2061 ret = 0; 2062 goto out; 2063 } 2064 continue; 2065 } 2066 2067 path->slots[0]++; 2068 2069 btrfs_item_key_to_cpu(leaf, &key, slot); 2070 2071 if (key.objectid > inum) 2072 goto out; 2073 2074 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) 2075 continue; 2076 2077 extent = btrfs_item_ptr(leaf, slot, 2078 struct btrfs_file_extent_item); 2079 2080 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) 2081 continue; 2082 2083 /* 2084 * 'offset' refers to the exact key.offset, 2085 * NOT the 'offset' field in btrfs_extent_data_ref, ie. 2086 * (key.offset - extent_offset). 2087 */ 2088 if (key.offset != offset) 2089 continue; 2090 2091 extent_offset = btrfs_file_extent_offset(leaf, extent); 2092 num_bytes = btrfs_file_extent_num_bytes(leaf, extent); 2093 2094 if (extent_offset >= old->extent_offset + old->offset + 2095 old->len || extent_offset + num_bytes <= 2096 old->extent_offset + old->offset) 2097 continue; 2098 break; 2099 } 2100 2101 backref = kmalloc(sizeof(*backref), GFP_NOFS); 2102 if (!backref) { 2103 ret = -ENOENT; 2104 goto out; 2105 } 2106 2107 backref->root_id = root_id; 2108 backref->inum = inum; 2109 backref->file_pos = offset; 2110 backref->num_bytes = num_bytes; 2111 backref->extent_offset = extent_offset; 2112 backref->generation = btrfs_file_extent_generation(leaf, extent); 2113 backref->old = old; 2114 backref_insert(&new->root, backref); 2115 old->count++; 2116 out: 2117 btrfs_release_path(path); 2118 WARN_ON(ret); 2119 return ret; 2120 } 2121 2122 static noinline bool record_extent_backrefs(struct btrfs_path *path, 2123 struct new_sa_defrag_extent *new) 2124 { 2125 struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; 2126 struct old_sa_defrag_extent *old, *tmp; 2127 int ret; 2128 2129 new->path = path; 2130 2131 list_for_each_entry_safe(old, tmp, &new->head, list) { 2132 ret = iterate_inodes_from_logical(old->bytenr + 2133 old->extent_offset, fs_info, 2134 path, record_one_backref, 2135 old); 2136 BUG_ON(ret < 0 && ret != -ENOENT); 2137 2138 /* no backref to be processed for this extent */ 2139 if (!old->count) { 2140 list_del(&old->list); 2141 kfree(old); 2142 } 2143 } 2144 2145 if (list_empty(&new->head)) 2146 return false; 2147 2148 return true; 2149 } 2150 2151 static int relink_is_mergable(struct extent_buffer *leaf, 2152 struct btrfs_file_extent_item *fi, 2153 struct new_sa_defrag_extent *new) 2154 { 2155 if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) 2156 return 0; 2157 2158 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 2159 return 0; 2160 2161 if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) 2162 return 0; 2163 2164 if (btrfs_file_extent_encryption(leaf, fi) || 2165 btrfs_file_extent_other_encoding(leaf, fi)) 2166 return 0; 2167 2168 return 1; 2169 } 2170 2171 /* 2172 * Note the backref might has changed, and in this case we just return 0. 2173 */ 2174 static noinline int relink_extent_backref(struct btrfs_path *path, 2175 struct sa_defrag_extent_backref *prev, 2176 struct sa_defrag_extent_backref *backref) 2177 { 2178 struct btrfs_file_extent_item *extent; 2179 struct btrfs_file_extent_item *item; 2180 struct btrfs_ordered_extent *ordered; 2181 struct btrfs_trans_handle *trans; 2182 struct btrfs_fs_info *fs_info; 2183 struct btrfs_root *root; 2184 struct btrfs_key key; 2185 struct extent_buffer *leaf; 2186 struct old_sa_defrag_extent *old = backref->old; 2187 struct new_sa_defrag_extent *new = old->new; 2188 struct inode *src_inode = new->inode; 2189 struct inode *inode; 2190 struct extent_state *cached = NULL; 2191 int ret = 0; 2192 u64 start; 2193 u64 len; 2194 u64 lock_start; 2195 u64 lock_end; 2196 bool merge = false; 2197 int index; 2198 2199 if (prev && prev->root_id == backref->root_id && 2200 prev->inum == backref->inum && 2201 prev->file_pos + prev->num_bytes == backref->file_pos) 2202 merge = true; 2203 2204 /* step 1: get root */ 2205 key.objectid = backref->root_id; 2206 key.type = BTRFS_ROOT_ITEM_KEY; 2207 key.offset = (u64)-1; 2208 2209 fs_info = BTRFS_I(src_inode)->root->fs_info; 2210 index = srcu_read_lock(&fs_info->subvol_srcu); 2211 2212 root = btrfs_read_fs_root_no_name(fs_info, &key); 2213 if (IS_ERR(root)) { 2214 srcu_read_unlock(&fs_info->subvol_srcu, index); 2215 if (PTR_ERR(root) == -ENOENT) 2216 return 0; 2217 return PTR_ERR(root); 2218 } 2219 2220 /* step 2: get inode */ 2221 key.objectid = backref->inum; 2222 key.type = BTRFS_INODE_ITEM_KEY; 2223 key.offset = 0; 2224 2225 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 2226 if (IS_ERR(inode)) { 2227 srcu_read_unlock(&fs_info->subvol_srcu, index); 2228 return 0; 2229 } 2230 2231 srcu_read_unlock(&fs_info->subvol_srcu, index); 2232 2233 /* step 3: relink backref */ 2234 lock_start = backref->file_pos; 2235 lock_end = backref->file_pos + backref->num_bytes - 1; 2236 lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, 2237 0, &cached); 2238 2239 ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); 2240 if (ordered) { 2241 btrfs_put_ordered_extent(ordered); 2242 goto out_unlock; 2243 } 2244 2245 trans = btrfs_join_transaction(root); 2246 if (IS_ERR(trans)) { 2247 ret = PTR_ERR(trans); 2248 goto out_unlock; 2249 } 2250 2251 key.objectid = backref->inum; 2252 key.type = BTRFS_EXTENT_DATA_KEY; 2253 key.offset = backref->file_pos; 2254 2255 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2256 if (ret < 0) { 2257 goto out_free_path; 2258 } else if (ret > 0) { 2259 ret = 0; 2260 goto out_free_path; 2261 } 2262 2263 extent = btrfs_item_ptr(path->nodes[0], path->slots[0], 2264 struct btrfs_file_extent_item); 2265 2266 if (btrfs_file_extent_generation(path->nodes[0], extent) != 2267 backref->generation) 2268 goto out_free_path; 2269 2270 btrfs_release_path(path); 2271 2272 start = backref->file_pos; 2273 if (backref->extent_offset < old->extent_offset + old->offset) 2274 start += old->extent_offset + old->offset - 2275 backref->extent_offset; 2276 2277 len = min(backref->extent_offset + backref->num_bytes, 2278 old->extent_offset + old->offset + old->len); 2279 len -= max(backref->extent_offset, old->extent_offset + old->offset); 2280 2281 ret = btrfs_drop_extents(trans, root, inode, start, 2282 start + len, 1); 2283 if (ret) 2284 goto out_free_path; 2285 again: 2286 key.objectid = btrfs_ino(inode); 2287 key.type = BTRFS_EXTENT_DATA_KEY; 2288 key.offset = start; 2289 2290 path->leave_spinning = 1; 2291 if (merge) { 2292 struct btrfs_file_extent_item *fi; 2293 u64 extent_len; 2294 struct btrfs_key found_key; 2295 2296 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 2297 if (ret < 0) 2298 goto out_free_path; 2299 2300 path->slots[0]--; 2301 leaf = path->nodes[0]; 2302 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2303 2304 fi = btrfs_item_ptr(leaf, path->slots[0], 2305 struct btrfs_file_extent_item); 2306 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 2307 2308 if (extent_len + found_key.offset == start && 2309 relink_is_mergable(leaf, fi, new)) { 2310 btrfs_set_file_extent_num_bytes(leaf, fi, 2311 extent_len + len); 2312 btrfs_mark_buffer_dirty(leaf); 2313 inode_add_bytes(inode, len); 2314 2315 ret = 1; 2316 goto out_free_path; 2317 } else { 2318 merge = false; 2319 btrfs_release_path(path); 2320 goto again; 2321 } 2322 } 2323 2324 ret = btrfs_insert_empty_item(trans, root, path, &key, 2325 sizeof(*extent)); 2326 if (ret) { 2327 btrfs_abort_transaction(trans, root, ret); 2328 goto out_free_path; 2329 } 2330 2331 leaf = path->nodes[0]; 2332 item = btrfs_item_ptr(leaf, path->slots[0], 2333 struct btrfs_file_extent_item); 2334 btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); 2335 btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); 2336 btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); 2337 btrfs_set_file_extent_num_bytes(leaf, item, len); 2338 btrfs_set_file_extent_ram_bytes(leaf, item, new->len); 2339 btrfs_set_file_extent_generation(leaf, item, trans->transid); 2340 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); 2341 btrfs_set_file_extent_compression(leaf, item, new->compress_type); 2342 btrfs_set_file_extent_encryption(leaf, item, 0); 2343 btrfs_set_file_extent_other_encoding(leaf, item, 0); 2344 2345 btrfs_mark_buffer_dirty(leaf); 2346 inode_add_bytes(inode, len); 2347 btrfs_release_path(path); 2348 2349 ret = btrfs_inc_extent_ref(trans, root, new->bytenr, 2350 new->disk_len, 0, 2351 backref->root_id, backref->inum, 2352 new->file_pos, 0); /* start - extent_offset */ 2353 if (ret) { 2354 btrfs_abort_transaction(trans, root, ret); 2355 goto out_free_path; 2356 } 2357 2358 ret = 1; 2359 out_free_path: 2360 btrfs_release_path(path); 2361 path->leave_spinning = 0; 2362 btrfs_end_transaction(trans, root); 2363 out_unlock: 2364 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, 2365 &cached, GFP_NOFS); 2366 iput(inode); 2367 return ret; 2368 } 2369 2370 static void relink_file_extents(struct new_sa_defrag_extent *new) 2371 { 2372 struct btrfs_path *path; 2373 struct old_sa_defrag_extent *old, *tmp; 2374 struct sa_defrag_extent_backref *backref; 2375 struct sa_defrag_extent_backref *prev = NULL; 2376 struct inode *inode; 2377 struct btrfs_root *root; 2378 struct rb_node *node; 2379 int ret; 2380 2381 inode = new->inode; 2382 root = BTRFS_I(inode)->root; 2383 2384 path = btrfs_alloc_path(); 2385 if (!path) 2386 return; 2387 2388 if (!record_extent_backrefs(path, new)) { 2389 btrfs_free_path(path); 2390 goto out; 2391 } 2392 btrfs_release_path(path); 2393 2394 while (1) { 2395 node = rb_first(&new->root); 2396 if (!node) 2397 break; 2398 rb_erase(node, &new->root); 2399 2400 backref = rb_entry(node, struct sa_defrag_extent_backref, node); 2401 2402 ret = relink_extent_backref(path, prev, backref); 2403 WARN_ON(ret < 0); 2404 2405 kfree(prev); 2406 2407 if (ret == 1) 2408 prev = backref; 2409 else 2410 prev = NULL; 2411 cond_resched(); 2412 } 2413 kfree(prev); 2414 2415 btrfs_free_path(path); 2416 2417 list_for_each_entry_safe(old, tmp, &new->head, list) { 2418 list_del(&old->list); 2419 kfree(old); 2420 } 2421 out: 2422 atomic_dec(&root->fs_info->defrag_running); 2423 wake_up(&root->fs_info->transaction_wait); 2424 2425 kfree(new); 2426 } 2427 2428 static struct new_sa_defrag_extent * 2429 record_old_file_extents(struct inode *inode, 2430 struct btrfs_ordered_extent *ordered) 2431 { 2432 struct btrfs_root *root = BTRFS_I(inode)->root; 2433 struct btrfs_path *path; 2434 struct btrfs_key key; 2435 struct old_sa_defrag_extent *old, *tmp; 2436 struct new_sa_defrag_extent *new; 2437 int ret; 2438 2439 new = kmalloc(sizeof(*new), GFP_NOFS); 2440 if (!new) 2441 return NULL; 2442 2443 new->inode = inode; 2444 new->file_pos = ordered->file_offset; 2445 new->len = ordered->len; 2446 new->bytenr = ordered->start; 2447 new->disk_len = ordered->disk_len; 2448 new->compress_type = ordered->compress_type; 2449 new->root = RB_ROOT; 2450 INIT_LIST_HEAD(&new->head); 2451 2452 path = btrfs_alloc_path(); 2453 if (!path) 2454 goto out_kfree; 2455 2456 key.objectid = btrfs_ino(inode); 2457 key.type = BTRFS_EXTENT_DATA_KEY; 2458 key.offset = new->file_pos; 2459 2460 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2461 if (ret < 0) 2462 goto out_free_path; 2463 if (ret > 0 && path->slots[0] > 0) 2464 path->slots[0]--; 2465 2466 /* find out all the old extents for the file range */ 2467 while (1) { 2468 struct btrfs_file_extent_item *extent; 2469 struct extent_buffer *l; 2470 int slot; 2471 u64 num_bytes; 2472 u64 offset; 2473 u64 end; 2474 u64 disk_bytenr; 2475 u64 extent_offset; 2476 2477 l = path->nodes[0]; 2478 slot = path->slots[0]; 2479 2480 if (slot >= btrfs_header_nritems(l)) { 2481 ret = btrfs_next_leaf(root, path); 2482 if (ret < 0) 2483 goto out_free_list; 2484 else if (ret > 0) 2485 break; 2486 continue; 2487 } 2488 2489 btrfs_item_key_to_cpu(l, &key, slot); 2490 2491 if (key.objectid != btrfs_ino(inode)) 2492 break; 2493 if (key.type != BTRFS_EXTENT_DATA_KEY) 2494 break; 2495 if (key.offset >= new->file_pos + new->len) 2496 break; 2497 2498 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); 2499 2500 num_bytes = btrfs_file_extent_num_bytes(l, extent); 2501 if (key.offset + num_bytes < new->file_pos) 2502 goto next; 2503 2504 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); 2505 if (!disk_bytenr) 2506 goto next; 2507 2508 extent_offset = btrfs_file_extent_offset(l, extent); 2509 2510 old = kmalloc(sizeof(*old), GFP_NOFS); 2511 if (!old) 2512 goto out_free_list; 2513 2514 offset = max(new->file_pos, key.offset); 2515 end = min(new->file_pos + new->len, key.offset + num_bytes); 2516 2517 old->bytenr = disk_bytenr; 2518 old->extent_offset = extent_offset; 2519 old->offset = offset - key.offset; 2520 old->len = end - offset; 2521 old->new = new; 2522 old->count = 0; 2523 list_add_tail(&old->list, &new->head); 2524 next: 2525 path->slots[0]++; 2526 cond_resched(); 2527 } 2528 2529 btrfs_free_path(path); 2530 atomic_inc(&root->fs_info->defrag_running); 2531 2532 return new; 2533 2534 out_free_list: 2535 list_for_each_entry_safe(old, tmp, &new->head, list) { 2536 list_del(&old->list); 2537 kfree(old); 2538 } 2539 out_free_path: 2540 btrfs_free_path(path); 2541 out_kfree: 2542 kfree(new); 2543 return NULL; 2544 } 2545 2546 /* 2547 * helper function for btrfs_finish_ordered_io, this 2548 * just reads in some of the csum leaves to prime them into ram 2549 * before we start the transaction. It limits the amount of btree 2550 * reads required while inside the transaction. 2551 */ 2552 /* as ordered data IO finishes, this gets called so we can finish 2553 * an ordered extent if the range of bytes in the file it covers are 2554 * fully written. 2555 */ 2556 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 2557 { 2558 struct inode *inode = ordered_extent->inode; 2559 struct btrfs_root *root = BTRFS_I(inode)->root; 2560 struct btrfs_trans_handle *trans = NULL; 2561 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2562 struct extent_state *cached_state = NULL; 2563 struct new_sa_defrag_extent *new = NULL; 2564 int compress_type = 0; 2565 int ret = 0; 2566 u64 logical_len = ordered_extent->len; 2567 bool nolock; 2568 bool truncated = false; 2569 2570 nolock = btrfs_is_free_space_inode(inode); 2571 2572 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 2573 ret = -EIO; 2574 goto out; 2575 } 2576 2577 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 2578 truncated = true; 2579 logical_len = ordered_extent->truncated_len; 2580 /* Truncated the entire extent, don't bother adding */ 2581 if (!logical_len) 2582 goto out; 2583 } 2584 2585 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 2586 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 2587 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 2588 if (nolock) 2589 trans = btrfs_join_transaction_nolock(root); 2590 else 2591 trans = btrfs_join_transaction(root); 2592 if (IS_ERR(trans)) { 2593 ret = PTR_ERR(trans); 2594 trans = NULL; 2595 goto out; 2596 } 2597 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 2598 ret = btrfs_update_inode_fallback(trans, root, inode); 2599 if (ret) /* -ENOMEM or corruption */ 2600 btrfs_abort_transaction(trans, root, ret); 2601 goto out; 2602 } 2603 2604 lock_extent_bits(io_tree, ordered_extent->file_offset, 2605 ordered_extent->file_offset + ordered_extent->len - 1, 2606 0, &cached_state); 2607 2608 ret = test_range_bit(io_tree, ordered_extent->file_offset, 2609 ordered_extent->file_offset + ordered_extent->len - 1, 2610 EXTENT_DEFRAG, 1, cached_state); 2611 if (ret) { 2612 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2613 if (last_snapshot >= BTRFS_I(inode)->generation) 2614 /* the inode is shared */ 2615 new = record_old_file_extents(inode, ordered_extent); 2616 2617 clear_extent_bit(io_tree, ordered_extent->file_offset, 2618 ordered_extent->file_offset + ordered_extent->len - 1, 2619 EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); 2620 } 2621 2622 if (nolock) 2623 trans = btrfs_join_transaction_nolock(root); 2624 else 2625 trans = btrfs_join_transaction(root); 2626 if (IS_ERR(trans)) { 2627 ret = PTR_ERR(trans); 2628 trans = NULL; 2629 goto out_unlock; 2630 } 2631 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 2632 2633 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 2634 compress_type = ordered_extent->compress_type; 2635 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 2636 BUG_ON(compress_type); 2637 ret = btrfs_mark_extent_written(trans, inode, 2638 ordered_extent->file_offset, 2639 ordered_extent->file_offset + 2640 logical_len); 2641 } else { 2642 BUG_ON(root == root->fs_info->tree_root); 2643 ret = insert_reserved_file_extent(trans, inode, 2644 ordered_extent->file_offset, 2645 ordered_extent->start, 2646 ordered_extent->disk_len, 2647 logical_len, logical_len, 2648 compress_type, 0, 0, 2649 BTRFS_FILE_EXTENT_REG); 2650 } 2651 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 2652 ordered_extent->file_offset, ordered_extent->len, 2653 trans->transid); 2654 if (ret < 0) { 2655 btrfs_abort_transaction(trans, root, ret); 2656 goto out_unlock; 2657 } 2658 2659 add_pending_csums(trans, inode, ordered_extent->file_offset, 2660 &ordered_extent->list); 2661 2662 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 2663 ret = btrfs_update_inode_fallback(trans, root, inode); 2664 if (ret) { /* -ENOMEM or corruption */ 2665 btrfs_abort_transaction(trans, root, ret); 2666 goto out_unlock; 2667 } 2668 ret = 0; 2669 out_unlock: 2670 unlock_extent_cached(io_tree, ordered_extent->file_offset, 2671 ordered_extent->file_offset + 2672 ordered_extent->len - 1, &cached_state, GFP_NOFS); 2673 out: 2674 if (root != root->fs_info->tree_root) 2675 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 2676 if (trans) 2677 btrfs_end_transaction(trans, root); 2678 2679 if (ret || truncated) { 2680 u64 start, end; 2681 2682 if (truncated) 2683 start = ordered_extent->file_offset + logical_len; 2684 else 2685 start = ordered_extent->file_offset; 2686 end = ordered_extent->file_offset + ordered_extent->len - 1; 2687 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); 2688 2689 /* Drop the cache for the part of the extent we didn't write. */ 2690 btrfs_drop_extent_cache(inode, start, end, 0); 2691 2692 /* 2693 * If the ordered extent had an IOERR or something else went 2694 * wrong we need to return the space for this ordered extent 2695 * back to the allocator. We only free the extent in the 2696 * truncated case if we didn't write out the extent at all. 2697 */ 2698 if ((ret || !logical_len) && 2699 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 2700 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) 2701 btrfs_free_reserved_extent(root, ordered_extent->start, 2702 ordered_extent->disk_len); 2703 } 2704 2705 2706 /* 2707 * This needs to be done to make sure anybody waiting knows we are done 2708 * updating everything for this ordered extent. 2709 */ 2710 btrfs_remove_ordered_extent(inode, ordered_extent); 2711 2712 /* for snapshot-aware defrag */ 2713 if (new) 2714 relink_file_extents(new); 2715 2716 /* once for us */ 2717 btrfs_put_ordered_extent(ordered_extent); 2718 /* once for the tree */ 2719 btrfs_put_ordered_extent(ordered_extent); 2720 2721 return ret; 2722 } 2723 2724 static void finish_ordered_fn(struct btrfs_work *work) 2725 { 2726 struct btrfs_ordered_extent *ordered_extent; 2727 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 2728 btrfs_finish_ordered_io(ordered_extent); 2729 } 2730 2731 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 2732 struct extent_state *state, int uptodate) 2733 { 2734 struct inode *inode = page->mapping->host; 2735 struct btrfs_root *root = BTRFS_I(inode)->root; 2736 struct btrfs_ordered_extent *ordered_extent = NULL; 2737 struct btrfs_workers *workers; 2738 2739 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2740 2741 ClearPagePrivate2(page); 2742 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 2743 end - start + 1, uptodate)) 2744 return 0; 2745 2746 ordered_extent->work.func = finish_ordered_fn; 2747 ordered_extent->work.flags = 0; 2748 2749 if (btrfs_is_free_space_inode(inode)) 2750 workers = &root->fs_info->endio_freespace_worker; 2751 else 2752 workers = &root->fs_info->endio_write_workers; 2753 btrfs_queue_worker(workers, &ordered_extent->work); 2754 2755 return 0; 2756 } 2757 2758 /* 2759 * when reads are done, we need to check csums to verify the data is correct 2760 * if there's a match, we allow the bio to finish. If not, the code in 2761 * extent_io.c will try to find good copies for us. 2762 */ 2763 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, 2764 u64 phy_offset, struct page *page, 2765 u64 start, u64 end, int mirror) 2766 { 2767 size_t offset = start - page_offset(page); 2768 struct inode *inode = page->mapping->host; 2769 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2770 char *kaddr; 2771 struct btrfs_root *root = BTRFS_I(inode)->root; 2772 u32 csum_expected; 2773 u32 csum = ~(u32)0; 2774 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 2775 DEFAULT_RATELIMIT_BURST); 2776 2777 if (PageChecked(page)) { 2778 ClearPageChecked(page); 2779 goto good; 2780 } 2781 2782 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 2783 goto good; 2784 2785 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 2786 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 2787 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 2788 GFP_NOFS); 2789 return 0; 2790 } 2791 2792 phy_offset >>= inode->i_sb->s_blocksize_bits; 2793 csum_expected = *(((u32 *)io_bio->csum) + phy_offset); 2794 2795 kaddr = kmap_atomic(page); 2796 csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1); 2797 btrfs_csum_final(csum, (char *)&csum); 2798 if (csum != csum_expected) 2799 goto zeroit; 2800 2801 kunmap_atomic(kaddr); 2802 good: 2803 return 0; 2804 2805 zeroit: 2806 if (__ratelimit(&_rs)) 2807 btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", 2808 btrfs_ino(page->mapping->host), start, csum, csum_expected); 2809 memset(kaddr + offset, 1, end - start + 1); 2810 flush_dcache_page(page); 2811 kunmap_atomic(kaddr); 2812 if (csum_expected == 0) 2813 return 0; 2814 return -EIO; 2815 } 2816 2817 struct delayed_iput { 2818 struct list_head list; 2819 struct inode *inode; 2820 }; 2821 2822 /* JDM: If this is fs-wide, why can't we add a pointer to 2823 * btrfs_inode instead and avoid the allocation? */ 2824 void btrfs_add_delayed_iput(struct inode *inode) 2825 { 2826 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2827 struct delayed_iput *delayed; 2828 2829 if (atomic_add_unless(&inode->i_count, -1, 1)) 2830 return; 2831 2832 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); 2833 delayed->inode = inode; 2834 2835 spin_lock(&fs_info->delayed_iput_lock); 2836 list_add_tail(&delayed->list, &fs_info->delayed_iputs); 2837 spin_unlock(&fs_info->delayed_iput_lock); 2838 } 2839 2840 void btrfs_run_delayed_iputs(struct btrfs_root *root) 2841 { 2842 LIST_HEAD(list); 2843 struct btrfs_fs_info *fs_info = root->fs_info; 2844 struct delayed_iput *delayed; 2845 int empty; 2846 2847 spin_lock(&fs_info->delayed_iput_lock); 2848 empty = list_empty(&fs_info->delayed_iputs); 2849 spin_unlock(&fs_info->delayed_iput_lock); 2850 if (empty) 2851 return; 2852 2853 spin_lock(&fs_info->delayed_iput_lock); 2854 list_splice_init(&fs_info->delayed_iputs, &list); 2855 spin_unlock(&fs_info->delayed_iput_lock); 2856 2857 while (!list_empty(&list)) { 2858 delayed = list_entry(list.next, struct delayed_iput, list); 2859 list_del(&delayed->list); 2860 iput(delayed->inode); 2861 kfree(delayed); 2862 } 2863 } 2864 2865 /* 2866 * This is called in transaction commit time. If there are no orphan 2867 * files in the subvolume, it removes orphan item and frees block_rsv 2868 * structure. 2869 */ 2870 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2871 struct btrfs_root *root) 2872 { 2873 struct btrfs_block_rsv *block_rsv; 2874 int ret; 2875 2876 if (atomic_read(&root->orphan_inodes) || 2877 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2878 return; 2879 2880 spin_lock(&root->orphan_lock); 2881 if (atomic_read(&root->orphan_inodes)) { 2882 spin_unlock(&root->orphan_lock); 2883 return; 2884 } 2885 2886 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { 2887 spin_unlock(&root->orphan_lock); 2888 return; 2889 } 2890 2891 block_rsv = root->orphan_block_rsv; 2892 root->orphan_block_rsv = NULL; 2893 spin_unlock(&root->orphan_lock); 2894 2895 if (root->orphan_item_inserted && 2896 btrfs_root_refs(&root->root_item) > 0) { 2897 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 2898 root->root_key.objectid); 2899 if (ret) 2900 btrfs_abort_transaction(trans, root, ret); 2901 else 2902 root->orphan_item_inserted = 0; 2903 } 2904 2905 if (block_rsv) { 2906 WARN_ON(block_rsv->size > 0); 2907 btrfs_free_block_rsv(root, block_rsv); 2908 } 2909 } 2910 2911 /* 2912 * This creates an orphan entry for the given inode in case something goes 2913 * wrong in the middle of an unlink/truncate. 2914 * 2915 * NOTE: caller of this function should reserve 5 units of metadata for 2916 * this function. 2917 */ 2918 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2919 { 2920 struct btrfs_root *root = BTRFS_I(inode)->root; 2921 struct btrfs_block_rsv *block_rsv = NULL; 2922 int reserve = 0; 2923 int insert = 0; 2924 int ret; 2925 2926 if (!root->orphan_block_rsv) { 2927 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 2928 if (!block_rsv) 2929 return -ENOMEM; 2930 } 2931 2932 spin_lock(&root->orphan_lock); 2933 if (!root->orphan_block_rsv) { 2934 root->orphan_block_rsv = block_rsv; 2935 } else if (block_rsv) { 2936 btrfs_free_block_rsv(root, block_rsv); 2937 block_rsv = NULL; 2938 } 2939 2940 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2941 &BTRFS_I(inode)->runtime_flags)) { 2942 #if 0 2943 /* 2944 * For proper ENOSPC handling, we should do orphan 2945 * cleanup when mounting. But this introduces backward 2946 * compatibility issue. 2947 */ 2948 if (!xchg(&root->orphan_item_inserted, 1)) 2949 insert = 2; 2950 else 2951 insert = 1; 2952 #endif 2953 insert = 1; 2954 atomic_inc(&root->orphan_inodes); 2955 } 2956 2957 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2958 &BTRFS_I(inode)->runtime_flags)) 2959 reserve = 1; 2960 spin_unlock(&root->orphan_lock); 2961 2962 /* grab metadata reservation from transaction handle */ 2963 if (reserve) { 2964 ret = btrfs_orphan_reserve_metadata(trans, inode); 2965 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ 2966 } 2967 2968 /* insert an orphan item to track this unlinked/truncated file */ 2969 if (insert >= 1) { 2970 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2971 if (ret) { 2972 if (reserve) { 2973 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2974 &BTRFS_I(inode)->runtime_flags); 2975 btrfs_orphan_release_metadata(inode); 2976 } 2977 if (ret != -EEXIST) { 2978 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2979 &BTRFS_I(inode)->runtime_flags); 2980 btrfs_abort_transaction(trans, root, ret); 2981 return ret; 2982 } 2983 } 2984 ret = 0; 2985 } 2986 2987 /* insert an orphan item to track subvolume contains orphan files */ 2988 if (insert >= 2) { 2989 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 2990 root->root_key.objectid); 2991 if (ret && ret != -EEXIST) { 2992 btrfs_abort_transaction(trans, root, ret); 2993 return ret; 2994 } 2995 } 2996 return 0; 2997 } 2998 2999 /* 3000 * We have done the truncate/delete so we can go ahead and remove the orphan 3001 * item for this particular inode. 3002 */ 3003 static int btrfs_orphan_del(struct btrfs_trans_handle *trans, 3004 struct inode *inode) 3005 { 3006 struct btrfs_root *root = BTRFS_I(inode)->root; 3007 int delete_item = 0; 3008 int release_rsv = 0; 3009 int ret = 0; 3010 3011 spin_lock(&root->orphan_lock); 3012 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3013 &BTRFS_I(inode)->runtime_flags)) 3014 delete_item = 1; 3015 3016 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 3017 &BTRFS_I(inode)->runtime_flags)) 3018 release_rsv = 1; 3019 spin_unlock(&root->orphan_lock); 3020 3021 if (trans && delete_item) 3022 ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); 3023 3024 if (release_rsv) { 3025 btrfs_orphan_release_metadata(inode); 3026 atomic_dec(&root->orphan_inodes); 3027 } 3028 3029 return ret; 3030 } 3031 3032 /* 3033 * this cleans up any orphans that may be left on the list from the last use 3034 * of this root. 3035 */ 3036 int btrfs_orphan_cleanup(struct btrfs_root *root) 3037 { 3038 struct btrfs_path *path; 3039 struct extent_buffer *leaf; 3040 struct btrfs_key key, found_key; 3041 struct btrfs_trans_handle *trans; 3042 struct inode *inode; 3043 u64 last_objectid = 0; 3044 int ret = 0, nr_unlink = 0, nr_truncate = 0; 3045 3046 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 3047 return 0; 3048 3049 path = btrfs_alloc_path(); 3050 if (!path) { 3051 ret = -ENOMEM; 3052 goto out; 3053 } 3054 path->reada = -1; 3055 3056 key.objectid = BTRFS_ORPHAN_OBJECTID; 3057 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 3058 key.offset = (u64)-1; 3059 3060 while (1) { 3061 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3062 if (ret < 0) 3063 goto out; 3064 3065 /* 3066 * if ret == 0 means we found what we were searching for, which 3067 * is weird, but possible, so only screw with path if we didn't 3068 * find the key and see if we have stuff that matches 3069 */ 3070 if (ret > 0) { 3071 ret = 0; 3072 if (path->slots[0] == 0) 3073 break; 3074 path->slots[0]--; 3075 } 3076 3077 /* pull out the item */ 3078 leaf = path->nodes[0]; 3079 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3080 3081 /* make sure the item matches what we want */ 3082 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3083 break; 3084 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 3085 break; 3086 3087 /* release the path since we're done with it */ 3088 btrfs_release_path(path); 3089 3090 /* 3091 * this is where we are basically btrfs_lookup, without the 3092 * crossing root thing. we store the inode number in the 3093 * offset of the orphan item. 3094 */ 3095 3096 if (found_key.offset == last_objectid) { 3097 btrfs_err(root->fs_info, 3098 "Error removing orphan entry, stopping orphan cleanup"); 3099 ret = -EINVAL; 3100 goto out; 3101 } 3102 3103 last_objectid = found_key.offset; 3104 3105 found_key.objectid = found_key.offset; 3106 found_key.type = BTRFS_INODE_ITEM_KEY; 3107 found_key.offset = 0; 3108 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 3109 ret = PTR_ERR_OR_ZERO(inode); 3110 if (ret && ret != -ESTALE) 3111 goto out; 3112 3113 if (ret == -ESTALE && root == root->fs_info->tree_root) { 3114 struct btrfs_root *dead_root; 3115 struct btrfs_fs_info *fs_info = root->fs_info; 3116 int is_dead_root = 0; 3117 3118 /* 3119 * this is an orphan in the tree root. Currently these 3120 * could come from 2 sources: 3121 * a) a snapshot deletion in progress 3122 * b) a free space cache inode 3123 * We need to distinguish those two, as the snapshot 3124 * orphan must not get deleted. 3125 * find_dead_roots already ran before us, so if this 3126 * is a snapshot deletion, we should find the root 3127 * in the dead_roots list 3128 */ 3129 spin_lock(&fs_info->trans_lock); 3130 list_for_each_entry(dead_root, &fs_info->dead_roots, 3131 root_list) { 3132 if (dead_root->root_key.objectid == 3133 found_key.objectid) { 3134 is_dead_root = 1; 3135 break; 3136 } 3137 } 3138 spin_unlock(&fs_info->trans_lock); 3139 if (is_dead_root) { 3140 /* prevent this orphan from being found again */ 3141 key.offset = found_key.objectid - 1; 3142 continue; 3143 } 3144 } 3145 /* 3146 * Inode is already gone but the orphan item is still there, 3147 * kill the orphan item. 3148 */ 3149 if (ret == -ESTALE) { 3150 trans = btrfs_start_transaction(root, 1); 3151 if (IS_ERR(trans)) { 3152 ret = PTR_ERR(trans); 3153 goto out; 3154 } 3155 btrfs_debug(root->fs_info, "auto deleting %Lu", 3156 found_key.objectid); 3157 ret = btrfs_del_orphan_item(trans, root, 3158 found_key.objectid); 3159 btrfs_end_transaction(trans, root); 3160 if (ret) 3161 goto out; 3162 continue; 3163 } 3164 3165 /* 3166 * add this inode to the orphan list so btrfs_orphan_del does 3167 * the proper thing when we hit it 3168 */ 3169 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3170 &BTRFS_I(inode)->runtime_flags); 3171 atomic_inc(&root->orphan_inodes); 3172 3173 /* if we have links, this was a truncate, lets do that */ 3174 if (inode->i_nlink) { 3175 if (!S_ISREG(inode->i_mode)) { 3176 WARN_ON(1); 3177 iput(inode); 3178 continue; 3179 } 3180 nr_truncate++; 3181 3182 /* 1 for the orphan item deletion. */ 3183 trans = btrfs_start_transaction(root, 1); 3184 if (IS_ERR(trans)) { 3185 iput(inode); 3186 ret = PTR_ERR(trans); 3187 goto out; 3188 } 3189 ret = btrfs_orphan_add(trans, inode); 3190 btrfs_end_transaction(trans, root); 3191 if (ret) { 3192 iput(inode); 3193 goto out; 3194 } 3195 3196 ret = btrfs_truncate(inode); 3197 if (ret) 3198 btrfs_orphan_del(NULL, inode); 3199 } else { 3200 nr_unlink++; 3201 } 3202 3203 /* this will do delete_inode and everything for us */ 3204 iput(inode); 3205 if (ret) 3206 goto out; 3207 } 3208 /* release the path since we're done with it */ 3209 btrfs_release_path(path); 3210 3211 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 3212 3213 if (root->orphan_block_rsv) 3214 btrfs_block_rsv_release(root, root->orphan_block_rsv, 3215 (u64)-1); 3216 3217 if (root->orphan_block_rsv || root->orphan_item_inserted) { 3218 trans = btrfs_join_transaction(root); 3219 if (!IS_ERR(trans)) 3220 btrfs_end_transaction(trans, root); 3221 } 3222 3223 if (nr_unlink) 3224 btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink); 3225 if (nr_truncate) 3226 btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate); 3227 3228 out: 3229 if (ret) 3230 btrfs_crit(root->fs_info, 3231 "could not do orphan cleanup %d", ret); 3232 btrfs_free_path(path); 3233 return ret; 3234 } 3235 3236 /* 3237 * very simple check to peek ahead in the leaf looking for xattrs. If we 3238 * don't find any xattrs, we know there can't be any acls. 3239 * 3240 * slot is the slot the inode is in, objectid is the objectid of the inode 3241 */ 3242 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 3243 int slot, u64 objectid) 3244 { 3245 u32 nritems = btrfs_header_nritems(leaf); 3246 struct btrfs_key found_key; 3247 static u64 xattr_access = 0; 3248 static u64 xattr_default = 0; 3249 int scanned = 0; 3250 3251 if (!xattr_access) { 3252 xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, 3253 strlen(POSIX_ACL_XATTR_ACCESS)); 3254 xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, 3255 strlen(POSIX_ACL_XATTR_DEFAULT)); 3256 } 3257 3258 slot++; 3259 while (slot < nritems) { 3260 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3261 3262 /* we found a different objectid, there must not be acls */ 3263 if (found_key.objectid != objectid) 3264 return 0; 3265 3266 /* we found an xattr, assume we've got an acl */ 3267 if (found_key.type == BTRFS_XATTR_ITEM_KEY) { 3268 if (found_key.offset == xattr_access || 3269 found_key.offset == xattr_default) 3270 return 1; 3271 } 3272 3273 /* 3274 * we found a key greater than an xattr key, there can't 3275 * be any acls later on 3276 */ 3277 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 3278 return 0; 3279 3280 slot++; 3281 scanned++; 3282 3283 /* 3284 * it goes inode, inode backrefs, xattrs, extents, 3285 * so if there are a ton of hard links to an inode there can 3286 * be a lot of backrefs. Don't waste time searching too hard, 3287 * this is just an optimization 3288 */ 3289 if (scanned >= 8) 3290 break; 3291 } 3292 /* we hit the end of the leaf before we found an xattr or 3293 * something larger than an xattr. We have to assume the inode 3294 * has acls 3295 */ 3296 return 1; 3297 } 3298 3299 /* 3300 * read an inode from the btree into the in-memory inode 3301 */ 3302 static void btrfs_read_locked_inode(struct inode *inode) 3303 { 3304 struct btrfs_path *path; 3305 struct extent_buffer *leaf; 3306 struct btrfs_inode_item *inode_item; 3307 struct btrfs_timespec *tspec; 3308 struct btrfs_root *root = BTRFS_I(inode)->root; 3309 struct btrfs_key location; 3310 int maybe_acls; 3311 u32 rdev; 3312 int ret; 3313 bool filled = false; 3314 3315 ret = btrfs_fill_inode(inode, &rdev); 3316 if (!ret) 3317 filled = true; 3318 3319 path = btrfs_alloc_path(); 3320 if (!path) 3321 goto make_bad; 3322 3323 path->leave_spinning = 1; 3324 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3325 3326 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3327 if (ret) 3328 goto make_bad; 3329 3330 leaf = path->nodes[0]; 3331 3332 if (filled) 3333 goto cache_acl; 3334 3335 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3336 struct btrfs_inode_item); 3337 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 3338 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); 3339 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); 3340 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); 3341 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 3342 3343 tspec = btrfs_inode_atime(inode_item); 3344 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 3345 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 3346 3347 tspec = btrfs_inode_mtime(inode_item); 3348 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 3349 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 3350 3351 tspec = btrfs_inode_ctime(inode_item); 3352 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 3353 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 3354 3355 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 3356 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 3357 BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); 3358 3359 /* 3360 * If we were modified in the current generation and evicted from memory 3361 * and then re-read we need to do a full sync since we don't have any 3362 * idea about which extents were modified before we were evicted from 3363 * cache. 3364 */ 3365 if (BTRFS_I(inode)->last_trans == root->fs_info->generation) 3366 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3367 &BTRFS_I(inode)->runtime_flags); 3368 3369 inode->i_version = btrfs_inode_sequence(leaf, inode_item); 3370 inode->i_generation = BTRFS_I(inode)->generation; 3371 inode->i_rdev = 0; 3372 rdev = btrfs_inode_rdev(leaf, inode_item); 3373 3374 BTRFS_I(inode)->index_cnt = (u64)-1; 3375 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 3376 cache_acl: 3377 /* 3378 * try to precache a NULL acl entry for files that don't have 3379 * any xattrs or acls 3380 */ 3381 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 3382 btrfs_ino(inode)); 3383 if (!maybe_acls) 3384 cache_no_acl(inode); 3385 3386 btrfs_free_path(path); 3387 3388 switch (inode->i_mode & S_IFMT) { 3389 case S_IFREG: 3390 inode->i_mapping->a_ops = &btrfs_aops; 3391 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 3392 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 3393 inode->i_fop = &btrfs_file_operations; 3394 inode->i_op = &btrfs_file_inode_operations; 3395 break; 3396 case S_IFDIR: 3397 inode->i_fop = &btrfs_dir_file_operations; 3398 if (root == root->fs_info->tree_root) 3399 inode->i_op = &btrfs_dir_ro_inode_operations; 3400 else 3401 inode->i_op = &btrfs_dir_inode_operations; 3402 break; 3403 case S_IFLNK: 3404 inode->i_op = &btrfs_symlink_inode_operations; 3405 inode->i_mapping->a_ops = &btrfs_symlink_aops; 3406 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 3407 break; 3408 default: 3409 inode->i_op = &btrfs_special_inode_operations; 3410 init_special_inode(inode, inode->i_mode, rdev); 3411 break; 3412 } 3413 3414 btrfs_update_iflags(inode); 3415 return; 3416 3417 make_bad: 3418 btrfs_free_path(path); 3419 make_bad_inode(inode); 3420 } 3421 3422 /* 3423 * given a leaf and an inode, copy the inode fields into the leaf 3424 */ 3425 static void fill_inode_item(struct btrfs_trans_handle *trans, 3426 struct extent_buffer *leaf, 3427 struct btrfs_inode_item *item, 3428 struct inode *inode) 3429 { 3430 struct btrfs_map_token token; 3431 3432 btrfs_init_map_token(&token); 3433 3434 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3435 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3436 btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, 3437 &token); 3438 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3439 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3440 3441 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), 3442 inode->i_atime.tv_sec, &token); 3443 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), 3444 inode->i_atime.tv_nsec, &token); 3445 3446 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), 3447 inode->i_mtime.tv_sec, &token); 3448 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), 3449 inode->i_mtime.tv_nsec, &token); 3450 3451 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), 3452 inode->i_ctime.tv_sec, &token); 3453 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), 3454 inode->i_ctime.tv_nsec, &token); 3455 3456 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3457 &token); 3458 btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, 3459 &token); 3460 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3461 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3462 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3463 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3464 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3465 } 3466 3467 /* 3468 * copy everything in the in-memory inode into the btree. 3469 */ 3470 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, 3471 struct btrfs_root *root, struct inode *inode) 3472 { 3473 struct btrfs_inode_item *inode_item; 3474 struct btrfs_path *path; 3475 struct extent_buffer *leaf; 3476 int ret; 3477 3478 path = btrfs_alloc_path(); 3479 if (!path) 3480 return -ENOMEM; 3481 3482 path->leave_spinning = 1; 3483 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, 3484 1); 3485 if (ret) { 3486 if (ret > 0) 3487 ret = -ENOENT; 3488 goto failed; 3489 } 3490 3491 btrfs_unlock_up_safe(path, 1); 3492 leaf = path->nodes[0]; 3493 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3494 struct btrfs_inode_item); 3495 3496 fill_inode_item(trans, leaf, inode_item, inode); 3497 btrfs_mark_buffer_dirty(leaf); 3498 btrfs_set_inode_last_trans(trans, inode); 3499 ret = 0; 3500 failed: 3501 btrfs_free_path(path); 3502 return ret; 3503 } 3504 3505 /* 3506 * copy everything in the in-memory inode into the btree. 3507 */ 3508 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 3509 struct btrfs_root *root, struct inode *inode) 3510 { 3511 int ret; 3512 3513 /* 3514 * If the inode is a free space inode, we can deadlock during commit 3515 * if we put it into the delayed code. 3516 * 3517 * The data relocation inode should also be directly updated 3518 * without delay 3519 */ 3520 if (!btrfs_is_free_space_inode(inode) 3521 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 3522 btrfs_update_root_times(trans, root); 3523 3524 ret = btrfs_delayed_update_inode(trans, root, inode); 3525 if (!ret) 3526 btrfs_set_inode_last_trans(trans, inode); 3527 return ret; 3528 } 3529 3530 return btrfs_update_inode_item(trans, root, inode); 3531 } 3532 3533 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, 3534 struct btrfs_root *root, 3535 struct inode *inode) 3536 { 3537 int ret; 3538 3539 ret = btrfs_update_inode(trans, root, inode); 3540 if (ret == -ENOSPC) 3541 return btrfs_update_inode_item(trans, root, inode); 3542 return ret; 3543 } 3544 3545 /* 3546 * unlink helper that gets used here in inode.c and in the tree logging 3547 * recovery code. It remove a link in a directory with a given name, and 3548 * also drops the back refs in the inode to the directory 3549 */ 3550 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 3551 struct btrfs_root *root, 3552 struct inode *dir, struct inode *inode, 3553 const char *name, int name_len) 3554 { 3555 struct btrfs_path *path; 3556 int ret = 0; 3557 struct extent_buffer *leaf; 3558 struct btrfs_dir_item *di; 3559 struct btrfs_key key; 3560 u64 index; 3561 u64 ino = btrfs_ino(inode); 3562 u64 dir_ino = btrfs_ino(dir); 3563 3564 path = btrfs_alloc_path(); 3565 if (!path) { 3566 ret = -ENOMEM; 3567 goto out; 3568 } 3569 3570 path->leave_spinning = 1; 3571 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 3572 name, name_len, -1); 3573 if (IS_ERR(di)) { 3574 ret = PTR_ERR(di); 3575 goto err; 3576 } 3577 if (!di) { 3578 ret = -ENOENT; 3579 goto err; 3580 } 3581 leaf = path->nodes[0]; 3582 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3583 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3584 if (ret) 3585 goto err; 3586 btrfs_release_path(path); 3587 3588 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 3589 dir_ino, &index); 3590 if (ret) { 3591 btrfs_info(root->fs_info, 3592 "failed to delete reference to %.*s, inode %llu parent %llu", 3593 name_len, name, ino, dir_ino); 3594 btrfs_abort_transaction(trans, root, ret); 3595 goto err; 3596 } 3597 3598 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 3599 if (ret) { 3600 btrfs_abort_transaction(trans, root, ret); 3601 goto err; 3602 } 3603 3604 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 3605 inode, dir_ino); 3606 if (ret != 0 && ret != -ENOENT) { 3607 btrfs_abort_transaction(trans, root, ret); 3608 goto err; 3609 } 3610 3611 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 3612 dir, index); 3613 if (ret == -ENOENT) 3614 ret = 0; 3615 else if (ret) 3616 btrfs_abort_transaction(trans, root, ret); 3617 err: 3618 btrfs_free_path(path); 3619 if (ret) 3620 goto out; 3621 3622 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3623 inode_inc_iversion(inode); 3624 inode_inc_iversion(dir); 3625 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3626 ret = btrfs_update_inode(trans, root, dir); 3627 out: 3628 return ret; 3629 } 3630 3631 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 3632 struct btrfs_root *root, 3633 struct inode *dir, struct inode *inode, 3634 const char *name, int name_len) 3635 { 3636 int ret; 3637 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 3638 if (!ret) { 3639 btrfs_drop_nlink(inode); 3640 ret = btrfs_update_inode(trans, root, inode); 3641 } 3642 return ret; 3643 } 3644 3645 /* 3646 * helper to start transaction for unlink and rmdir. 3647 * 3648 * unlink and rmdir are special in btrfs, they do not always free space, so 3649 * if we cannot make our reservations the normal way try and see if there is 3650 * plenty of slack room in the global reserve to migrate, otherwise we cannot 3651 * allow the unlink to occur. 3652 */ 3653 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) 3654 { 3655 struct btrfs_trans_handle *trans; 3656 struct btrfs_root *root = BTRFS_I(dir)->root; 3657 int ret; 3658 3659 /* 3660 * 1 for the possible orphan item 3661 * 1 for the dir item 3662 * 1 for the dir index 3663 * 1 for the inode ref 3664 * 1 for the inode 3665 */ 3666 trans = btrfs_start_transaction(root, 5); 3667 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 3668 return trans; 3669 3670 if (PTR_ERR(trans) == -ENOSPC) { 3671 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); 3672 3673 trans = btrfs_start_transaction(root, 0); 3674 if (IS_ERR(trans)) 3675 return trans; 3676 ret = btrfs_cond_migrate_bytes(root->fs_info, 3677 &root->fs_info->trans_block_rsv, 3678 num_bytes, 5); 3679 if (ret) { 3680 btrfs_end_transaction(trans, root); 3681 return ERR_PTR(ret); 3682 } 3683 trans->block_rsv = &root->fs_info->trans_block_rsv; 3684 trans->bytes_reserved = num_bytes; 3685 } 3686 return trans; 3687 } 3688 3689 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3690 { 3691 struct btrfs_root *root = BTRFS_I(dir)->root; 3692 struct btrfs_trans_handle *trans; 3693 struct inode *inode = dentry->d_inode; 3694 int ret; 3695 3696 trans = __unlink_start_trans(dir); 3697 if (IS_ERR(trans)) 3698 return PTR_ERR(trans); 3699 3700 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 3701 3702 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3703 dentry->d_name.name, dentry->d_name.len); 3704 if (ret) 3705 goto out; 3706 3707 if (inode->i_nlink == 0) { 3708 ret = btrfs_orphan_add(trans, inode); 3709 if (ret) 3710 goto out; 3711 } 3712 3713 out: 3714 btrfs_end_transaction(trans, root); 3715 btrfs_btree_balance_dirty(root); 3716 return ret; 3717 } 3718 3719 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 3720 struct btrfs_root *root, 3721 struct inode *dir, u64 objectid, 3722 const char *name, int name_len) 3723 { 3724 struct btrfs_path *path; 3725 struct extent_buffer *leaf; 3726 struct btrfs_dir_item *di; 3727 struct btrfs_key key; 3728 u64 index; 3729 int ret; 3730 u64 dir_ino = btrfs_ino(dir); 3731 3732 path = btrfs_alloc_path(); 3733 if (!path) 3734 return -ENOMEM; 3735 3736 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, 3737 name, name_len, -1); 3738 if (IS_ERR_OR_NULL(di)) { 3739 if (!di) 3740 ret = -ENOENT; 3741 else 3742 ret = PTR_ERR(di); 3743 goto out; 3744 } 3745 3746 leaf = path->nodes[0]; 3747 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3748 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 3749 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3750 if (ret) { 3751 btrfs_abort_transaction(trans, root, ret); 3752 goto out; 3753 } 3754 btrfs_release_path(path); 3755 3756 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 3757 objectid, root->root_key.objectid, 3758 dir_ino, &index, name, name_len); 3759 if (ret < 0) { 3760 if (ret != -ENOENT) { 3761 btrfs_abort_transaction(trans, root, ret); 3762 goto out; 3763 } 3764 di = btrfs_search_dir_index_item(root, path, dir_ino, 3765 name, name_len); 3766 if (IS_ERR_OR_NULL(di)) { 3767 if (!di) 3768 ret = -ENOENT; 3769 else 3770 ret = PTR_ERR(di); 3771 btrfs_abort_transaction(trans, root, ret); 3772 goto out; 3773 } 3774 3775 leaf = path->nodes[0]; 3776 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3777 btrfs_release_path(path); 3778 index = key.offset; 3779 } 3780 btrfs_release_path(path); 3781 3782 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 3783 if (ret) { 3784 btrfs_abort_transaction(trans, root, ret); 3785 goto out; 3786 } 3787 3788 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3789 inode_inc_iversion(dir); 3790 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3791 ret = btrfs_update_inode_fallback(trans, root, dir); 3792 if (ret) 3793 btrfs_abort_transaction(trans, root, ret); 3794 out: 3795 btrfs_free_path(path); 3796 return ret; 3797 } 3798 3799 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 3800 { 3801 struct inode *inode = dentry->d_inode; 3802 int err = 0; 3803 struct btrfs_root *root = BTRFS_I(dir)->root; 3804 struct btrfs_trans_handle *trans; 3805 3806 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 3807 return -ENOTEMPTY; 3808 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 3809 return -EPERM; 3810 3811 trans = __unlink_start_trans(dir); 3812 if (IS_ERR(trans)) 3813 return PTR_ERR(trans); 3814 3815 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 3816 err = btrfs_unlink_subvol(trans, root, dir, 3817 BTRFS_I(inode)->location.objectid, 3818 dentry->d_name.name, 3819 dentry->d_name.len); 3820 goto out; 3821 } 3822 3823 err = btrfs_orphan_add(trans, inode); 3824 if (err) 3825 goto out; 3826 3827 /* now the directory is empty */ 3828 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3829 dentry->d_name.name, dentry->d_name.len); 3830 if (!err) 3831 btrfs_i_size_write(inode, 0); 3832 out: 3833 btrfs_end_transaction(trans, root); 3834 btrfs_btree_balance_dirty(root); 3835 3836 return err; 3837 } 3838 3839 /* 3840 * this can truncate away extent items, csum items and directory items. 3841 * It starts at a high offset and removes keys until it can't find 3842 * any higher than new_size 3843 * 3844 * csum items that cross the new i_size are truncated to the new size 3845 * as well. 3846 * 3847 * min_type is the minimum key type to truncate down to. If set to 0, this 3848 * will kill all the items on this inode, including the INODE_ITEM_KEY. 3849 */ 3850 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3851 struct btrfs_root *root, 3852 struct inode *inode, 3853 u64 new_size, u32 min_type) 3854 { 3855 struct btrfs_path *path; 3856 struct extent_buffer *leaf; 3857 struct btrfs_file_extent_item *fi; 3858 struct btrfs_key key; 3859 struct btrfs_key found_key; 3860 u64 extent_start = 0; 3861 u64 extent_num_bytes = 0; 3862 u64 extent_offset = 0; 3863 u64 item_end = 0; 3864 u64 last_size = (u64)-1; 3865 u32 found_type = (u8)-1; 3866 int found_extent; 3867 int del_item; 3868 int pending_del_nr = 0; 3869 int pending_del_slot = 0; 3870 int extent_type = -1; 3871 int ret; 3872 int err = 0; 3873 u64 ino = btrfs_ino(inode); 3874 3875 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3876 3877 path = btrfs_alloc_path(); 3878 if (!path) 3879 return -ENOMEM; 3880 path->reada = -1; 3881 3882 /* 3883 * We want to drop from the next block forward in case this new size is 3884 * not block aligned since we will be keeping the last block of the 3885 * extent just the way it is. 3886 */ 3887 if (root->ref_cows || root == root->fs_info->tree_root) 3888 btrfs_drop_extent_cache(inode, ALIGN(new_size, 3889 root->sectorsize), (u64)-1, 0); 3890 3891 /* 3892 * This function is also used to drop the items in the log tree before 3893 * we relog the inode, so if root != BTRFS_I(inode)->root, it means 3894 * it is used to drop the loged items. So we shouldn't kill the delayed 3895 * items. 3896 */ 3897 if (min_type == 0 && root == BTRFS_I(inode)->root) 3898 btrfs_kill_delayed_inode_items(inode); 3899 3900 key.objectid = ino; 3901 key.offset = (u64)-1; 3902 key.type = (u8)-1; 3903 3904 search_again: 3905 path->leave_spinning = 1; 3906 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3907 if (ret < 0) { 3908 err = ret; 3909 goto out; 3910 } 3911 3912 if (ret > 0) { 3913 /* there are no items in the tree for us to truncate, we're 3914 * done 3915 */ 3916 if (path->slots[0] == 0) 3917 goto out; 3918 path->slots[0]--; 3919 } 3920 3921 while (1) { 3922 fi = NULL; 3923 leaf = path->nodes[0]; 3924 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3925 found_type = btrfs_key_type(&found_key); 3926 3927 if (found_key.objectid != ino) 3928 break; 3929 3930 if (found_type < min_type) 3931 break; 3932 3933 item_end = found_key.offset; 3934 if (found_type == BTRFS_EXTENT_DATA_KEY) { 3935 fi = btrfs_item_ptr(leaf, path->slots[0], 3936 struct btrfs_file_extent_item); 3937 extent_type = btrfs_file_extent_type(leaf, fi); 3938 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3939 item_end += 3940 btrfs_file_extent_num_bytes(leaf, fi); 3941 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3942 item_end += btrfs_file_extent_inline_len(leaf, 3943 fi); 3944 } 3945 item_end--; 3946 } 3947 if (found_type > min_type) { 3948 del_item = 1; 3949 } else { 3950 if (item_end < new_size) 3951 break; 3952 if (found_key.offset >= new_size) 3953 del_item = 1; 3954 else 3955 del_item = 0; 3956 } 3957 found_extent = 0; 3958 /* FIXME, shrink the extent if the ref count is only 1 */ 3959 if (found_type != BTRFS_EXTENT_DATA_KEY) 3960 goto delete; 3961 3962 if (del_item) 3963 last_size = found_key.offset; 3964 else 3965 last_size = new_size; 3966 3967 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3968 u64 num_dec; 3969 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3970 if (!del_item) { 3971 u64 orig_num_bytes = 3972 btrfs_file_extent_num_bytes(leaf, fi); 3973 extent_num_bytes = ALIGN(new_size - 3974 found_key.offset, 3975 root->sectorsize); 3976 btrfs_set_file_extent_num_bytes(leaf, fi, 3977 extent_num_bytes); 3978 num_dec = (orig_num_bytes - 3979 extent_num_bytes); 3980 if (root->ref_cows && extent_start != 0) 3981 inode_sub_bytes(inode, num_dec); 3982 btrfs_mark_buffer_dirty(leaf); 3983 } else { 3984 extent_num_bytes = 3985 btrfs_file_extent_disk_num_bytes(leaf, 3986 fi); 3987 extent_offset = found_key.offset - 3988 btrfs_file_extent_offset(leaf, fi); 3989 3990 /* FIXME blocksize != 4096 */ 3991 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 3992 if (extent_start != 0) { 3993 found_extent = 1; 3994 if (root->ref_cows) 3995 inode_sub_bytes(inode, num_dec); 3996 } 3997 } 3998 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3999 /* 4000 * we can't truncate inline items that have had 4001 * special encodings 4002 */ 4003 if (!del_item && 4004 btrfs_file_extent_compression(leaf, fi) == 0 && 4005 btrfs_file_extent_encryption(leaf, fi) == 0 && 4006 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 4007 u32 size = new_size - found_key.offset; 4008 4009 if (root->ref_cows) { 4010 inode_sub_bytes(inode, item_end + 1 - 4011 new_size); 4012 } 4013 size = 4014 btrfs_file_extent_calc_inline_size(size); 4015 btrfs_truncate_item(root, path, size, 1); 4016 } else if (root->ref_cows) { 4017 inode_sub_bytes(inode, item_end + 1 - 4018 found_key.offset); 4019 } 4020 } 4021 delete: 4022 if (del_item) { 4023 if (!pending_del_nr) { 4024 /* no pending yet, add ourselves */ 4025 pending_del_slot = path->slots[0]; 4026 pending_del_nr = 1; 4027 } else if (pending_del_nr && 4028 path->slots[0] + 1 == pending_del_slot) { 4029 /* hop on the pending chunk */ 4030 pending_del_nr++; 4031 pending_del_slot = path->slots[0]; 4032 } else { 4033 BUG(); 4034 } 4035 } else { 4036 break; 4037 } 4038 if (found_extent && (root->ref_cows || 4039 root == root->fs_info->tree_root)) { 4040 btrfs_set_path_blocking(path); 4041 ret = btrfs_free_extent(trans, root, extent_start, 4042 extent_num_bytes, 0, 4043 btrfs_header_owner(leaf), 4044 ino, extent_offset, 0); 4045 BUG_ON(ret); 4046 } 4047 4048 if (found_type == BTRFS_INODE_ITEM_KEY) 4049 break; 4050 4051 if (path->slots[0] == 0 || 4052 path->slots[0] != pending_del_slot) { 4053 if (pending_del_nr) { 4054 ret = btrfs_del_items(trans, root, path, 4055 pending_del_slot, 4056 pending_del_nr); 4057 if (ret) { 4058 btrfs_abort_transaction(trans, 4059 root, ret); 4060 goto error; 4061 } 4062 pending_del_nr = 0; 4063 } 4064 btrfs_release_path(path); 4065 goto search_again; 4066 } else { 4067 path->slots[0]--; 4068 } 4069 } 4070 out: 4071 if (pending_del_nr) { 4072 ret = btrfs_del_items(trans, root, path, pending_del_slot, 4073 pending_del_nr); 4074 if (ret) 4075 btrfs_abort_transaction(trans, root, ret); 4076 } 4077 error: 4078 if (last_size != (u64)-1) 4079 btrfs_ordered_update_i_size(inode, last_size, NULL); 4080 btrfs_free_path(path); 4081 return err; 4082 } 4083 4084 /* 4085 * btrfs_truncate_page - read, zero a chunk and write a page 4086 * @inode - inode that we're zeroing 4087 * @from - the offset to start zeroing 4088 * @len - the length to zero, 0 to zero the entire range respective to the 4089 * offset 4090 * @front - zero up to the offset instead of from the offset on 4091 * 4092 * This will find the page for the "from" offset and cow the page and zero the 4093 * part we want to zero. This is used with truncate and hole punching. 4094 */ 4095 int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, 4096 int front) 4097 { 4098 struct address_space *mapping = inode->i_mapping; 4099 struct btrfs_root *root = BTRFS_I(inode)->root; 4100 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4101 struct btrfs_ordered_extent *ordered; 4102 struct extent_state *cached_state = NULL; 4103 char *kaddr; 4104 u32 blocksize = root->sectorsize; 4105 pgoff_t index = from >> PAGE_CACHE_SHIFT; 4106 unsigned offset = from & (PAGE_CACHE_SIZE-1); 4107 struct page *page; 4108 gfp_t mask = btrfs_alloc_write_mask(mapping); 4109 int ret = 0; 4110 u64 page_start; 4111 u64 page_end; 4112 4113 if ((offset & (blocksize - 1)) == 0 && 4114 (!len || ((len & (blocksize - 1)) == 0))) 4115 goto out; 4116 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 4117 if (ret) 4118 goto out; 4119 4120 again: 4121 page = find_or_create_page(mapping, index, mask); 4122 if (!page) { 4123 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 4124 ret = -ENOMEM; 4125 goto out; 4126 } 4127 4128 page_start = page_offset(page); 4129 page_end = page_start + PAGE_CACHE_SIZE - 1; 4130 4131 if (!PageUptodate(page)) { 4132 ret = btrfs_readpage(NULL, page); 4133 lock_page(page); 4134 if (page->mapping != mapping) { 4135 unlock_page(page); 4136 page_cache_release(page); 4137 goto again; 4138 } 4139 if (!PageUptodate(page)) { 4140 ret = -EIO; 4141 goto out_unlock; 4142 } 4143 } 4144 wait_on_page_writeback(page); 4145 4146 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 4147 set_page_extent_mapped(page); 4148 4149 ordered = btrfs_lookup_ordered_extent(inode, page_start); 4150 if (ordered) { 4151 unlock_extent_cached(io_tree, page_start, page_end, 4152 &cached_state, GFP_NOFS); 4153 unlock_page(page); 4154 page_cache_release(page); 4155 btrfs_start_ordered_extent(inode, ordered, 1); 4156 btrfs_put_ordered_extent(ordered); 4157 goto again; 4158 } 4159 4160 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 4161 EXTENT_DIRTY | EXTENT_DELALLOC | 4162 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 4163 0, 0, &cached_state, GFP_NOFS); 4164 4165 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 4166 &cached_state); 4167 if (ret) { 4168 unlock_extent_cached(io_tree, page_start, page_end, 4169 &cached_state, GFP_NOFS); 4170 goto out_unlock; 4171 } 4172 4173 if (offset != PAGE_CACHE_SIZE) { 4174 if (!len) 4175 len = PAGE_CACHE_SIZE - offset; 4176 kaddr = kmap(page); 4177 if (front) 4178 memset(kaddr, 0, offset); 4179 else 4180 memset(kaddr + offset, 0, len); 4181 flush_dcache_page(page); 4182 kunmap(page); 4183 } 4184 ClearPageChecked(page); 4185 set_page_dirty(page); 4186 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 4187 GFP_NOFS); 4188 4189 out_unlock: 4190 if (ret) 4191 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 4192 unlock_page(page); 4193 page_cache_release(page); 4194 out: 4195 return ret; 4196 } 4197 4198 /* 4199 * This function puts in dummy file extents for the area we're creating a hole 4200 * for. So if we are truncating this file to a larger size we need to insert 4201 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 4202 * the range between oldsize and size 4203 */ 4204 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 4205 { 4206 struct btrfs_trans_handle *trans; 4207 struct btrfs_root *root = BTRFS_I(inode)->root; 4208 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4209 struct extent_map *em = NULL; 4210 struct extent_state *cached_state = NULL; 4211 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4212 u64 hole_start = ALIGN(oldsize, root->sectorsize); 4213 u64 block_end = ALIGN(size, root->sectorsize); 4214 u64 last_byte; 4215 u64 cur_offset; 4216 u64 hole_size; 4217 int err = 0; 4218 4219 /* 4220 * If our size started in the middle of a page we need to zero out the 4221 * rest of the page before we expand the i_size, otherwise we could 4222 * expose stale data. 4223 */ 4224 err = btrfs_truncate_page(inode, oldsize, 0, 0); 4225 if (err) 4226 return err; 4227 4228 if (size <= hole_start) 4229 return 0; 4230 4231 while (1) { 4232 struct btrfs_ordered_extent *ordered; 4233 btrfs_wait_ordered_range(inode, hole_start, 4234 block_end - hole_start); 4235 lock_extent_bits(io_tree, hole_start, block_end - 1, 0, 4236 &cached_state); 4237 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 4238 if (!ordered) 4239 break; 4240 unlock_extent_cached(io_tree, hole_start, block_end - 1, 4241 &cached_state, GFP_NOFS); 4242 btrfs_put_ordered_extent(ordered); 4243 } 4244 4245 cur_offset = hole_start; 4246 while (1) { 4247 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 4248 block_end - cur_offset, 0); 4249 if (IS_ERR(em)) { 4250 err = PTR_ERR(em); 4251 em = NULL; 4252 break; 4253 } 4254 last_byte = min(extent_map_end(em), block_end); 4255 last_byte = ALIGN(last_byte , root->sectorsize); 4256 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 4257 struct extent_map *hole_em; 4258 hole_size = last_byte - cur_offset; 4259 4260 trans = btrfs_start_transaction(root, 3); 4261 if (IS_ERR(trans)) { 4262 err = PTR_ERR(trans); 4263 break; 4264 } 4265 4266 err = btrfs_drop_extents(trans, root, inode, 4267 cur_offset, 4268 cur_offset + hole_size, 1); 4269 if (err) { 4270 btrfs_abort_transaction(trans, root, err); 4271 btrfs_end_transaction(trans, root); 4272 break; 4273 } 4274 4275 err = btrfs_insert_file_extent(trans, root, 4276 btrfs_ino(inode), cur_offset, 0, 4277 0, hole_size, 0, hole_size, 4278 0, 0, 0); 4279 if (err) { 4280 btrfs_abort_transaction(trans, root, err); 4281 btrfs_end_transaction(trans, root); 4282 break; 4283 } 4284 4285 btrfs_drop_extent_cache(inode, cur_offset, 4286 cur_offset + hole_size - 1, 0); 4287 hole_em = alloc_extent_map(); 4288 if (!hole_em) { 4289 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4290 &BTRFS_I(inode)->runtime_flags); 4291 goto next; 4292 } 4293 hole_em->start = cur_offset; 4294 hole_em->len = hole_size; 4295 hole_em->orig_start = cur_offset; 4296 4297 hole_em->block_start = EXTENT_MAP_HOLE; 4298 hole_em->block_len = 0; 4299 hole_em->orig_block_len = 0; 4300 hole_em->ram_bytes = hole_size; 4301 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 4302 hole_em->compress_type = BTRFS_COMPRESS_NONE; 4303 hole_em->generation = trans->transid; 4304 4305 while (1) { 4306 write_lock(&em_tree->lock); 4307 err = add_extent_mapping(em_tree, hole_em, 1); 4308 write_unlock(&em_tree->lock); 4309 if (err != -EEXIST) 4310 break; 4311 btrfs_drop_extent_cache(inode, cur_offset, 4312 cur_offset + 4313 hole_size - 1, 0); 4314 } 4315 free_extent_map(hole_em); 4316 next: 4317 btrfs_update_inode(trans, root, inode); 4318 btrfs_end_transaction(trans, root); 4319 } 4320 free_extent_map(em); 4321 em = NULL; 4322 cur_offset = last_byte; 4323 if (cur_offset >= block_end) 4324 break; 4325 } 4326 4327 free_extent_map(em); 4328 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 4329 GFP_NOFS); 4330 return err; 4331 } 4332 4333 static int btrfs_setsize(struct inode *inode, struct iattr *attr) 4334 { 4335 struct btrfs_root *root = BTRFS_I(inode)->root; 4336 struct btrfs_trans_handle *trans; 4337 loff_t oldsize = i_size_read(inode); 4338 loff_t newsize = attr->ia_size; 4339 int mask = attr->ia_valid; 4340 int ret; 4341 4342 /* 4343 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a 4344 * special case where we need to update the times despite not having 4345 * these flags set. For all other operations the VFS set these flags 4346 * explicitly if it wants a timestamp update. 4347 */ 4348 if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) 4349 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); 4350 4351 if (newsize > oldsize) { 4352 truncate_pagecache(inode, newsize); 4353 ret = btrfs_cont_expand(inode, oldsize, newsize); 4354 if (ret) 4355 return ret; 4356 4357 trans = btrfs_start_transaction(root, 1); 4358 if (IS_ERR(trans)) 4359 return PTR_ERR(trans); 4360 4361 i_size_write(inode, newsize); 4362 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 4363 ret = btrfs_update_inode(trans, root, inode); 4364 btrfs_end_transaction(trans, root); 4365 } else { 4366 4367 /* 4368 * We're truncating a file that used to have good data down to 4369 * zero. Make sure it gets into the ordered flush list so that 4370 * any new writes get down to disk quickly. 4371 */ 4372 if (newsize == 0) 4373 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 4374 &BTRFS_I(inode)->runtime_flags); 4375 4376 /* 4377 * 1 for the orphan item we're going to add 4378 * 1 for the orphan item deletion. 4379 */ 4380 trans = btrfs_start_transaction(root, 2); 4381 if (IS_ERR(trans)) 4382 return PTR_ERR(trans); 4383 4384 /* 4385 * We need to do this in case we fail at _any_ point during the 4386 * actual truncate. Once we do the truncate_setsize we could 4387 * invalidate pages which forces any outstanding ordered io to 4388 * be instantly completed which will give us extents that need 4389 * to be truncated. If we fail to get an orphan inode down we 4390 * could have left over extents that were never meant to live, 4391 * so we need to garuntee from this point on that everything 4392 * will be consistent. 4393 */ 4394 ret = btrfs_orphan_add(trans, inode); 4395 btrfs_end_transaction(trans, root); 4396 if (ret) 4397 return ret; 4398 4399 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 4400 truncate_setsize(inode, newsize); 4401 4402 /* Disable nonlocked read DIO to avoid the end less truncate */ 4403 btrfs_inode_block_unlocked_dio(inode); 4404 inode_dio_wait(inode); 4405 btrfs_inode_resume_unlocked_dio(inode); 4406 4407 ret = btrfs_truncate(inode); 4408 if (ret && inode->i_nlink) { 4409 int err; 4410 4411 /* 4412 * failed to truncate, disk_i_size is only adjusted down 4413 * as we remove extents, so it should represent the true 4414 * size of the inode, so reset the in memory size and 4415 * delete our orphan entry. 4416 */ 4417 trans = btrfs_join_transaction(root); 4418 if (IS_ERR(trans)) { 4419 btrfs_orphan_del(NULL, inode); 4420 return ret; 4421 } 4422 i_size_write(inode, BTRFS_I(inode)->disk_i_size); 4423 err = btrfs_orphan_del(trans, inode); 4424 if (err) 4425 btrfs_abort_transaction(trans, root, err); 4426 btrfs_end_transaction(trans, root); 4427 } 4428 } 4429 4430 return ret; 4431 } 4432 4433 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 4434 { 4435 struct inode *inode = dentry->d_inode; 4436 struct btrfs_root *root = BTRFS_I(inode)->root; 4437 int err; 4438 4439 if (btrfs_root_readonly(root)) 4440 return -EROFS; 4441 4442 err = inode_change_ok(inode, attr); 4443 if (err) 4444 return err; 4445 4446 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 4447 err = btrfs_setsize(inode, attr); 4448 if (err) 4449 return err; 4450 } 4451 4452 if (attr->ia_valid) { 4453 setattr_copy(inode, attr); 4454 inode_inc_iversion(inode); 4455 err = btrfs_dirty_inode(inode); 4456 4457 if (!err && attr->ia_valid & ATTR_MODE) 4458 err = btrfs_acl_chmod(inode); 4459 } 4460 4461 return err; 4462 } 4463 4464 void btrfs_evict_inode(struct inode *inode) 4465 { 4466 struct btrfs_trans_handle *trans; 4467 struct btrfs_root *root = BTRFS_I(inode)->root; 4468 struct btrfs_block_rsv *rsv, *global_rsv; 4469 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 4470 int ret; 4471 4472 trace_btrfs_inode_evict(inode); 4473 4474 truncate_inode_pages(&inode->i_data, 0); 4475 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 4476 btrfs_is_free_space_inode(inode))) 4477 goto no_delete; 4478 4479 if (is_bad_inode(inode)) { 4480 btrfs_orphan_del(NULL, inode); 4481 goto no_delete; 4482 } 4483 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 4484 btrfs_wait_ordered_range(inode, 0, (u64)-1); 4485 4486 if (root->fs_info->log_root_recovering) { 4487 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 4488 &BTRFS_I(inode)->runtime_flags)); 4489 goto no_delete; 4490 } 4491 4492 if (inode->i_nlink > 0) { 4493 BUG_ON(btrfs_root_refs(&root->root_item) != 0); 4494 goto no_delete; 4495 } 4496 4497 ret = btrfs_commit_inode_delayed_inode(inode); 4498 if (ret) { 4499 btrfs_orphan_del(NULL, inode); 4500 goto no_delete; 4501 } 4502 4503 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 4504 if (!rsv) { 4505 btrfs_orphan_del(NULL, inode); 4506 goto no_delete; 4507 } 4508 rsv->size = min_size; 4509 rsv->failfast = 1; 4510 global_rsv = &root->fs_info->global_block_rsv; 4511 4512 btrfs_i_size_write(inode, 0); 4513 4514 /* 4515 * This is a bit simpler than btrfs_truncate since we've already 4516 * reserved our space for our orphan item in the unlink, so we just 4517 * need to reserve some slack space in case we add bytes and update 4518 * inode item when doing the truncate. 4519 */ 4520 while (1) { 4521 ret = btrfs_block_rsv_refill(root, rsv, min_size, 4522 BTRFS_RESERVE_FLUSH_LIMIT); 4523 4524 /* 4525 * Try and steal from the global reserve since we will 4526 * likely not use this space anyway, we want to try as 4527 * hard as possible to get this to work. 4528 */ 4529 if (ret) 4530 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); 4531 4532 if (ret) { 4533 btrfs_warn(root->fs_info, 4534 "Could not get space for a delete, will truncate on mount %d", 4535 ret); 4536 btrfs_orphan_del(NULL, inode); 4537 btrfs_free_block_rsv(root, rsv); 4538 goto no_delete; 4539 } 4540 4541 trans = btrfs_join_transaction(root); 4542 if (IS_ERR(trans)) { 4543 btrfs_orphan_del(NULL, inode); 4544 btrfs_free_block_rsv(root, rsv); 4545 goto no_delete; 4546 } 4547 4548 trans->block_rsv = rsv; 4549 4550 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 4551 if (ret != -ENOSPC) 4552 break; 4553 4554 trans->block_rsv = &root->fs_info->trans_block_rsv; 4555 btrfs_end_transaction(trans, root); 4556 trans = NULL; 4557 btrfs_btree_balance_dirty(root); 4558 } 4559 4560 btrfs_free_block_rsv(root, rsv); 4561 4562 /* 4563 * Errors here aren't a big deal, it just means we leave orphan items 4564 * in the tree. They will be cleaned up on the next mount. 4565 */ 4566 if (ret == 0) { 4567 trans->block_rsv = root->orphan_block_rsv; 4568 btrfs_orphan_del(trans, inode); 4569 } else { 4570 btrfs_orphan_del(NULL, inode); 4571 } 4572 4573 trans->block_rsv = &root->fs_info->trans_block_rsv; 4574 if (!(root == root->fs_info->tree_root || 4575 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 4576 btrfs_return_ino(root, btrfs_ino(inode)); 4577 4578 btrfs_end_transaction(trans, root); 4579 btrfs_btree_balance_dirty(root); 4580 no_delete: 4581 btrfs_remove_delayed_node(inode); 4582 clear_inode(inode); 4583 return; 4584 } 4585 4586 /* 4587 * this returns the key found in the dir entry in the location pointer. 4588 * If no dir entries were found, location->objectid is 0. 4589 */ 4590 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 4591 struct btrfs_key *location) 4592 { 4593 const char *name = dentry->d_name.name; 4594 int namelen = dentry->d_name.len; 4595 struct btrfs_dir_item *di; 4596 struct btrfs_path *path; 4597 struct btrfs_root *root = BTRFS_I(dir)->root; 4598 int ret = 0; 4599 4600 path = btrfs_alloc_path(); 4601 if (!path) 4602 return -ENOMEM; 4603 4604 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 4605 namelen, 0); 4606 if (IS_ERR(di)) 4607 ret = PTR_ERR(di); 4608 4609 if (IS_ERR_OR_NULL(di)) 4610 goto out_err; 4611 4612 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 4613 out: 4614 btrfs_free_path(path); 4615 return ret; 4616 out_err: 4617 location->objectid = 0; 4618 goto out; 4619 } 4620 4621 /* 4622 * when we hit a tree root in a directory, the btrfs part of the inode 4623 * needs to be changed to reflect the root directory of the tree root. This 4624 * is kind of like crossing a mount point. 4625 */ 4626 static int fixup_tree_root_location(struct btrfs_root *root, 4627 struct inode *dir, 4628 struct dentry *dentry, 4629 struct btrfs_key *location, 4630 struct btrfs_root **sub_root) 4631 { 4632 struct btrfs_path *path; 4633 struct btrfs_root *new_root; 4634 struct btrfs_root_ref *ref; 4635 struct extent_buffer *leaf; 4636 int ret; 4637 int err = 0; 4638 4639 path = btrfs_alloc_path(); 4640 if (!path) { 4641 err = -ENOMEM; 4642 goto out; 4643 } 4644 4645 err = -ENOENT; 4646 ret = btrfs_find_root_ref(root->fs_info->tree_root, path, 4647 BTRFS_I(dir)->root->root_key.objectid, 4648 location->objectid); 4649 if (ret) { 4650 if (ret < 0) 4651 err = ret; 4652 goto out; 4653 } 4654 4655 leaf = path->nodes[0]; 4656 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 4657 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || 4658 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 4659 goto out; 4660 4661 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 4662 (unsigned long)(ref + 1), 4663 dentry->d_name.len); 4664 if (ret) 4665 goto out; 4666 4667 btrfs_release_path(path); 4668 4669 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 4670 if (IS_ERR(new_root)) { 4671 err = PTR_ERR(new_root); 4672 goto out; 4673 } 4674 4675 *sub_root = new_root; 4676 location->objectid = btrfs_root_dirid(&new_root->root_item); 4677 location->type = BTRFS_INODE_ITEM_KEY; 4678 location->offset = 0; 4679 err = 0; 4680 out: 4681 btrfs_free_path(path); 4682 return err; 4683 } 4684 4685 static void inode_tree_add(struct inode *inode) 4686 { 4687 struct btrfs_root *root = BTRFS_I(inode)->root; 4688 struct btrfs_inode *entry; 4689 struct rb_node **p; 4690 struct rb_node *parent; 4691 struct rb_node *new = &BTRFS_I(inode)->rb_node; 4692 u64 ino = btrfs_ino(inode); 4693 4694 if (inode_unhashed(inode)) 4695 return; 4696 parent = NULL; 4697 spin_lock(&root->inode_lock); 4698 p = &root->inode_tree.rb_node; 4699 while (*p) { 4700 parent = *p; 4701 entry = rb_entry(parent, struct btrfs_inode, rb_node); 4702 4703 if (ino < btrfs_ino(&entry->vfs_inode)) 4704 p = &parent->rb_left; 4705 else if (ino > btrfs_ino(&entry->vfs_inode)) 4706 p = &parent->rb_right; 4707 else { 4708 WARN_ON(!(entry->vfs_inode.i_state & 4709 (I_WILL_FREE | I_FREEING))); 4710 rb_replace_node(parent, new, &root->inode_tree); 4711 RB_CLEAR_NODE(parent); 4712 spin_unlock(&root->inode_lock); 4713 return; 4714 } 4715 } 4716 rb_link_node(new, parent, p); 4717 rb_insert_color(new, &root->inode_tree); 4718 spin_unlock(&root->inode_lock); 4719 } 4720 4721 static void inode_tree_del(struct inode *inode) 4722 { 4723 struct btrfs_root *root = BTRFS_I(inode)->root; 4724 int empty = 0; 4725 4726 spin_lock(&root->inode_lock); 4727 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 4728 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 4729 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 4730 empty = RB_EMPTY_ROOT(&root->inode_tree); 4731 } 4732 spin_unlock(&root->inode_lock); 4733 4734 /* 4735 * Free space cache has inodes in the tree root, but the tree root has a 4736 * root_refs of 0, so this could end up dropping the tree root as a 4737 * snapshot, so we need the extra !root->fs_info->tree_root check to 4738 * make sure we don't drop it. 4739 */ 4740 if (empty && btrfs_root_refs(&root->root_item) == 0 && 4741 root != root->fs_info->tree_root) { 4742 synchronize_srcu(&root->fs_info->subvol_srcu); 4743 spin_lock(&root->inode_lock); 4744 empty = RB_EMPTY_ROOT(&root->inode_tree); 4745 spin_unlock(&root->inode_lock); 4746 if (empty) 4747 btrfs_add_dead_root(root); 4748 } 4749 } 4750 4751 void btrfs_invalidate_inodes(struct btrfs_root *root) 4752 { 4753 struct rb_node *node; 4754 struct rb_node *prev; 4755 struct btrfs_inode *entry; 4756 struct inode *inode; 4757 u64 objectid = 0; 4758 4759 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4760 4761 spin_lock(&root->inode_lock); 4762 again: 4763 node = root->inode_tree.rb_node; 4764 prev = NULL; 4765 while (node) { 4766 prev = node; 4767 entry = rb_entry(node, struct btrfs_inode, rb_node); 4768 4769 if (objectid < btrfs_ino(&entry->vfs_inode)) 4770 node = node->rb_left; 4771 else if (objectid > btrfs_ino(&entry->vfs_inode)) 4772 node = node->rb_right; 4773 else 4774 break; 4775 } 4776 if (!node) { 4777 while (prev) { 4778 entry = rb_entry(prev, struct btrfs_inode, rb_node); 4779 if (objectid <= btrfs_ino(&entry->vfs_inode)) { 4780 node = prev; 4781 break; 4782 } 4783 prev = rb_next(prev); 4784 } 4785 } 4786 while (node) { 4787 entry = rb_entry(node, struct btrfs_inode, rb_node); 4788 objectid = btrfs_ino(&entry->vfs_inode) + 1; 4789 inode = igrab(&entry->vfs_inode); 4790 if (inode) { 4791 spin_unlock(&root->inode_lock); 4792 if (atomic_read(&inode->i_count) > 1) 4793 d_prune_aliases(inode); 4794 /* 4795 * btrfs_drop_inode will have it removed from 4796 * the inode cache when its usage count 4797 * hits zero. 4798 */ 4799 iput(inode); 4800 cond_resched(); 4801 spin_lock(&root->inode_lock); 4802 goto again; 4803 } 4804 4805 if (cond_resched_lock(&root->inode_lock)) 4806 goto again; 4807 4808 node = rb_next(node); 4809 } 4810 spin_unlock(&root->inode_lock); 4811 } 4812 4813 static int btrfs_init_locked_inode(struct inode *inode, void *p) 4814 { 4815 struct btrfs_iget_args *args = p; 4816 inode->i_ino = args->ino; 4817 BTRFS_I(inode)->root = args->root; 4818 return 0; 4819 } 4820 4821 static int btrfs_find_actor(struct inode *inode, void *opaque) 4822 { 4823 struct btrfs_iget_args *args = opaque; 4824 return args->ino == btrfs_ino(inode) && 4825 args->root == BTRFS_I(inode)->root; 4826 } 4827 4828 static struct inode *btrfs_iget_locked(struct super_block *s, 4829 u64 objectid, 4830 struct btrfs_root *root) 4831 { 4832 struct inode *inode; 4833 struct btrfs_iget_args args; 4834 args.ino = objectid; 4835 args.root = root; 4836 4837 inode = iget5_locked(s, objectid, btrfs_find_actor, 4838 btrfs_init_locked_inode, 4839 (void *)&args); 4840 return inode; 4841 } 4842 4843 /* Get an inode object given its location and corresponding root. 4844 * Returns in *is_new if the inode was read from disk 4845 */ 4846 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 4847 struct btrfs_root *root, int *new) 4848 { 4849 struct inode *inode; 4850 4851 inode = btrfs_iget_locked(s, location->objectid, root); 4852 if (!inode) 4853 return ERR_PTR(-ENOMEM); 4854 4855 if (inode->i_state & I_NEW) { 4856 BTRFS_I(inode)->root = root; 4857 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 4858 btrfs_read_locked_inode(inode); 4859 if (!is_bad_inode(inode)) { 4860 inode_tree_add(inode); 4861 unlock_new_inode(inode); 4862 if (new) 4863 *new = 1; 4864 } else { 4865 unlock_new_inode(inode); 4866 iput(inode); 4867 inode = ERR_PTR(-ESTALE); 4868 } 4869 } 4870 4871 return inode; 4872 } 4873 4874 static struct inode *new_simple_dir(struct super_block *s, 4875 struct btrfs_key *key, 4876 struct btrfs_root *root) 4877 { 4878 struct inode *inode = new_inode(s); 4879 4880 if (!inode) 4881 return ERR_PTR(-ENOMEM); 4882 4883 BTRFS_I(inode)->root = root; 4884 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4885 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 4886 4887 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 4888 inode->i_op = &btrfs_dir_ro_inode_operations; 4889 inode->i_fop = &simple_dir_operations; 4890 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 4891 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4892 4893 return inode; 4894 } 4895 4896 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 4897 { 4898 struct inode *inode; 4899 struct btrfs_root *root = BTRFS_I(dir)->root; 4900 struct btrfs_root *sub_root = root; 4901 struct btrfs_key location; 4902 int index; 4903 int ret = 0; 4904 4905 if (dentry->d_name.len > BTRFS_NAME_LEN) 4906 return ERR_PTR(-ENAMETOOLONG); 4907 4908 ret = btrfs_inode_by_name(dir, dentry, &location); 4909 if (ret < 0) 4910 return ERR_PTR(ret); 4911 4912 if (location.objectid == 0) 4913 return NULL; 4914 4915 if (location.type == BTRFS_INODE_ITEM_KEY) { 4916 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 4917 return inode; 4918 } 4919 4920 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 4921 4922 index = srcu_read_lock(&root->fs_info->subvol_srcu); 4923 ret = fixup_tree_root_location(root, dir, dentry, 4924 &location, &sub_root); 4925 if (ret < 0) { 4926 if (ret != -ENOENT) 4927 inode = ERR_PTR(ret); 4928 else 4929 inode = new_simple_dir(dir->i_sb, &location, sub_root); 4930 } else { 4931 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); 4932 } 4933 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4934 4935 if (!IS_ERR(inode) && root != sub_root) { 4936 down_read(&root->fs_info->cleanup_work_sem); 4937 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4938 ret = btrfs_orphan_cleanup(sub_root); 4939 up_read(&root->fs_info->cleanup_work_sem); 4940 if (ret) { 4941 iput(inode); 4942 inode = ERR_PTR(ret); 4943 } 4944 } 4945 4946 return inode; 4947 } 4948 4949 static int btrfs_dentry_delete(const struct dentry *dentry) 4950 { 4951 struct btrfs_root *root; 4952 struct inode *inode = dentry->d_inode; 4953 4954 if (!inode && !IS_ROOT(dentry)) 4955 inode = dentry->d_parent->d_inode; 4956 4957 if (inode) { 4958 root = BTRFS_I(inode)->root; 4959 if (btrfs_root_refs(&root->root_item) == 0) 4960 return 1; 4961 4962 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 4963 return 1; 4964 } 4965 return 0; 4966 } 4967 4968 static void btrfs_dentry_release(struct dentry *dentry) 4969 { 4970 if (dentry->d_fsdata) 4971 kfree(dentry->d_fsdata); 4972 } 4973 4974 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4975 unsigned int flags) 4976 { 4977 struct dentry *ret; 4978 4979 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); 4980 return ret; 4981 } 4982 4983 unsigned char btrfs_filetype_table[] = { 4984 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 4985 }; 4986 4987 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) 4988 { 4989 struct inode *inode = file_inode(file); 4990 struct btrfs_root *root = BTRFS_I(inode)->root; 4991 struct btrfs_item *item; 4992 struct btrfs_dir_item *di; 4993 struct btrfs_key key; 4994 struct btrfs_key found_key; 4995 struct btrfs_path *path; 4996 struct list_head ins_list; 4997 struct list_head del_list; 4998 int ret; 4999 struct extent_buffer *leaf; 5000 int slot; 5001 unsigned char d_type; 5002 int over = 0; 5003 u32 di_cur; 5004 u32 di_total; 5005 u32 di_len; 5006 int key_type = BTRFS_DIR_INDEX_KEY; 5007 char tmp_name[32]; 5008 char *name_ptr; 5009 int name_len; 5010 int is_curr = 0; /* ctx->pos points to the current index? */ 5011 5012 /* FIXME, use a real flag for deciding about the key type */ 5013 if (root->fs_info->tree_root == root) 5014 key_type = BTRFS_DIR_ITEM_KEY; 5015 5016 if (!dir_emit_dots(file, ctx)) 5017 return 0; 5018 5019 path = btrfs_alloc_path(); 5020 if (!path) 5021 return -ENOMEM; 5022 5023 path->reada = 1; 5024 5025 if (key_type == BTRFS_DIR_INDEX_KEY) { 5026 INIT_LIST_HEAD(&ins_list); 5027 INIT_LIST_HEAD(&del_list); 5028 btrfs_get_delayed_items(inode, &ins_list, &del_list); 5029 } 5030 5031 btrfs_set_key_type(&key, key_type); 5032 key.offset = ctx->pos; 5033 key.objectid = btrfs_ino(inode); 5034 5035 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5036 if (ret < 0) 5037 goto err; 5038 5039 while (1) { 5040 leaf = path->nodes[0]; 5041 slot = path->slots[0]; 5042 if (slot >= btrfs_header_nritems(leaf)) { 5043 ret = btrfs_next_leaf(root, path); 5044 if (ret < 0) 5045 goto err; 5046 else if (ret > 0) 5047 break; 5048 continue; 5049 } 5050 5051 item = btrfs_item_nr(leaf, slot); 5052 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5053 5054 if (found_key.objectid != key.objectid) 5055 break; 5056 if (btrfs_key_type(&found_key) != key_type) 5057 break; 5058 if (found_key.offset < ctx->pos) 5059 goto next; 5060 if (key_type == BTRFS_DIR_INDEX_KEY && 5061 btrfs_should_delete_dir_index(&del_list, 5062 found_key.offset)) 5063 goto next; 5064 5065 ctx->pos = found_key.offset; 5066 is_curr = 1; 5067 5068 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 5069 di_cur = 0; 5070 di_total = btrfs_item_size(leaf, item); 5071 5072 while (di_cur < di_total) { 5073 struct btrfs_key location; 5074 5075 if (verify_dir_item(root, leaf, di)) 5076 break; 5077 5078 name_len = btrfs_dir_name_len(leaf, di); 5079 if (name_len <= sizeof(tmp_name)) { 5080 name_ptr = tmp_name; 5081 } else { 5082 name_ptr = kmalloc(name_len, GFP_NOFS); 5083 if (!name_ptr) { 5084 ret = -ENOMEM; 5085 goto err; 5086 } 5087 } 5088 read_extent_buffer(leaf, name_ptr, 5089 (unsigned long)(di + 1), name_len); 5090 5091 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 5092 btrfs_dir_item_key_to_cpu(leaf, di, &location); 5093 5094 5095 /* is this a reference to our own snapshot? If so 5096 * skip it. 5097 * 5098 * In contrast to old kernels, we insert the snapshot's 5099 * dir item and dir index after it has been created, so 5100 * we won't find a reference to our own snapshot. We 5101 * still keep the following code for backward 5102 * compatibility. 5103 */ 5104 if (location.type == BTRFS_ROOT_ITEM_KEY && 5105 location.objectid == root->root_key.objectid) { 5106 over = 0; 5107 goto skip; 5108 } 5109 over = !dir_emit(ctx, name_ptr, name_len, 5110 location.objectid, d_type); 5111 5112 skip: 5113 if (name_ptr != tmp_name) 5114 kfree(name_ptr); 5115 5116 if (over) 5117 goto nopos; 5118 di_len = btrfs_dir_name_len(leaf, di) + 5119 btrfs_dir_data_len(leaf, di) + sizeof(*di); 5120 di_cur += di_len; 5121 di = (struct btrfs_dir_item *)((char *)di + di_len); 5122 } 5123 next: 5124 path->slots[0]++; 5125 } 5126 5127 if (key_type == BTRFS_DIR_INDEX_KEY) { 5128 if (is_curr) 5129 ctx->pos++; 5130 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); 5131 if (ret) 5132 goto nopos; 5133 } 5134 5135 /* Reached end of directory/root. Bump pos past the last item. */ 5136 ctx->pos++; 5137 5138 /* 5139 * Stop new entries from being returned after we return the last 5140 * entry. 5141 * 5142 * New directory entries are assigned a strictly increasing 5143 * offset. This means that new entries created during readdir 5144 * are *guaranteed* to be seen in the future by that readdir. 5145 * This has broken buggy programs which operate on names as 5146 * they're returned by readdir. Until we re-use freed offsets 5147 * we have this hack to stop new entries from being returned 5148 * under the assumption that they'll never reach this huge 5149 * offset. 5150 * 5151 * This is being careful not to overflow 32bit loff_t unless the 5152 * last entry requires it because doing so has broken 32bit apps 5153 * in the past. 5154 */ 5155 if (key_type == BTRFS_DIR_INDEX_KEY) { 5156 if (ctx->pos >= INT_MAX) 5157 ctx->pos = LLONG_MAX; 5158 else 5159 ctx->pos = INT_MAX; 5160 } 5161 nopos: 5162 ret = 0; 5163 err: 5164 if (key_type == BTRFS_DIR_INDEX_KEY) 5165 btrfs_put_delayed_items(&ins_list, &del_list); 5166 btrfs_free_path(path); 5167 return ret; 5168 } 5169 5170 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) 5171 { 5172 struct btrfs_root *root = BTRFS_I(inode)->root; 5173 struct btrfs_trans_handle *trans; 5174 int ret = 0; 5175 bool nolock = false; 5176 5177 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 5178 return 0; 5179 5180 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode)) 5181 nolock = true; 5182 5183 if (wbc->sync_mode == WB_SYNC_ALL) { 5184 if (nolock) 5185 trans = btrfs_join_transaction_nolock(root); 5186 else 5187 trans = btrfs_join_transaction(root); 5188 if (IS_ERR(trans)) 5189 return PTR_ERR(trans); 5190 ret = btrfs_commit_transaction(trans, root); 5191 } 5192 return ret; 5193 } 5194 5195 /* 5196 * This is somewhat expensive, updating the tree every time the 5197 * inode changes. But, it is most likely to find the inode in cache. 5198 * FIXME, needs more benchmarking...there are no reasons other than performance 5199 * to keep or drop this code. 5200 */ 5201 static int btrfs_dirty_inode(struct inode *inode) 5202 { 5203 struct btrfs_root *root = BTRFS_I(inode)->root; 5204 struct btrfs_trans_handle *trans; 5205 int ret; 5206 5207 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 5208 return 0; 5209 5210 trans = btrfs_join_transaction(root); 5211 if (IS_ERR(trans)) 5212 return PTR_ERR(trans); 5213 5214 ret = btrfs_update_inode(trans, root, inode); 5215 if (ret && ret == -ENOSPC) { 5216 /* whoops, lets try again with the full transaction */ 5217 btrfs_end_transaction(trans, root); 5218 trans = btrfs_start_transaction(root, 1); 5219 if (IS_ERR(trans)) 5220 return PTR_ERR(trans); 5221 5222 ret = btrfs_update_inode(trans, root, inode); 5223 } 5224 btrfs_end_transaction(trans, root); 5225 if (BTRFS_I(inode)->delayed_node) 5226 btrfs_balance_delayed_items(root); 5227 5228 return ret; 5229 } 5230 5231 /* 5232 * This is a copy of file_update_time. We need this so we can return error on 5233 * ENOSPC for updating the inode in the case of file write and mmap writes. 5234 */ 5235 static int btrfs_update_time(struct inode *inode, struct timespec *now, 5236 int flags) 5237 { 5238 struct btrfs_root *root = BTRFS_I(inode)->root; 5239 5240 if (btrfs_root_readonly(root)) 5241 return -EROFS; 5242 5243 if (flags & S_VERSION) 5244 inode_inc_iversion(inode); 5245 if (flags & S_CTIME) 5246 inode->i_ctime = *now; 5247 if (flags & S_MTIME) 5248 inode->i_mtime = *now; 5249 if (flags & S_ATIME) 5250 inode->i_atime = *now; 5251 return btrfs_dirty_inode(inode); 5252 } 5253 5254 /* 5255 * find the highest existing sequence number in a directory 5256 * and then set the in-memory index_cnt variable to reflect 5257 * free sequence numbers 5258 */ 5259 static int btrfs_set_inode_index_count(struct inode *inode) 5260 { 5261 struct btrfs_root *root = BTRFS_I(inode)->root; 5262 struct btrfs_key key, found_key; 5263 struct btrfs_path *path; 5264 struct extent_buffer *leaf; 5265 int ret; 5266 5267 key.objectid = btrfs_ino(inode); 5268 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 5269 key.offset = (u64)-1; 5270 5271 path = btrfs_alloc_path(); 5272 if (!path) 5273 return -ENOMEM; 5274 5275 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5276 if (ret < 0) 5277 goto out; 5278 /* FIXME: we should be able to handle this */ 5279 if (ret == 0) 5280 goto out; 5281 ret = 0; 5282 5283 /* 5284 * MAGIC NUMBER EXPLANATION: 5285 * since we search a directory based on f_pos we have to start at 2 5286 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 5287 * else has to start at 2 5288 */ 5289 if (path->slots[0] == 0) { 5290 BTRFS_I(inode)->index_cnt = 2; 5291 goto out; 5292 } 5293 5294 path->slots[0]--; 5295 5296 leaf = path->nodes[0]; 5297 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5298 5299 if (found_key.objectid != btrfs_ino(inode) || 5300 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 5301 BTRFS_I(inode)->index_cnt = 2; 5302 goto out; 5303 } 5304 5305 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 5306 out: 5307 btrfs_free_path(path); 5308 return ret; 5309 } 5310 5311 /* 5312 * helper to find a free sequence number in a given directory. This current 5313 * code is very simple, later versions will do smarter things in the btree 5314 */ 5315 int btrfs_set_inode_index(struct inode *dir, u64 *index) 5316 { 5317 int ret = 0; 5318 5319 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 5320 ret = btrfs_inode_delayed_dir_index_count(dir); 5321 if (ret) { 5322 ret = btrfs_set_inode_index_count(dir); 5323 if (ret) 5324 return ret; 5325 } 5326 } 5327 5328 *index = BTRFS_I(dir)->index_cnt; 5329 BTRFS_I(dir)->index_cnt++; 5330 5331 return ret; 5332 } 5333 5334 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 5335 struct btrfs_root *root, 5336 struct inode *dir, 5337 const char *name, int name_len, 5338 u64 ref_objectid, u64 objectid, 5339 umode_t mode, u64 *index) 5340 { 5341 struct inode *inode; 5342 struct btrfs_inode_item *inode_item; 5343 struct btrfs_key *location; 5344 struct btrfs_path *path; 5345 struct btrfs_inode_ref *ref; 5346 struct btrfs_key key[2]; 5347 u32 sizes[2]; 5348 unsigned long ptr; 5349 int ret; 5350 int owner; 5351 5352 path = btrfs_alloc_path(); 5353 if (!path) 5354 return ERR_PTR(-ENOMEM); 5355 5356 inode = new_inode(root->fs_info->sb); 5357 if (!inode) { 5358 btrfs_free_path(path); 5359 return ERR_PTR(-ENOMEM); 5360 } 5361 5362 /* 5363 * we have to initialize this early, so we can reclaim the inode 5364 * number if we fail afterwards in this function. 5365 */ 5366 inode->i_ino = objectid; 5367 5368 if (dir) { 5369 trace_btrfs_inode_request(dir); 5370 5371 ret = btrfs_set_inode_index(dir, index); 5372 if (ret) { 5373 btrfs_free_path(path); 5374 iput(inode); 5375 return ERR_PTR(ret); 5376 } 5377 } 5378 /* 5379 * index_cnt is ignored for everything but a dir, 5380 * btrfs_get_inode_index_count has an explanation for the magic 5381 * number 5382 */ 5383 BTRFS_I(inode)->index_cnt = 2; 5384 BTRFS_I(inode)->root = root; 5385 BTRFS_I(inode)->generation = trans->transid; 5386 inode->i_generation = BTRFS_I(inode)->generation; 5387 5388 /* 5389 * We could have gotten an inode number from somebody who was fsynced 5390 * and then removed in this same transaction, so let's just set full 5391 * sync since it will be a full sync anyway and this will blow away the 5392 * old info in the log. 5393 */ 5394 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 5395 5396 if (S_ISDIR(mode)) 5397 owner = 0; 5398 else 5399 owner = 1; 5400 5401 key[0].objectid = objectid; 5402 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 5403 key[0].offset = 0; 5404 5405 /* 5406 * Start new inodes with an inode_ref. This is slightly more 5407 * efficient for small numbers of hard links since they will 5408 * be packed into one item. Extended refs will kick in if we 5409 * add more hard links than can fit in the ref item. 5410 */ 5411 key[1].objectid = objectid; 5412 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 5413 key[1].offset = ref_objectid; 5414 5415 sizes[0] = sizeof(struct btrfs_inode_item); 5416 sizes[1] = name_len + sizeof(*ref); 5417 5418 path->leave_spinning = 1; 5419 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 5420 if (ret != 0) 5421 goto fail; 5422 5423 inode_init_owner(inode, dir, mode); 5424 inode_set_bytes(inode, 0); 5425 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 5426 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5427 struct btrfs_inode_item); 5428 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, 5429 sizeof(*inode_item)); 5430 fill_inode_item(trans, path->nodes[0], inode_item, inode); 5431 5432 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 5433 struct btrfs_inode_ref); 5434 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 5435 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 5436 ptr = (unsigned long)(ref + 1); 5437 write_extent_buffer(path->nodes[0], name, ptr, name_len); 5438 5439 btrfs_mark_buffer_dirty(path->nodes[0]); 5440 btrfs_free_path(path); 5441 5442 location = &BTRFS_I(inode)->location; 5443 location->objectid = objectid; 5444 location->offset = 0; 5445 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 5446 5447 btrfs_inherit_iflags(inode, dir); 5448 5449 if (S_ISREG(mode)) { 5450 if (btrfs_test_opt(root, NODATASUM)) 5451 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 5452 if (btrfs_test_opt(root, NODATACOW)) 5453 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | 5454 BTRFS_INODE_NODATASUM; 5455 } 5456 5457 insert_inode_hash(inode); 5458 inode_tree_add(inode); 5459 5460 trace_btrfs_inode_new(inode); 5461 btrfs_set_inode_last_trans(trans, inode); 5462 5463 btrfs_update_root_times(trans, root); 5464 5465 return inode; 5466 fail: 5467 if (dir) 5468 BTRFS_I(dir)->index_cnt--; 5469 btrfs_free_path(path); 5470 iput(inode); 5471 return ERR_PTR(ret); 5472 } 5473 5474 static inline u8 btrfs_inode_type(struct inode *inode) 5475 { 5476 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 5477 } 5478 5479 /* 5480 * utility function to add 'inode' into 'parent_inode' with 5481 * a give name and a given sequence number. 5482 * if 'add_backref' is true, also insert a backref from the 5483 * inode to the parent directory. 5484 */ 5485 int btrfs_add_link(struct btrfs_trans_handle *trans, 5486 struct inode *parent_inode, struct inode *inode, 5487 const char *name, int name_len, int add_backref, u64 index) 5488 { 5489 int ret = 0; 5490 struct btrfs_key key; 5491 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 5492 u64 ino = btrfs_ino(inode); 5493 u64 parent_ino = btrfs_ino(parent_inode); 5494 5495 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 5496 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 5497 } else { 5498 key.objectid = ino; 5499 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 5500 key.offset = 0; 5501 } 5502 5503 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 5504 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 5505 key.objectid, root->root_key.objectid, 5506 parent_ino, index, name, name_len); 5507 } else if (add_backref) { 5508 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, 5509 parent_ino, index); 5510 } 5511 5512 /* Nothing to clean up yet */ 5513 if (ret) 5514 return ret; 5515 5516 ret = btrfs_insert_dir_item(trans, root, name, name_len, 5517 parent_inode, &key, 5518 btrfs_inode_type(inode), index); 5519 if (ret == -EEXIST || ret == -EOVERFLOW) 5520 goto fail_dir_item; 5521 else if (ret) { 5522 btrfs_abort_transaction(trans, root, ret); 5523 return ret; 5524 } 5525 5526 btrfs_i_size_write(parent_inode, parent_inode->i_size + 5527 name_len * 2); 5528 inode_inc_iversion(parent_inode); 5529 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 5530 ret = btrfs_update_inode(trans, root, parent_inode); 5531 if (ret) 5532 btrfs_abort_transaction(trans, root, ret); 5533 return ret; 5534 5535 fail_dir_item: 5536 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { 5537 u64 local_index; 5538 int err; 5539 err = btrfs_del_root_ref(trans, root->fs_info->tree_root, 5540 key.objectid, root->root_key.objectid, 5541 parent_ino, &local_index, name, name_len); 5542 5543 } else if (add_backref) { 5544 u64 local_index; 5545 int err; 5546 5547 err = btrfs_del_inode_ref(trans, root, name, name_len, 5548 ino, parent_ino, &local_index); 5549 } 5550 return ret; 5551 } 5552 5553 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 5554 struct inode *dir, struct dentry *dentry, 5555 struct inode *inode, int backref, u64 index) 5556 { 5557 int err = btrfs_add_link(trans, dir, inode, 5558 dentry->d_name.name, dentry->d_name.len, 5559 backref, index); 5560 if (err > 0) 5561 err = -EEXIST; 5562 return err; 5563 } 5564 5565 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 5566 umode_t mode, dev_t rdev) 5567 { 5568 struct btrfs_trans_handle *trans; 5569 struct btrfs_root *root = BTRFS_I(dir)->root; 5570 struct inode *inode = NULL; 5571 int err; 5572 int drop_inode = 0; 5573 u64 objectid; 5574 u64 index = 0; 5575 5576 if (!new_valid_dev(rdev)) 5577 return -EINVAL; 5578 5579 /* 5580 * 2 for inode item and ref 5581 * 2 for dir items 5582 * 1 for xattr if selinux is on 5583 */ 5584 trans = btrfs_start_transaction(root, 5); 5585 if (IS_ERR(trans)) 5586 return PTR_ERR(trans); 5587 5588 err = btrfs_find_free_ino(root, &objectid); 5589 if (err) 5590 goto out_unlock; 5591 5592 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 5593 dentry->d_name.len, btrfs_ino(dir), objectid, 5594 mode, &index); 5595 if (IS_ERR(inode)) { 5596 err = PTR_ERR(inode); 5597 goto out_unlock; 5598 } 5599 5600 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5601 if (err) { 5602 drop_inode = 1; 5603 goto out_unlock; 5604 } 5605 5606 /* 5607 * If the active LSM wants to access the inode during 5608 * d_instantiate it needs these. Smack checks to see 5609 * if the filesystem supports xattrs by looking at the 5610 * ops vector. 5611 */ 5612 5613 inode->i_op = &btrfs_special_inode_operations; 5614 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5615 if (err) 5616 drop_inode = 1; 5617 else { 5618 init_special_inode(inode, inode->i_mode, rdev); 5619 btrfs_update_inode(trans, root, inode); 5620 d_instantiate(dentry, inode); 5621 } 5622 out_unlock: 5623 btrfs_end_transaction(trans, root); 5624 btrfs_btree_balance_dirty(root); 5625 if (drop_inode) { 5626 inode_dec_link_count(inode); 5627 iput(inode); 5628 } 5629 return err; 5630 } 5631 5632 static int btrfs_create(struct inode *dir, struct dentry *dentry, 5633 umode_t mode, bool excl) 5634 { 5635 struct btrfs_trans_handle *trans; 5636 struct btrfs_root *root = BTRFS_I(dir)->root; 5637 struct inode *inode = NULL; 5638 int drop_inode_on_err = 0; 5639 int err; 5640 u64 objectid; 5641 u64 index = 0; 5642 5643 /* 5644 * 2 for inode item and ref 5645 * 2 for dir items 5646 * 1 for xattr if selinux is on 5647 */ 5648 trans = btrfs_start_transaction(root, 5); 5649 if (IS_ERR(trans)) 5650 return PTR_ERR(trans); 5651 5652 err = btrfs_find_free_ino(root, &objectid); 5653 if (err) 5654 goto out_unlock; 5655 5656 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 5657 dentry->d_name.len, btrfs_ino(dir), objectid, 5658 mode, &index); 5659 if (IS_ERR(inode)) { 5660 err = PTR_ERR(inode); 5661 goto out_unlock; 5662 } 5663 drop_inode_on_err = 1; 5664 5665 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5666 if (err) 5667 goto out_unlock; 5668 5669 err = btrfs_update_inode(trans, root, inode); 5670 if (err) 5671 goto out_unlock; 5672 5673 /* 5674 * If the active LSM wants to access the inode during 5675 * d_instantiate it needs these. Smack checks to see 5676 * if the filesystem supports xattrs by looking at the 5677 * ops vector. 5678 */ 5679 inode->i_fop = &btrfs_file_operations; 5680 inode->i_op = &btrfs_file_inode_operations; 5681 5682 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5683 if (err) 5684 goto out_unlock; 5685 5686 inode->i_mapping->a_ops = &btrfs_aops; 5687 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5688 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5689 d_instantiate(dentry, inode); 5690 5691 out_unlock: 5692 btrfs_end_transaction(trans, root); 5693 if (err && drop_inode_on_err) { 5694 inode_dec_link_count(inode); 5695 iput(inode); 5696 } 5697 btrfs_btree_balance_dirty(root); 5698 return err; 5699 } 5700 5701 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 5702 struct dentry *dentry) 5703 { 5704 struct btrfs_trans_handle *trans; 5705 struct btrfs_root *root = BTRFS_I(dir)->root; 5706 struct inode *inode = old_dentry->d_inode; 5707 u64 index; 5708 int err; 5709 int drop_inode = 0; 5710 5711 /* do not allow sys_link's with other subvols of the same device */ 5712 if (root->objectid != BTRFS_I(inode)->root->objectid) 5713 return -EXDEV; 5714 5715 if (inode->i_nlink >= BTRFS_LINK_MAX) 5716 return -EMLINK; 5717 5718 err = btrfs_set_inode_index(dir, &index); 5719 if (err) 5720 goto fail; 5721 5722 /* 5723 * 2 items for inode and inode ref 5724 * 2 items for dir items 5725 * 1 item for parent inode 5726 */ 5727 trans = btrfs_start_transaction(root, 5); 5728 if (IS_ERR(trans)) { 5729 err = PTR_ERR(trans); 5730 goto fail; 5731 } 5732 5733 btrfs_inc_nlink(inode); 5734 inode_inc_iversion(inode); 5735 inode->i_ctime = CURRENT_TIME; 5736 ihold(inode); 5737 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 5738 5739 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5740 5741 if (err) { 5742 drop_inode = 1; 5743 } else { 5744 struct dentry *parent = dentry->d_parent; 5745 err = btrfs_update_inode(trans, root, inode); 5746 if (err) 5747 goto fail; 5748 d_instantiate(dentry, inode); 5749 btrfs_log_new_name(trans, inode, NULL, parent); 5750 } 5751 5752 btrfs_end_transaction(trans, root); 5753 fail: 5754 if (drop_inode) { 5755 inode_dec_link_count(inode); 5756 iput(inode); 5757 } 5758 btrfs_btree_balance_dirty(root); 5759 return err; 5760 } 5761 5762 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 5763 { 5764 struct inode *inode = NULL; 5765 struct btrfs_trans_handle *trans; 5766 struct btrfs_root *root = BTRFS_I(dir)->root; 5767 int err = 0; 5768 int drop_on_err = 0; 5769 u64 objectid = 0; 5770 u64 index = 0; 5771 5772 /* 5773 * 2 items for inode and ref 5774 * 2 items for dir items 5775 * 1 for xattr if selinux is on 5776 */ 5777 trans = btrfs_start_transaction(root, 5); 5778 if (IS_ERR(trans)) 5779 return PTR_ERR(trans); 5780 5781 err = btrfs_find_free_ino(root, &objectid); 5782 if (err) 5783 goto out_fail; 5784 5785 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 5786 dentry->d_name.len, btrfs_ino(dir), objectid, 5787 S_IFDIR | mode, &index); 5788 if (IS_ERR(inode)) { 5789 err = PTR_ERR(inode); 5790 goto out_fail; 5791 } 5792 5793 drop_on_err = 1; 5794 5795 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5796 if (err) 5797 goto out_fail; 5798 5799 inode->i_op = &btrfs_dir_inode_operations; 5800 inode->i_fop = &btrfs_dir_file_operations; 5801 5802 btrfs_i_size_write(inode, 0); 5803 err = btrfs_update_inode(trans, root, inode); 5804 if (err) 5805 goto out_fail; 5806 5807 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 5808 dentry->d_name.len, 0, index); 5809 if (err) 5810 goto out_fail; 5811 5812 d_instantiate(dentry, inode); 5813 drop_on_err = 0; 5814 5815 out_fail: 5816 btrfs_end_transaction(trans, root); 5817 if (drop_on_err) 5818 iput(inode); 5819 btrfs_btree_balance_dirty(root); 5820 return err; 5821 } 5822 5823 /* helper for btfs_get_extent. Given an existing extent in the tree, 5824 * and an extent that you want to insert, deal with overlap and insert 5825 * the new extent into the tree. 5826 */ 5827 static int merge_extent_mapping(struct extent_map_tree *em_tree, 5828 struct extent_map *existing, 5829 struct extent_map *em, 5830 u64 map_start, u64 map_len) 5831 { 5832 u64 start_diff; 5833 5834 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 5835 start_diff = map_start - em->start; 5836 em->start = map_start; 5837 em->len = map_len; 5838 if (em->block_start < EXTENT_MAP_LAST_BYTE && 5839 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 5840 em->block_start += start_diff; 5841 em->block_len -= start_diff; 5842 } 5843 return add_extent_mapping(em_tree, em, 0); 5844 } 5845 5846 static noinline int uncompress_inline(struct btrfs_path *path, 5847 struct inode *inode, struct page *page, 5848 size_t pg_offset, u64 extent_offset, 5849 struct btrfs_file_extent_item *item) 5850 { 5851 int ret; 5852 struct extent_buffer *leaf = path->nodes[0]; 5853 char *tmp; 5854 size_t max_size; 5855 unsigned long inline_size; 5856 unsigned long ptr; 5857 int compress_type; 5858 5859 WARN_ON(pg_offset != 0); 5860 compress_type = btrfs_file_extent_compression(leaf, item); 5861 max_size = btrfs_file_extent_ram_bytes(leaf, item); 5862 inline_size = btrfs_file_extent_inline_item_len(leaf, 5863 btrfs_item_nr(leaf, path->slots[0])); 5864 tmp = kmalloc(inline_size, GFP_NOFS); 5865 if (!tmp) 5866 return -ENOMEM; 5867 ptr = btrfs_file_extent_inline_start(item); 5868 5869 read_extent_buffer(leaf, tmp, ptr, inline_size); 5870 5871 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 5872 ret = btrfs_decompress(compress_type, tmp, page, 5873 extent_offset, inline_size, max_size); 5874 if (ret) { 5875 char *kaddr = kmap_atomic(page); 5876 unsigned long copy_size = min_t(u64, 5877 PAGE_CACHE_SIZE - pg_offset, 5878 max_size - extent_offset); 5879 memset(kaddr + pg_offset, 0, copy_size); 5880 kunmap_atomic(kaddr); 5881 } 5882 kfree(tmp); 5883 return 0; 5884 } 5885 5886 /* 5887 * a bit scary, this does extent mapping from logical file offset to the disk. 5888 * the ugly parts come from merging extents from the disk with the in-ram 5889 * representation. This gets more complex because of the data=ordered code, 5890 * where the in-ram extents might be locked pending data=ordered completion. 5891 * 5892 * This also copies inline extents directly into the page. 5893 */ 5894 5895 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 5896 size_t pg_offset, u64 start, u64 len, 5897 int create) 5898 { 5899 int ret; 5900 int err = 0; 5901 u64 bytenr; 5902 u64 extent_start = 0; 5903 u64 extent_end = 0; 5904 u64 objectid = btrfs_ino(inode); 5905 u32 found_type; 5906 struct btrfs_path *path = NULL; 5907 struct btrfs_root *root = BTRFS_I(inode)->root; 5908 struct btrfs_file_extent_item *item; 5909 struct extent_buffer *leaf; 5910 struct btrfs_key found_key; 5911 struct extent_map *em = NULL; 5912 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5913 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5914 struct btrfs_trans_handle *trans = NULL; 5915 int compress_type; 5916 5917 again: 5918 read_lock(&em_tree->lock); 5919 em = lookup_extent_mapping(em_tree, start, len); 5920 if (em) 5921 em->bdev = root->fs_info->fs_devices->latest_bdev; 5922 read_unlock(&em_tree->lock); 5923 5924 if (em) { 5925 if (em->start > start || em->start + em->len <= start) 5926 free_extent_map(em); 5927 else if (em->block_start == EXTENT_MAP_INLINE && page) 5928 free_extent_map(em); 5929 else 5930 goto out; 5931 } 5932 em = alloc_extent_map(); 5933 if (!em) { 5934 err = -ENOMEM; 5935 goto out; 5936 } 5937 em->bdev = root->fs_info->fs_devices->latest_bdev; 5938 em->start = EXTENT_MAP_HOLE; 5939 em->orig_start = EXTENT_MAP_HOLE; 5940 em->len = (u64)-1; 5941 em->block_len = (u64)-1; 5942 5943 if (!path) { 5944 path = btrfs_alloc_path(); 5945 if (!path) { 5946 err = -ENOMEM; 5947 goto out; 5948 } 5949 /* 5950 * Chances are we'll be called again, so go ahead and do 5951 * readahead 5952 */ 5953 path->reada = 1; 5954 } 5955 5956 ret = btrfs_lookup_file_extent(trans, root, path, 5957 objectid, start, trans != NULL); 5958 if (ret < 0) { 5959 err = ret; 5960 goto out; 5961 } 5962 5963 if (ret != 0) { 5964 if (path->slots[0] == 0) 5965 goto not_found; 5966 path->slots[0]--; 5967 } 5968 5969 leaf = path->nodes[0]; 5970 item = btrfs_item_ptr(leaf, path->slots[0], 5971 struct btrfs_file_extent_item); 5972 /* are we inside the extent that was found? */ 5973 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5974 found_type = btrfs_key_type(&found_key); 5975 if (found_key.objectid != objectid || 5976 found_type != BTRFS_EXTENT_DATA_KEY) { 5977 goto not_found; 5978 } 5979 5980 found_type = btrfs_file_extent_type(leaf, item); 5981 extent_start = found_key.offset; 5982 compress_type = btrfs_file_extent_compression(leaf, item); 5983 if (found_type == BTRFS_FILE_EXTENT_REG || 5984 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5985 extent_end = extent_start + 5986 btrfs_file_extent_num_bytes(leaf, item); 5987 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5988 size_t size; 5989 size = btrfs_file_extent_inline_len(leaf, item); 5990 extent_end = ALIGN(extent_start + size, root->sectorsize); 5991 } 5992 5993 if (start >= extent_end) { 5994 path->slots[0]++; 5995 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 5996 ret = btrfs_next_leaf(root, path); 5997 if (ret < 0) { 5998 err = ret; 5999 goto out; 6000 } 6001 if (ret > 0) 6002 goto not_found; 6003 leaf = path->nodes[0]; 6004 } 6005 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6006 if (found_key.objectid != objectid || 6007 found_key.type != BTRFS_EXTENT_DATA_KEY) 6008 goto not_found; 6009 if (start + len <= found_key.offset) 6010 goto not_found; 6011 em->start = start; 6012 em->orig_start = start; 6013 em->len = found_key.offset - start; 6014 goto not_found_em; 6015 } 6016 6017 em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); 6018 if (found_type == BTRFS_FILE_EXTENT_REG || 6019 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 6020 em->start = extent_start; 6021 em->len = extent_end - extent_start; 6022 em->orig_start = extent_start - 6023 btrfs_file_extent_offset(leaf, item); 6024 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, 6025 item); 6026 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 6027 if (bytenr == 0) { 6028 em->block_start = EXTENT_MAP_HOLE; 6029 goto insert; 6030 } 6031 if (compress_type != BTRFS_COMPRESS_NONE) { 6032 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 6033 em->compress_type = compress_type; 6034 em->block_start = bytenr; 6035 em->block_len = em->orig_block_len; 6036 } else { 6037 bytenr += btrfs_file_extent_offset(leaf, item); 6038 em->block_start = bytenr; 6039 em->block_len = em->len; 6040 if (found_type == BTRFS_FILE_EXTENT_PREALLOC) 6041 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 6042 } 6043 goto insert; 6044 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6045 unsigned long ptr; 6046 char *map; 6047 size_t size; 6048 size_t extent_offset; 6049 size_t copy_size; 6050 6051 em->block_start = EXTENT_MAP_INLINE; 6052 if (!page || create) { 6053 em->start = extent_start; 6054 em->len = extent_end - extent_start; 6055 goto out; 6056 } 6057 6058 size = btrfs_file_extent_inline_len(leaf, item); 6059 extent_offset = page_offset(page) + pg_offset - extent_start; 6060 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 6061 size - extent_offset); 6062 em->start = extent_start + extent_offset; 6063 em->len = ALIGN(copy_size, root->sectorsize); 6064 em->orig_block_len = em->len; 6065 em->orig_start = em->start; 6066 if (compress_type) { 6067 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 6068 em->compress_type = compress_type; 6069 } 6070 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 6071 if (create == 0 && !PageUptodate(page)) { 6072 if (btrfs_file_extent_compression(leaf, item) != 6073 BTRFS_COMPRESS_NONE) { 6074 ret = uncompress_inline(path, inode, page, 6075 pg_offset, 6076 extent_offset, item); 6077 BUG_ON(ret); /* -ENOMEM */ 6078 } else { 6079 map = kmap(page); 6080 read_extent_buffer(leaf, map + pg_offset, ptr, 6081 copy_size); 6082 if (pg_offset + copy_size < PAGE_CACHE_SIZE) { 6083 memset(map + pg_offset + copy_size, 0, 6084 PAGE_CACHE_SIZE - pg_offset - 6085 copy_size); 6086 } 6087 kunmap(page); 6088 } 6089 flush_dcache_page(page); 6090 } else if (create && PageUptodate(page)) { 6091 BUG(); 6092 if (!trans) { 6093 kunmap(page); 6094 free_extent_map(em); 6095 em = NULL; 6096 6097 btrfs_release_path(path); 6098 trans = btrfs_join_transaction(root); 6099 6100 if (IS_ERR(trans)) 6101 return ERR_CAST(trans); 6102 goto again; 6103 } 6104 map = kmap(page); 6105 write_extent_buffer(leaf, map + pg_offset, ptr, 6106 copy_size); 6107 kunmap(page); 6108 btrfs_mark_buffer_dirty(leaf); 6109 } 6110 set_extent_uptodate(io_tree, em->start, 6111 extent_map_end(em) - 1, NULL, GFP_NOFS); 6112 goto insert; 6113 } else { 6114 WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type); 6115 } 6116 not_found: 6117 em->start = start; 6118 em->orig_start = start; 6119 em->len = len; 6120 not_found_em: 6121 em->block_start = EXTENT_MAP_HOLE; 6122 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 6123 insert: 6124 btrfs_release_path(path); 6125 if (em->start > start || extent_map_end(em) <= start) { 6126 btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", 6127 em->start, em->len, start, len); 6128 err = -EIO; 6129 goto out; 6130 } 6131 6132 err = 0; 6133 write_lock(&em_tree->lock); 6134 ret = add_extent_mapping(em_tree, em, 0); 6135 /* it is possible that someone inserted the extent into the tree 6136 * while we had the lock dropped. It is also possible that 6137 * an overlapping map exists in the tree 6138 */ 6139 if (ret == -EEXIST) { 6140 struct extent_map *existing; 6141 6142 ret = 0; 6143 6144 existing = lookup_extent_mapping(em_tree, start, len); 6145 if (existing && (existing->start > start || 6146 existing->start + existing->len <= start)) { 6147 free_extent_map(existing); 6148 existing = NULL; 6149 } 6150 if (!existing) { 6151 existing = lookup_extent_mapping(em_tree, em->start, 6152 em->len); 6153 if (existing) { 6154 err = merge_extent_mapping(em_tree, existing, 6155 em, start, 6156 root->sectorsize); 6157 free_extent_map(existing); 6158 if (err) { 6159 free_extent_map(em); 6160 em = NULL; 6161 } 6162 } else { 6163 err = -EIO; 6164 free_extent_map(em); 6165 em = NULL; 6166 } 6167 } else { 6168 free_extent_map(em); 6169 em = existing; 6170 err = 0; 6171 } 6172 } 6173 write_unlock(&em_tree->lock); 6174 out: 6175 6176 if (em) 6177 trace_btrfs_get_extent(root, em); 6178 6179 if (path) 6180 btrfs_free_path(path); 6181 if (trans) { 6182 ret = btrfs_end_transaction(trans, root); 6183 if (!err) 6184 err = ret; 6185 } 6186 if (err) { 6187 free_extent_map(em); 6188 return ERR_PTR(err); 6189 } 6190 BUG_ON(!em); /* Error is always set */ 6191 return em; 6192 } 6193 6194 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 6195 size_t pg_offset, u64 start, u64 len, 6196 int create) 6197 { 6198 struct extent_map *em; 6199 struct extent_map *hole_em = NULL; 6200 u64 range_start = start; 6201 u64 end; 6202 u64 found; 6203 u64 found_end; 6204 int err = 0; 6205 6206 em = btrfs_get_extent(inode, page, pg_offset, start, len, create); 6207 if (IS_ERR(em)) 6208 return em; 6209 if (em) { 6210 /* 6211 * if our em maps to 6212 * - a hole or 6213 * - a pre-alloc extent, 6214 * there might actually be delalloc bytes behind it. 6215 */ 6216 if (em->block_start != EXTENT_MAP_HOLE && 6217 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 6218 return em; 6219 else 6220 hole_em = em; 6221 } 6222 6223 /* check to see if we've wrapped (len == -1 or similar) */ 6224 end = start + len; 6225 if (end < start) 6226 end = (u64)-1; 6227 else 6228 end -= 1; 6229 6230 em = NULL; 6231 6232 /* ok, we didn't find anything, lets look for delalloc */ 6233 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, 6234 end, len, EXTENT_DELALLOC, 1); 6235 found_end = range_start + found; 6236 if (found_end < range_start) 6237 found_end = (u64)-1; 6238 6239 /* 6240 * we didn't find anything useful, return 6241 * the original results from get_extent() 6242 */ 6243 if (range_start > end || found_end <= start) { 6244 em = hole_em; 6245 hole_em = NULL; 6246 goto out; 6247 } 6248 6249 /* adjust the range_start to make sure it doesn't 6250 * go backwards from the start they passed in 6251 */ 6252 range_start = max(start,range_start); 6253 found = found_end - range_start; 6254 6255 if (found > 0) { 6256 u64 hole_start = start; 6257 u64 hole_len = len; 6258 6259 em = alloc_extent_map(); 6260 if (!em) { 6261 err = -ENOMEM; 6262 goto out; 6263 } 6264 /* 6265 * when btrfs_get_extent can't find anything it 6266 * returns one huge hole 6267 * 6268 * make sure what it found really fits our range, and 6269 * adjust to make sure it is based on the start from 6270 * the caller 6271 */ 6272 if (hole_em) { 6273 u64 calc_end = extent_map_end(hole_em); 6274 6275 if (calc_end <= start || (hole_em->start > end)) { 6276 free_extent_map(hole_em); 6277 hole_em = NULL; 6278 } else { 6279 hole_start = max(hole_em->start, start); 6280 hole_len = calc_end - hole_start; 6281 } 6282 } 6283 em->bdev = NULL; 6284 if (hole_em && range_start > hole_start) { 6285 /* our hole starts before our delalloc, so we 6286 * have to return just the parts of the hole 6287 * that go until the delalloc starts 6288 */ 6289 em->len = min(hole_len, 6290 range_start - hole_start); 6291 em->start = hole_start; 6292 em->orig_start = hole_start; 6293 /* 6294 * don't adjust block start at all, 6295 * it is fixed at EXTENT_MAP_HOLE 6296 */ 6297 em->block_start = hole_em->block_start; 6298 em->block_len = hole_len; 6299 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) 6300 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 6301 } else { 6302 em->start = range_start; 6303 em->len = found; 6304 em->orig_start = range_start; 6305 em->block_start = EXTENT_MAP_DELALLOC; 6306 em->block_len = found; 6307 } 6308 } else if (hole_em) { 6309 return hole_em; 6310 } 6311 out: 6312 6313 free_extent_map(hole_em); 6314 if (err) { 6315 free_extent_map(em); 6316 return ERR_PTR(err); 6317 } 6318 return em; 6319 } 6320 6321 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 6322 u64 start, u64 len) 6323 { 6324 struct btrfs_root *root = BTRFS_I(inode)->root; 6325 struct extent_map *em; 6326 struct btrfs_key ins; 6327 u64 alloc_hint; 6328 int ret; 6329 6330 alloc_hint = get_extent_allocation_hint(inode, start, len); 6331 ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, 6332 alloc_hint, &ins, 1); 6333 if (ret) 6334 return ERR_PTR(ret); 6335 6336 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, 6337 ins.offset, ins.offset, ins.offset, 0); 6338 if (IS_ERR(em)) { 6339 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 6340 return em; 6341 } 6342 6343 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 6344 ins.offset, ins.offset, 0); 6345 if (ret) { 6346 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 6347 free_extent_map(em); 6348 return ERR_PTR(ret); 6349 } 6350 6351 return em; 6352 } 6353 6354 /* 6355 * returns 1 when the nocow is safe, < 1 on error, 0 if the 6356 * block must be cow'd 6357 */ 6358 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, 6359 u64 *orig_start, u64 *orig_block_len, 6360 u64 *ram_bytes) 6361 { 6362 struct btrfs_trans_handle *trans; 6363 struct btrfs_path *path; 6364 int ret; 6365 struct extent_buffer *leaf; 6366 struct btrfs_root *root = BTRFS_I(inode)->root; 6367 struct btrfs_file_extent_item *fi; 6368 struct btrfs_key key; 6369 u64 disk_bytenr; 6370 u64 backref_offset; 6371 u64 extent_end; 6372 u64 num_bytes; 6373 int slot; 6374 int found_type; 6375 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); 6376 path = btrfs_alloc_path(); 6377 if (!path) 6378 return -ENOMEM; 6379 6380 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), 6381 offset, 0); 6382 if (ret < 0) 6383 goto out; 6384 6385 slot = path->slots[0]; 6386 if (ret == 1) { 6387 if (slot == 0) { 6388 /* can't find the item, must cow */ 6389 ret = 0; 6390 goto out; 6391 } 6392 slot--; 6393 } 6394 ret = 0; 6395 leaf = path->nodes[0]; 6396 btrfs_item_key_to_cpu(leaf, &key, slot); 6397 if (key.objectid != btrfs_ino(inode) || 6398 key.type != BTRFS_EXTENT_DATA_KEY) { 6399 /* not our file or wrong item type, must cow */ 6400 goto out; 6401 } 6402 6403 if (key.offset > offset) { 6404 /* Wrong offset, must cow */ 6405 goto out; 6406 } 6407 6408 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 6409 found_type = btrfs_file_extent_type(leaf, fi); 6410 if (found_type != BTRFS_FILE_EXTENT_REG && 6411 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 6412 /* not a regular extent, must cow */ 6413 goto out; 6414 } 6415 6416 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) 6417 goto out; 6418 6419 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 6420 if (disk_bytenr == 0) 6421 goto out; 6422 6423 if (btrfs_file_extent_compression(leaf, fi) || 6424 btrfs_file_extent_encryption(leaf, fi) || 6425 btrfs_file_extent_other_encoding(leaf, fi)) 6426 goto out; 6427 6428 backref_offset = btrfs_file_extent_offset(leaf, fi); 6429 6430 if (orig_start) { 6431 *orig_start = key.offset - backref_offset; 6432 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 6433 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 6434 } 6435 6436 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 6437 6438 if (btrfs_extent_readonly(root, disk_bytenr)) 6439 goto out; 6440 6441 /* 6442 * look for other files referencing this extent, if we 6443 * find any we must cow 6444 */ 6445 trans = btrfs_join_transaction(root); 6446 if (IS_ERR(trans)) { 6447 ret = 0; 6448 goto out; 6449 } 6450 6451 ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), 6452 key.offset - backref_offset, disk_bytenr); 6453 btrfs_end_transaction(trans, root); 6454 if (ret) { 6455 ret = 0; 6456 goto out; 6457 } 6458 6459 /* 6460 * adjust disk_bytenr and num_bytes to cover just the bytes 6461 * in this extent we are about to write. If there 6462 * are any csums in that range we have to cow in order 6463 * to keep the csums correct 6464 */ 6465 disk_bytenr += backref_offset; 6466 disk_bytenr += offset - key.offset; 6467 num_bytes = min(offset + *len, extent_end) - offset; 6468 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 6469 goto out; 6470 /* 6471 * all of the above have passed, it is safe to overwrite this extent 6472 * without cow 6473 */ 6474 *len = num_bytes; 6475 ret = 1; 6476 out: 6477 btrfs_free_path(path); 6478 return ret; 6479 } 6480 6481 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 6482 struct extent_state **cached_state, int writing) 6483 { 6484 struct btrfs_ordered_extent *ordered; 6485 int ret = 0; 6486 6487 while (1) { 6488 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6489 0, cached_state); 6490 /* 6491 * We're concerned with the entire range that we're going to be 6492 * doing DIO to, so we need to make sure theres no ordered 6493 * extents in this range. 6494 */ 6495 ordered = btrfs_lookup_ordered_range(inode, lockstart, 6496 lockend - lockstart + 1); 6497 6498 /* 6499 * We need to make sure there are no buffered pages in this 6500 * range either, we could have raced between the invalidate in 6501 * generic_file_direct_write and locking the extent. The 6502 * invalidate needs to happen so that reads after a write do not 6503 * get stale data. 6504 */ 6505 if (!ordered && (!writing || 6506 !test_range_bit(&BTRFS_I(inode)->io_tree, 6507 lockstart, lockend, EXTENT_UPTODATE, 0, 6508 *cached_state))) 6509 break; 6510 6511 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6512 cached_state, GFP_NOFS); 6513 6514 if (ordered) { 6515 btrfs_start_ordered_extent(inode, ordered, 1); 6516 btrfs_put_ordered_extent(ordered); 6517 } else { 6518 /* Screw you mmap */ 6519 ret = filemap_write_and_wait_range(inode->i_mapping, 6520 lockstart, 6521 lockend); 6522 if (ret) 6523 break; 6524 6525 /* 6526 * If we found a page that couldn't be invalidated just 6527 * fall back to buffered. 6528 */ 6529 ret = invalidate_inode_pages2_range(inode->i_mapping, 6530 lockstart >> PAGE_CACHE_SHIFT, 6531 lockend >> PAGE_CACHE_SHIFT); 6532 if (ret) 6533 break; 6534 } 6535 6536 cond_resched(); 6537 } 6538 6539 return ret; 6540 } 6541 6542 static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 6543 u64 len, u64 orig_start, 6544 u64 block_start, u64 block_len, 6545 u64 orig_block_len, u64 ram_bytes, 6546 int type) 6547 { 6548 struct extent_map_tree *em_tree; 6549 struct extent_map *em; 6550 struct btrfs_root *root = BTRFS_I(inode)->root; 6551 int ret; 6552 6553 em_tree = &BTRFS_I(inode)->extent_tree; 6554 em = alloc_extent_map(); 6555 if (!em) 6556 return ERR_PTR(-ENOMEM); 6557 6558 em->start = start; 6559 em->orig_start = orig_start; 6560 em->mod_start = start; 6561 em->mod_len = len; 6562 em->len = len; 6563 em->block_len = block_len; 6564 em->block_start = block_start; 6565 em->bdev = root->fs_info->fs_devices->latest_bdev; 6566 em->orig_block_len = orig_block_len; 6567 em->ram_bytes = ram_bytes; 6568 em->generation = -1; 6569 set_bit(EXTENT_FLAG_PINNED, &em->flags); 6570 if (type == BTRFS_ORDERED_PREALLOC) 6571 set_bit(EXTENT_FLAG_FILLING, &em->flags); 6572 6573 do { 6574 btrfs_drop_extent_cache(inode, em->start, 6575 em->start + em->len - 1, 0); 6576 write_lock(&em_tree->lock); 6577 ret = add_extent_mapping(em_tree, em, 1); 6578 write_unlock(&em_tree->lock); 6579 } while (ret == -EEXIST); 6580 6581 if (ret) { 6582 free_extent_map(em); 6583 return ERR_PTR(ret); 6584 } 6585 6586 return em; 6587 } 6588 6589 6590 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 6591 struct buffer_head *bh_result, int create) 6592 { 6593 struct extent_map *em; 6594 struct btrfs_root *root = BTRFS_I(inode)->root; 6595 struct extent_state *cached_state = NULL; 6596 u64 start = iblock << inode->i_blkbits; 6597 u64 lockstart, lockend; 6598 u64 len = bh_result->b_size; 6599 int unlock_bits = EXTENT_LOCKED; 6600 int ret = 0; 6601 6602 if (create) 6603 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; 6604 else 6605 len = min_t(u64, len, root->sectorsize); 6606 6607 lockstart = start; 6608 lockend = start + len - 1; 6609 6610 /* 6611 * If this errors out it's because we couldn't invalidate pagecache for 6612 * this range and we need to fallback to buffered. 6613 */ 6614 if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) 6615 return -ENOTBLK; 6616 6617 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 6618 if (IS_ERR(em)) { 6619 ret = PTR_ERR(em); 6620 goto unlock_err; 6621 } 6622 6623 /* 6624 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 6625 * io. INLINE is special, and we could probably kludge it in here, but 6626 * it's still buffered so for safety lets just fall back to the generic 6627 * buffered path. 6628 * 6629 * For COMPRESSED we _have_ to read the entire extent in so we can 6630 * decompress it, so there will be buffering required no matter what we 6631 * do, so go ahead and fallback to buffered. 6632 * 6633 * We return -ENOTBLK because thats what makes DIO go ahead and go back 6634 * to buffered IO. Don't blame me, this is the price we pay for using 6635 * the generic code. 6636 */ 6637 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 6638 em->block_start == EXTENT_MAP_INLINE) { 6639 free_extent_map(em); 6640 ret = -ENOTBLK; 6641 goto unlock_err; 6642 } 6643 6644 /* Just a good old fashioned hole, return */ 6645 if (!create && (em->block_start == EXTENT_MAP_HOLE || 6646 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6647 free_extent_map(em); 6648 goto unlock_err; 6649 } 6650 6651 /* 6652 * We don't allocate a new extent in the following cases 6653 * 6654 * 1) The inode is marked as NODATACOW. In this case we'll just use the 6655 * existing extent. 6656 * 2) The extent is marked as PREALLOC. We're good to go here and can 6657 * just use the extent. 6658 * 6659 */ 6660 if (!create) { 6661 len = min(len, em->len - (start - em->start)); 6662 lockstart = start + len; 6663 goto unlock; 6664 } 6665 6666 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 6667 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 6668 em->block_start != EXTENT_MAP_HOLE)) { 6669 int type; 6670 int ret; 6671 u64 block_start, orig_start, orig_block_len, ram_bytes; 6672 6673 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 6674 type = BTRFS_ORDERED_PREALLOC; 6675 else 6676 type = BTRFS_ORDERED_NOCOW; 6677 len = min(len, em->len - (start - em->start)); 6678 block_start = em->block_start + (start - em->start); 6679 6680 if (can_nocow_extent(inode, start, &len, &orig_start, 6681 &orig_block_len, &ram_bytes) == 1) { 6682 if (type == BTRFS_ORDERED_PREALLOC) { 6683 free_extent_map(em); 6684 em = create_pinned_em(inode, start, len, 6685 orig_start, 6686 block_start, len, 6687 orig_block_len, 6688 ram_bytes, type); 6689 if (IS_ERR(em)) 6690 goto unlock_err; 6691 } 6692 6693 ret = btrfs_add_ordered_extent_dio(inode, start, 6694 block_start, len, len, type); 6695 if (ret) { 6696 free_extent_map(em); 6697 goto unlock_err; 6698 } 6699 goto unlock; 6700 } 6701 } 6702 6703 /* 6704 * this will cow the extent, reset the len in case we changed 6705 * it above 6706 */ 6707 len = bh_result->b_size; 6708 free_extent_map(em); 6709 em = btrfs_new_extent_direct(inode, start, len); 6710 if (IS_ERR(em)) { 6711 ret = PTR_ERR(em); 6712 goto unlock_err; 6713 } 6714 len = min(len, em->len - (start - em->start)); 6715 unlock: 6716 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 6717 inode->i_blkbits; 6718 bh_result->b_size = len; 6719 bh_result->b_bdev = em->bdev; 6720 set_buffer_mapped(bh_result); 6721 if (create) { 6722 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 6723 set_buffer_new(bh_result); 6724 6725 /* 6726 * Need to update the i_size under the extent lock so buffered 6727 * readers will get the updated i_size when we unlock. 6728 */ 6729 if (start + len > i_size_read(inode)) 6730 i_size_write(inode, start + len); 6731 6732 spin_lock(&BTRFS_I(inode)->lock); 6733 BTRFS_I(inode)->outstanding_extents++; 6734 spin_unlock(&BTRFS_I(inode)->lock); 6735 6736 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6737 lockstart + len - 1, EXTENT_DELALLOC, NULL, 6738 &cached_state, GFP_NOFS); 6739 BUG_ON(ret); 6740 } 6741 6742 /* 6743 * In the case of write we need to clear and unlock the entire range, 6744 * in the case of read we need to unlock only the end area that we 6745 * aren't using if there is any left over space. 6746 */ 6747 if (lockstart < lockend) { 6748 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6749 lockend, unlock_bits, 1, 0, 6750 &cached_state, GFP_NOFS); 6751 } else { 6752 free_extent_state(cached_state); 6753 } 6754 6755 free_extent_map(em); 6756 6757 return 0; 6758 6759 unlock_err: 6760 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6761 unlock_bits, 1, 0, &cached_state, GFP_NOFS); 6762 return ret; 6763 } 6764 6765 static void btrfs_endio_direct_read(struct bio *bio, int err) 6766 { 6767 struct btrfs_dio_private *dip = bio->bi_private; 6768 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 6769 struct bio_vec *bvec = bio->bi_io_vec; 6770 struct inode *inode = dip->inode; 6771 struct btrfs_root *root = BTRFS_I(inode)->root; 6772 struct bio *dio_bio; 6773 u32 *csums = (u32 *)dip->csum; 6774 int index = 0; 6775 u64 start; 6776 6777 start = dip->logical_offset; 6778 do { 6779 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 6780 struct page *page = bvec->bv_page; 6781 char *kaddr; 6782 u32 csum = ~(u32)0; 6783 unsigned long flags; 6784 6785 local_irq_save(flags); 6786 kaddr = kmap_atomic(page); 6787 csum = btrfs_csum_data(kaddr + bvec->bv_offset, 6788 csum, bvec->bv_len); 6789 btrfs_csum_final(csum, (char *)&csum); 6790 kunmap_atomic(kaddr); 6791 local_irq_restore(flags); 6792 6793 flush_dcache_page(bvec->bv_page); 6794 if (csum != csums[index]) { 6795 btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", 6796 btrfs_ino(inode), start, csum, 6797 csums[index]); 6798 err = -EIO; 6799 } 6800 } 6801 6802 start += bvec->bv_len; 6803 bvec++; 6804 index++; 6805 } while (bvec <= bvec_end); 6806 6807 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 6808 dip->logical_offset + dip->bytes - 1); 6809 dio_bio = dip->dio_bio; 6810 6811 kfree(dip); 6812 6813 /* If we had a csum failure make sure to clear the uptodate flag */ 6814 if (err) 6815 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); 6816 dio_end_io(dio_bio, err); 6817 bio_put(bio); 6818 } 6819 6820 static void btrfs_endio_direct_write(struct bio *bio, int err) 6821 { 6822 struct btrfs_dio_private *dip = bio->bi_private; 6823 struct inode *inode = dip->inode; 6824 struct btrfs_root *root = BTRFS_I(inode)->root; 6825 struct btrfs_ordered_extent *ordered = NULL; 6826 u64 ordered_offset = dip->logical_offset; 6827 u64 ordered_bytes = dip->bytes; 6828 struct bio *dio_bio; 6829 int ret; 6830 6831 if (err) 6832 goto out_done; 6833 again: 6834 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 6835 &ordered_offset, 6836 ordered_bytes, !err); 6837 if (!ret) 6838 goto out_test; 6839 6840 ordered->work.func = finish_ordered_fn; 6841 ordered->work.flags = 0; 6842 btrfs_queue_worker(&root->fs_info->endio_write_workers, 6843 &ordered->work); 6844 out_test: 6845 /* 6846 * our bio might span multiple ordered extents. If we haven't 6847 * completed the accounting for the whole dio, go back and try again 6848 */ 6849 if (ordered_offset < dip->logical_offset + dip->bytes) { 6850 ordered_bytes = dip->logical_offset + dip->bytes - 6851 ordered_offset; 6852 ordered = NULL; 6853 goto again; 6854 } 6855 out_done: 6856 dio_bio = dip->dio_bio; 6857 6858 kfree(dip); 6859 6860 /* If we had an error make sure to clear the uptodate flag */ 6861 if (err) 6862 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); 6863 dio_end_io(dio_bio, err); 6864 bio_put(bio); 6865 } 6866 6867 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, 6868 struct bio *bio, int mirror_num, 6869 unsigned long bio_flags, u64 offset) 6870 { 6871 int ret; 6872 struct btrfs_root *root = BTRFS_I(inode)->root; 6873 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); 6874 BUG_ON(ret); /* -ENOMEM */ 6875 return 0; 6876 } 6877 6878 static void btrfs_end_dio_bio(struct bio *bio, int err) 6879 { 6880 struct btrfs_dio_private *dip = bio->bi_private; 6881 6882 if (err) { 6883 printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " 6884 "sector %#Lx len %u err no %d\n", 6885 btrfs_ino(dip->inode), bio->bi_rw, 6886 (unsigned long long)bio->bi_sector, bio->bi_size, err); 6887 dip->errors = 1; 6888 6889 /* 6890 * before atomic variable goto zero, we must make sure 6891 * dip->errors is perceived to be set. 6892 */ 6893 smp_mb__before_atomic_dec(); 6894 } 6895 6896 /* if there are more bios still pending for this dio, just exit */ 6897 if (!atomic_dec_and_test(&dip->pending_bios)) 6898 goto out; 6899 6900 if (dip->errors) { 6901 bio_io_error(dip->orig_bio); 6902 } else { 6903 set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); 6904 bio_endio(dip->orig_bio, 0); 6905 } 6906 out: 6907 bio_put(bio); 6908 } 6909 6910 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, 6911 u64 first_sector, gfp_t gfp_flags) 6912 { 6913 int nr_vecs = bio_get_nr_vecs(bdev); 6914 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 6915 } 6916 6917 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 6918 int rw, u64 file_offset, int skip_sum, 6919 int async_submit) 6920 { 6921 struct btrfs_dio_private *dip = bio->bi_private; 6922 int write = rw & REQ_WRITE; 6923 struct btrfs_root *root = BTRFS_I(inode)->root; 6924 int ret; 6925 6926 if (async_submit) 6927 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); 6928 6929 bio_get(bio); 6930 6931 if (!write) { 6932 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 6933 if (ret) 6934 goto err; 6935 } 6936 6937 if (skip_sum) 6938 goto map; 6939 6940 if (write && async_submit) { 6941 ret = btrfs_wq_submit_bio(root->fs_info, 6942 inode, rw, bio, 0, 0, 6943 file_offset, 6944 __btrfs_submit_bio_start_direct_io, 6945 __btrfs_submit_bio_done); 6946 goto err; 6947 } else if (write) { 6948 /* 6949 * If we aren't doing async submit, calculate the csum of the 6950 * bio now. 6951 */ 6952 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 6953 if (ret) 6954 goto err; 6955 } else if (!skip_sum) { 6956 ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, 6957 file_offset); 6958 if (ret) 6959 goto err; 6960 } 6961 6962 map: 6963 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 6964 err: 6965 bio_put(bio); 6966 return ret; 6967 } 6968 6969 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, 6970 int skip_sum) 6971 { 6972 struct inode *inode = dip->inode; 6973 struct btrfs_root *root = BTRFS_I(inode)->root; 6974 struct bio *bio; 6975 struct bio *orig_bio = dip->orig_bio; 6976 struct bio_vec *bvec = orig_bio->bi_io_vec; 6977 u64 start_sector = orig_bio->bi_sector; 6978 u64 file_offset = dip->logical_offset; 6979 u64 submit_len = 0; 6980 u64 map_length; 6981 int nr_pages = 0; 6982 int ret = 0; 6983 int async_submit = 0; 6984 6985 map_length = orig_bio->bi_size; 6986 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, 6987 &map_length, NULL, 0); 6988 if (ret) { 6989 bio_put(orig_bio); 6990 return -EIO; 6991 } 6992 6993 if (map_length >= orig_bio->bi_size) { 6994 bio = orig_bio; 6995 goto submit; 6996 } 6997 6998 /* async crcs make it difficult to collect full stripe writes. */ 6999 if (btrfs_get_alloc_profile(root, 1) & 7000 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) 7001 async_submit = 0; 7002 else 7003 async_submit = 1; 7004 7005 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 7006 if (!bio) 7007 return -ENOMEM; 7008 bio->bi_private = dip; 7009 bio->bi_end_io = btrfs_end_dio_bio; 7010 atomic_inc(&dip->pending_bios); 7011 7012 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 7013 if (unlikely(map_length < submit_len + bvec->bv_len || 7014 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 7015 bvec->bv_offset) < bvec->bv_len)) { 7016 /* 7017 * inc the count before we submit the bio so 7018 * we know the end IO handler won't happen before 7019 * we inc the count. Otherwise, the dip might get freed 7020 * before we're done setting it up 7021 */ 7022 atomic_inc(&dip->pending_bios); 7023 ret = __btrfs_submit_dio_bio(bio, inode, rw, 7024 file_offset, skip_sum, 7025 async_submit); 7026 if (ret) { 7027 bio_put(bio); 7028 atomic_dec(&dip->pending_bios); 7029 goto out_err; 7030 } 7031 7032 start_sector += submit_len >> 9; 7033 file_offset += submit_len; 7034 7035 submit_len = 0; 7036 nr_pages = 0; 7037 7038 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 7039 start_sector, GFP_NOFS); 7040 if (!bio) 7041 goto out_err; 7042 bio->bi_private = dip; 7043 bio->bi_end_io = btrfs_end_dio_bio; 7044 7045 map_length = orig_bio->bi_size; 7046 ret = btrfs_map_block(root->fs_info, rw, 7047 start_sector << 9, 7048 &map_length, NULL, 0); 7049 if (ret) { 7050 bio_put(bio); 7051 goto out_err; 7052 } 7053 } else { 7054 submit_len += bvec->bv_len; 7055 nr_pages ++; 7056 bvec++; 7057 } 7058 } 7059 7060 submit: 7061 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 7062 async_submit); 7063 if (!ret) 7064 return 0; 7065 7066 bio_put(bio); 7067 out_err: 7068 dip->errors = 1; 7069 /* 7070 * before atomic variable goto zero, we must 7071 * make sure dip->errors is perceived to be set. 7072 */ 7073 smp_mb__before_atomic_dec(); 7074 if (atomic_dec_and_test(&dip->pending_bios)) 7075 bio_io_error(dip->orig_bio); 7076 7077 /* bio_end_io() will handle error, so we needn't return it */ 7078 return 0; 7079 } 7080 7081 static void btrfs_submit_direct(int rw, struct bio *dio_bio, 7082 struct inode *inode, loff_t file_offset) 7083 { 7084 struct btrfs_root *root = BTRFS_I(inode)->root; 7085 struct btrfs_dio_private *dip; 7086 struct bio *io_bio; 7087 int skip_sum; 7088 int sum_len; 7089 int write = rw & REQ_WRITE; 7090 int ret = 0; 7091 u16 csum_size; 7092 7093 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 7094 7095 io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); 7096 if (!io_bio) { 7097 ret = -ENOMEM; 7098 goto free_ordered; 7099 } 7100 7101 if (!skip_sum && !write) { 7102 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 7103 sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits; 7104 sum_len *= csum_size; 7105 } else { 7106 sum_len = 0; 7107 } 7108 7109 dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS); 7110 if (!dip) { 7111 ret = -ENOMEM; 7112 goto free_io_bio; 7113 } 7114 7115 dip->private = dio_bio->bi_private; 7116 dip->inode = inode; 7117 dip->logical_offset = file_offset; 7118 dip->bytes = dio_bio->bi_size; 7119 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; 7120 io_bio->bi_private = dip; 7121 dip->errors = 0; 7122 dip->orig_bio = io_bio; 7123 dip->dio_bio = dio_bio; 7124 atomic_set(&dip->pending_bios, 0); 7125 7126 if (write) 7127 io_bio->bi_end_io = btrfs_endio_direct_write; 7128 else 7129 io_bio->bi_end_io = btrfs_endio_direct_read; 7130 7131 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 7132 if (!ret) 7133 return; 7134 7135 free_io_bio: 7136 bio_put(io_bio); 7137 7138 free_ordered: 7139 /* 7140 * If this is a write, we need to clean up the reserved space and kill 7141 * the ordered extent. 7142 */ 7143 if (write) { 7144 struct btrfs_ordered_extent *ordered; 7145 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 7146 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 7147 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 7148 btrfs_free_reserved_extent(root, ordered->start, 7149 ordered->disk_len); 7150 btrfs_put_ordered_extent(ordered); 7151 btrfs_put_ordered_extent(ordered); 7152 } 7153 bio_endio(dio_bio, ret); 7154 } 7155 7156 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, 7157 const struct iovec *iov, loff_t offset, 7158 unsigned long nr_segs) 7159 { 7160 int seg; 7161 int i; 7162 size_t size; 7163 unsigned long addr; 7164 unsigned blocksize_mask = root->sectorsize - 1; 7165 ssize_t retval = -EINVAL; 7166 loff_t end = offset; 7167 7168 if (offset & blocksize_mask) 7169 goto out; 7170 7171 /* Check the memory alignment. Blocks cannot straddle pages */ 7172 for (seg = 0; seg < nr_segs; seg++) { 7173 addr = (unsigned long)iov[seg].iov_base; 7174 size = iov[seg].iov_len; 7175 end += size; 7176 if ((addr & blocksize_mask) || (size & blocksize_mask)) 7177 goto out; 7178 7179 /* If this is a write we don't need to check anymore */ 7180 if (rw & WRITE) 7181 continue; 7182 7183 /* 7184 * Check to make sure we don't have duplicate iov_base's in this 7185 * iovec, if so return EINVAL, otherwise we'll get csum errors 7186 * when reading back. 7187 */ 7188 for (i = seg + 1; i < nr_segs; i++) { 7189 if (iov[seg].iov_base == iov[i].iov_base) 7190 goto out; 7191 } 7192 } 7193 retval = 0; 7194 out: 7195 return retval; 7196 } 7197 7198 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 7199 const struct iovec *iov, loff_t offset, 7200 unsigned long nr_segs) 7201 { 7202 struct file *file = iocb->ki_filp; 7203 struct inode *inode = file->f_mapping->host; 7204 size_t count = 0; 7205 int flags = 0; 7206 bool wakeup = true; 7207 bool relock = false; 7208 ssize_t ret; 7209 7210 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 7211 offset, nr_segs)) 7212 return 0; 7213 7214 atomic_inc(&inode->i_dio_count); 7215 smp_mb__after_atomic_inc(); 7216 7217 /* 7218 * The generic stuff only does filemap_write_and_wait_range, which isn't 7219 * enough if we've written compressed pages to this area, so we need to 7220 * call btrfs_wait_ordered_range to make absolutely sure that any 7221 * outstanding dirty pages are on disk. 7222 */ 7223 count = iov_length(iov, nr_segs); 7224 btrfs_wait_ordered_range(inode, offset, count); 7225 7226 if (rw & WRITE) { 7227 /* 7228 * If the write DIO is beyond the EOF, we need update 7229 * the isize, but it is protected by i_mutex. So we can 7230 * not unlock the i_mutex at this case. 7231 */ 7232 if (offset + count <= inode->i_size) { 7233 mutex_unlock(&inode->i_mutex); 7234 relock = true; 7235 } 7236 ret = btrfs_delalloc_reserve_space(inode, count); 7237 if (ret) 7238 goto out; 7239 } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, 7240 &BTRFS_I(inode)->runtime_flags))) { 7241 inode_dio_done(inode); 7242 flags = DIO_LOCKING | DIO_SKIP_HOLES; 7243 wakeup = false; 7244 } 7245 7246 ret = __blockdev_direct_IO(rw, iocb, inode, 7247 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 7248 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 7249 btrfs_submit_direct, flags); 7250 if (rw & WRITE) { 7251 if (ret < 0 && ret != -EIOCBQUEUED) 7252 btrfs_delalloc_release_space(inode, count); 7253 else if (ret >= 0 && (size_t)ret < count) 7254 btrfs_delalloc_release_space(inode, 7255 count - (size_t)ret); 7256 else 7257 btrfs_delalloc_release_metadata(inode, 0); 7258 } 7259 out: 7260 if (wakeup) 7261 inode_dio_done(inode); 7262 if (relock) 7263 mutex_lock(&inode->i_mutex); 7264 7265 return ret; 7266 } 7267 7268 #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 7269 7270 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 7271 __u64 start, __u64 len) 7272 { 7273 int ret; 7274 7275 ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); 7276 if (ret) 7277 return ret; 7278 7279 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 7280 } 7281 7282 int btrfs_readpage(struct file *file, struct page *page) 7283 { 7284 struct extent_io_tree *tree; 7285 tree = &BTRFS_I(page->mapping->host)->io_tree; 7286 return extent_read_full_page(tree, page, btrfs_get_extent, 0); 7287 } 7288 7289 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 7290 { 7291 struct extent_io_tree *tree; 7292 7293 7294 if (current->flags & PF_MEMALLOC) { 7295 redirty_page_for_writepage(wbc, page); 7296 unlock_page(page); 7297 return 0; 7298 } 7299 tree = &BTRFS_I(page->mapping->host)->io_tree; 7300 return extent_write_full_page(tree, page, btrfs_get_extent, wbc); 7301 } 7302 7303 static int btrfs_writepages(struct address_space *mapping, 7304 struct writeback_control *wbc) 7305 { 7306 struct extent_io_tree *tree; 7307 7308 tree = &BTRFS_I(mapping->host)->io_tree; 7309 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 7310 } 7311 7312 static int 7313 btrfs_readpages(struct file *file, struct address_space *mapping, 7314 struct list_head *pages, unsigned nr_pages) 7315 { 7316 struct extent_io_tree *tree; 7317 tree = &BTRFS_I(mapping->host)->io_tree; 7318 return extent_readpages(tree, mapping, pages, nr_pages, 7319 btrfs_get_extent); 7320 } 7321 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 7322 { 7323 struct extent_io_tree *tree; 7324 struct extent_map_tree *map; 7325 int ret; 7326 7327 tree = &BTRFS_I(page->mapping->host)->io_tree; 7328 map = &BTRFS_I(page->mapping->host)->extent_tree; 7329 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 7330 if (ret == 1) { 7331 ClearPagePrivate(page); 7332 set_page_private(page, 0); 7333 page_cache_release(page); 7334 } 7335 return ret; 7336 } 7337 7338 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 7339 { 7340 if (PageWriteback(page) || PageDirty(page)) 7341 return 0; 7342 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 7343 } 7344 7345 static void btrfs_invalidatepage(struct page *page, unsigned int offset, 7346 unsigned int length) 7347 { 7348 struct inode *inode = page->mapping->host; 7349 struct extent_io_tree *tree; 7350 struct btrfs_ordered_extent *ordered; 7351 struct extent_state *cached_state = NULL; 7352 u64 page_start = page_offset(page); 7353 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 7354 7355 /* 7356 * we have the page locked, so new writeback can't start, 7357 * and the dirty bit won't be cleared while we are here. 7358 * 7359 * Wait for IO on this page so that we can safely clear 7360 * the PagePrivate2 bit and do ordered accounting 7361 */ 7362 wait_on_page_writeback(page); 7363 7364 tree = &BTRFS_I(inode)->io_tree; 7365 if (offset) { 7366 btrfs_releasepage(page, GFP_NOFS); 7367 return; 7368 } 7369 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 7370 ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); 7371 if (ordered) { 7372 /* 7373 * IO on this page will never be started, so we need 7374 * to account for any ordered extents now 7375 */ 7376 clear_extent_bit(tree, page_start, page_end, 7377 EXTENT_DIRTY | EXTENT_DELALLOC | 7378 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 7379 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); 7380 /* 7381 * whoever cleared the private bit is responsible 7382 * for the finish_ordered_io 7383 */ 7384 if (TestClearPagePrivate2(page)) { 7385 struct btrfs_ordered_inode_tree *tree; 7386 u64 new_len; 7387 7388 tree = &BTRFS_I(inode)->ordered_tree; 7389 7390 spin_lock_irq(&tree->lock); 7391 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 7392 new_len = page_start - ordered->file_offset; 7393 if (new_len < ordered->truncated_len) 7394 ordered->truncated_len = new_len; 7395 spin_unlock_irq(&tree->lock); 7396 7397 if (btrfs_dec_test_ordered_pending(inode, &ordered, 7398 page_start, 7399 PAGE_CACHE_SIZE, 1)) 7400 btrfs_finish_ordered_io(ordered); 7401 } 7402 btrfs_put_ordered_extent(ordered); 7403 cached_state = NULL; 7404 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 7405 } 7406 clear_extent_bit(tree, page_start, page_end, 7407 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 7408 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, 7409 &cached_state, GFP_NOFS); 7410 __btrfs_releasepage(page, GFP_NOFS); 7411 7412 ClearPageChecked(page); 7413 if (PagePrivate(page)) { 7414 ClearPagePrivate(page); 7415 set_page_private(page, 0); 7416 page_cache_release(page); 7417 } 7418 } 7419 7420 /* 7421 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 7422 * called from a page fault handler when a page is first dirtied. Hence we must 7423 * be careful to check for EOF conditions here. We set the page up correctly 7424 * for a written page which means we get ENOSPC checking when writing into 7425 * holes and correct delalloc and unwritten extent mapping on filesystems that 7426 * support these features. 7427 * 7428 * We are not allowed to take the i_mutex here so we have to play games to 7429 * protect against truncate races as the page could now be beyond EOF. Because 7430 * vmtruncate() writes the inode size before removing pages, once we have the 7431 * page lock we can determine safely if the page is beyond EOF. If it is not 7432 * beyond EOF, then the page is guaranteed safe against truncation until we 7433 * unlock the page. 7434 */ 7435 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 7436 { 7437 struct page *page = vmf->page; 7438 struct inode *inode = file_inode(vma->vm_file); 7439 struct btrfs_root *root = BTRFS_I(inode)->root; 7440 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7441 struct btrfs_ordered_extent *ordered; 7442 struct extent_state *cached_state = NULL; 7443 char *kaddr; 7444 unsigned long zero_start; 7445 loff_t size; 7446 int ret; 7447 int reserved = 0; 7448 u64 page_start; 7449 u64 page_end; 7450 7451 sb_start_pagefault(inode->i_sb); 7452 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 7453 if (!ret) { 7454 ret = file_update_time(vma->vm_file); 7455 reserved = 1; 7456 } 7457 if (ret) { 7458 if (ret == -ENOMEM) 7459 ret = VM_FAULT_OOM; 7460 else /* -ENOSPC, -EIO, etc */ 7461 ret = VM_FAULT_SIGBUS; 7462 if (reserved) 7463 goto out; 7464 goto out_noreserve; 7465 } 7466 7467 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 7468 again: 7469 lock_page(page); 7470 size = i_size_read(inode); 7471 page_start = page_offset(page); 7472 page_end = page_start + PAGE_CACHE_SIZE - 1; 7473 7474 if ((page->mapping != inode->i_mapping) || 7475 (page_start >= size)) { 7476 /* page got truncated out from underneath us */ 7477 goto out_unlock; 7478 } 7479 wait_on_page_writeback(page); 7480 7481 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); 7482 set_page_extent_mapped(page); 7483 7484 /* 7485 * we can't set the delalloc bits if there are pending ordered 7486 * extents. Drop our locks and wait for them to finish 7487 */ 7488 ordered = btrfs_lookup_ordered_extent(inode, page_start); 7489 if (ordered) { 7490 unlock_extent_cached(io_tree, page_start, page_end, 7491 &cached_state, GFP_NOFS); 7492 unlock_page(page); 7493 btrfs_start_ordered_extent(inode, ordered, 1); 7494 btrfs_put_ordered_extent(ordered); 7495 goto again; 7496 } 7497 7498 /* 7499 * XXX - page_mkwrite gets called every time the page is dirtied, even 7500 * if it was already dirty, so for space accounting reasons we need to 7501 * clear any delalloc bits for the range we are fixing to save. There 7502 * is probably a better way to do this, but for now keep consistent with 7503 * prepare_pages in the normal write path. 7504 */ 7505 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 7506 EXTENT_DIRTY | EXTENT_DELALLOC | 7507 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 7508 0, 0, &cached_state, GFP_NOFS); 7509 7510 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 7511 &cached_state); 7512 if (ret) { 7513 unlock_extent_cached(io_tree, page_start, page_end, 7514 &cached_state, GFP_NOFS); 7515 ret = VM_FAULT_SIGBUS; 7516 goto out_unlock; 7517 } 7518 ret = 0; 7519 7520 /* page is wholly or partially inside EOF */ 7521 if (page_start + PAGE_CACHE_SIZE > size) 7522 zero_start = size & ~PAGE_CACHE_MASK; 7523 else 7524 zero_start = PAGE_CACHE_SIZE; 7525 7526 if (zero_start != PAGE_CACHE_SIZE) { 7527 kaddr = kmap(page); 7528 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); 7529 flush_dcache_page(page); 7530 kunmap(page); 7531 } 7532 ClearPageChecked(page); 7533 set_page_dirty(page); 7534 SetPageUptodate(page); 7535 7536 BTRFS_I(inode)->last_trans = root->fs_info->generation; 7537 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 7538 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; 7539 7540 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 7541 7542 out_unlock: 7543 if (!ret) { 7544 sb_end_pagefault(inode->i_sb); 7545 return VM_FAULT_LOCKED; 7546 } 7547 unlock_page(page); 7548 out: 7549 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 7550 out_noreserve: 7551 sb_end_pagefault(inode->i_sb); 7552 return ret; 7553 } 7554 7555 static int btrfs_truncate(struct inode *inode) 7556 { 7557 struct btrfs_root *root = BTRFS_I(inode)->root; 7558 struct btrfs_block_rsv *rsv; 7559 int ret = 0; 7560 int err = 0; 7561 struct btrfs_trans_handle *trans; 7562 u64 mask = root->sectorsize - 1; 7563 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 7564 7565 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 7566 7567 /* 7568 * Yes ladies and gentelment, this is indeed ugly. The fact is we have 7569 * 3 things going on here 7570 * 7571 * 1) We need to reserve space for our orphan item and the space to 7572 * delete our orphan item. Lord knows we don't want to have a dangling 7573 * orphan item because we didn't reserve space to remove it. 7574 * 7575 * 2) We need to reserve space to update our inode. 7576 * 7577 * 3) We need to have something to cache all the space that is going to 7578 * be free'd up by the truncate operation, but also have some slack 7579 * space reserved in case it uses space during the truncate (thank you 7580 * very much snapshotting). 7581 * 7582 * And we need these to all be seperate. The fact is we can use alot of 7583 * space doing the truncate, and we have no earthly idea how much space 7584 * we will use, so we need the truncate reservation to be seperate so it 7585 * doesn't end up using space reserved for updating the inode or 7586 * removing the orphan item. We also need to be able to stop the 7587 * transaction and start a new one, which means we need to be able to 7588 * update the inode several times, and we have no idea of knowing how 7589 * many times that will be, so we can't just reserve 1 item for the 7590 * entirety of the opration, so that has to be done seperately as well. 7591 * Then there is the orphan item, which does indeed need to be held on 7592 * to for the whole operation, and we need nobody to touch this reserved 7593 * space except the orphan code. 7594 * 7595 * So that leaves us with 7596 * 7597 * 1) root->orphan_block_rsv - for the orphan deletion. 7598 * 2) rsv - for the truncate reservation, which we will steal from the 7599 * transaction reservation. 7600 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for 7601 * updating the inode. 7602 */ 7603 rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); 7604 if (!rsv) 7605 return -ENOMEM; 7606 rsv->size = min_size; 7607 rsv->failfast = 1; 7608 7609 /* 7610 * 1 for the truncate slack space 7611 * 1 for updating the inode. 7612 */ 7613 trans = btrfs_start_transaction(root, 2); 7614 if (IS_ERR(trans)) { 7615 err = PTR_ERR(trans); 7616 goto out; 7617 } 7618 7619 /* Migrate the slack space for the truncate to our reserve */ 7620 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, 7621 min_size); 7622 BUG_ON(ret); 7623 7624 /* 7625 * setattr is responsible for setting the ordered_data_close flag, 7626 * but that is only tested during the last file release. That 7627 * could happen well after the next commit, leaving a great big 7628 * window where new writes may get lost if someone chooses to write 7629 * to this file after truncating to zero 7630 * 7631 * The inode doesn't have any dirty data here, and so if we commit 7632 * this is a noop. If someone immediately starts writing to the inode 7633 * it is very likely we'll catch some of their writes in this 7634 * transaction, and the commit will find this file on the ordered 7635 * data list with good things to send down. 7636 * 7637 * This is a best effort solution, there is still a window where 7638 * using truncate to replace the contents of the file will 7639 * end up with a zero length file after a crash. 7640 */ 7641 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 7642 &BTRFS_I(inode)->runtime_flags)) 7643 btrfs_add_ordered_operation(trans, root, inode); 7644 7645 /* 7646 * So if we truncate and then write and fsync we normally would just 7647 * write the extents that changed, which is a problem if we need to 7648 * first truncate that entire inode. So set this flag so we write out 7649 * all of the extents in the inode to the sync log so we're completely 7650 * safe. 7651 */ 7652 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 7653 trans->block_rsv = rsv; 7654 7655 while (1) { 7656 ret = btrfs_truncate_inode_items(trans, root, inode, 7657 inode->i_size, 7658 BTRFS_EXTENT_DATA_KEY); 7659 if (ret != -ENOSPC) { 7660 err = ret; 7661 break; 7662 } 7663 7664 trans->block_rsv = &root->fs_info->trans_block_rsv; 7665 ret = btrfs_update_inode(trans, root, inode); 7666 if (ret) { 7667 err = ret; 7668 break; 7669 } 7670 7671 btrfs_end_transaction(trans, root); 7672 btrfs_btree_balance_dirty(root); 7673 7674 trans = btrfs_start_transaction(root, 2); 7675 if (IS_ERR(trans)) { 7676 ret = err = PTR_ERR(trans); 7677 trans = NULL; 7678 break; 7679 } 7680 7681 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, 7682 rsv, min_size); 7683 BUG_ON(ret); /* shouldn't happen */ 7684 trans->block_rsv = rsv; 7685 } 7686 7687 if (ret == 0 && inode->i_nlink > 0) { 7688 trans->block_rsv = root->orphan_block_rsv; 7689 ret = btrfs_orphan_del(trans, inode); 7690 if (ret) 7691 err = ret; 7692 } 7693 7694 if (trans) { 7695 trans->block_rsv = &root->fs_info->trans_block_rsv; 7696 ret = btrfs_update_inode(trans, root, inode); 7697 if (ret && !err) 7698 err = ret; 7699 7700 ret = btrfs_end_transaction(trans, root); 7701 btrfs_btree_balance_dirty(root); 7702 } 7703 7704 out: 7705 btrfs_free_block_rsv(root, rsv); 7706 7707 if (ret && !err) 7708 err = ret; 7709 7710 return err; 7711 } 7712 7713 /* 7714 * create a new subvolume directory/inode (helper for the ioctl). 7715 */ 7716 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 7717 struct btrfs_root *new_root, u64 new_dirid) 7718 { 7719 struct inode *inode; 7720 int err; 7721 u64 index = 0; 7722 7723 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, 7724 new_dirid, new_dirid, 7725 S_IFDIR | (~current_umask() & S_IRWXUGO), 7726 &index); 7727 if (IS_ERR(inode)) 7728 return PTR_ERR(inode); 7729 inode->i_op = &btrfs_dir_inode_operations; 7730 inode->i_fop = &btrfs_dir_file_operations; 7731 7732 set_nlink(inode, 1); 7733 btrfs_i_size_write(inode, 0); 7734 7735 err = btrfs_update_inode(trans, new_root, inode); 7736 7737 iput(inode); 7738 return err; 7739 } 7740 7741 struct inode *btrfs_alloc_inode(struct super_block *sb) 7742 { 7743 struct btrfs_inode *ei; 7744 struct inode *inode; 7745 7746 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 7747 if (!ei) 7748 return NULL; 7749 7750 ei->root = NULL; 7751 ei->generation = 0; 7752 ei->last_trans = 0; 7753 ei->last_sub_trans = 0; 7754 ei->logged_trans = 0; 7755 ei->delalloc_bytes = 0; 7756 ei->disk_i_size = 0; 7757 ei->flags = 0; 7758 ei->csum_bytes = 0; 7759 ei->index_cnt = (u64)-1; 7760 ei->last_unlink_trans = 0; 7761 ei->last_log_commit = 0; 7762 7763 spin_lock_init(&ei->lock); 7764 ei->outstanding_extents = 0; 7765 ei->reserved_extents = 0; 7766 7767 ei->runtime_flags = 0; 7768 ei->force_compress = BTRFS_COMPRESS_NONE; 7769 7770 ei->delayed_node = NULL; 7771 7772 inode = &ei->vfs_inode; 7773 extent_map_tree_init(&ei->extent_tree); 7774 extent_io_tree_init(&ei->io_tree, &inode->i_data); 7775 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 7776 ei->io_tree.track_uptodate = 1; 7777 ei->io_failure_tree.track_uptodate = 1; 7778 atomic_set(&ei->sync_writers, 0); 7779 mutex_init(&ei->log_mutex); 7780 mutex_init(&ei->delalloc_mutex); 7781 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 7782 INIT_LIST_HEAD(&ei->delalloc_inodes); 7783 INIT_LIST_HEAD(&ei->ordered_operations); 7784 RB_CLEAR_NODE(&ei->rb_node); 7785 7786 return inode; 7787 } 7788 7789 static void btrfs_i_callback(struct rcu_head *head) 7790 { 7791 struct inode *inode = container_of(head, struct inode, i_rcu); 7792 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 7793 } 7794 7795 void btrfs_destroy_inode(struct inode *inode) 7796 { 7797 struct btrfs_ordered_extent *ordered; 7798 struct btrfs_root *root = BTRFS_I(inode)->root; 7799 7800 WARN_ON(!hlist_empty(&inode->i_dentry)); 7801 WARN_ON(inode->i_data.nrpages); 7802 WARN_ON(BTRFS_I(inode)->outstanding_extents); 7803 WARN_ON(BTRFS_I(inode)->reserved_extents); 7804 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 7805 WARN_ON(BTRFS_I(inode)->csum_bytes); 7806 7807 /* 7808 * This can happen where we create an inode, but somebody else also 7809 * created the same inode and we need to destroy the one we already 7810 * created. 7811 */ 7812 if (!root) 7813 goto free; 7814 7815 /* 7816 * Make sure we're properly removed from the ordered operation 7817 * lists. 7818 */ 7819 smp_mb(); 7820 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 7821 spin_lock(&root->fs_info->ordered_root_lock); 7822 list_del_init(&BTRFS_I(inode)->ordered_operations); 7823 spin_unlock(&root->fs_info->ordered_root_lock); 7824 } 7825 7826 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 7827 &BTRFS_I(inode)->runtime_flags)) { 7828 btrfs_info(root->fs_info, "inode %llu still on the orphan list", 7829 btrfs_ino(inode)); 7830 atomic_dec(&root->orphan_inodes); 7831 } 7832 7833 while (1) { 7834 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 7835 if (!ordered) 7836 break; 7837 else { 7838 btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", 7839 ordered->file_offset, ordered->len); 7840 btrfs_remove_ordered_extent(inode, ordered); 7841 btrfs_put_ordered_extent(ordered); 7842 btrfs_put_ordered_extent(ordered); 7843 } 7844 } 7845 inode_tree_del(inode); 7846 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 7847 free: 7848 call_rcu(&inode->i_rcu, btrfs_i_callback); 7849 } 7850 7851 int btrfs_drop_inode(struct inode *inode) 7852 { 7853 struct btrfs_root *root = BTRFS_I(inode)->root; 7854 7855 if (root == NULL) 7856 return 1; 7857 7858 /* the snap/subvol tree is on deleting */ 7859 if (btrfs_root_refs(&root->root_item) == 0 && 7860 root != root->fs_info->tree_root) 7861 return 1; 7862 else 7863 return generic_drop_inode(inode); 7864 } 7865 7866 static void init_once(void *foo) 7867 { 7868 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 7869 7870 inode_init_once(&ei->vfs_inode); 7871 } 7872 7873 void btrfs_destroy_cachep(void) 7874 { 7875 /* 7876 * Make sure all delayed rcu free inodes are flushed before we 7877 * destroy cache. 7878 */ 7879 rcu_barrier(); 7880 if (btrfs_inode_cachep) 7881 kmem_cache_destroy(btrfs_inode_cachep); 7882 if (btrfs_trans_handle_cachep) 7883 kmem_cache_destroy(btrfs_trans_handle_cachep); 7884 if (btrfs_transaction_cachep) 7885 kmem_cache_destroy(btrfs_transaction_cachep); 7886 if (btrfs_path_cachep) 7887 kmem_cache_destroy(btrfs_path_cachep); 7888 if (btrfs_free_space_cachep) 7889 kmem_cache_destroy(btrfs_free_space_cachep); 7890 if (btrfs_delalloc_work_cachep) 7891 kmem_cache_destroy(btrfs_delalloc_work_cachep); 7892 } 7893 7894 int btrfs_init_cachep(void) 7895 { 7896 btrfs_inode_cachep = kmem_cache_create("btrfs_inode", 7897 sizeof(struct btrfs_inode), 0, 7898 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 7899 if (!btrfs_inode_cachep) 7900 goto fail; 7901 7902 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", 7903 sizeof(struct btrfs_trans_handle), 0, 7904 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7905 if (!btrfs_trans_handle_cachep) 7906 goto fail; 7907 7908 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", 7909 sizeof(struct btrfs_transaction), 0, 7910 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7911 if (!btrfs_transaction_cachep) 7912 goto fail; 7913 7914 btrfs_path_cachep = kmem_cache_create("btrfs_path", 7915 sizeof(struct btrfs_path), 0, 7916 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7917 if (!btrfs_path_cachep) 7918 goto fail; 7919 7920 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 7921 sizeof(struct btrfs_free_space), 0, 7922 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 7923 if (!btrfs_free_space_cachep) 7924 goto fail; 7925 7926 btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", 7927 sizeof(struct btrfs_delalloc_work), 0, 7928 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 7929 NULL); 7930 if (!btrfs_delalloc_work_cachep) 7931 goto fail; 7932 7933 return 0; 7934 fail: 7935 btrfs_destroy_cachep(); 7936 return -ENOMEM; 7937 } 7938 7939 static int btrfs_getattr(struct vfsmount *mnt, 7940 struct dentry *dentry, struct kstat *stat) 7941 { 7942 u64 delalloc_bytes; 7943 struct inode *inode = dentry->d_inode; 7944 u32 blocksize = inode->i_sb->s_blocksize; 7945 7946 generic_fillattr(inode, stat); 7947 stat->dev = BTRFS_I(inode)->root->anon_dev; 7948 stat->blksize = PAGE_CACHE_SIZE; 7949 7950 spin_lock(&BTRFS_I(inode)->lock); 7951 delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; 7952 spin_unlock(&BTRFS_I(inode)->lock); 7953 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + 7954 ALIGN(delalloc_bytes, blocksize)) >> 9; 7955 return 0; 7956 } 7957 7958 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 7959 struct inode *new_dir, struct dentry *new_dentry) 7960 { 7961 struct btrfs_trans_handle *trans; 7962 struct btrfs_root *root = BTRFS_I(old_dir)->root; 7963 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 7964 struct inode *new_inode = new_dentry->d_inode; 7965 struct inode *old_inode = old_dentry->d_inode; 7966 struct timespec ctime = CURRENT_TIME; 7967 u64 index = 0; 7968 u64 root_objectid; 7969 int ret; 7970 u64 old_ino = btrfs_ino(old_inode); 7971 7972 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 7973 return -EPERM; 7974 7975 /* we only allow rename subvolume link between subvolumes */ 7976 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 7977 return -EXDEV; 7978 7979 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 7980 (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) 7981 return -ENOTEMPTY; 7982 7983 if (S_ISDIR(old_inode->i_mode) && new_inode && 7984 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7985 return -ENOTEMPTY; 7986 7987 7988 /* check for collisions, even if the name isn't there */ 7989 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, 7990 new_dentry->d_name.name, 7991 new_dentry->d_name.len); 7992 7993 if (ret) { 7994 if (ret == -EEXIST) { 7995 /* we shouldn't get 7996 * eexist without a new_inode */ 7997 if (!new_inode) { 7998 WARN_ON(1); 7999 return ret; 8000 } 8001 } else { 8002 /* maybe -EOVERFLOW */ 8003 return ret; 8004 } 8005 } 8006 ret = 0; 8007 8008 /* 8009 * we're using rename to replace one file with another. 8010 * and the replacement file is large. Start IO on it now so 8011 * we don't add too much work to the end of the transaction 8012 */ 8013 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 8014 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 8015 filemap_flush(old_inode->i_mapping); 8016 8017 /* close the racy window with snapshot create/destroy ioctl */ 8018 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 8019 down_read(&root->fs_info->subvol_sem); 8020 /* 8021 * We want to reserve the absolute worst case amount of items. So if 8022 * both inodes are subvols and we need to unlink them then that would 8023 * require 4 item modifications, but if they are both normal inodes it 8024 * would require 5 item modifications, so we'll assume their normal 8025 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 8026 * should cover the worst case number of items we'll modify. 8027 */ 8028 trans = btrfs_start_transaction(root, 11); 8029 if (IS_ERR(trans)) { 8030 ret = PTR_ERR(trans); 8031 goto out_notrans; 8032 } 8033 8034 if (dest != root) 8035 btrfs_record_root_in_trans(trans, dest); 8036 8037 ret = btrfs_set_inode_index(new_dir, &index); 8038 if (ret) 8039 goto out_fail; 8040 8041 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 8042 /* force full log commit if subvolume involved. */ 8043 root->fs_info->last_trans_log_full_commit = trans->transid; 8044 } else { 8045 ret = btrfs_insert_inode_ref(trans, dest, 8046 new_dentry->d_name.name, 8047 new_dentry->d_name.len, 8048 old_ino, 8049 btrfs_ino(new_dir), index); 8050 if (ret) 8051 goto out_fail; 8052 /* 8053 * this is an ugly little race, but the rename is required 8054 * to make sure that if we crash, the inode is either at the 8055 * old name or the new one. pinning the log transaction lets 8056 * us make sure we don't allow a log commit to come in after 8057 * we unlink the name but before we add the new name back in. 8058 */ 8059 btrfs_pin_log_trans(root); 8060 } 8061 /* 8062 * make sure the inode gets flushed if it is replacing 8063 * something. 8064 */ 8065 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) 8066 btrfs_add_ordered_operation(trans, root, old_inode); 8067 8068 inode_inc_iversion(old_dir); 8069 inode_inc_iversion(new_dir); 8070 inode_inc_iversion(old_inode); 8071 old_dir->i_ctime = old_dir->i_mtime = ctime; 8072 new_dir->i_ctime = new_dir->i_mtime = ctime; 8073 old_inode->i_ctime = ctime; 8074 8075 if (old_dentry->d_parent != new_dentry->d_parent) 8076 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 8077 8078 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 8079 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 8080 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 8081 old_dentry->d_name.name, 8082 old_dentry->d_name.len); 8083 } else { 8084 ret = __btrfs_unlink_inode(trans, root, old_dir, 8085 old_dentry->d_inode, 8086 old_dentry->d_name.name, 8087 old_dentry->d_name.len); 8088 if (!ret) 8089 ret = btrfs_update_inode(trans, root, old_inode); 8090 } 8091 if (ret) { 8092 btrfs_abort_transaction(trans, root, ret); 8093 goto out_fail; 8094 } 8095 8096 if (new_inode) { 8097 inode_inc_iversion(new_inode); 8098 new_inode->i_ctime = CURRENT_TIME; 8099 if (unlikely(btrfs_ino(new_inode) == 8100 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 8101 root_objectid = BTRFS_I(new_inode)->location.objectid; 8102 ret = btrfs_unlink_subvol(trans, dest, new_dir, 8103 root_objectid, 8104 new_dentry->d_name.name, 8105 new_dentry->d_name.len); 8106 BUG_ON(new_inode->i_nlink == 0); 8107 } else { 8108 ret = btrfs_unlink_inode(trans, dest, new_dir, 8109 new_dentry->d_inode, 8110 new_dentry->d_name.name, 8111 new_dentry->d_name.len); 8112 } 8113 if (!ret && new_inode->i_nlink == 0) 8114 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 8115 if (ret) { 8116 btrfs_abort_transaction(trans, root, ret); 8117 goto out_fail; 8118 } 8119 } 8120 8121 ret = btrfs_add_link(trans, new_dir, old_inode, 8122 new_dentry->d_name.name, 8123 new_dentry->d_name.len, 0, index); 8124 if (ret) { 8125 btrfs_abort_transaction(trans, root, ret); 8126 goto out_fail; 8127 } 8128 8129 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 8130 struct dentry *parent = new_dentry->d_parent; 8131 btrfs_log_new_name(trans, old_inode, old_dir, parent); 8132 btrfs_end_log_trans(root); 8133 } 8134 out_fail: 8135 btrfs_end_transaction(trans, root); 8136 out_notrans: 8137 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 8138 up_read(&root->fs_info->subvol_sem); 8139 8140 return ret; 8141 } 8142 8143 static void btrfs_run_delalloc_work(struct btrfs_work *work) 8144 { 8145 struct btrfs_delalloc_work *delalloc_work; 8146 8147 delalloc_work = container_of(work, struct btrfs_delalloc_work, 8148 work); 8149 if (delalloc_work->wait) 8150 btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1); 8151 else 8152 filemap_flush(delalloc_work->inode->i_mapping); 8153 8154 if (delalloc_work->delay_iput) 8155 btrfs_add_delayed_iput(delalloc_work->inode); 8156 else 8157 iput(delalloc_work->inode); 8158 complete(&delalloc_work->completion); 8159 } 8160 8161 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, 8162 int wait, int delay_iput) 8163 { 8164 struct btrfs_delalloc_work *work; 8165 8166 work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); 8167 if (!work) 8168 return NULL; 8169 8170 init_completion(&work->completion); 8171 INIT_LIST_HEAD(&work->list); 8172 work->inode = inode; 8173 work->wait = wait; 8174 work->delay_iput = delay_iput; 8175 work->work.func = btrfs_run_delalloc_work; 8176 8177 return work; 8178 } 8179 8180 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) 8181 { 8182 wait_for_completion(&work->completion); 8183 kmem_cache_free(btrfs_delalloc_work_cachep, work); 8184 } 8185 8186 /* 8187 * some fairly slow code that needs optimization. This walks the list 8188 * of all the inodes with pending delalloc and forces them to disk. 8189 */ 8190 static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8191 { 8192 struct btrfs_inode *binode; 8193 struct inode *inode; 8194 struct btrfs_delalloc_work *work, *next; 8195 struct list_head works; 8196 struct list_head splice; 8197 int ret = 0; 8198 8199 INIT_LIST_HEAD(&works); 8200 INIT_LIST_HEAD(&splice); 8201 8202 spin_lock(&root->delalloc_lock); 8203 list_splice_init(&root->delalloc_inodes, &splice); 8204 while (!list_empty(&splice)) { 8205 binode = list_entry(splice.next, struct btrfs_inode, 8206 delalloc_inodes); 8207 8208 list_move_tail(&binode->delalloc_inodes, 8209 &root->delalloc_inodes); 8210 inode = igrab(&binode->vfs_inode); 8211 if (!inode) { 8212 cond_resched_lock(&root->delalloc_lock); 8213 continue; 8214 } 8215 spin_unlock(&root->delalloc_lock); 8216 8217 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 8218 if (unlikely(!work)) { 8219 if (delay_iput) 8220 btrfs_add_delayed_iput(inode); 8221 else 8222 iput(inode); 8223 ret = -ENOMEM; 8224 goto out; 8225 } 8226 list_add_tail(&work->list, &works); 8227 btrfs_queue_worker(&root->fs_info->flush_workers, 8228 &work->work); 8229 8230 cond_resched(); 8231 spin_lock(&root->delalloc_lock); 8232 } 8233 spin_unlock(&root->delalloc_lock); 8234 8235 list_for_each_entry_safe(work, next, &works, list) { 8236 list_del_init(&work->list); 8237 btrfs_wait_and_free_delalloc_work(work); 8238 } 8239 return 0; 8240 out: 8241 list_for_each_entry_safe(work, next, &works, list) { 8242 list_del_init(&work->list); 8243 btrfs_wait_and_free_delalloc_work(work); 8244 } 8245 8246 if (!list_empty_careful(&splice)) { 8247 spin_lock(&root->delalloc_lock); 8248 list_splice_tail(&splice, &root->delalloc_inodes); 8249 spin_unlock(&root->delalloc_lock); 8250 } 8251 return ret; 8252 } 8253 8254 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8255 { 8256 int ret; 8257 8258 if (root->fs_info->sb->s_flags & MS_RDONLY) 8259 return -EROFS; 8260 8261 ret = __start_delalloc_inodes(root, delay_iput); 8262 /* 8263 * the filemap_flush will queue IO into the worker threads, but 8264 * we have to make sure the IO is actually started and that 8265 * ordered extents get created before we return 8266 */ 8267 atomic_inc(&root->fs_info->async_submit_draining); 8268 while (atomic_read(&root->fs_info->nr_async_submits) || 8269 atomic_read(&root->fs_info->async_delalloc_pages)) { 8270 wait_event(root->fs_info->async_submit_wait, 8271 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 8272 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 8273 } 8274 atomic_dec(&root->fs_info->async_submit_draining); 8275 return ret; 8276 } 8277 8278 int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, 8279 int delay_iput) 8280 { 8281 struct btrfs_root *root; 8282 struct list_head splice; 8283 int ret; 8284 8285 if (fs_info->sb->s_flags & MS_RDONLY) 8286 return -EROFS; 8287 8288 INIT_LIST_HEAD(&splice); 8289 8290 spin_lock(&fs_info->delalloc_root_lock); 8291 list_splice_init(&fs_info->delalloc_roots, &splice); 8292 while (!list_empty(&splice)) { 8293 root = list_first_entry(&splice, struct btrfs_root, 8294 delalloc_root); 8295 root = btrfs_grab_fs_root(root); 8296 BUG_ON(!root); 8297 list_move_tail(&root->delalloc_root, 8298 &fs_info->delalloc_roots); 8299 spin_unlock(&fs_info->delalloc_root_lock); 8300 8301 ret = __start_delalloc_inodes(root, delay_iput); 8302 btrfs_put_fs_root(root); 8303 if (ret) 8304 goto out; 8305 8306 spin_lock(&fs_info->delalloc_root_lock); 8307 } 8308 spin_unlock(&fs_info->delalloc_root_lock); 8309 8310 atomic_inc(&fs_info->async_submit_draining); 8311 while (atomic_read(&fs_info->nr_async_submits) || 8312 atomic_read(&fs_info->async_delalloc_pages)) { 8313 wait_event(fs_info->async_submit_wait, 8314 (atomic_read(&fs_info->nr_async_submits) == 0 && 8315 atomic_read(&fs_info->async_delalloc_pages) == 0)); 8316 } 8317 atomic_dec(&fs_info->async_submit_draining); 8318 return 0; 8319 out: 8320 if (!list_empty_careful(&splice)) { 8321 spin_lock(&fs_info->delalloc_root_lock); 8322 list_splice_tail(&splice, &fs_info->delalloc_roots); 8323 spin_unlock(&fs_info->delalloc_root_lock); 8324 } 8325 return ret; 8326 } 8327 8328 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 8329 const char *symname) 8330 { 8331 struct btrfs_trans_handle *trans; 8332 struct btrfs_root *root = BTRFS_I(dir)->root; 8333 struct btrfs_path *path; 8334 struct btrfs_key key; 8335 struct inode *inode = NULL; 8336 int err; 8337 int drop_inode = 0; 8338 u64 objectid; 8339 u64 index = 0 ; 8340 int name_len; 8341 int datasize; 8342 unsigned long ptr; 8343 struct btrfs_file_extent_item *ei; 8344 struct extent_buffer *leaf; 8345 8346 name_len = strlen(symname) + 1; 8347 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 8348 return -ENAMETOOLONG; 8349 8350 /* 8351 * 2 items for inode item and ref 8352 * 2 items for dir items 8353 * 1 item for xattr if selinux is on 8354 */ 8355 trans = btrfs_start_transaction(root, 5); 8356 if (IS_ERR(trans)) 8357 return PTR_ERR(trans); 8358 8359 err = btrfs_find_free_ino(root, &objectid); 8360 if (err) 8361 goto out_unlock; 8362 8363 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 8364 dentry->d_name.len, btrfs_ino(dir), objectid, 8365 S_IFLNK|S_IRWXUGO, &index); 8366 if (IS_ERR(inode)) { 8367 err = PTR_ERR(inode); 8368 goto out_unlock; 8369 } 8370 8371 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 8372 if (err) { 8373 drop_inode = 1; 8374 goto out_unlock; 8375 } 8376 8377 /* 8378 * If the active LSM wants to access the inode during 8379 * d_instantiate it needs these. Smack checks to see 8380 * if the filesystem supports xattrs by looking at the 8381 * ops vector. 8382 */ 8383 inode->i_fop = &btrfs_file_operations; 8384 inode->i_op = &btrfs_file_inode_operations; 8385 8386 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 8387 if (err) 8388 drop_inode = 1; 8389 else { 8390 inode->i_mapping->a_ops = &btrfs_aops; 8391 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 8392 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 8393 } 8394 if (drop_inode) 8395 goto out_unlock; 8396 8397 path = btrfs_alloc_path(); 8398 if (!path) { 8399 err = -ENOMEM; 8400 drop_inode = 1; 8401 goto out_unlock; 8402 } 8403 key.objectid = btrfs_ino(inode); 8404 key.offset = 0; 8405 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 8406 datasize = btrfs_file_extent_calc_inline_size(name_len); 8407 err = btrfs_insert_empty_item(trans, root, path, &key, 8408 datasize); 8409 if (err) { 8410 drop_inode = 1; 8411 btrfs_free_path(path); 8412 goto out_unlock; 8413 } 8414 leaf = path->nodes[0]; 8415 ei = btrfs_item_ptr(leaf, path->slots[0], 8416 struct btrfs_file_extent_item); 8417 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 8418 btrfs_set_file_extent_type(leaf, ei, 8419 BTRFS_FILE_EXTENT_INLINE); 8420 btrfs_set_file_extent_encryption(leaf, ei, 0); 8421 btrfs_set_file_extent_compression(leaf, ei, 0); 8422 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 8423 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 8424 8425 ptr = btrfs_file_extent_inline_start(ei); 8426 write_extent_buffer(leaf, symname, ptr, name_len); 8427 btrfs_mark_buffer_dirty(leaf); 8428 btrfs_free_path(path); 8429 8430 inode->i_op = &btrfs_symlink_inode_operations; 8431 inode->i_mapping->a_ops = &btrfs_symlink_aops; 8432 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 8433 inode_set_bytes(inode, name_len); 8434 btrfs_i_size_write(inode, name_len - 1); 8435 err = btrfs_update_inode(trans, root, inode); 8436 if (err) 8437 drop_inode = 1; 8438 8439 out_unlock: 8440 if (!err) 8441 d_instantiate(dentry, inode); 8442 btrfs_end_transaction(trans, root); 8443 if (drop_inode) { 8444 inode_dec_link_count(inode); 8445 iput(inode); 8446 } 8447 btrfs_btree_balance_dirty(root); 8448 return err; 8449 } 8450 8451 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 8452 u64 start, u64 num_bytes, u64 min_size, 8453 loff_t actual_len, u64 *alloc_hint, 8454 struct btrfs_trans_handle *trans) 8455 { 8456 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 8457 struct extent_map *em; 8458 struct btrfs_root *root = BTRFS_I(inode)->root; 8459 struct btrfs_key ins; 8460 u64 cur_offset = start; 8461 u64 i_size; 8462 u64 cur_bytes; 8463 int ret = 0; 8464 bool own_trans = true; 8465 8466 if (trans) 8467 own_trans = false; 8468 while (num_bytes > 0) { 8469 if (own_trans) { 8470 trans = btrfs_start_transaction(root, 3); 8471 if (IS_ERR(trans)) { 8472 ret = PTR_ERR(trans); 8473 break; 8474 } 8475 } 8476 8477 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); 8478 cur_bytes = max(cur_bytes, min_size); 8479 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, 8480 *alloc_hint, &ins, 1); 8481 if (ret) { 8482 if (own_trans) 8483 btrfs_end_transaction(trans, root); 8484 break; 8485 } 8486 8487 ret = insert_reserved_file_extent(trans, inode, 8488 cur_offset, ins.objectid, 8489 ins.offset, ins.offset, 8490 ins.offset, 0, 0, 0, 8491 BTRFS_FILE_EXTENT_PREALLOC); 8492 if (ret) { 8493 btrfs_abort_transaction(trans, root, ret); 8494 if (own_trans) 8495 btrfs_end_transaction(trans, root); 8496 break; 8497 } 8498 btrfs_drop_extent_cache(inode, cur_offset, 8499 cur_offset + ins.offset -1, 0); 8500 8501 em = alloc_extent_map(); 8502 if (!em) { 8503 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 8504 &BTRFS_I(inode)->runtime_flags); 8505 goto next; 8506 } 8507 8508 em->start = cur_offset; 8509 em->orig_start = cur_offset; 8510 em->len = ins.offset; 8511 em->block_start = ins.objectid; 8512 em->block_len = ins.offset; 8513 em->orig_block_len = ins.offset; 8514 em->ram_bytes = ins.offset; 8515 em->bdev = root->fs_info->fs_devices->latest_bdev; 8516 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 8517 em->generation = trans->transid; 8518 8519 while (1) { 8520 write_lock(&em_tree->lock); 8521 ret = add_extent_mapping(em_tree, em, 1); 8522 write_unlock(&em_tree->lock); 8523 if (ret != -EEXIST) 8524 break; 8525 btrfs_drop_extent_cache(inode, cur_offset, 8526 cur_offset + ins.offset - 1, 8527 0); 8528 } 8529 free_extent_map(em); 8530 next: 8531 num_bytes -= ins.offset; 8532 cur_offset += ins.offset; 8533 *alloc_hint = ins.objectid + ins.offset; 8534 8535 inode_inc_iversion(inode); 8536 inode->i_ctime = CURRENT_TIME; 8537 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 8538 if (!(mode & FALLOC_FL_KEEP_SIZE) && 8539 (actual_len > inode->i_size) && 8540 (cur_offset > inode->i_size)) { 8541 if (cur_offset > actual_len) 8542 i_size = actual_len; 8543 else 8544 i_size = cur_offset; 8545 i_size_write(inode, i_size); 8546 btrfs_ordered_update_i_size(inode, i_size, NULL); 8547 } 8548 8549 ret = btrfs_update_inode(trans, root, inode); 8550 8551 if (ret) { 8552 btrfs_abort_transaction(trans, root, ret); 8553 if (own_trans) 8554 btrfs_end_transaction(trans, root); 8555 break; 8556 } 8557 8558 if (own_trans) 8559 btrfs_end_transaction(trans, root); 8560 } 8561 return ret; 8562 } 8563 8564 int btrfs_prealloc_file_range(struct inode *inode, int mode, 8565 u64 start, u64 num_bytes, u64 min_size, 8566 loff_t actual_len, u64 *alloc_hint) 8567 { 8568 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 8569 min_size, actual_len, alloc_hint, 8570 NULL); 8571 } 8572 8573 int btrfs_prealloc_file_range_trans(struct inode *inode, 8574 struct btrfs_trans_handle *trans, int mode, 8575 u64 start, u64 num_bytes, u64 min_size, 8576 loff_t actual_len, u64 *alloc_hint) 8577 { 8578 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 8579 min_size, actual_len, alloc_hint, trans); 8580 } 8581 8582 static int btrfs_set_page_dirty(struct page *page) 8583 { 8584 return __set_page_dirty_nobuffers(page); 8585 } 8586 8587 static int btrfs_permission(struct inode *inode, int mask) 8588 { 8589 struct btrfs_root *root = BTRFS_I(inode)->root; 8590 umode_t mode = inode->i_mode; 8591 8592 if (mask & MAY_WRITE && 8593 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { 8594 if (btrfs_root_readonly(root)) 8595 return -EROFS; 8596 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) 8597 return -EACCES; 8598 } 8599 return generic_permission(inode, mask); 8600 } 8601 8602 static const struct inode_operations btrfs_dir_inode_operations = { 8603 .getattr = btrfs_getattr, 8604 .lookup = btrfs_lookup, 8605 .create = btrfs_create, 8606 .unlink = btrfs_unlink, 8607 .link = btrfs_link, 8608 .mkdir = btrfs_mkdir, 8609 .rmdir = btrfs_rmdir, 8610 .rename = btrfs_rename, 8611 .symlink = btrfs_symlink, 8612 .setattr = btrfs_setattr, 8613 .mknod = btrfs_mknod, 8614 .setxattr = btrfs_setxattr, 8615 .getxattr = btrfs_getxattr, 8616 .listxattr = btrfs_listxattr, 8617 .removexattr = btrfs_removexattr, 8618 .permission = btrfs_permission, 8619 .get_acl = btrfs_get_acl, 8620 .update_time = btrfs_update_time, 8621 }; 8622 static const struct inode_operations btrfs_dir_ro_inode_operations = { 8623 .lookup = btrfs_lookup, 8624 .permission = btrfs_permission, 8625 .get_acl = btrfs_get_acl, 8626 .update_time = btrfs_update_time, 8627 }; 8628 8629 static const struct file_operations btrfs_dir_file_operations = { 8630 .llseek = generic_file_llseek, 8631 .read = generic_read_dir, 8632 .iterate = btrfs_real_readdir, 8633 .unlocked_ioctl = btrfs_ioctl, 8634 #ifdef CONFIG_COMPAT 8635 .compat_ioctl = btrfs_ioctl, 8636 #endif 8637 .release = btrfs_release_file, 8638 .fsync = btrfs_sync_file, 8639 }; 8640 8641 static struct extent_io_ops btrfs_extent_io_ops = { 8642 .fill_delalloc = run_delalloc_range, 8643 .submit_bio_hook = btrfs_submit_bio_hook, 8644 .merge_bio_hook = btrfs_merge_bio_hook, 8645 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 8646 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 8647 .writepage_start_hook = btrfs_writepage_start_hook, 8648 .set_bit_hook = btrfs_set_bit_hook, 8649 .clear_bit_hook = btrfs_clear_bit_hook, 8650 .merge_extent_hook = btrfs_merge_extent_hook, 8651 .split_extent_hook = btrfs_split_extent_hook, 8652 }; 8653 8654 /* 8655 * btrfs doesn't support the bmap operation because swapfiles 8656 * use bmap to make a mapping of extents in the file. They assume 8657 * these extents won't change over the life of the file and they 8658 * use the bmap result to do IO directly to the drive. 8659 * 8660 * the btrfs bmap call would return logical addresses that aren't 8661 * suitable for IO and they also will change frequently as COW 8662 * operations happen. So, swapfile + btrfs == corruption. 8663 * 8664 * For now we're avoiding this by dropping bmap. 8665 */ 8666 static const struct address_space_operations btrfs_aops = { 8667 .readpage = btrfs_readpage, 8668 .writepage = btrfs_writepage, 8669 .writepages = btrfs_writepages, 8670 .readpages = btrfs_readpages, 8671 .direct_IO = btrfs_direct_IO, 8672 .invalidatepage = btrfs_invalidatepage, 8673 .releasepage = btrfs_releasepage, 8674 .set_page_dirty = btrfs_set_page_dirty, 8675 .error_remove_page = generic_error_remove_page, 8676 }; 8677 8678 static const struct address_space_operations btrfs_symlink_aops = { 8679 .readpage = btrfs_readpage, 8680 .writepage = btrfs_writepage, 8681 .invalidatepage = btrfs_invalidatepage, 8682 .releasepage = btrfs_releasepage, 8683 }; 8684 8685 static const struct inode_operations btrfs_file_inode_operations = { 8686 .getattr = btrfs_getattr, 8687 .setattr = btrfs_setattr, 8688 .setxattr = btrfs_setxattr, 8689 .getxattr = btrfs_getxattr, 8690 .listxattr = btrfs_listxattr, 8691 .removexattr = btrfs_removexattr, 8692 .permission = btrfs_permission, 8693 .fiemap = btrfs_fiemap, 8694 .get_acl = btrfs_get_acl, 8695 .update_time = btrfs_update_time, 8696 }; 8697 static const struct inode_operations btrfs_special_inode_operations = { 8698 .getattr = btrfs_getattr, 8699 .setattr = btrfs_setattr, 8700 .permission = btrfs_permission, 8701 .setxattr = btrfs_setxattr, 8702 .getxattr = btrfs_getxattr, 8703 .listxattr = btrfs_listxattr, 8704 .removexattr = btrfs_removexattr, 8705 .get_acl = btrfs_get_acl, 8706 .update_time = btrfs_update_time, 8707 }; 8708 static const struct inode_operations btrfs_symlink_inode_operations = { 8709 .readlink = generic_readlink, 8710 .follow_link = page_follow_link_light, 8711 .put_link = page_put_link, 8712 .getattr = btrfs_getattr, 8713 .setattr = btrfs_setattr, 8714 .permission = btrfs_permission, 8715 .setxattr = btrfs_setxattr, 8716 .getxattr = btrfs_getxattr, 8717 .listxattr = btrfs_listxattr, 8718 .removexattr = btrfs_removexattr, 8719 .get_acl = btrfs_get_acl, 8720 .update_time = btrfs_update_time, 8721 }; 8722 8723 const struct dentry_operations btrfs_dentry_operations = { 8724 .d_delete = btrfs_dentry_delete, 8725 .d_release = btrfs_dentry_release, 8726 }; 8727