1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/bio.h> 21 #include <linux/buffer_head.h> 22 #include <linux/file.h> 23 #include <linux/fs.h> 24 #include <linux/pagemap.h> 25 #include <linux/highmem.h> 26 #include <linux/time.h> 27 #include <linux/init.h> 28 #include <linux/string.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mpage.h> 31 #include <linux/swap.h> 32 #include <linux/writeback.h> 33 #include <linux/statfs.h> 34 #include <linux/compat.h> 35 #include <linux/bit_spinlock.h> 36 #include <linux/xattr.h> 37 #include <linux/posix_acl.h> 38 #include <linux/falloc.h> 39 #include <linux/slab.h> 40 #include "compat.h" 41 #include "ctree.h" 42 #include "disk-io.h" 43 #include "transaction.h" 44 #include "btrfs_inode.h" 45 #include "ioctl.h" 46 #include "print-tree.h" 47 #include "volumes.h" 48 #include "ordered-data.h" 49 #include "xattr.h" 50 #include "tree-log.h" 51 #include "compression.h" 52 #include "locking.h" 53 #include "free-space-cache.h" 54 55 struct btrfs_iget_args { 56 u64 ino; 57 struct btrfs_root *root; 58 }; 59 60 static const struct inode_operations btrfs_dir_inode_operations; 61 static const struct inode_operations btrfs_symlink_inode_operations; 62 static const struct inode_operations btrfs_dir_ro_inode_operations; 63 static const struct inode_operations btrfs_special_inode_operations; 64 static const struct inode_operations btrfs_file_inode_operations; 65 static const struct address_space_operations btrfs_aops; 66 static const struct address_space_operations btrfs_symlink_aops; 67 static const struct file_operations btrfs_dir_file_operations; 68 static struct extent_io_ops btrfs_extent_io_ops; 69 70 static struct kmem_cache *btrfs_inode_cachep; 71 struct kmem_cache *btrfs_trans_handle_cachep; 72 struct kmem_cache *btrfs_transaction_cachep; 73 struct kmem_cache *btrfs_path_cachep; 74 struct kmem_cache *btrfs_free_space_cachep; 75 76 #define S_SHIFT 12 77 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 78 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, 79 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, 80 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, 81 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, 82 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, 83 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, 84 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 85 }; 86 87 static int btrfs_setsize(struct inode *inode, loff_t newsize); 88 static int btrfs_truncate(struct inode *inode); 89 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 90 static noinline int cow_file_range(struct inode *inode, 91 struct page *locked_page, 92 u64 start, u64 end, int *page_started, 93 unsigned long *nr_written, int unlock); 94 95 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 96 struct inode *inode, struct inode *dir, 97 const struct qstr *qstr) 98 { 99 int err; 100 101 err = btrfs_init_acl(trans, inode, dir); 102 if (!err) 103 err = btrfs_xattr_security_init(trans, inode, dir, qstr); 104 return err; 105 } 106 107 /* 108 * this does all the hard work for inserting an inline extent into 109 * the btree. The caller should have done a btrfs_drop_extents so that 110 * no overlapping inline items exist in the btree 111 */ 112 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 113 struct btrfs_root *root, struct inode *inode, 114 u64 start, size_t size, size_t compressed_size, 115 int compress_type, 116 struct page **compressed_pages) 117 { 118 struct btrfs_key key; 119 struct btrfs_path *path; 120 struct extent_buffer *leaf; 121 struct page *page = NULL; 122 char *kaddr; 123 unsigned long ptr; 124 struct btrfs_file_extent_item *ei; 125 int err = 0; 126 int ret; 127 size_t cur_size = size; 128 size_t datasize; 129 unsigned long offset; 130 131 if (compressed_size && compressed_pages) 132 cur_size = compressed_size; 133 134 path = btrfs_alloc_path(); 135 if (!path) 136 return -ENOMEM; 137 138 path->leave_spinning = 1; 139 btrfs_set_trans_block_group(trans, inode); 140 141 key.objectid = inode->i_ino; 142 key.offset = start; 143 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 144 datasize = btrfs_file_extent_calc_inline_size(cur_size); 145 146 inode_add_bytes(inode, size); 147 ret = btrfs_insert_empty_item(trans, root, path, &key, 148 datasize); 149 BUG_ON(ret); 150 if (ret) { 151 err = ret; 152 goto fail; 153 } 154 leaf = path->nodes[0]; 155 ei = btrfs_item_ptr(leaf, path->slots[0], 156 struct btrfs_file_extent_item); 157 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 158 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); 159 btrfs_set_file_extent_encryption(leaf, ei, 0); 160 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 161 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 162 ptr = btrfs_file_extent_inline_start(ei); 163 164 if (compress_type != BTRFS_COMPRESS_NONE) { 165 struct page *cpage; 166 int i = 0; 167 while (compressed_size > 0) { 168 cpage = compressed_pages[i]; 169 cur_size = min_t(unsigned long, compressed_size, 170 PAGE_CACHE_SIZE); 171 172 kaddr = kmap_atomic(cpage, KM_USER0); 173 write_extent_buffer(leaf, kaddr, ptr, cur_size); 174 kunmap_atomic(kaddr, KM_USER0); 175 176 i++; 177 ptr += cur_size; 178 compressed_size -= cur_size; 179 } 180 btrfs_set_file_extent_compression(leaf, ei, 181 compress_type); 182 } else { 183 page = find_get_page(inode->i_mapping, 184 start >> PAGE_CACHE_SHIFT); 185 btrfs_set_file_extent_compression(leaf, ei, 0); 186 kaddr = kmap_atomic(page, KM_USER0); 187 offset = start & (PAGE_CACHE_SIZE - 1); 188 write_extent_buffer(leaf, kaddr + offset, ptr, size); 189 kunmap_atomic(kaddr, KM_USER0); 190 page_cache_release(page); 191 } 192 btrfs_mark_buffer_dirty(leaf); 193 btrfs_free_path(path); 194 195 /* 196 * we're an inline extent, so nobody can 197 * extend the file past i_size without locking 198 * a page we already have locked. 199 * 200 * We must do any isize and inode updates 201 * before we unlock the pages. Otherwise we 202 * could end up racing with unlink. 203 */ 204 BTRFS_I(inode)->disk_i_size = inode->i_size; 205 btrfs_update_inode(trans, root, inode); 206 207 return 0; 208 fail: 209 btrfs_free_path(path); 210 return err; 211 } 212 213 214 /* 215 * conditionally insert an inline extent into the file. This 216 * does the checks required to make sure the data is small enough 217 * to fit as an inline extent. 218 */ 219 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 220 struct btrfs_root *root, 221 struct inode *inode, u64 start, u64 end, 222 size_t compressed_size, int compress_type, 223 struct page **compressed_pages) 224 { 225 u64 isize = i_size_read(inode); 226 u64 actual_end = min(end + 1, isize); 227 u64 inline_len = actual_end - start; 228 u64 aligned_end = (end + root->sectorsize - 1) & 229 ~((u64)root->sectorsize - 1); 230 u64 hint_byte; 231 u64 data_len = inline_len; 232 int ret; 233 234 if (compressed_size) 235 data_len = compressed_size; 236 237 if (start > 0 || 238 actual_end >= PAGE_CACHE_SIZE || 239 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 240 (!compressed_size && 241 (actual_end & (root->sectorsize - 1)) == 0) || 242 end + 1 < isize || 243 data_len > root->fs_info->max_inline) { 244 return 1; 245 } 246 247 ret = btrfs_drop_extents(trans, inode, start, aligned_end, 248 &hint_byte, 1); 249 BUG_ON(ret); 250 251 if (isize > actual_end) 252 inline_len = min_t(u64, isize, actual_end); 253 ret = insert_inline_extent(trans, root, inode, start, 254 inline_len, compressed_size, 255 compress_type, compressed_pages); 256 BUG_ON(ret); 257 btrfs_delalloc_release_metadata(inode, end + 1 - start); 258 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 259 return 0; 260 } 261 262 struct async_extent { 263 u64 start; 264 u64 ram_size; 265 u64 compressed_size; 266 struct page **pages; 267 unsigned long nr_pages; 268 int compress_type; 269 struct list_head list; 270 }; 271 272 struct async_cow { 273 struct inode *inode; 274 struct btrfs_root *root; 275 struct page *locked_page; 276 u64 start; 277 u64 end; 278 struct list_head extents; 279 struct btrfs_work work; 280 }; 281 282 static noinline int add_async_extent(struct async_cow *cow, 283 u64 start, u64 ram_size, 284 u64 compressed_size, 285 struct page **pages, 286 unsigned long nr_pages, 287 int compress_type) 288 { 289 struct async_extent *async_extent; 290 291 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 292 BUG_ON(!async_extent); 293 async_extent->start = start; 294 async_extent->ram_size = ram_size; 295 async_extent->compressed_size = compressed_size; 296 async_extent->pages = pages; 297 async_extent->nr_pages = nr_pages; 298 async_extent->compress_type = compress_type; 299 list_add_tail(&async_extent->list, &cow->extents); 300 return 0; 301 } 302 303 /* 304 * we create compressed extents in two phases. The first 305 * phase compresses a range of pages that have already been 306 * locked (both pages and state bits are locked). 307 * 308 * This is done inside an ordered work queue, and the compression 309 * is spread across many cpus. The actual IO submission is step 310 * two, and the ordered work queue takes care of making sure that 311 * happens in the same order things were put onto the queue by 312 * writepages and friends. 313 * 314 * If this code finds it can't get good compression, it puts an 315 * entry onto the work queue to write the uncompressed bytes. This 316 * makes sure that both compressed inodes and uncompressed inodes 317 * are written in the same order that pdflush sent them down. 318 */ 319 static noinline int compress_file_range(struct inode *inode, 320 struct page *locked_page, 321 u64 start, u64 end, 322 struct async_cow *async_cow, 323 int *num_added) 324 { 325 struct btrfs_root *root = BTRFS_I(inode)->root; 326 struct btrfs_trans_handle *trans; 327 u64 num_bytes; 328 u64 blocksize = root->sectorsize; 329 u64 actual_end; 330 u64 isize = i_size_read(inode); 331 int ret = 0; 332 struct page **pages = NULL; 333 unsigned long nr_pages; 334 unsigned long nr_pages_ret = 0; 335 unsigned long total_compressed = 0; 336 unsigned long total_in = 0; 337 unsigned long max_compressed = 128 * 1024; 338 unsigned long max_uncompressed = 128 * 1024; 339 int i; 340 int will_compress; 341 int compress_type = root->fs_info->compress_type; 342 343 actual_end = min_t(u64, isize, end + 1); 344 again: 345 will_compress = 0; 346 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 347 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 348 349 /* 350 * we don't want to send crud past the end of i_size through 351 * compression, that's just a waste of CPU time. So, if the 352 * end of the file is before the start of our current 353 * requested range of bytes, we bail out to the uncompressed 354 * cleanup code that can deal with all of this. 355 * 356 * It isn't really the fastest way to fix things, but this is a 357 * very uncommon corner. 358 */ 359 if (actual_end <= start) 360 goto cleanup_and_bail_uncompressed; 361 362 total_compressed = actual_end - start; 363 364 /* we want to make sure that amount of ram required to uncompress 365 * an extent is reasonable, so we limit the total size in ram 366 * of a compressed extent to 128k. This is a crucial number 367 * because it also controls how easily we can spread reads across 368 * cpus for decompression. 369 * 370 * We also want to make sure the amount of IO required to do 371 * a random read is reasonably small, so we limit the size of 372 * a compressed extent to 128k. 373 */ 374 total_compressed = min(total_compressed, max_uncompressed); 375 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 376 num_bytes = max(blocksize, num_bytes); 377 total_in = 0; 378 ret = 0; 379 380 /* 381 * we do compression for mount -o compress and when the 382 * inode has not been flagged as nocompress. This flag can 383 * change at any time if we discover bad compression ratios. 384 */ 385 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 386 (btrfs_test_opt(root, COMPRESS) || 387 (BTRFS_I(inode)->force_compress) || 388 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 389 WARN_ON(pages); 390 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 391 BUG_ON(!pages); 392 393 if (BTRFS_I(inode)->force_compress) 394 compress_type = BTRFS_I(inode)->force_compress; 395 396 ret = btrfs_compress_pages(compress_type, 397 inode->i_mapping, start, 398 total_compressed, pages, 399 nr_pages, &nr_pages_ret, 400 &total_in, 401 &total_compressed, 402 max_compressed); 403 404 if (!ret) { 405 unsigned long offset = total_compressed & 406 (PAGE_CACHE_SIZE - 1); 407 struct page *page = pages[nr_pages_ret - 1]; 408 char *kaddr; 409 410 /* zero the tail end of the last page, we might be 411 * sending it down to disk 412 */ 413 if (offset) { 414 kaddr = kmap_atomic(page, KM_USER0); 415 memset(kaddr + offset, 0, 416 PAGE_CACHE_SIZE - offset); 417 kunmap_atomic(kaddr, KM_USER0); 418 } 419 will_compress = 1; 420 } 421 } 422 if (start == 0) { 423 trans = btrfs_join_transaction(root, 1); 424 BUG_ON(IS_ERR(trans)); 425 btrfs_set_trans_block_group(trans, inode); 426 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 427 428 /* lets try to make an inline extent */ 429 if (ret || total_in < (actual_end - start)) { 430 /* we didn't compress the entire range, try 431 * to make an uncompressed inline extent. 432 */ 433 ret = cow_file_range_inline(trans, root, inode, 434 start, end, 0, 0, NULL); 435 } else { 436 /* try making a compressed inline extent */ 437 ret = cow_file_range_inline(trans, root, inode, 438 start, end, 439 total_compressed, 440 compress_type, pages); 441 } 442 if (ret == 0) { 443 /* 444 * inline extent creation worked, we don't need 445 * to create any more async work items. Unlock 446 * and free up our temp pages. 447 */ 448 extent_clear_unlock_delalloc(inode, 449 &BTRFS_I(inode)->io_tree, 450 start, end, NULL, 451 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 452 EXTENT_CLEAR_DELALLOC | 453 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 454 455 btrfs_end_transaction(trans, root); 456 goto free_pages_out; 457 } 458 btrfs_end_transaction(trans, root); 459 } 460 461 if (will_compress) { 462 /* 463 * we aren't doing an inline extent round the compressed size 464 * up to a block size boundary so the allocator does sane 465 * things 466 */ 467 total_compressed = (total_compressed + blocksize - 1) & 468 ~(blocksize - 1); 469 470 /* 471 * one last check to make sure the compression is really a 472 * win, compare the page count read with the blocks on disk 473 */ 474 total_in = (total_in + PAGE_CACHE_SIZE - 1) & 475 ~(PAGE_CACHE_SIZE - 1); 476 if (total_compressed >= total_in) { 477 will_compress = 0; 478 } else { 479 num_bytes = total_in; 480 } 481 } 482 if (!will_compress && pages) { 483 /* 484 * the compression code ran but failed to make things smaller, 485 * free any pages it allocated and our page pointer array 486 */ 487 for (i = 0; i < nr_pages_ret; i++) { 488 WARN_ON(pages[i]->mapping); 489 page_cache_release(pages[i]); 490 } 491 kfree(pages); 492 pages = NULL; 493 total_compressed = 0; 494 nr_pages_ret = 0; 495 496 /* flag the file so we don't compress in the future */ 497 if (!btrfs_test_opt(root, FORCE_COMPRESS) && 498 !(BTRFS_I(inode)->force_compress)) { 499 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 500 } 501 } 502 if (will_compress) { 503 *num_added += 1; 504 505 /* the async work queues will take care of doing actual 506 * allocation on disk for these compressed pages, 507 * and will submit them to the elevator. 508 */ 509 add_async_extent(async_cow, start, num_bytes, 510 total_compressed, pages, nr_pages_ret, 511 compress_type); 512 513 if (start + num_bytes < end) { 514 start += num_bytes; 515 pages = NULL; 516 cond_resched(); 517 goto again; 518 } 519 } else { 520 cleanup_and_bail_uncompressed: 521 /* 522 * No compression, but we still need to write the pages in 523 * the file we've been given so far. redirty the locked 524 * page if it corresponds to our extent and set things up 525 * for the async work queue to run cow_file_range to do 526 * the normal delalloc dance 527 */ 528 if (page_offset(locked_page) >= start && 529 page_offset(locked_page) <= end) { 530 __set_page_dirty_nobuffers(locked_page); 531 /* unlocked later on in the async handlers */ 532 } 533 add_async_extent(async_cow, start, end - start + 1, 534 0, NULL, 0, BTRFS_COMPRESS_NONE); 535 *num_added += 1; 536 } 537 538 out: 539 return 0; 540 541 free_pages_out: 542 for (i = 0; i < nr_pages_ret; i++) { 543 WARN_ON(pages[i]->mapping); 544 page_cache_release(pages[i]); 545 } 546 kfree(pages); 547 548 goto out; 549 } 550 551 /* 552 * phase two of compressed writeback. This is the ordered portion 553 * of the code, which only gets called in the order the work was 554 * queued. We walk all the async extents created by compress_file_range 555 * and send them down to the disk. 556 */ 557 static noinline int submit_compressed_extents(struct inode *inode, 558 struct async_cow *async_cow) 559 { 560 struct async_extent *async_extent; 561 u64 alloc_hint = 0; 562 struct btrfs_trans_handle *trans; 563 struct btrfs_key ins; 564 struct extent_map *em; 565 struct btrfs_root *root = BTRFS_I(inode)->root; 566 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 567 struct extent_io_tree *io_tree; 568 int ret = 0; 569 570 if (list_empty(&async_cow->extents)) 571 return 0; 572 573 574 while (!list_empty(&async_cow->extents)) { 575 async_extent = list_entry(async_cow->extents.next, 576 struct async_extent, list); 577 list_del(&async_extent->list); 578 579 io_tree = &BTRFS_I(inode)->io_tree; 580 581 retry: 582 /* did the compression code fall back to uncompressed IO? */ 583 if (!async_extent->pages) { 584 int page_started = 0; 585 unsigned long nr_written = 0; 586 587 lock_extent(io_tree, async_extent->start, 588 async_extent->start + 589 async_extent->ram_size - 1, GFP_NOFS); 590 591 /* allocate blocks */ 592 ret = cow_file_range(inode, async_cow->locked_page, 593 async_extent->start, 594 async_extent->start + 595 async_extent->ram_size - 1, 596 &page_started, &nr_written, 0); 597 598 /* 599 * if page_started, cow_file_range inserted an 600 * inline extent and took care of all the unlocking 601 * and IO for us. Otherwise, we need to submit 602 * all those pages down to the drive. 603 */ 604 if (!page_started && !ret) 605 extent_write_locked_range(io_tree, 606 inode, async_extent->start, 607 async_extent->start + 608 async_extent->ram_size - 1, 609 btrfs_get_extent, 610 WB_SYNC_ALL); 611 kfree(async_extent); 612 cond_resched(); 613 continue; 614 } 615 616 lock_extent(io_tree, async_extent->start, 617 async_extent->start + async_extent->ram_size - 1, 618 GFP_NOFS); 619 620 trans = btrfs_join_transaction(root, 1); 621 BUG_ON(IS_ERR(trans)); 622 ret = btrfs_reserve_extent(trans, root, 623 async_extent->compressed_size, 624 async_extent->compressed_size, 625 0, alloc_hint, 626 (u64)-1, &ins, 1); 627 btrfs_end_transaction(trans, root); 628 629 if (ret) { 630 int i; 631 for (i = 0; i < async_extent->nr_pages; i++) { 632 WARN_ON(async_extent->pages[i]->mapping); 633 page_cache_release(async_extent->pages[i]); 634 } 635 kfree(async_extent->pages); 636 async_extent->nr_pages = 0; 637 async_extent->pages = NULL; 638 unlock_extent(io_tree, async_extent->start, 639 async_extent->start + 640 async_extent->ram_size - 1, GFP_NOFS); 641 goto retry; 642 } 643 644 /* 645 * here we're doing allocation and writeback of the 646 * compressed pages 647 */ 648 btrfs_drop_extent_cache(inode, async_extent->start, 649 async_extent->start + 650 async_extent->ram_size - 1, 0); 651 652 em = alloc_extent_map(GFP_NOFS); 653 BUG_ON(!em); 654 em->start = async_extent->start; 655 em->len = async_extent->ram_size; 656 em->orig_start = em->start; 657 658 em->block_start = ins.objectid; 659 em->block_len = ins.offset; 660 em->bdev = root->fs_info->fs_devices->latest_bdev; 661 em->compress_type = async_extent->compress_type; 662 set_bit(EXTENT_FLAG_PINNED, &em->flags); 663 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 664 665 while (1) { 666 write_lock(&em_tree->lock); 667 ret = add_extent_mapping(em_tree, em); 668 write_unlock(&em_tree->lock); 669 if (ret != -EEXIST) { 670 free_extent_map(em); 671 break; 672 } 673 btrfs_drop_extent_cache(inode, async_extent->start, 674 async_extent->start + 675 async_extent->ram_size - 1, 0); 676 } 677 678 ret = btrfs_add_ordered_extent_compress(inode, 679 async_extent->start, 680 ins.objectid, 681 async_extent->ram_size, 682 ins.offset, 683 BTRFS_ORDERED_COMPRESSED, 684 async_extent->compress_type); 685 BUG_ON(ret); 686 687 /* 688 * clear dirty, set writeback and unlock the pages. 689 */ 690 extent_clear_unlock_delalloc(inode, 691 &BTRFS_I(inode)->io_tree, 692 async_extent->start, 693 async_extent->start + 694 async_extent->ram_size - 1, 695 NULL, EXTENT_CLEAR_UNLOCK_PAGE | 696 EXTENT_CLEAR_UNLOCK | 697 EXTENT_CLEAR_DELALLOC | 698 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); 699 700 ret = btrfs_submit_compressed_write(inode, 701 async_extent->start, 702 async_extent->ram_size, 703 ins.objectid, 704 ins.offset, async_extent->pages, 705 async_extent->nr_pages); 706 707 BUG_ON(ret); 708 alloc_hint = ins.objectid + ins.offset; 709 kfree(async_extent); 710 cond_resched(); 711 } 712 713 return 0; 714 } 715 716 static u64 get_extent_allocation_hint(struct inode *inode, u64 start, 717 u64 num_bytes) 718 { 719 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 720 struct extent_map *em; 721 u64 alloc_hint = 0; 722 723 read_lock(&em_tree->lock); 724 em = search_extent_mapping(em_tree, start, num_bytes); 725 if (em) { 726 /* 727 * if block start isn't an actual block number then find the 728 * first block in this inode and use that as a hint. If that 729 * block is also bogus then just don't worry about it. 730 */ 731 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 732 free_extent_map(em); 733 em = search_extent_mapping(em_tree, 0, 0); 734 if (em && em->block_start < EXTENT_MAP_LAST_BYTE) 735 alloc_hint = em->block_start; 736 if (em) 737 free_extent_map(em); 738 } else { 739 alloc_hint = em->block_start; 740 free_extent_map(em); 741 } 742 } 743 read_unlock(&em_tree->lock); 744 745 return alloc_hint; 746 } 747 748 /* 749 * when extent_io.c finds a delayed allocation range in the file, 750 * the call backs end up in this code. The basic idea is to 751 * allocate extents on disk for the range, and create ordered data structs 752 * in ram to track those extents. 753 * 754 * locked_page is the page that writepage had locked already. We use 755 * it to make sure we don't do extra locks or unlocks. 756 * 757 * *page_started is set to one if we unlock locked_page and do everything 758 * required to start IO on it. It may be clean and already done with 759 * IO when we return. 760 */ 761 static noinline int cow_file_range(struct inode *inode, 762 struct page *locked_page, 763 u64 start, u64 end, int *page_started, 764 unsigned long *nr_written, 765 int unlock) 766 { 767 struct btrfs_root *root = BTRFS_I(inode)->root; 768 struct btrfs_trans_handle *trans; 769 u64 alloc_hint = 0; 770 u64 num_bytes; 771 unsigned long ram_size; 772 u64 disk_num_bytes; 773 u64 cur_alloc_size; 774 u64 blocksize = root->sectorsize; 775 struct btrfs_key ins; 776 struct extent_map *em; 777 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 778 int ret = 0; 779 780 BUG_ON(root == root->fs_info->tree_root); 781 trans = btrfs_join_transaction(root, 1); 782 BUG_ON(IS_ERR(trans)); 783 btrfs_set_trans_block_group(trans, inode); 784 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 785 786 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 787 num_bytes = max(blocksize, num_bytes); 788 disk_num_bytes = num_bytes; 789 ret = 0; 790 791 if (start == 0) { 792 /* lets try to make an inline extent */ 793 ret = cow_file_range_inline(trans, root, inode, 794 start, end, 0, 0, NULL); 795 if (ret == 0) { 796 extent_clear_unlock_delalloc(inode, 797 &BTRFS_I(inode)->io_tree, 798 start, end, NULL, 799 EXTENT_CLEAR_UNLOCK_PAGE | 800 EXTENT_CLEAR_UNLOCK | 801 EXTENT_CLEAR_DELALLOC | 802 EXTENT_CLEAR_DIRTY | 803 EXTENT_SET_WRITEBACK | 804 EXTENT_END_WRITEBACK); 805 806 *nr_written = *nr_written + 807 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 808 *page_started = 1; 809 ret = 0; 810 goto out; 811 } 812 } 813 814 BUG_ON(disk_num_bytes > 815 btrfs_super_total_bytes(&root->fs_info->super_copy)); 816 817 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 818 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 819 820 while (disk_num_bytes > 0) { 821 unsigned long op; 822 823 cur_alloc_size = disk_num_bytes; 824 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 825 root->sectorsize, 0, alloc_hint, 826 (u64)-1, &ins, 1); 827 BUG_ON(ret); 828 829 em = alloc_extent_map(GFP_NOFS); 830 BUG_ON(!em); 831 em->start = start; 832 em->orig_start = em->start; 833 ram_size = ins.offset; 834 em->len = ins.offset; 835 836 em->block_start = ins.objectid; 837 em->block_len = ins.offset; 838 em->bdev = root->fs_info->fs_devices->latest_bdev; 839 set_bit(EXTENT_FLAG_PINNED, &em->flags); 840 841 while (1) { 842 write_lock(&em_tree->lock); 843 ret = add_extent_mapping(em_tree, em); 844 write_unlock(&em_tree->lock); 845 if (ret != -EEXIST) { 846 free_extent_map(em); 847 break; 848 } 849 btrfs_drop_extent_cache(inode, start, 850 start + ram_size - 1, 0); 851 } 852 853 cur_alloc_size = ins.offset; 854 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 855 ram_size, cur_alloc_size, 0); 856 BUG_ON(ret); 857 858 if (root->root_key.objectid == 859 BTRFS_DATA_RELOC_TREE_OBJECTID) { 860 ret = btrfs_reloc_clone_csums(inode, start, 861 cur_alloc_size); 862 BUG_ON(ret); 863 } 864 865 if (disk_num_bytes < cur_alloc_size) 866 break; 867 868 /* we're not doing compressed IO, don't unlock the first 869 * page (which the caller expects to stay locked), don't 870 * clear any dirty bits and don't set any writeback bits 871 * 872 * Do set the Private2 bit so we know this page was properly 873 * setup for writepage 874 */ 875 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; 876 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 877 EXTENT_SET_PRIVATE2; 878 879 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 880 start, start + ram_size - 1, 881 locked_page, op); 882 disk_num_bytes -= cur_alloc_size; 883 num_bytes -= cur_alloc_size; 884 alloc_hint = ins.objectid + ins.offset; 885 start += cur_alloc_size; 886 } 887 out: 888 ret = 0; 889 btrfs_end_transaction(trans, root); 890 891 return ret; 892 } 893 894 /* 895 * work queue call back to started compression on a file and pages 896 */ 897 static noinline void async_cow_start(struct btrfs_work *work) 898 { 899 struct async_cow *async_cow; 900 int num_added = 0; 901 async_cow = container_of(work, struct async_cow, work); 902 903 compress_file_range(async_cow->inode, async_cow->locked_page, 904 async_cow->start, async_cow->end, async_cow, 905 &num_added); 906 if (num_added == 0) 907 async_cow->inode = NULL; 908 } 909 910 /* 911 * work queue call back to submit previously compressed pages 912 */ 913 static noinline void async_cow_submit(struct btrfs_work *work) 914 { 915 struct async_cow *async_cow; 916 struct btrfs_root *root; 917 unsigned long nr_pages; 918 919 async_cow = container_of(work, struct async_cow, work); 920 921 root = async_cow->root; 922 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 923 PAGE_CACHE_SHIFT; 924 925 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); 926 927 if (atomic_read(&root->fs_info->async_delalloc_pages) < 928 5 * 1042 * 1024 && 929 waitqueue_active(&root->fs_info->async_submit_wait)) 930 wake_up(&root->fs_info->async_submit_wait); 931 932 if (async_cow->inode) 933 submit_compressed_extents(async_cow->inode, async_cow); 934 } 935 936 static noinline void async_cow_free(struct btrfs_work *work) 937 { 938 struct async_cow *async_cow; 939 async_cow = container_of(work, struct async_cow, work); 940 kfree(async_cow); 941 } 942 943 static int cow_file_range_async(struct inode *inode, struct page *locked_page, 944 u64 start, u64 end, int *page_started, 945 unsigned long *nr_written) 946 { 947 struct async_cow *async_cow; 948 struct btrfs_root *root = BTRFS_I(inode)->root; 949 unsigned long nr_pages; 950 u64 cur_end; 951 int limit = 10 * 1024 * 1042; 952 953 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 954 1, 0, NULL, GFP_NOFS); 955 while (start < end) { 956 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 957 BUG_ON(!async_cow); 958 async_cow->inode = inode; 959 async_cow->root = root; 960 async_cow->locked_page = locked_page; 961 async_cow->start = start; 962 963 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 964 cur_end = end; 965 else 966 cur_end = min(end, start + 512 * 1024 - 1); 967 968 async_cow->end = cur_end; 969 INIT_LIST_HEAD(&async_cow->extents); 970 971 async_cow->work.func = async_cow_start; 972 async_cow->work.ordered_func = async_cow_submit; 973 async_cow->work.ordered_free = async_cow_free; 974 async_cow->work.flags = 0; 975 976 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 977 PAGE_CACHE_SHIFT; 978 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); 979 980 btrfs_queue_worker(&root->fs_info->delalloc_workers, 981 &async_cow->work); 982 983 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { 984 wait_event(root->fs_info->async_submit_wait, 985 (atomic_read(&root->fs_info->async_delalloc_pages) < 986 limit)); 987 } 988 989 while (atomic_read(&root->fs_info->async_submit_draining) && 990 atomic_read(&root->fs_info->async_delalloc_pages)) { 991 wait_event(root->fs_info->async_submit_wait, 992 (atomic_read(&root->fs_info->async_delalloc_pages) == 993 0)); 994 } 995 996 *nr_written += nr_pages; 997 start = cur_end + 1; 998 } 999 *page_started = 1; 1000 return 0; 1001 } 1002 1003 static noinline int csum_exist_in_range(struct btrfs_root *root, 1004 u64 bytenr, u64 num_bytes) 1005 { 1006 int ret; 1007 struct btrfs_ordered_sum *sums; 1008 LIST_HEAD(list); 1009 1010 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1011 bytenr + num_bytes - 1, &list); 1012 if (ret == 0 && list_empty(&list)) 1013 return 0; 1014 1015 while (!list_empty(&list)) { 1016 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 1017 list_del(&sums->list); 1018 kfree(sums); 1019 } 1020 return 1; 1021 } 1022 1023 /* 1024 * when nowcow writeback call back. This checks for snapshots or COW copies 1025 * of the extents that exist in the file, and COWs the file as required. 1026 * 1027 * If no cow copies or snapshots exist, we write directly to the existing 1028 * blocks on disk 1029 */ 1030 static noinline int run_delalloc_nocow(struct inode *inode, 1031 struct page *locked_page, 1032 u64 start, u64 end, int *page_started, int force, 1033 unsigned long *nr_written) 1034 { 1035 struct btrfs_root *root = BTRFS_I(inode)->root; 1036 struct btrfs_trans_handle *trans; 1037 struct extent_buffer *leaf; 1038 struct btrfs_path *path; 1039 struct btrfs_file_extent_item *fi; 1040 struct btrfs_key found_key; 1041 u64 cow_start; 1042 u64 cur_offset; 1043 u64 extent_end; 1044 u64 extent_offset; 1045 u64 disk_bytenr; 1046 u64 num_bytes; 1047 int extent_type; 1048 int ret; 1049 int type; 1050 int nocow; 1051 int check_prev = 1; 1052 bool nolock = false; 1053 1054 path = btrfs_alloc_path(); 1055 BUG_ON(!path); 1056 if (root == root->fs_info->tree_root) { 1057 nolock = true; 1058 trans = btrfs_join_transaction_nolock(root, 1); 1059 } else { 1060 trans = btrfs_join_transaction(root, 1); 1061 } 1062 BUG_ON(IS_ERR(trans)); 1063 1064 cow_start = (u64)-1; 1065 cur_offset = start; 1066 while (1) { 1067 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 1068 cur_offset, 0); 1069 BUG_ON(ret < 0); 1070 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1071 leaf = path->nodes[0]; 1072 btrfs_item_key_to_cpu(leaf, &found_key, 1073 path->slots[0] - 1); 1074 if (found_key.objectid == inode->i_ino && 1075 found_key.type == BTRFS_EXTENT_DATA_KEY) 1076 path->slots[0]--; 1077 } 1078 check_prev = 0; 1079 next_slot: 1080 leaf = path->nodes[0]; 1081 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1082 ret = btrfs_next_leaf(root, path); 1083 if (ret < 0) 1084 BUG_ON(1); 1085 if (ret > 0) 1086 break; 1087 leaf = path->nodes[0]; 1088 } 1089 1090 nocow = 0; 1091 disk_bytenr = 0; 1092 num_bytes = 0; 1093 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1094 1095 if (found_key.objectid > inode->i_ino || 1096 found_key.type > BTRFS_EXTENT_DATA_KEY || 1097 found_key.offset > end) 1098 break; 1099 1100 if (found_key.offset > cur_offset) { 1101 extent_end = found_key.offset; 1102 extent_type = 0; 1103 goto out_check; 1104 } 1105 1106 fi = btrfs_item_ptr(leaf, path->slots[0], 1107 struct btrfs_file_extent_item); 1108 extent_type = btrfs_file_extent_type(leaf, fi); 1109 1110 if (extent_type == BTRFS_FILE_EXTENT_REG || 1111 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1112 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1113 extent_offset = btrfs_file_extent_offset(leaf, fi); 1114 extent_end = found_key.offset + 1115 btrfs_file_extent_num_bytes(leaf, fi); 1116 if (extent_end <= start) { 1117 path->slots[0]++; 1118 goto next_slot; 1119 } 1120 if (disk_bytenr == 0) 1121 goto out_check; 1122 if (btrfs_file_extent_compression(leaf, fi) || 1123 btrfs_file_extent_encryption(leaf, fi) || 1124 btrfs_file_extent_other_encoding(leaf, fi)) 1125 goto out_check; 1126 if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1127 goto out_check; 1128 if (btrfs_extent_readonly(root, disk_bytenr)) 1129 goto out_check; 1130 if (btrfs_cross_ref_exist(trans, root, inode->i_ino, 1131 found_key.offset - 1132 extent_offset, disk_bytenr)) 1133 goto out_check; 1134 disk_bytenr += extent_offset; 1135 disk_bytenr += cur_offset - found_key.offset; 1136 num_bytes = min(end + 1, extent_end) - cur_offset; 1137 /* 1138 * force cow if csum exists in the range. 1139 * this ensure that csum for a given extent are 1140 * either valid or do not exist. 1141 */ 1142 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1143 goto out_check; 1144 nocow = 1; 1145 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1146 extent_end = found_key.offset + 1147 btrfs_file_extent_inline_len(leaf, fi); 1148 extent_end = ALIGN(extent_end, root->sectorsize); 1149 } else { 1150 BUG_ON(1); 1151 } 1152 out_check: 1153 if (extent_end <= start) { 1154 path->slots[0]++; 1155 goto next_slot; 1156 } 1157 if (!nocow) { 1158 if (cow_start == (u64)-1) 1159 cow_start = cur_offset; 1160 cur_offset = extent_end; 1161 if (cur_offset > end) 1162 break; 1163 path->slots[0]++; 1164 goto next_slot; 1165 } 1166 1167 btrfs_release_path(root, path); 1168 if (cow_start != (u64)-1) { 1169 ret = cow_file_range(inode, locked_page, cow_start, 1170 found_key.offset - 1, page_started, 1171 nr_written, 1); 1172 BUG_ON(ret); 1173 cow_start = (u64)-1; 1174 } 1175 1176 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1177 struct extent_map *em; 1178 struct extent_map_tree *em_tree; 1179 em_tree = &BTRFS_I(inode)->extent_tree; 1180 em = alloc_extent_map(GFP_NOFS); 1181 BUG_ON(!em); 1182 em->start = cur_offset; 1183 em->orig_start = em->start; 1184 em->len = num_bytes; 1185 em->block_len = num_bytes; 1186 em->block_start = disk_bytenr; 1187 em->bdev = root->fs_info->fs_devices->latest_bdev; 1188 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1189 while (1) { 1190 write_lock(&em_tree->lock); 1191 ret = add_extent_mapping(em_tree, em); 1192 write_unlock(&em_tree->lock); 1193 if (ret != -EEXIST) { 1194 free_extent_map(em); 1195 break; 1196 } 1197 btrfs_drop_extent_cache(inode, em->start, 1198 em->start + em->len - 1, 0); 1199 } 1200 type = BTRFS_ORDERED_PREALLOC; 1201 } else { 1202 type = BTRFS_ORDERED_NOCOW; 1203 } 1204 1205 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1206 num_bytes, num_bytes, type); 1207 BUG_ON(ret); 1208 1209 if (root->root_key.objectid == 1210 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1211 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1212 num_bytes); 1213 BUG_ON(ret); 1214 } 1215 1216 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1217 cur_offset, cur_offset + num_bytes - 1, 1218 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1219 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | 1220 EXTENT_SET_PRIVATE2); 1221 cur_offset = extent_end; 1222 if (cur_offset > end) 1223 break; 1224 } 1225 btrfs_release_path(root, path); 1226 1227 if (cur_offset <= end && cow_start == (u64)-1) 1228 cow_start = cur_offset; 1229 if (cow_start != (u64)-1) { 1230 ret = cow_file_range(inode, locked_page, cow_start, end, 1231 page_started, nr_written, 1); 1232 BUG_ON(ret); 1233 } 1234 1235 if (nolock) { 1236 ret = btrfs_end_transaction_nolock(trans, root); 1237 BUG_ON(ret); 1238 } else { 1239 ret = btrfs_end_transaction(trans, root); 1240 BUG_ON(ret); 1241 } 1242 btrfs_free_path(path); 1243 return 0; 1244 } 1245 1246 /* 1247 * extent_io.c call back to do delayed allocation processing 1248 */ 1249 static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1250 u64 start, u64 end, int *page_started, 1251 unsigned long *nr_written) 1252 { 1253 int ret; 1254 struct btrfs_root *root = BTRFS_I(inode)->root; 1255 1256 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) 1257 ret = run_delalloc_nocow(inode, locked_page, start, end, 1258 page_started, 1, nr_written); 1259 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1260 ret = run_delalloc_nocow(inode, locked_page, start, end, 1261 page_started, 0, nr_written); 1262 else if (!btrfs_test_opt(root, COMPRESS) && 1263 !(BTRFS_I(inode)->force_compress) && 1264 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) 1265 ret = cow_file_range(inode, locked_page, start, end, 1266 page_started, nr_written, 1); 1267 else 1268 ret = cow_file_range_async(inode, locked_page, start, end, 1269 page_started, nr_written); 1270 return ret; 1271 } 1272 1273 static int btrfs_split_extent_hook(struct inode *inode, 1274 struct extent_state *orig, u64 split) 1275 { 1276 /* not delalloc, ignore it */ 1277 if (!(orig->state & EXTENT_DELALLOC)) 1278 return 0; 1279 1280 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 1281 return 0; 1282 } 1283 1284 /* 1285 * extent_io.c merge_extent_hook, used to track merged delayed allocation 1286 * extents so we can keep track of new extents that are just merged onto old 1287 * extents, such as when we are doing sequential writes, so we can properly 1288 * account for the metadata space we'll need. 1289 */ 1290 static int btrfs_merge_extent_hook(struct inode *inode, 1291 struct extent_state *new, 1292 struct extent_state *other) 1293 { 1294 /* not delalloc, ignore it */ 1295 if (!(other->state & EXTENT_DELALLOC)) 1296 return 0; 1297 1298 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 1299 return 0; 1300 } 1301 1302 /* 1303 * extent_io.c set_bit_hook, used to track delayed allocation 1304 * bytes in this file, and to maintain the list of inodes that 1305 * have pending delalloc work to be done. 1306 */ 1307 static int btrfs_set_bit_hook(struct inode *inode, 1308 struct extent_state *state, int *bits) 1309 { 1310 1311 /* 1312 * set_bit and clear bit hooks normally require _irqsave/restore 1313 * but in this case, we are only testeing for the DELALLOC 1314 * bit, which is only set or cleared with irqs on 1315 */ 1316 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1317 struct btrfs_root *root = BTRFS_I(inode)->root; 1318 u64 len = state->end + 1 - state->start; 1319 int do_list = (root->root_key.objectid != 1320 BTRFS_ROOT_TREE_OBJECTID); 1321 1322 if (*bits & EXTENT_FIRST_DELALLOC) 1323 *bits &= ~EXTENT_FIRST_DELALLOC; 1324 else 1325 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 1326 1327 spin_lock(&root->fs_info->delalloc_lock); 1328 BTRFS_I(inode)->delalloc_bytes += len; 1329 root->fs_info->delalloc_bytes += len; 1330 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1331 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1332 &root->fs_info->delalloc_inodes); 1333 } 1334 spin_unlock(&root->fs_info->delalloc_lock); 1335 } 1336 return 0; 1337 } 1338 1339 /* 1340 * extent_io.c clear_bit_hook, see set_bit_hook for why 1341 */ 1342 static int btrfs_clear_bit_hook(struct inode *inode, 1343 struct extent_state *state, int *bits) 1344 { 1345 /* 1346 * set_bit and clear bit hooks normally require _irqsave/restore 1347 * but in this case, we are only testeing for the DELALLOC 1348 * bit, which is only set or cleared with irqs on 1349 */ 1350 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1351 struct btrfs_root *root = BTRFS_I(inode)->root; 1352 u64 len = state->end + 1 - state->start; 1353 int do_list = (root->root_key.objectid != 1354 BTRFS_ROOT_TREE_OBJECTID); 1355 1356 if (*bits & EXTENT_FIRST_DELALLOC) 1357 *bits &= ~EXTENT_FIRST_DELALLOC; 1358 else if (!(*bits & EXTENT_DO_ACCOUNTING)) 1359 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 1360 1361 if (*bits & EXTENT_DO_ACCOUNTING) 1362 btrfs_delalloc_release_metadata(inode, len); 1363 1364 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1365 && do_list) 1366 btrfs_free_reserved_data_space(inode, len); 1367 1368 spin_lock(&root->fs_info->delalloc_lock); 1369 root->fs_info->delalloc_bytes -= len; 1370 BTRFS_I(inode)->delalloc_bytes -= len; 1371 1372 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1373 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1374 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1375 } 1376 spin_unlock(&root->fs_info->delalloc_lock); 1377 } 1378 return 0; 1379 } 1380 1381 /* 1382 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1383 * we don't create bios that span stripes or chunks 1384 */ 1385 int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1386 size_t size, struct bio *bio, 1387 unsigned long bio_flags) 1388 { 1389 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1390 struct btrfs_mapping_tree *map_tree; 1391 u64 logical = (u64)bio->bi_sector << 9; 1392 u64 length = 0; 1393 u64 map_length; 1394 int ret; 1395 1396 if (bio_flags & EXTENT_BIO_COMPRESSED) 1397 return 0; 1398 1399 length = bio->bi_size; 1400 map_tree = &root->fs_info->mapping_tree; 1401 map_length = length; 1402 ret = btrfs_map_block(map_tree, READ, logical, 1403 &map_length, NULL, 0); 1404 1405 if (map_length < length + size) 1406 return 1; 1407 return ret; 1408 } 1409 1410 /* 1411 * in order to insert checksums into the metadata in large chunks, 1412 * we wait until bio submission time. All the pages in the bio are 1413 * checksummed and sums are attached onto the ordered extent record. 1414 * 1415 * At IO completion time the cums attached on the ordered extent record 1416 * are inserted into the btree 1417 */ 1418 static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1419 struct bio *bio, int mirror_num, 1420 unsigned long bio_flags, 1421 u64 bio_offset) 1422 { 1423 struct btrfs_root *root = BTRFS_I(inode)->root; 1424 int ret = 0; 1425 1426 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); 1427 BUG_ON(ret); 1428 return 0; 1429 } 1430 1431 /* 1432 * in order to insert checksums into the metadata in large chunks, 1433 * we wait until bio submission time. All the pages in the bio are 1434 * checksummed and sums are attached onto the ordered extent record. 1435 * 1436 * At IO completion time the cums attached on the ordered extent record 1437 * are inserted into the btree 1438 */ 1439 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1440 int mirror_num, unsigned long bio_flags, 1441 u64 bio_offset) 1442 { 1443 struct btrfs_root *root = BTRFS_I(inode)->root; 1444 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1445 } 1446 1447 /* 1448 * extent_io.c submission hook. This does the right thing for csum calculation 1449 * on write, or reading the csums from the tree before a read 1450 */ 1451 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1452 int mirror_num, unsigned long bio_flags, 1453 u64 bio_offset) 1454 { 1455 struct btrfs_root *root = BTRFS_I(inode)->root; 1456 int ret = 0; 1457 int skip_sum; 1458 1459 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1460 1461 if (root == root->fs_info->tree_root) 1462 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); 1463 else 1464 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1465 BUG_ON(ret); 1466 1467 if (!(rw & REQ_WRITE)) { 1468 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1469 return btrfs_submit_compressed_read(inode, bio, 1470 mirror_num, bio_flags); 1471 } else if (!skip_sum) { 1472 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1473 if (ret) 1474 return ret; 1475 } 1476 goto mapit; 1477 } else if (!skip_sum) { 1478 /* csum items have already been cloned */ 1479 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1480 goto mapit; 1481 /* we're doing a write, do the async checksumming */ 1482 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1483 inode, rw, bio, mirror_num, 1484 bio_flags, bio_offset, 1485 __btrfs_submit_bio_start, 1486 __btrfs_submit_bio_done); 1487 } 1488 1489 mapit: 1490 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1491 } 1492 1493 /* 1494 * given a list of ordered sums record them in the inode. This happens 1495 * at IO completion time based on sums calculated at bio submission time. 1496 */ 1497 static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 1498 struct inode *inode, u64 file_offset, 1499 struct list_head *list) 1500 { 1501 struct btrfs_ordered_sum *sum; 1502 1503 btrfs_set_trans_block_group(trans, inode); 1504 1505 list_for_each_entry(sum, list, list) { 1506 btrfs_csum_file_blocks(trans, 1507 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1508 } 1509 return 0; 1510 } 1511 1512 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1513 struct extent_state **cached_state) 1514 { 1515 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1516 WARN_ON(1); 1517 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1518 cached_state, GFP_NOFS); 1519 } 1520 1521 /* see btrfs_writepage_start_hook for details on why this is required */ 1522 struct btrfs_writepage_fixup { 1523 struct page *page; 1524 struct btrfs_work work; 1525 }; 1526 1527 static void btrfs_writepage_fixup_worker(struct btrfs_work *work) 1528 { 1529 struct btrfs_writepage_fixup *fixup; 1530 struct btrfs_ordered_extent *ordered; 1531 struct extent_state *cached_state = NULL; 1532 struct page *page; 1533 struct inode *inode; 1534 u64 page_start; 1535 u64 page_end; 1536 1537 fixup = container_of(work, struct btrfs_writepage_fixup, work); 1538 page = fixup->page; 1539 again: 1540 lock_page(page); 1541 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { 1542 ClearPageChecked(page); 1543 goto out_page; 1544 } 1545 1546 inode = page->mapping->host; 1547 page_start = page_offset(page); 1548 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1549 1550 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, 1551 &cached_state, GFP_NOFS); 1552 1553 /* already ordered? We're done */ 1554 if (PagePrivate2(page)) 1555 goto out; 1556 1557 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1558 if (ordered) { 1559 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, 1560 page_end, &cached_state, GFP_NOFS); 1561 unlock_page(page); 1562 btrfs_start_ordered_extent(inode, ordered, 1); 1563 goto again; 1564 } 1565 1566 BUG(); 1567 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1568 ClearPageChecked(page); 1569 out: 1570 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, 1571 &cached_state, GFP_NOFS); 1572 out_page: 1573 unlock_page(page); 1574 page_cache_release(page); 1575 kfree(fixup); 1576 } 1577 1578 /* 1579 * There are a few paths in the higher layers of the kernel that directly 1580 * set the page dirty bit without asking the filesystem if it is a 1581 * good idea. This causes problems because we want to make sure COW 1582 * properly happens and the data=ordered rules are followed. 1583 * 1584 * In our case any range that doesn't have the ORDERED bit set 1585 * hasn't been properly setup for IO. We kick off an async process 1586 * to fix it up. The async helper will wait for ordered extents, set 1587 * the delalloc bit and make it safe to write the page. 1588 */ 1589 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) 1590 { 1591 struct inode *inode = page->mapping->host; 1592 struct btrfs_writepage_fixup *fixup; 1593 struct btrfs_root *root = BTRFS_I(inode)->root; 1594 1595 /* this page is properly in the ordered list */ 1596 if (TestClearPagePrivate2(page)) 1597 return 0; 1598 1599 if (PageChecked(page)) 1600 return -EAGAIN; 1601 1602 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 1603 if (!fixup) 1604 return -EAGAIN; 1605 1606 SetPageChecked(page); 1607 page_cache_get(page); 1608 fixup->work.func = btrfs_writepage_fixup_worker; 1609 fixup->page = page; 1610 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); 1611 return -EAGAIN; 1612 } 1613 1614 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, 1615 struct inode *inode, u64 file_pos, 1616 u64 disk_bytenr, u64 disk_num_bytes, 1617 u64 num_bytes, u64 ram_bytes, 1618 u8 compression, u8 encryption, 1619 u16 other_encoding, int extent_type) 1620 { 1621 struct btrfs_root *root = BTRFS_I(inode)->root; 1622 struct btrfs_file_extent_item *fi; 1623 struct btrfs_path *path; 1624 struct extent_buffer *leaf; 1625 struct btrfs_key ins; 1626 u64 hint; 1627 int ret; 1628 1629 path = btrfs_alloc_path(); 1630 BUG_ON(!path); 1631 1632 path->leave_spinning = 1; 1633 1634 /* 1635 * we may be replacing one extent in the tree with another. 1636 * The new extent is pinned in the extent map, and we don't want 1637 * to drop it from the cache until it is completely in the btree. 1638 * 1639 * So, tell btrfs_drop_extents to leave this extent in the cache. 1640 * the caller is expected to unpin it and allow it to be merged 1641 * with the others. 1642 */ 1643 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, 1644 &hint, 0); 1645 BUG_ON(ret); 1646 1647 ins.objectid = inode->i_ino; 1648 ins.offset = file_pos; 1649 ins.type = BTRFS_EXTENT_DATA_KEY; 1650 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1651 BUG_ON(ret); 1652 leaf = path->nodes[0]; 1653 fi = btrfs_item_ptr(leaf, path->slots[0], 1654 struct btrfs_file_extent_item); 1655 btrfs_set_file_extent_generation(leaf, fi, trans->transid); 1656 btrfs_set_file_extent_type(leaf, fi, extent_type); 1657 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); 1658 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); 1659 btrfs_set_file_extent_offset(leaf, fi, 0); 1660 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 1661 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); 1662 btrfs_set_file_extent_compression(leaf, fi, compression); 1663 btrfs_set_file_extent_encryption(leaf, fi, encryption); 1664 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); 1665 1666 btrfs_unlock_up_safe(path, 1); 1667 btrfs_set_lock_blocking(leaf); 1668 1669 btrfs_mark_buffer_dirty(leaf); 1670 1671 inode_add_bytes(inode, num_bytes); 1672 1673 ins.objectid = disk_bytenr; 1674 ins.offset = disk_num_bytes; 1675 ins.type = BTRFS_EXTENT_ITEM_KEY; 1676 ret = btrfs_alloc_reserved_file_extent(trans, root, 1677 root->root_key.objectid, 1678 inode->i_ino, file_pos, &ins); 1679 BUG_ON(ret); 1680 btrfs_free_path(path); 1681 1682 return 0; 1683 } 1684 1685 /* 1686 * helper function for btrfs_finish_ordered_io, this 1687 * just reads in some of the csum leaves to prime them into ram 1688 * before we start the transaction. It limits the amount of btree 1689 * reads required while inside the transaction. 1690 */ 1691 /* as ordered data IO finishes, this gets called so we can finish 1692 * an ordered extent if the range of bytes in the file it covers are 1693 * fully written. 1694 */ 1695 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1696 { 1697 struct btrfs_root *root = BTRFS_I(inode)->root; 1698 struct btrfs_trans_handle *trans = NULL; 1699 struct btrfs_ordered_extent *ordered_extent = NULL; 1700 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1701 struct extent_state *cached_state = NULL; 1702 int compress_type = 0; 1703 int ret; 1704 bool nolock = false; 1705 1706 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 1707 end - start + 1); 1708 if (!ret) 1709 return 0; 1710 BUG_ON(!ordered_extent); 1711 1712 nolock = (root == root->fs_info->tree_root); 1713 1714 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1715 BUG_ON(!list_empty(&ordered_extent->list)); 1716 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1717 if (!ret) { 1718 if (nolock) 1719 trans = btrfs_join_transaction_nolock(root, 1); 1720 else 1721 trans = btrfs_join_transaction(root, 1); 1722 BUG_ON(IS_ERR(trans)); 1723 btrfs_set_trans_block_group(trans, inode); 1724 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1725 ret = btrfs_update_inode(trans, root, inode); 1726 BUG_ON(ret); 1727 } 1728 goto out; 1729 } 1730 1731 lock_extent_bits(io_tree, ordered_extent->file_offset, 1732 ordered_extent->file_offset + ordered_extent->len - 1, 1733 0, &cached_state, GFP_NOFS); 1734 1735 if (nolock) 1736 trans = btrfs_join_transaction_nolock(root, 1); 1737 else 1738 trans = btrfs_join_transaction(root, 1); 1739 BUG_ON(IS_ERR(trans)); 1740 btrfs_set_trans_block_group(trans, inode); 1741 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1742 1743 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1744 compress_type = ordered_extent->compress_type; 1745 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1746 BUG_ON(compress_type); 1747 ret = btrfs_mark_extent_written(trans, inode, 1748 ordered_extent->file_offset, 1749 ordered_extent->file_offset + 1750 ordered_extent->len); 1751 BUG_ON(ret); 1752 } else { 1753 BUG_ON(root == root->fs_info->tree_root); 1754 ret = insert_reserved_file_extent(trans, inode, 1755 ordered_extent->file_offset, 1756 ordered_extent->start, 1757 ordered_extent->disk_len, 1758 ordered_extent->len, 1759 ordered_extent->len, 1760 compress_type, 0, 0, 1761 BTRFS_FILE_EXTENT_REG); 1762 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1763 ordered_extent->file_offset, 1764 ordered_extent->len); 1765 BUG_ON(ret); 1766 } 1767 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1768 ordered_extent->file_offset + 1769 ordered_extent->len - 1, &cached_state, GFP_NOFS); 1770 1771 add_pending_csums(trans, inode, ordered_extent->file_offset, 1772 &ordered_extent->list); 1773 1774 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1775 if (!ret) { 1776 ret = btrfs_update_inode(trans, root, inode); 1777 BUG_ON(ret); 1778 } 1779 ret = 0; 1780 out: 1781 if (nolock) { 1782 if (trans) 1783 btrfs_end_transaction_nolock(trans, root); 1784 } else { 1785 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1786 if (trans) 1787 btrfs_end_transaction(trans, root); 1788 } 1789 1790 /* once for us */ 1791 btrfs_put_ordered_extent(ordered_extent); 1792 /* once for the tree */ 1793 btrfs_put_ordered_extent(ordered_extent); 1794 1795 return 0; 1796 } 1797 1798 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1799 struct extent_state *state, int uptodate) 1800 { 1801 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 1802 1803 ClearPagePrivate2(page); 1804 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1805 } 1806 1807 /* 1808 * When IO fails, either with EIO or csum verification fails, we 1809 * try other mirrors that might have a good copy of the data. This 1810 * io_failure_record is used to record state as we go through all the 1811 * mirrors. If another mirror has good data, the page is set up to date 1812 * and things continue. If a good mirror can't be found, the original 1813 * bio end_io callback is called to indicate things have failed. 1814 */ 1815 struct io_failure_record { 1816 struct page *page; 1817 u64 start; 1818 u64 len; 1819 u64 logical; 1820 unsigned long bio_flags; 1821 int last_mirror; 1822 }; 1823 1824 static int btrfs_io_failed_hook(struct bio *failed_bio, 1825 struct page *page, u64 start, u64 end, 1826 struct extent_state *state) 1827 { 1828 struct io_failure_record *failrec = NULL; 1829 u64 private; 1830 struct extent_map *em; 1831 struct inode *inode = page->mapping->host; 1832 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1833 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 1834 struct bio *bio; 1835 int num_copies; 1836 int ret; 1837 int rw; 1838 u64 logical; 1839 1840 ret = get_state_private(failure_tree, start, &private); 1841 if (ret) { 1842 failrec = kmalloc(sizeof(*failrec), GFP_NOFS); 1843 if (!failrec) 1844 return -ENOMEM; 1845 failrec->start = start; 1846 failrec->len = end - start + 1; 1847 failrec->last_mirror = 0; 1848 failrec->bio_flags = 0; 1849 1850 read_lock(&em_tree->lock); 1851 em = lookup_extent_mapping(em_tree, start, failrec->len); 1852 if (em->start > start || em->start + em->len < start) { 1853 free_extent_map(em); 1854 em = NULL; 1855 } 1856 read_unlock(&em_tree->lock); 1857 1858 if (!em || IS_ERR(em)) { 1859 kfree(failrec); 1860 return -EIO; 1861 } 1862 logical = start - em->start; 1863 logical = em->block_start + logical; 1864 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1865 logical = em->block_start; 1866 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1867 extent_set_compress_type(&failrec->bio_flags, 1868 em->compress_type); 1869 } 1870 failrec->logical = logical; 1871 free_extent_map(em); 1872 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | 1873 EXTENT_DIRTY, GFP_NOFS); 1874 set_state_private(failure_tree, start, 1875 (u64)(unsigned long)failrec); 1876 } else { 1877 failrec = (struct io_failure_record *)(unsigned long)private; 1878 } 1879 num_copies = btrfs_num_copies( 1880 &BTRFS_I(inode)->root->fs_info->mapping_tree, 1881 failrec->logical, failrec->len); 1882 failrec->last_mirror++; 1883 if (!state) { 1884 spin_lock(&BTRFS_I(inode)->io_tree.lock); 1885 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 1886 failrec->start, 1887 EXTENT_LOCKED); 1888 if (state && state->start != failrec->start) 1889 state = NULL; 1890 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 1891 } 1892 if (!state || failrec->last_mirror > num_copies) { 1893 set_state_private(failure_tree, failrec->start, 0); 1894 clear_extent_bits(failure_tree, failrec->start, 1895 failrec->start + failrec->len - 1, 1896 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1897 kfree(failrec); 1898 return -EIO; 1899 } 1900 bio = bio_alloc(GFP_NOFS, 1); 1901 bio->bi_private = state; 1902 bio->bi_end_io = failed_bio->bi_end_io; 1903 bio->bi_sector = failrec->logical >> 9; 1904 bio->bi_bdev = failed_bio->bi_bdev; 1905 bio->bi_size = 0; 1906 1907 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 1908 if (failed_bio->bi_rw & REQ_WRITE) 1909 rw = WRITE; 1910 else 1911 rw = READ; 1912 1913 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1914 failrec->last_mirror, 1915 failrec->bio_flags, 0); 1916 return ret; 1917 } 1918 1919 /* 1920 * each time an IO finishes, we do a fast check in the IO failure tree 1921 * to see if we need to process or clean up an io_failure_record 1922 */ 1923 static int btrfs_clean_io_failures(struct inode *inode, u64 start) 1924 { 1925 u64 private; 1926 u64 private_failure; 1927 struct io_failure_record *failure; 1928 int ret; 1929 1930 private = 0; 1931 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 1932 (u64)-1, 1, EXTENT_DIRTY, 0)) { 1933 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, 1934 start, &private_failure); 1935 if (ret == 0) { 1936 failure = (struct io_failure_record *)(unsigned long) 1937 private_failure; 1938 set_state_private(&BTRFS_I(inode)->io_failure_tree, 1939 failure->start, 0); 1940 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, 1941 failure->start, 1942 failure->start + failure->len - 1, 1943 EXTENT_DIRTY | EXTENT_LOCKED, 1944 GFP_NOFS); 1945 kfree(failure); 1946 } 1947 } 1948 return 0; 1949 } 1950 1951 /* 1952 * when reads are done, we need to check csums to verify the data is correct 1953 * if there's a match, we allow the bio to finish. If not, we go through 1954 * the io_failure_record routines to find good copies 1955 */ 1956 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1957 struct extent_state *state) 1958 { 1959 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); 1960 struct inode *inode = page->mapping->host; 1961 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1962 char *kaddr; 1963 u64 private = ~(u32)0; 1964 int ret; 1965 struct btrfs_root *root = BTRFS_I(inode)->root; 1966 u32 csum = ~(u32)0; 1967 1968 if (PageChecked(page)) { 1969 ClearPageChecked(page); 1970 goto good; 1971 } 1972 1973 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 1974 return 0; 1975 1976 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 1977 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 1978 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, 1979 GFP_NOFS); 1980 return 0; 1981 } 1982 1983 if (state && state->start == start) { 1984 private = state->private; 1985 ret = 0; 1986 } else { 1987 ret = get_state_private(io_tree, start, &private); 1988 } 1989 kaddr = kmap_atomic(page, KM_USER0); 1990 if (ret) 1991 goto zeroit; 1992 1993 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); 1994 btrfs_csum_final(csum, (char *)&csum); 1995 if (csum != private) 1996 goto zeroit; 1997 1998 kunmap_atomic(kaddr, KM_USER0); 1999 good: 2000 /* if the io failure tree for this inode is non-empty, 2001 * check to see if we've recovered from a failed IO 2002 */ 2003 btrfs_clean_io_failures(inode, start); 2004 return 0; 2005 2006 zeroit: 2007 if (printk_ratelimit()) { 2008 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " 2009 "private %llu\n", page->mapping->host->i_ino, 2010 (unsigned long long)start, csum, 2011 (unsigned long long)private); 2012 } 2013 memset(kaddr + offset, 1, end - start + 1); 2014 flush_dcache_page(page); 2015 kunmap_atomic(kaddr, KM_USER0); 2016 if (private == 0) 2017 return 0; 2018 return -EIO; 2019 } 2020 2021 struct delayed_iput { 2022 struct list_head list; 2023 struct inode *inode; 2024 }; 2025 2026 void btrfs_add_delayed_iput(struct inode *inode) 2027 { 2028 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2029 struct delayed_iput *delayed; 2030 2031 if (atomic_add_unless(&inode->i_count, -1, 1)) 2032 return; 2033 2034 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); 2035 delayed->inode = inode; 2036 2037 spin_lock(&fs_info->delayed_iput_lock); 2038 list_add_tail(&delayed->list, &fs_info->delayed_iputs); 2039 spin_unlock(&fs_info->delayed_iput_lock); 2040 } 2041 2042 void btrfs_run_delayed_iputs(struct btrfs_root *root) 2043 { 2044 LIST_HEAD(list); 2045 struct btrfs_fs_info *fs_info = root->fs_info; 2046 struct delayed_iput *delayed; 2047 int empty; 2048 2049 spin_lock(&fs_info->delayed_iput_lock); 2050 empty = list_empty(&fs_info->delayed_iputs); 2051 spin_unlock(&fs_info->delayed_iput_lock); 2052 if (empty) 2053 return; 2054 2055 down_read(&root->fs_info->cleanup_work_sem); 2056 spin_lock(&fs_info->delayed_iput_lock); 2057 list_splice_init(&fs_info->delayed_iputs, &list); 2058 spin_unlock(&fs_info->delayed_iput_lock); 2059 2060 while (!list_empty(&list)) { 2061 delayed = list_entry(list.next, struct delayed_iput, list); 2062 list_del(&delayed->list); 2063 iput(delayed->inode); 2064 kfree(delayed); 2065 } 2066 up_read(&root->fs_info->cleanup_work_sem); 2067 } 2068 2069 /* 2070 * calculate extra metadata reservation when snapshotting a subvolume 2071 * contains orphan files. 2072 */ 2073 void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, 2074 struct btrfs_pending_snapshot *pending, 2075 u64 *bytes_to_reserve) 2076 { 2077 struct btrfs_root *root; 2078 struct btrfs_block_rsv *block_rsv; 2079 u64 num_bytes; 2080 int index; 2081 2082 root = pending->root; 2083 if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) 2084 return; 2085 2086 block_rsv = root->orphan_block_rsv; 2087 2088 /* orphan block reservation for the snapshot */ 2089 num_bytes = block_rsv->size; 2090 2091 /* 2092 * after the snapshot is created, COWing tree blocks may use more 2093 * space than it frees. So we should make sure there is enough 2094 * reserved space. 2095 */ 2096 index = trans->transid & 0x1; 2097 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { 2098 num_bytes += block_rsv->size - 2099 (block_rsv->reserved + block_rsv->freed[index]); 2100 } 2101 2102 *bytes_to_reserve += num_bytes; 2103 } 2104 2105 void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, 2106 struct btrfs_pending_snapshot *pending) 2107 { 2108 struct btrfs_root *root = pending->root; 2109 struct btrfs_root *snap = pending->snap; 2110 struct btrfs_block_rsv *block_rsv; 2111 u64 num_bytes; 2112 int index; 2113 int ret; 2114 2115 if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) 2116 return; 2117 2118 /* refill source subvolume's orphan block reservation */ 2119 block_rsv = root->orphan_block_rsv; 2120 index = trans->transid & 0x1; 2121 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { 2122 num_bytes = block_rsv->size - 2123 (block_rsv->reserved + block_rsv->freed[index]); 2124 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 2125 root->orphan_block_rsv, 2126 num_bytes); 2127 BUG_ON(ret); 2128 } 2129 2130 /* setup orphan block reservation for the snapshot */ 2131 block_rsv = btrfs_alloc_block_rsv(snap); 2132 BUG_ON(!block_rsv); 2133 2134 btrfs_add_durable_block_rsv(root->fs_info, block_rsv); 2135 snap->orphan_block_rsv = block_rsv; 2136 2137 num_bytes = root->orphan_block_rsv->size; 2138 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 2139 block_rsv, num_bytes); 2140 BUG_ON(ret); 2141 2142 #if 0 2143 /* insert orphan item for the snapshot */ 2144 WARN_ON(!root->orphan_item_inserted); 2145 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 2146 snap->root_key.objectid); 2147 BUG_ON(ret); 2148 snap->orphan_item_inserted = 1; 2149 #endif 2150 } 2151 2152 enum btrfs_orphan_cleanup_state { 2153 ORPHAN_CLEANUP_STARTED = 1, 2154 ORPHAN_CLEANUP_DONE = 2, 2155 }; 2156 2157 /* 2158 * This is called in transaction commmit time. If there are no orphan 2159 * files in the subvolume, it removes orphan item and frees block_rsv 2160 * structure. 2161 */ 2162 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2163 struct btrfs_root *root) 2164 { 2165 int ret; 2166 2167 if (!list_empty(&root->orphan_list) || 2168 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2169 return; 2170 2171 if (root->orphan_item_inserted && 2172 btrfs_root_refs(&root->root_item) > 0) { 2173 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 2174 root->root_key.objectid); 2175 BUG_ON(ret); 2176 root->orphan_item_inserted = 0; 2177 } 2178 2179 if (root->orphan_block_rsv) { 2180 WARN_ON(root->orphan_block_rsv->size > 0); 2181 btrfs_free_block_rsv(root, root->orphan_block_rsv); 2182 root->orphan_block_rsv = NULL; 2183 } 2184 } 2185 2186 /* 2187 * This creates an orphan entry for the given inode in case something goes 2188 * wrong in the middle of an unlink/truncate. 2189 * 2190 * NOTE: caller of this function should reserve 5 units of metadata for 2191 * this function. 2192 */ 2193 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2194 { 2195 struct btrfs_root *root = BTRFS_I(inode)->root; 2196 struct btrfs_block_rsv *block_rsv = NULL; 2197 int reserve = 0; 2198 int insert = 0; 2199 int ret; 2200 2201 if (!root->orphan_block_rsv) { 2202 block_rsv = btrfs_alloc_block_rsv(root); 2203 BUG_ON(!block_rsv); 2204 } 2205 2206 spin_lock(&root->orphan_lock); 2207 if (!root->orphan_block_rsv) { 2208 root->orphan_block_rsv = block_rsv; 2209 } else if (block_rsv) { 2210 btrfs_free_block_rsv(root, block_rsv); 2211 block_rsv = NULL; 2212 } 2213 2214 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2215 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2216 #if 0 2217 /* 2218 * For proper ENOSPC handling, we should do orphan 2219 * cleanup when mounting. But this introduces backward 2220 * compatibility issue. 2221 */ 2222 if (!xchg(&root->orphan_item_inserted, 1)) 2223 insert = 2; 2224 else 2225 insert = 1; 2226 #endif 2227 insert = 1; 2228 } 2229 2230 if (!BTRFS_I(inode)->orphan_meta_reserved) { 2231 BTRFS_I(inode)->orphan_meta_reserved = 1; 2232 reserve = 1; 2233 } 2234 spin_unlock(&root->orphan_lock); 2235 2236 if (block_rsv) 2237 btrfs_add_durable_block_rsv(root->fs_info, block_rsv); 2238 2239 /* grab metadata reservation from transaction handle */ 2240 if (reserve) { 2241 ret = btrfs_orphan_reserve_metadata(trans, inode); 2242 BUG_ON(ret); 2243 } 2244 2245 /* insert an orphan item to track this unlinked/truncated file */ 2246 if (insert >= 1) { 2247 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); 2248 BUG_ON(ret); 2249 } 2250 2251 /* insert an orphan item to track subvolume contains orphan files */ 2252 if (insert >= 2) { 2253 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, 2254 root->root_key.objectid); 2255 BUG_ON(ret); 2256 } 2257 return 0; 2258 } 2259 2260 /* 2261 * We have done the truncate/delete so we can go ahead and remove the orphan 2262 * item for this particular inode. 2263 */ 2264 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2265 { 2266 struct btrfs_root *root = BTRFS_I(inode)->root; 2267 int delete_item = 0; 2268 int release_rsv = 0; 2269 int ret = 0; 2270 2271 spin_lock(&root->orphan_lock); 2272 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2273 list_del_init(&BTRFS_I(inode)->i_orphan); 2274 delete_item = 1; 2275 } 2276 2277 if (BTRFS_I(inode)->orphan_meta_reserved) { 2278 BTRFS_I(inode)->orphan_meta_reserved = 0; 2279 release_rsv = 1; 2280 } 2281 spin_unlock(&root->orphan_lock); 2282 2283 if (trans && delete_item) { 2284 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2285 BUG_ON(ret); 2286 } 2287 2288 if (release_rsv) 2289 btrfs_orphan_release_metadata(inode); 2290 2291 return 0; 2292 } 2293 2294 /* 2295 * this cleans up any orphans that may be left on the list from the last use 2296 * of this root. 2297 */ 2298 int btrfs_orphan_cleanup(struct btrfs_root *root) 2299 { 2300 struct btrfs_path *path; 2301 struct extent_buffer *leaf; 2302 struct btrfs_key key, found_key; 2303 struct btrfs_trans_handle *trans; 2304 struct inode *inode; 2305 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2306 2307 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2308 return 0; 2309 2310 path = btrfs_alloc_path(); 2311 if (!path) { 2312 ret = -ENOMEM; 2313 goto out; 2314 } 2315 path->reada = -1; 2316 2317 key.objectid = BTRFS_ORPHAN_OBJECTID; 2318 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 2319 key.offset = (u64)-1; 2320 2321 while (1) { 2322 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2323 if (ret < 0) 2324 goto out; 2325 2326 /* 2327 * if ret == 0 means we found what we were searching for, which 2328 * is weird, but possible, so only screw with path if we didn't 2329 * find the key and see if we have stuff that matches 2330 */ 2331 if (ret > 0) { 2332 ret = 0; 2333 if (path->slots[0] == 0) 2334 break; 2335 path->slots[0]--; 2336 } 2337 2338 /* pull out the item */ 2339 leaf = path->nodes[0]; 2340 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2341 2342 /* make sure the item matches what we want */ 2343 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 2344 break; 2345 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 2346 break; 2347 2348 /* release the path since we're done with it */ 2349 btrfs_release_path(root, path); 2350 2351 /* 2352 * this is where we are basically btrfs_lookup, without the 2353 * crossing root thing. we store the inode number in the 2354 * offset of the orphan item. 2355 */ 2356 found_key.objectid = found_key.offset; 2357 found_key.type = BTRFS_INODE_ITEM_KEY; 2358 found_key.offset = 0; 2359 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2360 if (IS_ERR(inode)) { 2361 ret = PTR_ERR(inode); 2362 goto out; 2363 } 2364 2365 /* 2366 * add this inode to the orphan list so btrfs_orphan_del does 2367 * the proper thing when we hit it 2368 */ 2369 spin_lock(&root->orphan_lock); 2370 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2371 spin_unlock(&root->orphan_lock); 2372 2373 /* 2374 * if this is a bad inode, means we actually succeeded in 2375 * removing the inode, but not the orphan record, which means 2376 * we need to manually delete the orphan since iput will just 2377 * do a destroy_inode 2378 */ 2379 if (is_bad_inode(inode)) { 2380 trans = btrfs_start_transaction(root, 0); 2381 if (IS_ERR(trans)) { 2382 ret = PTR_ERR(trans); 2383 goto out; 2384 } 2385 btrfs_orphan_del(trans, inode); 2386 btrfs_end_transaction(trans, root); 2387 iput(inode); 2388 continue; 2389 } 2390 2391 /* if we have links, this was a truncate, lets do that */ 2392 if (inode->i_nlink) { 2393 if (!S_ISREG(inode->i_mode)) { 2394 WARN_ON(1); 2395 iput(inode); 2396 continue; 2397 } 2398 nr_truncate++; 2399 ret = btrfs_truncate(inode); 2400 } else { 2401 nr_unlink++; 2402 } 2403 2404 /* this will do delete_inode and everything for us */ 2405 iput(inode); 2406 if (ret) 2407 goto out; 2408 } 2409 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2410 2411 if (root->orphan_block_rsv) 2412 btrfs_block_rsv_release(root, root->orphan_block_rsv, 2413 (u64)-1); 2414 2415 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2416 trans = btrfs_join_transaction(root, 1); 2417 if (!IS_ERR(trans)) 2418 btrfs_end_transaction(trans, root); 2419 } 2420 2421 if (nr_unlink) 2422 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2423 if (nr_truncate) 2424 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2425 2426 out: 2427 if (ret) 2428 printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); 2429 btrfs_free_path(path); 2430 return ret; 2431 } 2432 2433 /* 2434 * very simple check to peek ahead in the leaf looking for xattrs. If we 2435 * don't find any xattrs, we know there can't be any acls. 2436 * 2437 * slot is the slot the inode is in, objectid is the objectid of the inode 2438 */ 2439 static noinline int acls_after_inode_item(struct extent_buffer *leaf, 2440 int slot, u64 objectid) 2441 { 2442 u32 nritems = btrfs_header_nritems(leaf); 2443 struct btrfs_key found_key; 2444 int scanned = 0; 2445 2446 slot++; 2447 while (slot < nritems) { 2448 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2449 2450 /* we found a different objectid, there must not be acls */ 2451 if (found_key.objectid != objectid) 2452 return 0; 2453 2454 /* we found an xattr, assume we've got an acl */ 2455 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 2456 return 1; 2457 2458 /* 2459 * we found a key greater than an xattr key, there can't 2460 * be any acls later on 2461 */ 2462 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 2463 return 0; 2464 2465 slot++; 2466 scanned++; 2467 2468 /* 2469 * it goes inode, inode backrefs, xattrs, extents, 2470 * so if there are a ton of hard links to an inode there can 2471 * be a lot of backrefs. Don't waste time searching too hard, 2472 * this is just an optimization 2473 */ 2474 if (scanned >= 8) 2475 break; 2476 } 2477 /* we hit the end of the leaf before we found an xattr or 2478 * something larger than an xattr. We have to assume the inode 2479 * has acls 2480 */ 2481 return 1; 2482 } 2483 2484 /* 2485 * read an inode from the btree into the in-memory inode 2486 */ 2487 static void btrfs_read_locked_inode(struct inode *inode) 2488 { 2489 struct btrfs_path *path; 2490 struct extent_buffer *leaf; 2491 struct btrfs_inode_item *inode_item; 2492 struct btrfs_timespec *tspec; 2493 struct btrfs_root *root = BTRFS_I(inode)->root; 2494 struct btrfs_key location; 2495 int maybe_acls; 2496 u64 alloc_group_block; 2497 u32 rdev; 2498 int ret; 2499 2500 path = btrfs_alloc_path(); 2501 BUG_ON(!path); 2502 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2503 2504 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 2505 if (ret) 2506 goto make_bad; 2507 2508 leaf = path->nodes[0]; 2509 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2510 struct btrfs_inode_item); 2511 2512 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2513 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); 2514 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2515 inode->i_gid = btrfs_inode_gid(leaf, inode_item); 2516 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2517 2518 tspec = btrfs_inode_atime(inode_item); 2519 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2520 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2521 2522 tspec = btrfs_inode_mtime(inode_item); 2523 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2524 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2525 2526 tspec = btrfs_inode_ctime(inode_item); 2527 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 2528 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 2529 2530 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2531 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2532 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); 2533 inode->i_generation = BTRFS_I(inode)->generation; 2534 inode->i_rdev = 0; 2535 rdev = btrfs_inode_rdev(leaf, inode_item); 2536 2537 BTRFS_I(inode)->index_cnt = (u64)-1; 2538 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2539 2540 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2541 2542 /* 2543 * try to precache a NULL acl entry for files that don't have 2544 * any xattrs or acls 2545 */ 2546 maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); 2547 if (!maybe_acls) 2548 cache_no_acl(inode); 2549 2550 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2551 alloc_group_block, 0); 2552 btrfs_free_path(path); 2553 inode_item = NULL; 2554 2555 switch (inode->i_mode & S_IFMT) { 2556 case S_IFREG: 2557 inode->i_mapping->a_ops = &btrfs_aops; 2558 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2559 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 2560 inode->i_fop = &btrfs_file_operations; 2561 inode->i_op = &btrfs_file_inode_operations; 2562 break; 2563 case S_IFDIR: 2564 inode->i_fop = &btrfs_dir_file_operations; 2565 if (root == root->fs_info->tree_root) 2566 inode->i_op = &btrfs_dir_ro_inode_operations; 2567 else 2568 inode->i_op = &btrfs_dir_inode_operations; 2569 break; 2570 case S_IFLNK: 2571 inode->i_op = &btrfs_symlink_inode_operations; 2572 inode->i_mapping->a_ops = &btrfs_symlink_aops; 2573 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2574 break; 2575 default: 2576 inode->i_op = &btrfs_special_inode_operations; 2577 init_special_inode(inode, inode->i_mode, rdev); 2578 break; 2579 } 2580 2581 btrfs_update_iflags(inode); 2582 return; 2583 2584 make_bad: 2585 btrfs_free_path(path); 2586 make_bad_inode(inode); 2587 } 2588 2589 /* 2590 * given a leaf and an inode, copy the inode fields into the leaf 2591 */ 2592 static void fill_inode_item(struct btrfs_trans_handle *trans, 2593 struct extent_buffer *leaf, 2594 struct btrfs_inode_item *item, 2595 struct inode *inode) 2596 { 2597 if (!leaf->map_token) 2598 map_private_extent_buffer(leaf, (unsigned long)item, 2599 sizeof(struct btrfs_inode_item), 2600 &leaf->map_token, &leaf->kaddr, 2601 &leaf->map_start, &leaf->map_len, 2602 KM_USER1); 2603 2604 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2605 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2606 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2607 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2608 btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 2609 2610 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), 2611 inode->i_atime.tv_sec); 2612 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), 2613 inode->i_atime.tv_nsec); 2614 2615 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), 2616 inode->i_mtime.tv_sec); 2617 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), 2618 inode->i_mtime.tv_nsec); 2619 2620 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), 2621 inode->i_ctime.tv_sec); 2622 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), 2623 inode->i_ctime.tv_nsec); 2624 2625 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2626 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2627 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); 2628 btrfs_set_inode_transid(leaf, item, trans->transid); 2629 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2630 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2631 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); 2632 2633 if (leaf->map_token) { 2634 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); 2635 leaf->map_token = NULL; 2636 } 2637 } 2638 2639 /* 2640 * copy everything in the in-memory inode into the btree. 2641 */ 2642 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2643 struct btrfs_root *root, struct inode *inode) 2644 { 2645 struct btrfs_inode_item *inode_item; 2646 struct btrfs_path *path; 2647 struct extent_buffer *leaf; 2648 int ret; 2649 2650 path = btrfs_alloc_path(); 2651 BUG_ON(!path); 2652 path->leave_spinning = 1; 2653 ret = btrfs_lookup_inode(trans, root, path, 2654 &BTRFS_I(inode)->location, 1); 2655 if (ret) { 2656 if (ret > 0) 2657 ret = -ENOENT; 2658 goto failed; 2659 } 2660 2661 btrfs_unlock_up_safe(path, 1); 2662 leaf = path->nodes[0]; 2663 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2664 struct btrfs_inode_item); 2665 2666 fill_inode_item(trans, leaf, inode_item, inode); 2667 btrfs_mark_buffer_dirty(leaf); 2668 btrfs_set_inode_last_trans(trans, inode); 2669 ret = 0; 2670 failed: 2671 btrfs_free_path(path); 2672 return ret; 2673 } 2674 2675 2676 /* 2677 * unlink helper that gets used here in inode.c and in the tree logging 2678 * recovery code. It remove a link in a directory with a given name, and 2679 * also drops the back refs in the inode to the directory 2680 */ 2681 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2682 struct btrfs_root *root, 2683 struct inode *dir, struct inode *inode, 2684 const char *name, int name_len) 2685 { 2686 struct btrfs_path *path; 2687 int ret = 0; 2688 struct extent_buffer *leaf; 2689 struct btrfs_dir_item *di; 2690 struct btrfs_key key; 2691 u64 index; 2692 2693 path = btrfs_alloc_path(); 2694 if (!path) { 2695 ret = -ENOMEM; 2696 goto out; 2697 } 2698 2699 path->leave_spinning = 1; 2700 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2701 name, name_len, -1); 2702 if (IS_ERR(di)) { 2703 ret = PTR_ERR(di); 2704 goto err; 2705 } 2706 if (!di) { 2707 ret = -ENOENT; 2708 goto err; 2709 } 2710 leaf = path->nodes[0]; 2711 btrfs_dir_item_key_to_cpu(leaf, di, &key); 2712 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2713 if (ret) 2714 goto err; 2715 btrfs_release_path(root, path); 2716 2717 ret = btrfs_del_inode_ref(trans, root, name, name_len, 2718 inode->i_ino, 2719 dir->i_ino, &index); 2720 if (ret) { 2721 printk(KERN_INFO "btrfs failed to delete reference to %.*s, " 2722 "inode %lu parent %lu\n", name_len, name, 2723 inode->i_ino, dir->i_ino); 2724 goto err; 2725 } 2726 2727 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 2728 index, name, name_len, -1); 2729 if (IS_ERR(di)) { 2730 ret = PTR_ERR(di); 2731 goto err; 2732 } 2733 if (!di) { 2734 ret = -ENOENT; 2735 goto err; 2736 } 2737 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2738 btrfs_release_path(root, path); 2739 2740 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2741 inode, dir->i_ino); 2742 BUG_ON(ret != 0 && ret != -ENOENT); 2743 2744 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2745 dir, index); 2746 if (ret == -ENOENT) 2747 ret = 0; 2748 err: 2749 btrfs_free_path(path); 2750 if (ret) 2751 goto out; 2752 2753 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2754 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2755 btrfs_update_inode(trans, root, dir); 2756 out: 2757 return ret; 2758 } 2759 2760 int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2761 struct btrfs_root *root, 2762 struct inode *dir, struct inode *inode, 2763 const char *name, int name_len) 2764 { 2765 int ret; 2766 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 2767 if (!ret) { 2768 btrfs_drop_nlink(inode); 2769 ret = btrfs_update_inode(trans, root, inode); 2770 } 2771 return ret; 2772 } 2773 2774 2775 /* helper to check if there is any shared block in the path */ 2776 static int check_path_shared(struct btrfs_root *root, 2777 struct btrfs_path *path) 2778 { 2779 struct extent_buffer *eb; 2780 int level; 2781 u64 refs = 1; 2782 2783 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2784 int ret; 2785 2786 if (!path->nodes[level]) 2787 break; 2788 eb = path->nodes[level]; 2789 if (!btrfs_block_can_be_shared(root, eb)) 2790 continue; 2791 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, 2792 &refs, NULL); 2793 if (refs > 1) 2794 return 1; 2795 } 2796 return 0; 2797 } 2798 2799 /* 2800 * helper to start transaction for unlink and rmdir. 2801 * 2802 * unlink and rmdir are special in btrfs, they do not always free space. 2803 * so in enospc case, we should make sure they will free space before 2804 * allowing them to use the global metadata reservation. 2805 */ 2806 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 2807 struct dentry *dentry) 2808 { 2809 struct btrfs_trans_handle *trans; 2810 struct btrfs_root *root = BTRFS_I(dir)->root; 2811 struct btrfs_path *path; 2812 struct btrfs_inode_ref *ref; 2813 struct btrfs_dir_item *di; 2814 struct inode *inode = dentry->d_inode; 2815 u64 index; 2816 int check_link = 1; 2817 int err = -ENOSPC; 2818 int ret; 2819 2820 trans = btrfs_start_transaction(root, 10); 2821 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2822 return trans; 2823 2824 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 2825 return ERR_PTR(-ENOSPC); 2826 2827 /* check if there is someone else holds reference */ 2828 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) 2829 return ERR_PTR(-ENOSPC); 2830 2831 if (atomic_read(&inode->i_count) > 2) 2832 return ERR_PTR(-ENOSPC); 2833 2834 if (xchg(&root->fs_info->enospc_unlink, 1)) 2835 return ERR_PTR(-ENOSPC); 2836 2837 path = btrfs_alloc_path(); 2838 if (!path) { 2839 root->fs_info->enospc_unlink = 0; 2840 return ERR_PTR(-ENOMEM); 2841 } 2842 2843 trans = btrfs_start_transaction(root, 0); 2844 if (IS_ERR(trans)) { 2845 btrfs_free_path(path); 2846 root->fs_info->enospc_unlink = 0; 2847 return trans; 2848 } 2849 2850 path->skip_locking = 1; 2851 path->search_commit_root = 1; 2852 2853 ret = btrfs_lookup_inode(trans, root, path, 2854 &BTRFS_I(dir)->location, 0); 2855 if (ret < 0) { 2856 err = ret; 2857 goto out; 2858 } 2859 if (ret == 0) { 2860 if (check_path_shared(root, path)) 2861 goto out; 2862 } else { 2863 check_link = 0; 2864 } 2865 btrfs_release_path(root, path); 2866 2867 ret = btrfs_lookup_inode(trans, root, path, 2868 &BTRFS_I(inode)->location, 0); 2869 if (ret < 0) { 2870 err = ret; 2871 goto out; 2872 } 2873 if (ret == 0) { 2874 if (check_path_shared(root, path)) 2875 goto out; 2876 } else { 2877 check_link = 0; 2878 } 2879 btrfs_release_path(root, path); 2880 2881 if (ret == 0 && S_ISREG(inode->i_mode)) { 2882 ret = btrfs_lookup_file_extent(trans, root, path, 2883 inode->i_ino, (u64)-1, 0); 2884 if (ret < 0) { 2885 err = ret; 2886 goto out; 2887 } 2888 BUG_ON(ret == 0); 2889 if (check_path_shared(root, path)) 2890 goto out; 2891 btrfs_release_path(root, path); 2892 } 2893 2894 if (!check_link) { 2895 err = 0; 2896 goto out; 2897 } 2898 2899 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2900 dentry->d_name.name, dentry->d_name.len, 0); 2901 if (IS_ERR(di)) { 2902 err = PTR_ERR(di); 2903 goto out; 2904 } 2905 if (di) { 2906 if (check_path_shared(root, path)) 2907 goto out; 2908 } else { 2909 err = 0; 2910 goto out; 2911 } 2912 btrfs_release_path(root, path); 2913 2914 ref = btrfs_lookup_inode_ref(trans, root, path, 2915 dentry->d_name.name, dentry->d_name.len, 2916 inode->i_ino, dir->i_ino, 0); 2917 if (IS_ERR(ref)) { 2918 err = PTR_ERR(ref); 2919 goto out; 2920 } 2921 BUG_ON(!ref); 2922 if (check_path_shared(root, path)) 2923 goto out; 2924 index = btrfs_inode_ref_index(path->nodes[0], ref); 2925 btrfs_release_path(root, path); 2926 2927 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index, 2928 dentry->d_name.name, dentry->d_name.len, 0); 2929 if (IS_ERR(di)) { 2930 err = PTR_ERR(di); 2931 goto out; 2932 } 2933 BUG_ON(ret == -ENOENT); 2934 if (check_path_shared(root, path)) 2935 goto out; 2936 2937 err = 0; 2938 out: 2939 btrfs_free_path(path); 2940 if (err) { 2941 btrfs_end_transaction(trans, root); 2942 root->fs_info->enospc_unlink = 0; 2943 return ERR_PTR(err); 2944 } 2945 2946 trans->block_rsv = &root->fs_info->global_block_rsv; 2947 return trans; 2948 } 2949 2950 static void __unlink_end_trans(struct btrfs_trans_handle *trans, 2951 struct btrfs_root *root) 2952 { 2953 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2954 BUG_ON(!root->fs_info->enospc_unlink); 2955 root->fs_info->enospc_unlink = 0; 2956 } 2957 btrfs_end_transaction_throttle(trans, root); 2958 } 2959 2960 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2961 { 2962 struct btrfs_root *root = BTRFS_I(dir)->root; 2963 struct btrfs_trans_handle *trans; 2964 struct inode *inode = dentry->d_inode; 2965 int ret; 2966 unsigned long nr = 0; 2967 2968 trans = __unlink_start_trans(dir, dentry); 2969 if (IS_ERR(trans)) 2970 return PTR_ERR(trans); 2971 2972 btrfs_set_trans_block_group(trans, dir); 2973 2974 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 2975 2976 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2977 dentry->d_name.name, dentry->d_name.len); 2978 BUG_ON(ret); 2979 2980 if (inode->i_nlink == 0) { 2981 ret = btrfs_orphan_add(trans, inode); 2982 BUG_ON(ret); 2983 } 2984 2985 nr = trans->blocks_used; 2986 __unlink_end_trans(trans, root); 2987 btrfs_btree_balance_dirty(root, nr); 2988 return ret; 2989 } 2990 2991 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, 2992 struct btrfs_root *root, 2993 struct inode *dir, u64 objectid, 2994 const char *name, int name_len) 2995 { 2996 struct btrfs_path *path; 2997 struct extent_buffer *leaf; 2998 struct btrfs_dir_item *di; 2999 struct btrfs_key key; 3000 u64 index; 3001 int ret; 3002 3003 path = btrfs_alloc_path(); 3004 if (!path) 3005 return -ENOMEM; 3006 3007 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 3008 name, name_len, -1); 3009 BUG_ON(!di || IS_ERR(di)); 3010 3011 leaf = path->nodes[0]; 3012 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3013 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 3014 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3015 BUG_ON(ret); 3016 btrfs_release_path(root, path); 3017 3018 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 3019 objectid, root->root_key.objectid, 3020 dir->i_ino, &index, name, name_len); 3021 if (ret < 0) { 3022 BUG_ON(ret != -ENOENT); 3023 di = btrfs_search_dir_index_item(root, path, dir->i_ino, 3024 name, name_len); 3025 BUG_ON(!di || IS_ERR(di)); 3026 3027 leaf = path->nodes[0]; 3028 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3029 btrfs_release_path(root, path); 3030 index = key.offset; 3031 } 3032 3033 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 3034 index, name, name_len, -1); 3035 BUG_ON(!di || IS_ERR(di)); 3036 3037 leaf = path->nodes[0]; 3038 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3039 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 3040 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3041 BUG_ON(ret); 3042 btrfs_release_path(root, path); 3043 3044 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3045 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3046 ret = btrfs_update_inode(trans, root, dir); 3047 BUG_ON(ret); 3048 3049 btrfs_free_path(path); 3050 return 0; 3051 } 3052 3053 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 3054 { 3055 struct inode *inode = dentry->d_inode; 3056 int err = 0; 3057 struct btrfs_root *root = BTRFS_I(dir)->root; 3058 struct btrfs_trans_handle *trans; 3059 unsigned long nr = 0; 3060 3061 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 3062 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 3063 return -ENOTEMPTY; 3064 3065 trans = __unlink_start_trans(dir, dentry); 3066 if (IS_ERR(trans)) 3067 return PTR_ERR(trans); 3068 3069 btrfs_set_trans_block_group(trans, dir); 3070 3071 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 3072 err = btrfs_unlink_subvol(trans, root, dir, 3073 BTRFS_I(inode)->location.objectid, 3074 dentry->d_name.name, 3075 dentry->d_name.len); 3076 goto out; 3077 } 3078 3079 err = btrfs_orphan_add(trans, inode); 3080 if (err) 3081 goto out; 3082 3083 /* now the directory is empty */ 3084 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3085 dentry->d_name.name, dentry->d_name.len); 3086 if (!err) 3087 btrfs_i_size_write(inode, 0); 3088 out: 3089 nr = trans->blocks_used; 3090 __unlink_end_trans(trans, root); 3091 btrfs_btree_balance_dirty(root, nr); 3092 3093 return err; 3094 } 3095 3096 #if 0 3097 /* 3098 * when truncating bytes in a file, it is possible to avoid reading 3099 * the leaves that contain only checksum items. This can be the 3100 * majority of the IO required to delete a large file, but it must 3101 * be done carefully. 3102 * 3103 * The keys in the level just above the leaves are checked to make sure 3104 * the lowest key in a given leaf is a csum key, and starts at an offset 3105 * after the new size. 3106 * 3107 * Then the key for the next leaf is checked to make sure it also has 3108 * a checksum item for the same file. If it does, we know our target leaf 3109 * contains only checksum items, and it can be safely freed without reading 3110 * it. 3111 * 3112 * This is just an optimization targeted at large files. It may do 3113 * nothing. It will return 0 unless things went badly. 3114 */ 3115 static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, 3116 struct btrfs_root *root, 3117 struct btrfs_path *path, 3118 struct inode *inode, u64 new_size) 3119 { 3120 struct btrfs_key key; 3121 int ret; 3122 int nritems; 3123 struct btrfs_key found_key; 3124 struct btrfs_key other_key; 3125 struct btrfs_leaf_ref *ref; 3126 u64 leaf_gen; 3127 u64 leaf_start; 3128 3129 path->lowest_level = 1; 3130 key.objectid = inode->i_ino; 3131 key.type = BTRFS_CSUM_ITEM_KEY; 3132 key.offset = new_size; 3133 again: 3134 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3135 if (ret < 0) 3136 goto out; 3137 3138 if (path->nodes[1] == NULL) { 3139 ret = 0; 3140 goto out; 3141 } 3142 ret = 0; 3143 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); 3144 nritems = btrfs_header_nritems(path->nodes[1]); 3145 3146 if (!nritems) 3147 goto out; 3148 3149 if (path->slots[1] >= nritems) 3150 goto next_node; 3151 3152 /* did we find a key greater than anything we want to delete? */ 3153 if (found_key.objectid > inode->i_ino || 3154 (found_key.objectid == inode->i_ino && found_key.type > key.type)) 3155 goto out; 3156 3157 /* we check the next key in the node to make sure the leave contains 3158 * only checksum items. This comparison doesn't work if our 3159 * leaf is the last one in the node 3160 */ 3161 if (path->slots[1] + 1 >= nritems) { 3162 next_node: 3163 /* search forward from the last key in the node, this 3164 * will bring us into the next node in the tree 3165 */ 3166 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); 3167 3168 /* unlikely, but we inc below, so check to be safe */ 3169 if (found_key.offset == (u64)-1) 3170 goto out; 3171 3172 /* search_forward needs a path with locks held, do the 3173 * search again for the original key. It is possible 3174 * this will race with a balance and return a path that 3175 * we could modify, but this drop is just an optimization 3176 * and is allowed to miss some leaves. 3177 */ 3178 btrfs_release_path(root, path); 3179 found_key.offset++; 3180 3181 /* setup a max key for search_forward */ 3182 other_key.offset = (u64)-1; 3183 other_key.type = key.type; 3184 other_key.objectid = key.objectid; 3185 3186 path->keep_locks = 1; 3187 ret = btrfs_search_forward(root, &found_key, &other_key, 3188 path, 0, 0); 3189 path->keep_locks = 0; 3190 if (ret || found_key.objectid != key.objectid || 3191 found_key.type != key.type) { 3192 ret = 0; 3193 goto out; 3194 } 3195 3196 key.offset = found_key.offset; 3197 btrfs_release_path(root, path); 3198 cond_resched(); 3199 goto again; 3200 } 3201 3202 /* we know there's one more slot after us in the tree, 3203 * read that key so we can verify it is also a checksum item 3204 */ 3205 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); 3206 3207 if (found_key.objectid < inode->i_ino) 3208 goto next_key; 3209 3210 if (found_key.type != key.type || found_key.offset < new_size) 3211 goto next_key; 3212 3213 /* 3214 * if the key for the next leaf isn't a csum key from this objectid, 3215 * we can't be sure there aren't good items inside this leaf. 3216 * Bail out 3217 */ 3218 if (other_key.objectid != inode->i_ino || other_key.type != key.type) 3219 goto out; 3220 3221 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]); 3222 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]); 3223 /* 3224 * it is safe to delete this leaf, it contains only 3225 * csum items from this inode at an offset >= new_size 3226 */ 3227 ret = btrfs_del_leaf(trans, root, path, leaf_start); 3228 BUG_ON(ret); 3229 3230 if (root->ref_cows && leaf_gen < trans->transid) { 3231 ref = btrfs_alloc_leaf_ref(root, 0); 3232 if (ref) { 3233 ref->root_gen = root->root_key.offset; 3234 ref->bytenr = leaf_start; 3235 ref->owner = 0; 3236 ref->generation = leaf_gen; 3237 ref->nritems = 0; 3238 3239 btrfs_sort_leaf_ref(ref); 3240 3241 ret = btrfs_add_leaf_ref(root, ref, 0); 3242 WARN_ON(ret); 3243 btrfs_free_leaf_ref(root, ref); 3244 } else { 3245 WARN_ON(1); 3246 } 3247 } 3248 next_key: 3249 btrfs_release_path(root, path); 3250 3251 if (other_key.objectid == inode->i_ino && 3252 other_key.type == key.type && other_key.offset > key.offset) { 3253 key.offset = other_key.offset; 3254 cond_resched(); 3255 goto again; 3256 } 3257 ret = 0; 3258 out: 3259 /* fixup any changes we've made to the path */ 3260 path->lowest_level = 0; 3261 path->keep_locks = 0; 3262 btrfs_release_path(root, path); 3263 return ret; 3264 } 3265 3266 #endif 3267 3268 /* 3269 * this can truncate away extent items, csum items and directory items. 3270 * It starts at a high offset and removes keys until it can't find 3271 * any higher than new_size 3272 * 3273 * csum items that cross the new i_size are truncated to the new size 3274 * as well. 3275 * 3276 * min_type is the minimum key type to truncate down to. If set to 0, this 3277 * will kill all the items on this inode, including the INODE_ITEM_KEY. 3278 */ 3279 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3280 struct btrfs_root *root, 3281 struct inode *inode, 3282 u64 new_size, u32 min_type) 3283 { 3284 struct btrfs_path *path; 3285 struct extent_buffer *leaf; 3286 struct btrfs_file_extent_item *fi; 3287 struct btrfs_key key; 3288 struct btrfs_key found_key; 3289 u64 extent_start = 0; 3290 u64 extent_num_bytes = 0; 3291 u64 extent_offset = 0; 3292 u64 item_end = 0; 3293 u64 mask = root->sectorsize - 1; 3294 u32 found_type = (u8)-1; 3295 int found_extent; 3296 int del_item; 3297 int pending_del_nr = 0; 3298 int pending_del_slot = 0; 3299 int extent_type = -1; 3300 int encoding; 3301 int ret; 3302 int err = 0; 3303 3304 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3305 3306 if (root->ref_cows || root == root->fs_info->tree_root) 3307 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3308 3309 path = btrfs_alloc_path(); 3310 BUG_ON(!path); 3311 path->reada = -1; 3312 3313 key.objectid = inode->i_ino; 3314 key.offset = (u64)-1; 3315 key.type = (u8)-1; 3316 3317 search_again: 3318 path->leave_spinning = 1; 3319 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3320 if (ret < 0) { 3321 err = ret; 3322 goto out; 3323 } 3324 3325 if (ret > 0) { 3326 /* there are no items in the tree for us to truncate, we're 3327 * done 3328 */ 3329 if (path->slots[0] == 0) 3330 goto out; 3331 path->slots[0]--; 3332 } 3333 3334 while (1) { 3335 fi = NULL; 3336 leaf = path->nodes[0]; 3337 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3338 found_type = btrfs_key_type(&found_key); 3339 encoding = 0; 3340 3341 if (found_key.objectid != inode->i_ino) 3342 break; 3343 3344 if (found_type < min_type) 3345 break; 3346 3347 item_end = found_key.offset; 3348 if (found_type == BTRFS_EXTENT_DATA_KEY) { 3349 fi = btrfs_item_ptr(leaf, path->slots[0], 3350 struct btrfs_file_extent_item); 3351 extent_type = btrfs_file_extent_type(leaf, fi); 3352 encoding = btrfs_file_extent_compression(leaf, fi); 3353 encoding |= btrfs_file_extent_encryption(leaf, fi); 3354 encoding |= btrfs_file_extent_other_encoding(leaf, fi); 3355 3356 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3357 item_end += 3358 btrfs_file_extent_num_bytes(leaf, fi); 3359 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3360 item_end += btrfs_file_extent_inline_len(leaf, 3361 fi); 3362 } 3363 item_end--; 3364 } 3365 if (found_type > min_type) { 3366 del_item = 1; 3367 } else { 3368 if (item_end < new_size) 3369 break; 3370 if (found_key.offset >= new_size) 3371 del_item = 1; 3372 else 3373 del_item = 0; 3374 } 3375 found_extent = 0; 3376 /* FIXME, shrink the extent if the ref count is only 1 */ 3377 if (found_type != BTRFS_EXTENT_DATA_KEY) 3378 goto delete; 3379 3380 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3381 u64 num_dec; 3382 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3383 if (!del_item && !encoding) { 3384 u64 orig_num_bytes = 3385 btrfs_file_extent_num_bytes(leaf, fi); 3386 extent_num_bytes = new_size - 3387 found_key.offset + root->sectorsize - 1; 3388 extent_num_bytes = extent_num_bytes & 3389 ~((u64)root->sectorsize - 1); 3390 btrfs_set_file_extent_num_bytes(leaf, fi, 3391 extent_num_bytes); 3392 num_dec = (orig_num_bytes - 3393 extent_num_bytes); 3394 if (root->ref_cows && extent_start != 0) 3395 inode_sub_bytes(inode, num_dec); 3396 btrfs_mark_buffer_dirty(leaf); 3397 } else { 3398 extent_num_bytes = 3399 btrfs_file_extent_disk_num_bytes(leaf, 3400 fi); 3401 extent_offset = found_key.offset - 3402 btrfs_file_extent_offset(leaf, fi); 3403 3404 /* FIXME blocksize != 4096 */ 3405 num_dec = btrfs_file_extent_num_bytes(leaf, fi); 3406 if (extent_start != 0) { 3407 found_extent = 1; 3408 if (root->ref_cows) 3409 inode_sub_bytes(inode, num_dec); 3410 } 3411 } 3412 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3413 /* 3414 * we can't truncate inline items that have had 3415 * special encodings 3416 */ 3417 if (!del_item && 3418 btrfs_file_extent_compression(leaf, fi) == 0 && 3419 btrfs_file_extent_encryption(leaf, fi) == 0 && 3420 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 3421 u32 size = new_size - found_key.offset; 3422 3423 if (root->ref_cows) { 3424 inode_sub_bytes(inode, item_end + 1 - 3425 new_size); 3426 } 3427 size = 3428 btrfs_file_extent_calc_inline_size(size); 3429 ret = btrfs_truncate_item(trans, root, path, 3430 size, 1); 3431 BUG_ON(ret); 3432 } else if (root->ref_cows) { 3433 inode_sub_bytes(inode, item_end + 1 - 3434 found_key.offset); 3435 } 3436 } 3437 delete: 3438 if (del_item) { 3439 if (!pending_del_nr) { 3440 /* no pending yet, add ourselves */ 3441 pending_del_slot = path->slots[0]; 3442 pending_del_nr = 1; 3443 } else if (pending_del_nr && 3444 path->slots[0] + 1 == pending_del_slot) { 3445 /* hop on the pending chunk */ 3446 pending_del_nr++; 3447 pending_del_slot = path->slots[0]; 3448 } else { 3449 BUG(); 3450 } 3451 } else { 3452 break; 3453 } 3454 if (found_extent && (root->ref_cows || 3455 root == root->fs_info->tree_root)) { 3456 btrfs_set_path_blocking(path); 3457 ret = btrfs_free_extent(trans, root, extent_start, 3458 extent_num_bytes, 0, 3459 btrfs_header_owner(leaf), 3460 inode->i_ino, extent_offset); 3461 BUG_ON(ret); 3462 } 3463 3464 if (found_type == BTRFS_INODE_ITEM_KEY) 3465 break; 3466 3467 if (path->slots[0] == 0 || 3468 path->slots[0] != pending_del_slot) { 3469 if (root->ref_cows) { 3470 err = -EAGAIN; 3471 goto out; 3472 } 3473 if (pending_del_nr) { 3474 ret = btrfs_del_items(trans, root, path, 3475 pending_del_slot, 3476 pending_del_nr); 3477 BUG_ON(ret); 3478 pending_del_nr = 0; 3479 } 3480 btrfs_release_path(root, path); 3481 goto search_again; 3482 } else { 3483 path->slots[0]--; 3484 } 3485 } 3486 out: 3487 if (pending_del_nr) { 3488 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3489 pending_del_nr); 3490 BUG_ON(ret); 3491 } 3492 btrfs_free_path(path); 3493 return err; 3494 } 3495 3496 /* 3497 * taken from block_truncate_page, but does cow as it zeros out 3498 * any bytes left in the last page in the file. 3499 */ 3500 static int btrfs_truncate_page(struct address_space *mapping, loff_t from) 3501 { 3502 struct inode *inode = mapping->host; 3503 struct btrfs_root *root = BTRFS_I(inode)->root; 3504 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3505 struct btrfs_ordered_extent *ordered; 3506 struct extent_state *cached_state = NULL; 3507 char *kaddr; 3508 u32 blocksize = root->sectorsize; 3509 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3510 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3511 struct page *page; 3512 int ret = 0; 3513 u64 page_start; 3514 u64 page_end; 3515 3516 if ((offset & (blocksize - 1)) == 0) 3517 goto out; 3518 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 3519 if (ret) 3520 goto out; 3521 3522 ret = -ENOMEM; 3523 again: 3524 page = grab_cache_page(mapping, index); 3525 if (!page) { 3526 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3527 goto out; 3528 } 3529 3530 page_start = page_offset(page); 3531 page_end = page_start + PAGE_CACHE_SIZE - 1; 3532 3533 if (!PageUptodate(page)) { 3534 ret = btrfs_readpage(NULL, page); 3535 lock_page(page); 3536 if (page->mapping != mapping) { 3537 unlock_page(page); 3538 page_cache_release(page); 3539 goto again; 3540 } 3541 if (!PageUptodate(page)) { 3542 ret = -EIO; 3543 goto out_unlock; 3544 } 3545 } 3546 wait_on_page_writeback(page); 3547 3548 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, 3549 GFP_NOFS); 3550 set_page_extent_mapped(page); 3551 3552 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3553 if (ordered) { 3554 unlock_extent_cached(io_tree, page_start, page_end, 3555 &cached_state, GFP_NOFS); 3556 unlock_page(page); 3557 page_cache_release(page); 3558 btrfs_start_ordered_extent(inode, ordered, 1); 3559 btrfs_put_ordered_extent(ordered); 3560 goto again; 3561 } 3562 3563 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 3564 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3565 0, 0, &cached_state, GFP_NOFS); 3566 3567 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 3568 &cached_state); 3569 if (ret) { 3570 unlock_extent_cached(io_tree, page_start, page_end, 3571 &cached_state, GFP_NOFS); 3572 goto out_unlock; 3573 } 3574 3575 ret = 0; 3576 if (offset != PAGE_CACHE_SIZE) { 3577 kaddr = kmap(page); 3578 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); 3579 flush_dcache_page(page); 3580 kunmap(page); 3581 } 3582 ClearPageChecked(page); 3583 set_page_dirty(page); 3584 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, 3585 GFP_NOFS); 3586 3587 out_unlock: 3588 if (ret) 3589 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3590 unlock_page(page); 3591 page_cache_release(page); 3592 out: 3593 return ret; 3594 } 3595 3596 /* 3597 * This function puts in dummy file extents for the area we're creating a hole 3598 * for. So if we are truncating this file to a larger size we need to insert 3599 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for 3600 * the range between oldsize and size 3601 */ 3602 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 3603 { 3604 struct btrfs_trans_handle *trans; 3605 struct btrfs_root *root = BTRFS_I(inode)->root; 3606 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3607 struct extent_map *em = NULL; 3608 struct extent_state *cached_state = NULL; 3609 u64 mask = root->sectorsize - 1; 3610 u64 hole_start = (oldsize + mask) & ~mask; 3611 u64 block_end = (size + mask) & ~mask; 3612 u64 last_byte; 3613 u64 cur_offset; 3614 u64 hole_size; 3615 int err = 0; 3616 3617 if (size <= hole_start) 3618 return 0; 3619 3620 while (1) { 3621 struct btrfs_ordered_extent *ordered; 3622 btrfs_wait_ordered_range(inode, hole_start, 3623 block_end - hole_start); 3624 lock_extent_bits(io_tree, hole_start, block_end - 1, 0, 3625 &cached_state, GFP_NOFS); 3626 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3627 if (!ordered) 3628 break; 3629 unlock_extent_cached(io_tree, hole_start, block_end - 1, 3630 &cached_state, GFP_NOFS); 3631 btrfs_put_ordered_extent(ordered); 3632 } 3633 3634 cur_offset = hole_start; 3635 while (1) { 3636 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 3637 block_end - cur_offset, 0); 3638 BUG_ON(IS_ERR(em) || !em); 3639 last_byte = min(extent_map_end(em), block_end); 3640 last_byte = (last_byte + mask) & ~mask; 3641 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3642 u64 hint_byte = 0; 3643 hole_size = last_byte - cur_offset; 3644 3645 trans = btrfs_start_transaction(root, 2); 3646 if (IS_ERR(trans)) { 3647 err = PTR_ERR(trans); 3648 break; 3649 } 3650 btrfs_set_trans_block_group(trans, inode); 3651 3652 err = btrfs_drop_extents(trans, inode, cur_offset, 3653 cur_offset + hole_size, 3654 &hint_byte, 1); 3655 if (err) 3656 break; 3657 3658 err = btrfs_insert_file_extent(trans, root, 3659 inode->i_ino, cur_offset, 0, 3660 0, hole_size, 0, hole_size, 3661 0, 0, 0); 3662 if (err) 3663 break; 3664 3665 btrfs_drop_extent_cache(inode, hole_start, 3666 last_byte - 1, 0); 3667 3668 btrfs_end_transaction(trans, root); 3669 } 3670 free_extent_map(em); 3671 em = NULL; 3672 cur_offset = last_byte; 3673 if (cur_offset >= block_end) 3674 break; 3675 } 3676 3677 free_extent_map(em); 3678 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3679 GFP_NOFS); 3680 return err; 3681 } 3682 3683 static int btrfs_setsize(struct inode *inode, loff_t newsize) 3684 { 3685 loff_t oldsize = i_size_read(inode); 3686 int ret; 3687 3688 if (newsize == oldsize) 3689 return 0; 3690 3691 if (newsize > oldsize) { 3692 i_size_write(inode, newsize); 3693 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 3694 truncate_pagecache(inode, oldsize, newsize); 3695 ret = btrfs_cont_expand(inode, oldsize, newsize); 3696 if (ret) { 3697 btrfs_setsize(inode, oldsize); 3698 return ret; 3699 } 3700 3701 mark_inode_dirty(inode); 3702 } else { 3703 3704 /* 3705 * We're truncating a file that used to have good data down to 3706 * zero. Make sure it gets into the ordered flush list so that 3707 * any new writes get down to disk quickly. 3708 */ 3709 if (newsize == 0) 3710 BTRFS_I(inode)->ordered_data_close = 1; 3711 3712 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3713 truncate_setsize(inode, newsize); 3714 ret = btrfs_truncate(inode); 3715 } 3716 3717 return ret; 3718 } 3719 3720 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3721 { 3722 struct inode *inode = dentry->d_inode; 3723 struct btrfs_root *root = BTRFS_I(inode)->root; 3724 int err; 3725 3726 if (btrfs_root_readonly(root)) 3727 return -EROFS; 3728 3729 err = inode_change_ok(inode, attr); 3730 if (err) 3731 return err; 3732 3733 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3734 err = btrfs_setsize(inode, attr->ia_size); 3735 if (err) 3736 return err; 3737 } 3738 3739 if (attr->ia_valid) { 3740 setattr_copy(inode, attr); 3741 mark_inode_dirty(inode); 3742 3743 if (attr->ia_valid & ATTR_MODE) 3744 err = btrfs_acl_chmod(inode); 3745 } 3746 3747 return err; 3748 } 3749 3750 void btrfs_evict_inode(struct inode *inode) 3751 { 3752 struct btrfs_trans_handle *trans; 3753 struct btrfs_root *root = BTRFS_I(inode)->root; 3754 unsigned long nr; 3755 int ret; 3756 3757 trace_btrfs_inode_evict(inode); 3758 3759 truncate_inode_pages(&inode->i_data, 0); 3760 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3761 root == root->fs_info->tree_root)) 3762 goto no_delete; 3763 3764 if (is_bad_inode(inode)) { 3765 btrfs_orphan_del(NULL, inode); 3766 goto no_delete; 3767 } 3768 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 3769 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3770 3771 if (root->fs_info->log_root_recovering) { 3772 BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); 3773 goto no_delete; 3774 } 3775 3776 if (inode->i_nlink > 0) { 3777 BUG_ON(btrfs_root_refs(&root->root_item) != 0); 3778 goto no_delete; 3779 } 3780 3781 btrfs_i_size_write(inode, 0); 3782 3783 while (1) { 3784 trans = btrfs_start_transaction(root, 0); 3785 BUG_ON(IS_ERR(trans)); 3786 btrfs_set_trans_block_group(trans, inode); 3787 trans->block_rsv = root->orphan_block_rsv; 3788 3789 ret = btrfs_block_rsv_check(trans, root, 3790 root->orphan_block_rsv, 0, 5); 3791 if (ret) { 3792 BUG_ON(ret != -EAGAIN); 3793 ret = btrfs_commit_transaction(trans, root); 3794 BUG_ON(ret); 3795 continue; 3796 } 3797 3798 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3799 if (ret != -EAGAIN) 3800 break; 3801 3802 nr = trans->blocks_used; 3803 btrfs_end_transaction(trans, root); 3804 trans = NULL; 3805 btrfs_btree_balance_dirty(root, nr); 3806 3807 } 3808 3809 if (ret == 0) { 3810 ret = btrfs_orphan_del(trans, inode); 3811 BUG_ON(ret); 3812 } 3813 3814 nr = trans->blocks_used; 3815 btrfs_end_transaction(trans, root); 3816 btrfs_btree_balance_dirty(root, nr); 3817 no_delete: 3818 end_writeback(inode); 3819 return; 3820 } 3821 3822 /* 3823 * this returns the key found in the dir entry in the location pointer. 3824 * If no dir entries were found, location->objectid is 0. 3825 */ 3826 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, 3827 struct btrfs_key *location) 3828 { 3829 const char *name = dentry->d_name.name; 3830 int namelen = dentry->d_name.len; 3831 struct btrfs_dir_item *di; 3832 struct btrfs_path *path; 3833 struct btrfs_root *root = BTRFS_I(dir)->root; 3834 int ret = 0; 3835 3836 path = btrfs_alloc_path(); 3837 BUG_ON(!path); 3838 3839 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, 3840 namelen, 0); 3841 if (IS_ERR(di)) 3842 ret = PTR_ERR(di); 3843 3844 if (!di || IS_ERR(di)) 3845 goto out_err; 3846 3847 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 3848 out: 3849 btrfs_free_path(path); 3850 return ret; 3851 out_err: 3852 location->objectid = 0; 3853 goto out; 3854 } 3855 3856 /* 3857 * when we hit a tree root in a directory, the btrfs part of the inode 3858 * needs to be changed to reflect the root directory of the tree root. This 3859 * is kind of like crossing a mount point. 3860 */ 3861 static int fixup_tree_root_location(struct btrfs_root *root, 3862 struct inode *dir, 3863 struct dentry *dentry, 3864 struct btrfs_key *location, 3865 struct btrfs_root **sub_root) 3866 { 3867 struct btrfs_path *path; 3868 struct btrfs_root *new_root; 3869 struct btrfs_root_ref *ref; 3870 struct extent_buffer *leaf; 3871 int ret; 3872 int err = 0; 3873 3874 path = btrfs_alloc_path(); 3875 if (!path) { 3876 err = -ENOMEM; 3877 goto out; 3878 } 3879 3880 err = -ENOENT; 3881 ret = btrfs_find_root_ref(root->fs_info->tree_root, path, 3882 BTRFS_I(dir)->root->root_key.objectid, 3883 location->objectid); 3884 if (ret) { 3885 if (ret < 0) 3886 err = ret; 3887 goto out; 3888 } 3889 3890 leaf = path->nodes[0]; 3891 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 3892 if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || 3893 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 3894 goto out; 3895 3896 ret = memcmp_extent_buffer(leaf, dentry->d_name.name, 3897 (unsigned long)(ref + 1), 3898 dentry->d_name.len); 3899 if (ret) 3900 goto out; 3901 3902 btrfs_release_path(root->fs_info->tree_root, path); 3903 3904 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 3905 if (IS_ERR(new_root)) { 3906 err = PTR_ERR(new_root); 3907 goto out; 3908 } 3909 3910 if (btrfs_root_refs(&new_root->root_item) == 0) { 3911 err = -ENOENT; 3912 goto out; 3913 } 3914 3915 *sub_root = new_root; 3916 location->objectid = btrfs_root_dirid(&new_root->root_item); 3917 location->type = BTRFS_INODE_ITEM_KEY; 3918 location->offset = 0; 3919 err = 0; 3920 out: 3921 btrfs_free_path(path); 3922 return err; 3923 } 3924 3925 static void inode_tree_add(struct inode *inode) 3926 { 3927 struct btrfs_root *root = BTRFS_I(inode)->root; 3928 struct btrfs_inode *entry; 3929 struct rb_node **p; 3930 struct rb_node *parent; 3931 again: 3932 p = &root->inode_tree.rb_node; 3933 parent = NULL; 3934 3935 if (inode_unhashed(inode)) 3936 return; 3937 3938 spin_lock(&root->inode_lock); 3939 while (*p) { 3940 parent = *p; 3941 entry = rb_entry(parent, struct btrfs_inode, rb_node); 3942 3943 if (inode->i_ino < entry->vfs_inode.i_ino) 3944 p = &parent->rb_left; 3945 else if (inode->i_ino > entry->vfs_inode.i_ino) 3946 p = &parent->rb_right; 3947 else { 3948 WARN_ON(!(entry->vfs_inode.i_state & 3949 (I_WILL_FREE | I_FREEING))); 3950 rb_erase(parent, &root->inode_tree); 3951 RB_CLEAR_NODE(parent); 3952 spin_unlock(&root->inode_lock); 3953 goto again; 3954 } 3955 } 3956 rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); 3957 rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3958 spin_unlock(&root->inode_lock); 3959 } 3960 3961 static void inode_tree_del(struct inode *inode) 3962 { 3963 struct btrfs_root *root = BTRFS_I(inode)->root; 3964 int empty = 0; 3965 3966 spin_lock(&root->inode_lock); 3967 if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { 3968 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); 3969 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3970 empty = RB_EMPTY_ROOT(&root->inode_tree); 3971 } 3972 spin_unlock(&root->inode_lock); 3973 3974 /* 3975 * Free space cache has inodes in the tree root, but the tree root has a 3976 * root_refs of 0, so this could end up dropping the tree root as a 3977 * snapshot, so we need the extra !root->fs_info->tree_root check to 3978 * make sure we don't drop it. 3979 */ 3980 if (empty && btrfs_root_refs(&root->root_item) == 0 && 3981 root != root->fs_info->tree_root) { 3982 synchronize_srcu(&root->fs_info->subvol_srcu); 3983 spin_lock(&root->inode_lock); 3984 empty = RB_EMPTY_ROOT(&root->inode_tree); 3985 spin_unlock(&root->inode_lock); 3986 if (empty) 3987 btrfs_add_dead_root(root); 3988 } 3989 } 3990 3991 int btrfs_invalidate_inodes(struct btrfs_root *root) 3992 { 3993 struct rb_node *node; 3994 struct rb_node *prev; 3995 struct btrfs_inode *entry; 3996 struct inode *inode; 3997 u64 objectid = 0; 3998 3999 WARN_ON(btrfs_root_refs(&root->root_item) != 0); 4000 4001 spin_lock(&root->inode_lock); 4002 again: 4003 node = root->inode_tree.rb_node; 4004 prev = NULL; 4005 while (node) { 4006 prev = node; 4007 entry = rb_entry(node, struct btrfs_inode, rb_node); 4008 4009 if (objectid < entry->vfs_inode.i_ino) 4010 node = node->rb_left; 4011 else if (objectid > entry->vfs_inode.i_ino) 4012 node = node->rb_right; 4013 else 4014 break; 4015 } 4016 if (!node) { 4017 while (prev) { 4018 entry = rb_entry(prev, struct btrfs_inode, rb_node); 4019 if (objectid <= entry->vfs_inode.i_ino) { 4020 node = prev; 4021 break; 4022 } 4023 prev = rb_next(prev); 4024 } 4025 } 4026 while (node) { 4027 entry = rb_entry(node, struct btrfs_inode, rb_node); 4028 objectid = entry->vfs_inode.i_ino + 1; 4029 inode = igrab(&entry->vfs_inode); 4030 if (inode) { 4031 spin_unlock(&root->inode_lock); 4032 if (atomic_read(&inode->i_count) > 1) 4033 d_prune_aliases(inode); 4034 /* 4035 * btrfs_drop_inode will have it removed from 4036 * the inode cache when its usage count 4037 * hits zero. 4038 */ 4039 iput(inode); 4040 cond_resched(); 4041 spin_lock(&root->inode_lock); 4042 goto again; 4043 } 4044 4045 if (cond_resched_lock(&root->inode_lock)) 4046 goto again; 4047 4048 node = rb_next(node); 4049 } 4050 spin_unlock(&root->inode_lock); 4051 return 0; 4052 } 4053 4054 static int btrfs_init_locked_inode(struct inode *inode, void *p) 4055 { 4056 struct btrfs_iget_args *args = p; 4057 inode->i_ino = args->ino; 4058 BTRFS_I(inode)->root = args->root; 4059 btrfs_set_inode_space_info(args->root, inode); 4060 return 0; 4061 } 4062 4063 static int btrfs_find_actor(struct inode *inode, void *opaque) 4064 { 4065 struct btrfs_iget_args *args = opaque; 4066 return args->ino == inode->i_ino && 4067 args->root == BTRFS_I(inode)->root; 4068 } 4069 4070 static struct inode *btrfs_iget_locked(struct super_block *s, 4071 u64 objectid, 4072 struct btrfs_root *root) 4073 { 4074 struct inode *inode; 4075 struct btrfs_iget_args args; 4076 args.ino = objectid; 4077 args.root = root; 4078 4079 inode = iget5_locked(s, objectid, btrfs_find_actor, 4080 btrfs_init_locked_inode, 4081 (void *)&args); 4082 return inode; 4083 } 4084 4085 /* Get an inode object given its location and corresponding root. 4086 * Returns in *is_new if the inode was read from disk 4087 */ 4088 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 4089 struct btrfs_root *root, int *new) 4090 { 4091 struct inode *inode; 4092 4093 inode = btrfs_iget_locked(s, location->objectid, root); 4094 if (!inode) 4095 return ERR_PTR(-ENOMEM); 4096 4097 if (inode->i_state & I_NEW) { 4098 BTRFS_I(inode)->root = root; 4099 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 4100 btrfs_read_locked_inode(inode); 4101 inode_tree_add(inode); 4102 unlock_new_inode(inode); 4103 if (new) 4104 *new = 1; 4105 } 4106 4107 return inode; 4108 } 4109 4110 static struct inode *new_simple_dir(struct super_block *s, 4111 struct btrfs_key *key, 4112 struct btrfs_root *root) 4113 { 4114 struct inode *inode = new_inode(s); 4115 4116 if (!inode) 4117 return ERR_PTR(-ENOMEM); 4118 4119 BTRFS_I(inode)->root = root; 4120 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4121 BTRFS_I(inode)->dummy_inode = 1; 4122 4123 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 4124 inode->i_op = &simple_dir_inode_operations; 4125 inode->i_fop = &simple_dir_operations; 4126 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 4127 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4128 4129 return inode; 4130 } 4131 4132 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) 4133 { 4134 struct inode *inode; 4135 struct btrfs_root *root = BTRFS_I(dir)->root; 4136 struct btrfs_root *sub_root = root; 4137 struct btrfs_key location; 4138 int index; 4139 int ret; 4140 4141 if (dentry->d_name.len > BTRFS_NAME_LEN) 4142 return ERR_PTR(-ENAMETOOLONG); 4143 4144 ret = btrfs_inode_by_name(dir, dentry, &location); 4145 4146 if (ret < 0) 4147 return ERR_PTR(ret); 4148 4149 if (location.objectid == 0) 4150 return NULL; 4151 4152 if (location.type == BTRFS_INODE_ITEM_KEY) { 4153 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 4154 return inode; 4155 } 4156 4157 BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); 4158 4159 index = srcu_read_lock(&root->fs_info->subvol_srcu); 4160 ret = fixup_tree_root_location(root, dir, dentry, 4161 &location, &sub_root); 4162 if (ret < 0) { 4163 if (ret != -ENOENT) 4164 inode = ERR_PTR(ret); 4165 else 4166 inode = new_simple_dir(dir->i_sb, &location, sub_root); 4167 } else { 4168 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); 4169 } 4170 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4171 4172 if (!IS_ERR(inode) && root != sub_root) { 4173 down_read(&root->fs_info->cleanup_work_sem); 4174 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4175 ret = btrfs_orphan_cleanup(sub_root); 4176 up_read(&root->fs_info->cleanup_work_sem); 4177 if (ret) 4178 inode = ERR_PTR(ret); 4179 } 4180 4181 return inode; 4182 } 4183 4184 static int btrfs_dentry_delete(const struct dentry *dentry) 4185 { 4186 struct btrfs_root *root; 4187 4188 if (!dentry->d_inode && !IS_ROOT(dentry)) 4189 dentry = dentry->d_parent; 4190 4191 if (dentry->d_inode) { 4192 root = BTRFS_I(dentry->d_inode)->root; 4193 if (btrfs_root_refs(&root->root_item) == 0) 4194 return 1; 4195 } 4196 return 0; 4197 } 4198 4199 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4200 struct nameidata *nd) 4201 { 4202 struct inode *inode; 4203 4204 inode = btrfs_lookup_dentry(dir, dentry); 4205 if (IS_ERR(inode)) 4206 return ERR_CAST(inode); 4207 4208 return d_splice_alias(inode, dentry); 4209 } 4210 4211 static unsigned char btrfs_filetype_table[] = { 4212 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 4213 }; 4214 4215 static int btrfs_real_readdir(struct file *filp, void *dirent, 4216 filldir_t filldir) 4217 { 4218 struct inode *inode = filp->f_dentry->d_inode; 4219 struct btrfs_root *root = BTRFS_I(inode)->root; 4220 struct btrfs_item *item; 4221 struct btrfs_dir_item *di; 4222 struct btrfs_key key; 4223 struct btrfs_key found_key; 4224 struct btrfs_path *path; 4225 int ret; 4226 struct extent_buffer *leaf; 4227 int slot; 4228 unsigned char d_type; 4229 int over = 0; 4230 u32 di_cur; 4231 u32 di_total; 4232 u32 di_len; 4233 int key_type = BTRFS_DIR_INDEX_KEY; 4234 char tmp_name[32]; 4235 char *name_ptr; 4236 int name_len; 4237 4238 /* FIXME, use a real flag for deciding about the key type */ 4239 if (root->fs_info->tree_root == root) 4240 key_type = BTRFS_DIR_ITEM_KEY; 4241 4242 /* special case for "." */ 4243 if (filp->f_pos == 0) { 4244 over = filldir(dirent, ".", 1, 4245 1, inode->i_ino, 4246 DT_DIR); 4247 if (over) 4248 return 0; 4249 filp->f_pos = 1; 4250 } 4251 /* special case for .., just use the back ref */ 4252 if (filp->f_pos == 1) { 4253 u64 pino = parent_ino(filp->f_path.dentry); 4254 over = filldir(dirent, "..", 2, 4255 2, pino, DT_DIR); 4256 if (over) 4257 return 0; 4258 filp->f_pos = 2; 4259 } 4260 path = btrfs_alloc_path(); 4261 path->reada = 2; 4262 4263 btrfs_set_key_type(&key, key_type); 4264 key.offset = filp->f_pos; 4265 key.objectid = inode->i_ino; 4266 4267 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4268 if (ret < 0) 4269 goto err; 4270 4271 while (1) { 4272 leaf = path->nodes[0]; 4273 slot = path->slots[0]; 4274 if (slot >= btrfs_header_nritems(leaf)) { 4275 ret = btrfs_next_leaf(root, path); 4276 if (ret < 0) 4277 goto err; 4278 else if (ret > 0) 4279 break; 4280 continue; 4281 } 4282 4283 item = btrfs_item_nr(leaf, slot); 4284 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4285 4286 if (found_key.objectid != key.objectid) 4287 break; 4288 if (btrfs_key_type(&found_key) != key_type) 4289 break; 4290 if (found_key.offset < filp->f_pos) 4291 goto next; 4292 4293 filp->f_pos = found_key.offset; 4294 4295 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 4296 di_cur = 0; 4297 di_total = btrfs_item_size(leaf, item); 4298 4299 while (di_cur < di_total) { 4300 struct btrfs_key location; 4301 4302 if (verify_dir_item(root, leaf, di)) 4303 break; 4304 4305 name_len = btrfs_dir_name_len(leaf, di); 4306 if (name_len <= sizeof(tmp_name)) { 4307 name_ptr = tmp_name; 4308 } else { 4309 name_ptr = kmalloc(name_len, GFP_NOFS); 4310 if (!name_ptr) { 4311 ret = -ENOMEM; 4312 goto err; 4313 } 4314 } 4315 read_extent_buffer(leaf, name_ptr, 4316 (unsigned long)(di + 1), name_len); 4317 4318 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 4319 btrfs_dir_item_key_to_cpu(leaf, di, &location); 4320 4321 /* is this a reference to our own snapshot? If so 4322 * skip it 4323 */ 4324 if (location.type == BTRFS_ROOT_ITEM_KEY && 4325 location.objectid == root->root_key.objectid) { 4326 over = 0; 4327 goto skip; 4328 } 4329 over = filldir(dirent, name_ptr, name_len, 4330 found_key.offset, location.objectid, 4331 d_type); 4332 4333 skip: 4334 if (name_ptr != tmp_name) 4335 kfree(name_ptr); 4336 4337 if (over) 4338 goto nopos; 4339 di_len = btrfs_dir_name_len(leaf, di) + 4340 btrfs_dir_data_len(leaf, di) + sizeof(*di); 4341 di_cur += di_len; 4342 di = (struct btrfs_dir_item *)((char *)di + di_len); 4343 } 4344 next: 4345 path->slots[0]++; 4346 } 4347 4348 /* Reached end of directory/root. Bump pos past the last item. */ 4349 if (key_type == BTRFS_DIR_INDEX_KEY) 4350 /* 4351 * 32-bit glibc will use getdents64, but then strtol - 4352 * so the last number we can serve is this. 4353 */ 4354 filp->f_pos = 0x7fffffff; 4355 else 4356 filp->f_pos++; 4357 nopos: 4358 ret = 0; 4359 err: 4360 btrfs_free_path(path); 4361 return ret; 4362 } 4363 4364 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) 4365 { 4366 struct btrfs_root *root = BTRFS_I(inode)->root; 4367 struct btrfs_trans_handle *trans; 4368 int ret = 0; 4369 bool nolock = false; 4370 4371 if (BTRFS_I(inode)->dummy_inode) 4372 return 0; 4373 4374 smp_mb(); 4375 nolock = (root->fs_info->closing && root == root->fs_info->tree_root); 4376 4377 if (wbc->sync_mode == WB_SYNC_ALL) { 4378 if (nolock) 4379 trans = btrfs_join_transaction_nolock(root, 1); 4380 else 4381 trans = btrfs_join_transaction(root, 1); 4382 if (IS_ERR(trans)) 4383 return PTR_ERR(trans); 4384 btrfs_set_trans_block_group(trans, inode); 4385 if (nolock) 4386 ret = btrfs_end_transaction_nolock(trans, root); 4387 else 4388 ret = btrfs_commit_transaction(trans, root); 4389 } 4390 return ret; 4391 } 4392 4393 /* 4394 * This is somewhat expensive, updating the tree every time the 4395 * inode changes. But, it is most likely to find the inode in cache. 4396 * FIXME, needs more benchmarking...there are no reasons other than performance 4397 * to keep or drop this code. 4398 */ 4399 void btrfs_dirty_inode(struct inode *inode, int flags) 4400 { 4401 struct btrfs_root *root = BTRFS_I(inode)->root; 4402 struct btrfs_trans_handle *trans; 4403 int ret; 4404 4405 if (BTRFS_I(inode)->dummy_inode) 4406 return; 4407 4408 trans = btrfs_join_transaction(root, 1); 4409 BUG_ON(IS_ERR(trans)); 4410 btrfs_set_trans_block_group(trans, inode); 4411 4412 ret = btrfs_update_inode(trans, root, inode); 4413 if (ret && ret == -ENOSPC) { 4414 /* whoops, lets try again with the full transaction */ 4415 btrfs_end_transaction(trans, root); 4416 trans = btrfs_start_transaction(root, 1); 4417 if (IS_ERR(trans)) { 4418 if (printk_ratelimit()) { 4419 printk(KERN_ERR "btrfs: fail to " 4420 "dirty inode %lu error %ld\n", 4421 inode->i_ino, PTR_ERR(trans)); 4422 } 4423 return; 4424 } 4425 btrfs_set_trans_block_group(trans, inode); 4426 4427 ret = btrfs_update_inode(trans, root, inode); 4428 if (ret) { 4429 if (printk_ratelimit()) { 4430 printk(KERN_ERR "btrfs: fail to " 4431 "dirty inode %lu error %d\n", 4432 inode->i_ino, ret); 4433 } 4434 } 4435 } 4436 btrfs_end_transaction(trans, root); 4437 } 4438 4439 /* 4440 * find the highest existing sequence number in a directory 4441 * and then set the in-memory index_cnt variable to reflect 4442 * free sequence numbers 4443 */ 4444 static int btrfs_set_inode_index_count(struct inode *inode) 4445 { 4446 struct btrfs_root *root = BTRFS_I(inode)->root; 4447 struct btrfs_key key, found_key; 4448 struct btrfs_path *path; 4449 struct extent_buffer *leaf; 4450 int ret; 4451 4452 key.objectid = inode->i_ino; 4453 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 4454 key.offset = (u64)-1; 4455 4456 path = btrfs_alloc_path(); 4457 if (!path) 4458 return -ENOMEM; 4459 4460 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4461 if (ret < 0) 4462 goto out; 4463 /* FIXME: we should be able to handle this */ 4464 if (ret == 0) 4465 goto out; 4466 ret = 0; 4467 4468 /* 4469 * MAGIC NUMBER EXPLANATION: 4470 * since we search a directory based on f_pos we have to start at 2 4471 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 4472 * else has to start at 2 4473 */ 4474 if (path->slots[0] == 0) { 4475 BTRFS_I(inode)->index_cnt = 2; 4476 goto out; 4477 } 4478 4479 path->slots[0]--; 4480 4481 leaf = path->nodes[0]; 4482 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4483 4484 if (found_key.objectid != inode->i_ino || 4485 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 4486 BTRFS_I(inode)->index_cnt = 2; 4487 goto out; 4488 } 4489 4490 BTRFS_I(inode)->index_cnt = found_key.offset + 1; 4491 out: 4492 btrfs_free_path(path); 4493 return ret; 4494 } 4495 4496 /* 4497 * helper to find a free sequence number in a given directory. This current 4498 * code is very simple, later versions will do smarter things in the btree 4499 */ 4500 int btrfs_set_inode_index(struct inode *dir, u64 *index) 4501 { 4502 int ret = 0; 4503 4504 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 4505 ret = btrfs_set_inode_index_count(dir); 4506 if (ret) 4507 return ret; 4508 } 4509 4510 *index = BTRFS_I(dir)->index_cnt; 4511 BTRFS_I(dir)->index_cnt++; 4512 4513 return ret; 4514 } 4515 4516 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 4517 struct btrfs_root *root, 4518 struct inode *dir, 4519 const char *name, int name_len, 4520 u64 ref_objectid, u64 objectid, 4521 u64 alloc_hint, int mode, u64 *index) 4522 { 4523 struct inode *inode; 4524 struct btrfs_inode_item *inode_item; 4525 struct btrfs_key *location; 4526 struct btrfs_path *path; 4527 struct btrfs_inode_ref *ref; 4528 struct btrfs_key key[2]; 4529 u32 sizes[2]; 4530 unsigned long ptr; 4531 int ret; 4532 int owner; 4533 4534 path = btrfs_alloc_path(); 4535 BUG_ON(!path); 4536 4537 inode = new_inode(root->fs_info->sb); 4538 if (!inode) { 4539 btrfs_free_path(path); 4540 return ERR_PTR(-ENOMEM); 4541 } 4542 4543 if (dir) { 4544 trace_btrfs_inode_request(dir); 4545 4546 ret = btrfs_set_inode_index(dir, index); 4547 if (ret) { 4548 btrfs_free_path(path); 4549 iput(inode); 4550 return ERR_PTR(ret); 4551 } 4552 } 4553 /* 4554 * index_cnt is ignored for everything but a dir, 4555 * btrfs_get_inode_index_count has an explanation for the magic 4556 * number 4557 */ 4558 BTRFS_I(inode)->index_cnt = 2; 4559 BTRFS_I(inode)->root = root; 4560 BTRFS_I(inode)->generation = trans->transid; 4561 inode->i_generation = BTRFS_I(inode)->generation; 4562 btrfs_set_inode_space_info(root, inode); 4563 4564 if (mode & S_IFDIR) 4565 owner = 0; 4566 else 4567 owner = 1; 4568 BTRFS_I(inode)->block_group = 4569 btrfs_find_block_group(root, 0, alloc_hint, owner); 4570 4571 key[0].objectid = objectid; 4572 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4573 key[0].offset = 0; 4574 4575 key[1].objectid = objectid; 4576 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 4577 key[1].offset = ref_objectid; 4578 4579 sizes[0] = sizeof(struct btrfs_inode_item); 4580 sizes[1] = name_len + sizeof(*ref); 4581 4582 path->leave_spinning = 1; 4583 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); 4584 if (ret != 0) 4585 goto fail; 4586 4587 inode_init_owner(inode, dir, mode); 4588 inode->i_ino = objectid; 4589 inode_set_bytes(inode, 0); 4590 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4591 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4592 struct btrfs_inode_item); 4593 fill_inode_item(trans, path->nodes[0], inode_item, inode); 4594 4595 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 4596 struct btrfs_inode_ref); 4597 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 4598 btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 4599 ptr = (unsigned long)(ref + 1); 4600 write_extent_buffer(path->nodes[0], name, ptr, name_len); 4601 4602 btrfs_mark_buffer_dirty(path->nodes[0]); 4603 btrfs_free_path(path); 4604 4605 location = &BTRFS_I(inode)->location; 4606 location->objectid = objectid; 4607 location->offset = 0; 4608 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 4609 4610 btrfs_inherit_iflags(inode, dir); 4611 4612 if ((mode & S_IFREG)) { 4613 if (btrfs_test_opt(root, NODATASUM)) 4614 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4615 if (btrfs_test_opt(root, NODATACOW) || 4616 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) 4617 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4618 } 4619 4620 insert_inode_hash(inode); 4621 inode_tree_add(inode); 4622 4623 trace_btrfs_inode_new(inode); 4624 4625 return inode; 4626 fail: 4627 if (dir) 4628 BTRFS_I(dir)->index_cnt--; 4629 btrfs_free_path(path); 4630 iput(inode); 4631 return ERR_PTR(ret); 4632 } 4633 4634 static inline u8 btrfs_inode_type(struct inode *inode) 4635 { 4636 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 4637 } 4638 4639 /* 4640 * utility function to add 'inode' into 'parent_inode' with 4641 * a give name and a given sequence number. 4642 * if 'add_backref' is true, also insert a backref from the 4643 * inode to the parent directory. 4644 */ 4645 int btrfs_add_link(struct btrfs_trans_handle *trans, 4646 struct inode *parent_inode, struct inode *inode, 4647 const char *name, int name_len, int add_backref, u64 index) 4648 { 4649 int ret = 0; 4650 struct btrfs_key key; 4651 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 4652 4653 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 4654 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 4655 } else { 4656 key.objectid = inode->i_ino; 4657 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 4658 key.offset = 0; 4659 } 4660 4661 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 4662 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 4663 key.objectid, root->root_key.objectid, 4664 parent_inode->i_ino, 4665 index, name, name_len); 4666 } else if (add_backref) { 4667 ret = btrfs_insert_inode_ref(trans, root, 4668 name, name_len, inode->i_ino, 4669 parent_inode->i_ino, index); 4670 } 4671 4672 if (ret == 0) { 4673 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4674 parent_inode->i_ino, &key, 4675 btrfs_inode_type(inode), index); 4676 BUG_ON(ret); 4677 4678 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4679 name_len * 2); 4680 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4681 ret = btrfs_update_inode(trans, root, parent_inode); 4682 } 4683 return ret; 4684 } 4685 4686 static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4687 struct inode *dir, struct dentry *dentry, 4688 struct inode *inode, int backref, u64 index) 4689 { 4690 int err = btrfs_add_link(trans, dir, inode, 4691 dentry->d_name.name, dentry->d_name.len, 4692 backref, index); 4693 if (!err) { 4694 d_instantiate(dentry, inode); 4695 return 0; 4696 } 4697 if (err > 0) 4698 err = -EEXIST; 4699 return err; 4700 } 4701 4702 static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 4703 int mode, dev_t rdev) 4704 { 4705 struct btrfs_trans_handle *trans; 4706 struct btrfs_root *root = BTRFS_I(dir)->root; 4707 struct inode *inode = NULL; 4708 int err; 4709 int drop_inode = 0; 4710 u64 objectid; 4711 unsigned long nr = 0; 4712 u64 index = 0; 4713 4714 if (!new_valid_dev(rdev)) 4715 return -EINVAL; 4716 4717 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); 4718 if (err) 4719 return err; 4720 4721 /* 4722 * 2 for inode item and ref 4723 * 2 for dir items 4724 * 1 for xattr if selinux is on 4725 */ 4726 trans = btrfs_start_transaction(root, 5); 4727 if (IS_ERR(trans)) 4728 return PTR_ERR(trans); 4729 4730 btrfs_set_trans_block_group(trans, dir); 4731 4732 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4733 dentry->d_name.len, dir->i_ino, objectid, 4734 BTRFS_I(dir)->block_group, mode, &index); 4735 if (IS_ERR(inode)) { 4736 err = PTR_ERR(inode); 4737 goto out_unlock; 4738 } 4739 4740 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4741 if (err) { 4742 drop_inode = 1; 4743 goto out_unlock; 4744 } 4745 4746 btrfs_set_trans_block_group(trans, inode); 4747 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4748 if (err) 4749 drop_inode = 1; 4750 else { 4751 inode->i_op = &btrfs_special_inode_operations; 4752 init_special_inode(inode, inode->i_mode, rdev); 4753 btrfs_update_inode(trans, root, inode); 4754 } 4755 btrfs_update_inode_block_group(trans, inode); 4756 btrfs_update_inode_block_group(trans, dir); 4757 out_unlock: 4758 nr = trans->blocks_used; 4759 btrfs_end_transaction_throttle(trans, root); 4760 btrfs_btree_balance_dirty(root, nr); 4761 if (drop_inode) { 4762 inode_dec_link_count(inode); 4763 iput(inode); 4764 } 4765 return err; 4766 } 4767 4768 static int btrfs_create(struct inode *dir, struct dentry *dentry, 4769 int mode, struct nameidata *nd) 4770 { 4771 struct btrfs_trans_handle *trans; 4772 struct btrfs_root *root = BTRFS_I(dir)->root; 4773 struct inode *inode = NULL; 4774 int drop_inode = 0; 4775 int err; 4776 unsigned long nr = 0; 4777 u64 objectid; 4778 u64 index = 0; 4779 4780 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); 4781 if (err) 4782 return err; 4783 /* 4784 * 2 for inode item and ref 4785 * 2 for dir items 4786 * 1 for xattr if selinux is on 4787 */ 4788 trans = btrfs_start_transaction(root, 5); 4789 if (IS_ERR(trans)) 4790 return PTR_ERR(trans); 4791 4792 btrfs_set_trans_block_group(trans, dir); 4793 4794 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4795 dentry->d_name.len, dir->i_ino, objectid, 4796 BTRFS_I(dir)->block_group, mode, &index); 4797 if (IS_ERR(inode)) { 4798 err = PTR_ERR(inode); 4799 goto out_unlock; 4800 } 4801 4802 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4803 if (err) { 4804 drop_inode = 1; 4805 goto out_unlock; 4806 } 4807 4808 btrfs_set_trans_block_group(trans, inode); 4809 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4810 if (err) 4811 drop_inode = 1; 4812 else { 4813 inode->i_mapping->a_ops = &btrfs_aops; 4814 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 4815 inode->i_fop = &btrfs_file_operations; 4816 inode->i_op = &btrfs_file_inode_operations; 4817 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4818 } 4819 btrfs_update_inode_block_group(trans, inode); 4820 btrfs_update_inode_block_group(trans, dir); 4821 out_unlock: 4822 nr = trans->blocks_used; 4823 btrfs_end_transaction_throttle(trans, root); 4824 if (drop_inode) { 4825 inode_dec_link_count(inode); 4826 iput(inode); 4827 } 4828 btrfs_btree_balance_dirty(root, nr); 4829 return err; 4830 } 4831 4832 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 4833 struct dentry *dentry) 4834 { 4835 struct btrfs_trans_handle *trans; 4836 struct btrfs_root *root = BTRFS_I(dir)->root; 4837 struct inode *inode = old_dentry->d_inode; 4838 u64 index; 4839 unsigned long nr = 0; 4840 int err; 4841 int drop_inode = 0; 4842 4843 /* do not allow sys_link's with other subvols of the same device */ 4844 if (root->objectid != BTRFS_I(inode)->root->objectid) 4845 return -EXDEV; 4846 4847 if (inode->i_nlink == ~0U) 4848 return -EMLINK; 4849 4850 err = btrfs_set_inode_index(dir, &index); 4851 if (err) 4852 goto fail; 4853 4854 /* 4855 * 2 items for inode and inode ref 4856 * 2 items for dir items 4857 * 1 item for parent inode 4858 */ 4859 trans = btrfs_start_transaction(root, 5); 4860 if (IS_ERR(trans)) { 4861 err = PTR_ERR(trans); 4862 goto fail; 4863 } 4864 4865 btrfs_inc_nlink(inode); 4866 inode->i_ctime = CURRENT_TIME; 4867 4868 btrfs_set_trans_block_group(trans, dir); 4869 ihold(inode); 4870 4871 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 4872 4873 if (err) { 4874 drop_inode = 1; 4875 } else { 4876 struct dentry *parent = dget_parent(dentry); 4877 btrfs_update_inode_block_group(trans, dir); 4878 err = btrfs_update_inode(trans, root, inode); 4879 BUG_ON(err); 4880 btrfs_log_new_name(trans, inode, NULL, parent); 4881 dput(parent); 4882 } 4883 4884 nr = trans->blocks_used; 4885 btrfs_end_transaction_throttle(trans, root); 4886 fail: 4887 if (drop_inode) { 4888 inode_dec_link_count(inode); 4889 iput(inode); 4890 } 4891 btrfs_btree_balance_dirty(root, nr); 4892 return err; 4893 } 4894 4895 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 4896 { 4897 struct inode *inode = NULL; 4898 struct btrfs_trans_handle *trans; 4899 struct btrfs_root *root = BTRFS_I(dir)->root; 4900 int err = 0; 4901 int drop_on_err = 0; 4902 u64 objectid = 0; 4903 u64 index = 0; 4904 unsigned long nr = 1; 4905 4906 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); 4907 if (err) 4908 return err; 4909 4910 /* 4911 * 2 items for inode and ref 4912 * 2 items for dir items 4913 * 1 for xattr if selinux is on 4914 */ 4915 trans = btrfs_start_transaction(root, 5); 4916 if (IS_ERR(trans)) 4917 return PTR_ERR(trans); 4918 btrfs_set_trans_block_group(trans, dir); 4919 4920 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4921 dentry->d_name.len, dir->i_ino, objectid, 4922 BTRFS_I(dir)->block_group, S_IFDIR | mode, 4923 &index); 4924 if (IS_ERR(inode)) { 4925 err = PTR_ERR(inode); 4926 goto out_fail; 4927 } 4928 4929 drop_on_err = 1; 4930 4931 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 4932 if (err) 4933 goto out_fail; 4934 4935 inode->i_op = &btrfs_dir_inode_operations; 4936 inode->i_fop = &btrfs_dir_file_operations; 4937 btrfs_set_trans_block_group(trans, inode); 4938 4939 btrfs_i_size_write(inode, 0); 4940 err = btrfs_update_inode(trans, root, inode); 4941 if (err) 4942 goto out_fail; 4943 4944 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 4945 dentry->d_name.len, 0, index); 4946 if (err) 4947 goto out_fail; 4948 4949 d_instantiate(dentry, inode); 4950 drop_on_err = 0; 4951 btrfs_update_inode_block_group(trans, inode); 4952 btrfs_update_inode_block_group(trans, dir); 4953 4954 out_fail: 4955 nr = trans->blocks_used; 4956 btrfs_end_transaction_throttle(trans, root); 4957 if (drop_on_err) 4958 iput(inode); 4959 btrfs_btree_balance_dirty(root, nr); 4960 return err; 4961 } 4962 4963 /* helper for btfs_get_extent. Given an existing extent in the tree, 4964 * and an extent that you want to insert, deal with overlap and insert 4965 * the new extent into the tree. 4966 */ 4967 static int merge_extent_mapping(struct extent_map_tree *em_tree, 4968 struct extent_map *existing, 4969 struct extent_map *em, 4970 u64 map_start, u64 map_len) 4971 { 4972 u64 start_diff; 4973 4974 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 4975 start_diff = map_start - em->start; 4976 em->start = map_start; 4977 em->len = map_len; 4978 if (em->block_start < EXTENT_MAP_LAST_BYTE && 4979 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 4980 em->block_start += start_diff; 4981 em->block_len -= start_diff; 4982 } 4983 return add_extent_mapping(em_tree, em); 4984 } 4985 4986 static noinline int uncompress_inline(struct btrfs_path *path, 4987 struct inode *inode, struct page *page, 4988 size_t pg_offset, u64 extent_offset, 4989 struct btrfs_file_extent_item *item) 4990 { 4991 int ret; 4992 struct extent_buffer *leaf = path->nodes[0]; 4993 char *tmp; 4994 size_t max_size; 4995 unsigned long inline_size; 4996 unsigned long ptr; 4997 int compress_type; 4998 4999 WARN_ON(pg_offset != 0); 5000 compress_type = btrfs_file_extent_compression(leaf, item); 5001 max_size = btrfs_file_extent_ram_bytes(leaf, item); 5002 inline_size = btrfs_file_extent_inline_item_len(leaf, 5003 btrfs_item_nr(leaf, path->slots[0])); 5004 tmp = kmalloc(inline_size, GFP_NOFS); 5005 if (!tmp) 5006 return -ENOMEM; 5007 ptr = btrfs_file_extent_inline_start(item); 5008 5009 read_extent_buffer(leaf, tmp, ptr, inline_size); 5010 5011 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 5012 ret = btrfs_decompress(compress_type, tmp, page, 5013 extent_offset, inline_size, max_size); 5014 if (ret) { 5015 char *kaddr = kmap_atomic(page, KM_USER0); 5016 unsigned long copy_size = min_t(u64, 5017 PAGE_CACHE_SIZE - pg_offset, 5018 max_size - extent_offset); 5019 memset(kaddr + pg_offset, 0, copy_size); 5020 kunmap_atomic(kaddr, KM_USER0); 5021 } 5022 kfree(tmp); 5023 return 0; 5024 } 5025 5026 /* 5027 * a bit scary, this does extent mapping from logical file offset to the disk. 5028 * the ugly parts come from merging extents from the disk with the in-ram 5029 * representation. This gets more complex because of the data=ordered code, 5030 * where the in-ram extents might be locked pending data=ordered completion. 5031 * 5032 * This also copies inline extents directly into the page. 5033 */ 5034 5035 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 5036 size_t pg_offset, u64 start, u64 len, 5037 int create) 5038 { 5039 int ret; 5040 int err = 0; 5041 u64 bytenr; 5042 u64 extent_start = 0; 5043 u64 extent_end = 0; 5044 u64 objectid = inode->i_ino; 5045 u32 found_type; 5046 struct btrfs_path *path = NULL; 5047 struct btrfs_root *root = BTRFS_I(inode)->root; 5048 struct btrfs_file_extent_item *item; 5049 struct extent_buffer *leaf; 5050 struct btrfs_key found_key; 5051 struct extent_map *em = NULL; 5052 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5053 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5054 struct btrfs_trans_handle *trans = NULL; 5055 int compress_type; 5056 5057 again: 5058 read_lock(&em_tree->lock); 5059 em = lookup_extent_mapping(em_tree, start, len); 5060 if (em) 5061 em->bdev = root->fs_info->fs_devices->latest_bdev; 5062 read_unlock(&em_tree->lock); 5063 5064 if (em) { 5065 if (em->start > start || em->start + em->len <= start) 5066 free_extent_map(em); 5067 else if (em->block_start == EXTENT_MAP_INLINE && page) 5068 free_extent_map(em); 5069 else 5070 goto out; 5071 } 5072 em = alloc_extent_map(GFP_NOFS); 5073 if (!em) { 5074 err = -ENOMEM; 5075 goto out; 5076 } 5077 em->bdev = root->fs_info->fs_devices->latest_bdev; 5078 em->start = EXTENT_MAP_HOLE; 5079 em->orig_start = EXTENT_MAP_HOLE; 5080 em->len = (u64)-1; 5081 em->block_len = (u64)-1; 5082 5083 if (!path) { 5084 path = btrfs_alloc_path(); 5085 BUG_ON(!path); 5086 } 5087 5088 ret = btrfs_lookup_file_extent(trans, root, path, 5089 objectid, start, trans != NULL); 5090 if (ret < 0) { 5091 err = ret; 5092 goto out; 5093 } 5094 5095 if (ret != 0) { 5096 if (path->slots[0] == 0) 5097 goto not_found; 5098 path->slots[0]--; 5099 } 5100 5101 leaf = path->nodes[0]; 5102 item = btrfs_item_ptr(leaf, path->slots[0], 5103 struct btrfs_file_extent_item); 5104 /* are we inside the extent that was found? */ 5105 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5106 found_type = btrfs_key_type(&found_key); 5107 if (found_key.objectid != objectid || 5108 found_type != BTRFS_EXTENT_DATA_KEY) { 5109 goto not_found; 5110 } 5111 5112 found_type = btrfs_file_extent_type(leaf, item); 5113 extent_start = found_key.offset; 5114 compress_type = btrfs_file_extent_compression(leaf, item); 5115 if (found_type == BTRFS_FILE_EXTENT_REG || 5116 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5117 extent_end = extent_start + 5118 btrfs_file_extent_num_bytes(leaf, item); 5119 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5120 size_t size; 5121 size = btrfs_file_extent_inline_len(leaf, item); 5122 extent_end = (extent_start + size + root->sectorsize - 1) & 5123 ~((u64)root->sectorsize - 1); 5124 } 5125 5126 if (start >= extent_end) { 5127 path->slots[0]++; 5128 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 5129 ret = btrfs_next_leaf(root, path); 5130 if (ret < 0) { 5131 err = ret; 5132 goto out; 5133 } 5134 if (ret > 0) 5135 goto not_found; 5136 leaf = path->nodes[0]; 5137 } 5138 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5139 if (found_key.objectid != objectid || 5140 found_key.type != BTRFS_EXTENT_DATA_KEY) 5141 goto not_found; 5142 if (start + len <= found_key.offset) 5143 goto not_found; 5144 em->start = start; 5145 em->len = found_key.offset - start; 5146 goto not_found_em; 5147 } 5148 5149 if (found_type == BTRFS_FILE_EXTENT_REG || 5150 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5151 em->start = extent_start; 5152 em->len = extent_end - extent_start; 5153 em->orig_start = extent_start - 5154 btrfs_file_extent_offset(leaf, item); 5155 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5156 if (bytenr == 0) { 5157 em->block_start = EXTENT_MAP_HOLE; 5158 goto insert; 5159 } 5160 if (compress_type != BTRFS_COMPRESS_NONE) { 5161 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5162 em->compress_type = compress_type; 5163 em->block_start = bytenr; 5164 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5165 item); 5166 } else { 5167 bytenr += btrfs_file_extent_offset(leaf, item); 5168 em->block_start = bytenr; 5169 em->block_len = em->len; 5170 if (found_type == BTRFS_FILE_EXTENT_PREALLOC) 5171 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5172 } 5173 goto insert; 5174 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 5175 unsigned long ptr; 5176 char *map; 5177 size_t size; 5178 size_t extent_offset; 5179 size_t copy_size; 5180 5181 em->block_start = EXTENT_MAP_INLINE; 5182 if (!page || create) { 5183 em->start = extent_start; 5184 em->len = extent_end - extent_start; 5185 goto out; 5186 } 5187 5188 size = btrfs_file_extent_inline_len(leaf, item); 5189 extent_offset = page_offset(page) + pg_offset - extent_start; 5190 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 5191 size - extent_offset); 5192 em->start = extent_start + extent_offset; 5193 em->len = (copy_size + root->sectorsize - 1) & 5194 ~((u64)root->sectorsize - 1); 5195 em->orig_start = EXTENT_MAP_INLINE; 5196 if (compress_type) { 5197 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5198 em->compress_type = compress_type; 5199 } 5200 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5201 if (create == 0 && !PageUptodate(page)) { 5202 if (btrfs_file_extent_compression(leaf, item) != 5203 BTRFS_COMPRESS_NONE) { 5204 ret = uncompress_inline(path, inode, page, 5205 pg_offset, 5206 extent_offset, item); 5207 BUG_ON(ret); 5208 } else { 5209 map = kmap(page); 5210 read_extent_buffer(leaf, map + pg_offset, ptr, 5211 copy_size); 5212 if (pg_offset + copy_size < PAGE_CACHE_SIZE) { 5213 memset(map + pg_offset + copy_size, 0, 5214 PAGE_CACHE_SIZE - pg_offset - 5215 copy_size); 5216 } 5217 kunmap(page); 5218 } 5219 flush_dcache_page(page); 5220 } else if (create && PageUptodate(page)) { 5221 WARN_ON(1); 5222 if (!trans) { 5223 kunmap(page); 5224 free_extent_map(em); 5225 em = NULL; 5226 btrfs_release_path(root, path); 5227 trans = btrfs_join_transaction(root, 1); 5228 if (IS_ERR(trans)) 5229 return ERR_CAST(trans); 5230 goto again; 5231 } 5232 map = kmap(page); 5233 write_extent_buffer(leaf, map + pg_offset, ptr, 5234 copy_size); 5235 kunmap(page); 5236 btrfs_mark_buffer_dirty(leaf); 5237 } 5238 set_extent_uptodate(io_tree, em->start, 5239 extent_map_end(em) - 1, NULL, GFP_NOFS); 5240 goto insert; 5241 } else { 5242 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5243 WARN_ON(1); 5244 } 5245 not_found: 5246 em->start = start; 5247 em->len = len; 5248 not_found_em: 5249 em->block_start = EXTENT_MAP_HOLE; 5250 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 5251 insert: 5252 btrfs_release_path(root, path); 5253 if (em->start > start || extent_map_end(em) <= start) { 5254 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " 5255 "[%llu %llu]\n", (unsigned long long)em->start, 5256 (unsigned long long)em->len, 5257 (unsigned long long)start, 5258 (unsigned long long)len); 5259 err = -EIO; 5260 goto out; 5261 } 5262 5263 err = 0; 5264 write_lock(&em_tree->lock); 5265 ret = add_extent_mapping(em_tree, em); 5266 /* it is possible that someone inserted the extent into the tree 5267 * while we had the lock dropped. It is also possible that 5268 * an overlapping map exists in the tree 5269 */ 5270 if (ret == -EEXIST) { 5271 struct extent_map *existing; 5272 5273 ret = 0; 5274 5275 existing = lookup_extent_mapping(em_tree, start, len); 5276 if (existing && (existing->start > start || 5277 existing->start + existing->len <= start)) { 5278 free_extent_map(existing); 5279 existing = NULL; 5280 } 5281 if (!existing) { 5282 existing = lookup_extent_mapping(em_tree, em->start, 5283 em->len); 5284 if (existing) { 5285 err = merge_extent_mapping(em_tree, existing, 5286 em, start, 5287 root->sectorsize); 5288 free_extent_map(existing); 5289 if (err) { 5290 free_extent_map(em); 5291 em = NULL; 5292 } 5293 } else { 5294 err = -EIO; 5295 free_extent_map(em); 5296 em = NULL; 5297 } 5298 } else { 5299 free_extent_map(em); 5300 em = existing; 5301 err = 0; 5302 } 5303 } 5304 write_unlock(&em_tree->lock); 5305 out: 5306 5307 trace_btrfs_get_extent(root, em); 5308 5309 if (path) 5310 btrfs_free_path(path); 5311 if (trans) { 5312 ret = btrfs_end_transaction(trans, root); 5313 if (!err) 5314 err = ret; 5315 } 5316 if (err) { 5317 free_extent_map(em); 5318 return ERR_PTR(err); 5319 } 5320 return em; 5321 } 5322 5323 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 5324 size_t pg_offset, u64 start, u64 len, 5325 int create) 5326 { 5327 struct extent_map *em; 5328 struct extent_map *hole_em = NULL; 5329 u64 range_start = start; 5330 u64 end; 5331 u64 found; 5332 u64 found_end; 5333 int err = 0; 5334 5335 em = btrfs_get_extent(inode, page, pg_offset, start, len, create); 5336 if (IS_ERR(em)) 5337 return em; 5338 if (em) { 5339 /* 5340 * if our em maps to a hole, there might 5341 * actually be delalloc bytes behind it 5342 */ 5343 if (em->block_start != EXTENT_MAP_HOLE) 5344 return em; 5345 else 5346 hole_em = em; 5347 } 5348 5349 /* check to see if we've wrapped (len == -1 or similar) */ 5350 end = start + len; 5351 if (end < start) 5352 end = (u64)-1; 5353 else 5354 end -= 1; 5355 5356 em = NULL; 5357 5358 /* ok, we didn't find anything, lets look for delalloc */ 5359 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, 5360 end, len, EXTENT_DELALLOC, 1); 5361 found_end = range_start + found; 5362 if (found_end < range_start) 5363 found_end = (u64)-1; 5364 5365 /* 5366 * we didn't find anything useful, return 5367 * the original results from get_extent() 5368 */ 5369 if (range_start > end || found_end <= start) { 5370 em = hole_em; 5371 hole_em = NULL; 5372 goto out; 5373 } 5374 5375 /* adjust the range_start to make sure it doesn't 5376 * go backwards from the start they passed in 5377 */ 5378 range_start = max(start,range_start); 5379 found = found_end - range_start; 5380 5381 if (found > 0) { 5382 u64 hole_start = start; 5383 u64 hole_len = len; 5384 5385 em = alloc_extent_map(GFP_NOFS); 5386 if (!em) { 5387 err = -ENOMEM; 5388 goto out; 5389 } 5390 /* 5391 * when btrfs_get_extent can't find anything it 5392 * returns one huge hole 5393 * 5394 * make sure what it found really fits our range, and 5395 * adjust to make sure it is based on the start from 5396 * the caller 5397 */ 5398 if (hole_em) { 5399 u64 calc_end = extent_map_end(hole_em); 5400 5401 if (calc_end <= start || (hole_em->start > end)) { 5402 free_extent_map(hole_em); 5403 hole_em = NULL; 5404 } else { 5405 hole_start = max(hole_em->start, start); 5406 hole_len = calc_end - hole_start; 5407 } 5408 } 5409 em->bdev = NULL; 5410 if (hole_em && range_start > hole_start) { 5411 /* our hole starts before our delalloc, so we 5412 * have to return just the parts of the hole 5413 * that go until the delalloc starts 5414 */ 5415 em->len = min(hole_len, 5416 range_start - hole_start); 5417 em->start = hole_start; 5418 em->orig_start = hole_start; 5419 /* 5420 * don't adjust block start at all, 5421 * it is fixed at EXTENT_MAP_HOLE 5422 */ 5423 em->block_start = hole_em->block_start; 5424 em->block_len = hole_len; 5425 } else { 5426 em->start = range_start; 5427 em->len = found; 5428 em->orig_start = range_start; 5429 em->block_start = EXTENT_MAP_DELALLOC; 5430 em->block_len = found; 5431 } 5432 } else if (hole_em) { 5433 return hole_em; 5434 } 5435 out: 5436 5437 free_extent_map(hole_em); 5438 if (err) { 5439 free_extent_map(em); 5440 return ERR_PTR(err); 5441 } 5442 return em; 5443 } 5444 5445 static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5446 struct extent_map *em, 5447 u64 start, u64 len) 5448 { 5449 struct btrfs_root *root = BTRFS_I(inode)->root; 5450 struct btrfs_trans_handle *trans; 5451 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5452 struct btrfs_key ins; 5453 u64 alloc_hint; 5454 int ret; 5455 bool insert = false; 5456 5457 /* 5458 * Ok if the extent map we looked up is a hole and is for the exact 5459 * range we want, there is no reason to allocate a new one, however if 5460 * it is not right then we need to free this one and drop the cache for 5461 * our range. 5462 */ 5463 if (em->block_start != EXTENT_MAP_HOLE || em->start != start || 5464 em->len != len) { 5465 free_extent_map(em); 5466 em = NULL; 5467 insert = true; 5468 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5469 } 5470 5471 trans = btrfs_join_transaction(root, 0); 5472 if (IS_ERR(trans)) 5473 return ERR_CAST(trans); 5474 5475 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5476 5477 alloc_hint = get_extent_allocation_hint(inode, start, len); 5478 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, 5479 alloc_hint, (u64)-1, &ins, 1); 5480 if (ret) { 5481 em = ERR_PTR(ret); 5482 goto out; 5483 } 5484 5485 if (!em) { 5486 em = alloc_extent_map(GFP_NOFS); 5487 if (!em) { 5488 em = ERR_PTR(-ENOMEM); 5489 goto out; 5490 } 5491 } 5492 5493 em->start = start; 5494 em->orig_start = em->start; 5495 em->len = ins.offset; 5496 5497 em->block_start = ins.objectid; 5498 em->block_len = ins.offset; 5499 em->bdev = root->fs_info->fs_devices->latest_bdev; 5500 5501 /* 5502 * We need to do this because if we're using the original em we searched 5503 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. 5504 */ 5505 em->flags = 0; 5506 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5507 5508 while (insert) { 5509 write_lock(&em_tree->lock); 5510 ret = add_extent_mapping(em_tree, em); 5511 write_unlock(&em_tree->lock); 5512 if (ret != -EEXIST) 5513 break; 5514 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); 5515 } 5516 5517 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5518 ins.offset, ins.offset, 0); 5519 if (ret) { 5520 btrfs_free_reserved_extent(root, ins.objectid, ins.offset); 5521 em = ERR_PTR(ret); 5522 } 5523 out: 5524 btrfs_end_transaction(trans, root); 5525 return em; 5526 } 5527 5528 /* 5529 * returns 1 when the nocow is safe, < 1 on error, 0 if the 5530 * block must be cow'd 5531 */ 5532 static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 5533 struct inode *inode, u64 offset, u64 len) 5534 { 5535 struct btrfs_path *path; 5536 int ret; 5537 struct extent_buffer *leaf; 5538 struct btrfs_root *root = BTRFS_I(inode)->root; 5539 struct btrfs_file_extent_item *fi; 5540 struct btrfs_key key; 5541 u64 disk_bytenr; 5542 u64 backref_offset; 5543 u64 extent_end; 5544 u64 num_bytes; 5545 int slot; 5546 int found_type; 5547 5548 path = btrfs_alloc_path(); 5549 if (!path) 5550 return -ENOMEM; 5551 5552 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 5553 offset, 0); 5554 if (ret < 0) 5555 goto out; 5556 5557 slot = path->slots[0]; 5558 if (ret == 1) { 5559 if (slot == 0) { 5560 /* can't find the item, must cow */ 5561 ret = 0; 5562 goto out; 5563 } 5564 slot--; 5565 } 5566 ret = 0; 5567 leaf = path->nodes[0]; 5568 btrfs_item_key_to_cpu(leaf, &key, slot); 5569 if (key.objectid != inode->i_ino || 5570 key.type != BTRFS_EXTENT_DATA_KEY) { 5571 /* not our file or wrong item type, must cow */ 5572 goto out; 5573 } 5574 5575 if (key.offset > offset) { 5576 /* Wrong offset, must cow */ 5577 goto out; 5578 } 5579 5580 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 5581 found_type = btrfs_file_extent_type(leaf, fi); 5582 if (found_type != BTRFS_FILE_EXTENT_REG && 5583 found_type != BTRFS_FILE_EXTENT_PREALLOC) { 5584 /* not a regular extent, must cow */ 5585 goto out; 5586 } 5587 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 5588 backref_offset = btrfs_file_extent_offset(leaf, fi); 5589 5590 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 5591 if (extent_end < offset + len) { 5592 /* extent doesn't include our full range, must cow */ 5593 goto out; 5594 } 5595 5596 if (btrfs_extent_readonly(root, disk_bytenr)) 5597 goto out; 5598 5599 /* 5600 * look for other files referencing this extent, if we 5601 * find any we must cow 5602 */ 5603 if (btrfs_cross_ref_exist(trans, root, inode->i_ino, 5604 key.offset - backref_offset, disk_bytenr)) 5605 goto out; 5606 5607 /* 5608 * adjust disk_bytenr and num_bytes to cover just the bytes 5609 * in this extent we are about to write. If there 5610 * are any csums in that range we have to cow in order 5611 * to keep the csums correct 5612 */ 5613 disk_bytenr += backref_offset; 5614 disk_bytenr += offset - key.offset; 5615 num_bytes = min(offset + len, extent_end) - offset; 5616 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 5617 goto out; 5618 /* 5619 * all of the above have passed, it is safe to overwrite this extent 5620 * without cow 5621 */ 5622 ret = 1; 5623 out: 5624 btrfs_free_path(path); 5625 return ret; 5626 } 5627 5628 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 5629 struct buffer_head *bh_result, int create) 5630 { 5631 struct extent_map *em; 5632 struct btrfs_root *root = BTRFS_I(inode)->root; 5633 u64 start = iblock << inode->i_blkbits; 5634 u64 len = bh_result->b_size; 5635 struct btrfs_trans_handle *trans; 5636 5637 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 5638 if (IS_ERR(em)) 5639 return PTR_ERR(em); 5640 5641 /* 5642 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered 5643 * io. INLINE is special, and we could probably kludge it in here, but 5644 * it's still buffered so for safety lets just fall back to the generic 5645 * buffered path. 5646 * 5647 * For COMPRESSED we _have_ to read the entire extent in so we can 5648 * decompress it, so there will be buffering required no matter what we 5649 * do, so go ahead and fallback to buffered. 5650 * 5651 * We return -ENOTBLK because thats what makes DIO go ahead and go back 5652 * to buffered IO. Don't blame me, this is the price we pay for using 5653 * the generic code. 5654 */ 5655 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || 5656 em->block_start == EXTENT_MAP_INLINE) { 5657 free_extent_map(em); 5658 return -ENOTBLK; 5659 } 5660 5661 /* Just a good old fashioned hole, return */ 5662 if (!create && (em->block_start == EXTENT_MAP_HOLE || 5663 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 5664 free_extent_map(em); 5665 /* DIO will do one hole at a time, so just unlock a sector */ 5666 unlock_extent(&BTRFS_I(inode)->io_tree, start, 5667 start + root->sectorsize - 1, GFP_NOFS); 5668 return 0; 5669 } 5670 5671 /* 5672 * We don't allocate a new extent in the following cases 5673 * 5674 * 1) The inode is marked as NODATACOW. In this case we'll just use the 5675 * existing extent. 5676 * 2) The extent is marked as PREALLOC. We're good to go here and can 5677 * just use the extent. 5678 * 5679 */ 5680 if (!create) { 5681 len = em->len - (start - em->start); 5682 goto map; 5683 } 5684 5685 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 5686 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 5687 em->block_start != EXTENT_MAP_HOLE)) { 5688 int type; 5689 int ret; 5690 u64 block_start; 5691 5692 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5693 type = BTRFS_ORDERED_PREALLOC; 5694 else 5695 type = BTRFS_ORDERED_NOCOW; 5696 len = min(len, em->len - (start - em->start)); 5697 block_start = em->block_start + (start - em->start); 5698 5699 /* 5700 * we're not going to log anything, but we do need 5701 * to make sure the current transaction stays open 5702 * while we look for nocow cross refs 5703 */ 5704 trans = btrfs_join_transaction(root, 0); 5705 if (IS_ERR(trans)) 5706 goto must_cow; 5707 5708 if (can_nocow_odirect(trans, inode, start, len) == 1) { 5709 ret = btrfs_add_ordered_extent_dio(inode, start, 5710 block_start, len, len, type); 5711 btrfs_end_transaction(trans, root); 5712 if (ret) { 5713 free_extent_map(em); 5714 return ret; 5715 } 5716 goto unlock; 5717 } 5718 btrfs_end_transaction(trans, root); 5719 } 5720 must_cow: 5721 /* 5722 * this will cow the extent, reset the len in case we changed 5723 * it above 5724 */ 5725 len = bh_result->b_size; 5726 em = btrfs_new_extent_direct(inode, em, start, len); 5727 if (IS_ERR(em)) 5728 return PTR_ERR(em); 5729 len = min(len, em->len - (start - em->start)); 5730 unlock: 5731 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, 5732 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, 5733 0, NULL, GFP_NOFS); 5734 map: 5735 bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 5736 inode->i_blkbits; 5737 bh_result->b_size = len; 5738 bh_result->b_bdev = em->bdev; 5739 set_buffer_mapped(bh_result); 5740 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5741 set_buffer_new(bh_result); 5742 5743 free_extent_map(em); 5744 5745 return 0; 5746 } 5747 5748 struct btrfs_dio_private { 5749 struct inode *inode; 5750 u64 logical_offset; 5751 u64 disk_bytenr; 5752 u64 bytes; 5753 u32 *csums; 5754 void *private; 5755 5756 /* number of bios pending for this dio */ 5757 atomic_t pending_bios; 5758 5759 /* IO errors */ 5760 int errors; 5761 5762 struct bio *orig_bio; 5763 }; 5764 5765 static void btrfs_endio_direct_read(struct bio *bio, int err) 5766 { 5767 struct btrfs_dio_private *dip = bio->bi_private; 5768 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 5769 struct bio_vec *bvec = bio->bi_io_vec; 5770 struct inode *inode = dip->inode; 5771 struct btrfs_root *root = BTRFS_I(inode)->root; 5772 u64 start; 5773 u32 *private = dip->csums; 5774 5775 start = dip->logical_offset; 5776 do { 5777 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 5778 struct page *page = bvec->bv_page; 5779 char *kaddr; 5780 u32 csum = ~(u32)0; 5781 unsigned long flags; 5782 5783 local_irq_save(flags); 5784 kaddr = kmap_atomic(page, KM_IRQ0); 5785 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, 5786 csum, bvec->bv_len); 5787 btrfs_csum_final(csum, (char *)&csum); 5788 kunmap_atomic(kaddr, KM_IRQ0); 5789 local_irq_restore(flags); 5790 5791 flush_dcache_page(bvec->bv_page); 5792 if (csum != *private) { 5793 printk(KERN_ERR "btrfs csum failed ino %lu off" 5794 " %llu csum %u private %u\n", 5795 inode->i_ino, (unsigned long long)start, 5796 csum, *private); 5797 err = -EIO; 5798 } 5799 } 5800 5801 start += bvec->bv_len; 5802 private++; 5803 bvec++; 5804 } while (bvec <= bvec_end); 5805 5806 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 5807 dip->logical_offset + dip->bytes - 1, GFP_NOFS); 5808 bio->bi_private = dip->private; 5809 5810 kfree(dip->csums); 5811 kfree(dip); 5812 5813 /* If we had a csum failure make sure to clear the uptodate flag */ 5814 if (err) 5815 clear_bit(BIO_UPTODATE, &bio->bi_flags); 5816 dio_end_io(bio, err); 5817 } 5818 5819 static void btrfs_endio_direct_write(struct bio *bio, int err) 5820 { 5821 struct btrfs_dio_private *dip = bio->bi_private; 5822 struct inode *inode = dip->inode; 5823 struct btrfs_root *root = BTRFS_I(inode)->root; 5824 struct btrfs_trans_handle *trans; 5825 struct btrfs_ordered_extent *ordered = NULL; 5826 struct extent_state *cached_state = NULL; 5827 u64 ordered_offset = dip->logical_offset; 5828 u64 ordered_bytes = dip->bytes; 5829 int ret; 5830 5831 if (err) 5832 goto out_done; 5833 again: 5834 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 5835 &ordered_offset, 5836 ordered_bytes); 5837 if (!ret) 5838 goto out_test; 5839 5840 BUG_ON(!ordered); 5841 5842 trans = btrfs_join_transaction(root, 1); 5843 if (IS_ERR(trans)) { 5844 err = -ENOMEM; 5845 goto out; 5846 } 5847 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5848 5849 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5850 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5851 if (!ret) 5852 ret = btrfs_update_inode(trans, root, inode); 5853 err = ret; 5854 goto out; 5855 } 5856 5857 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5858 ordered->file_offset + ordered->len - 1, 0, 5859 &cached_state, GFP_NOFS); 5860 5861 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { 5862 ret = btrfs_mark_extent_written(trans, inode, 5863 ordered->file_offset, 5864 ordered->file_offset + 5865 ordered->len); 5866 if (ret) { 5867 err = ret; 5868 goto out_unlock; 5869 } 5870 } else { 5871 ret = insert_reserved_file_extent(trans, inode, 5872 ordered->file_offset, 5873 ordered->start, 5874 ordered->disk_len, 5875 ordered->len, 5876 ordered->len, 5877 0, 0, 0, 5878 BTRFS_FILE_EXTENT_REG); 5879 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 5880 ordered->file_offset, ordered->len); 5881 if (ret) { 5882 err = ret; 5883 WARN_ON(1); 5884 goto out_unlock; 5885 } 5886 } 5887 5888 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5889 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5890 if (!ret) 5891 btrfs_update_inode(trans, root, inode); 5892 ret = 0; 5893 out_unlock: 5894 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5895 ordered->file_offset + ordered->len - 1, 5896 &cached_state, GFP_NOFS); 5897 out: 5898 btrfs_delalloc_release_metadata(inode, ordered->len); 5899 btrfs_end_transaction(trans, root); 5900 ordered_offset = ordered->file_offset + ordered->len; 5901 btrfs_put_ordered_extent(ordered); 5902 btrfs_put_ordered_extent(ordered); 5903 5904 out_test: 5905 /* 5906 * our bio might span multiple ordered extents. If we haven't 5907 * completed the accounting for the whole dio, go back and try again 5908 */ 5909 if (ordered_offset < dip->logical_offset + dip->bytes) { 5910 ordered_bytes = dip->logical_offset + dip->bytes - 5911 ordered_offset; 5912 goto again; 5913 } 5914 out_done: 5915 bio->bi_private = dip->private; 5916 5917 kfree(dip->csums); 5918 kfree(dip); 5919 5920 /* If we had an error make sure to clear the uptodate flag */ 5921 if (err) 5922 clear_bit(BIO_UPTODATE, &bio->bi_flags); 5923 dio_end_io(bio, err); 5924 } 5925 5926 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, 5927 struct bio *bio, int mirror_num, 5928 unsigned long bio_flags, u64 offset) 5929 { 5930 int ret; 5931 struct btrfs_root *root = BTRFS_I(inode)->root; 5932 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); 5933 BUG_ON(ret); 5934 return 0; 5935 } 5936 5937 static void btrfs_end_dio_bio(struct bio *bio, int err) 5938 { 5939 struct btrfs_dio_private *dip = bio->bi_private; 5940 5941 if (err) { 5942 printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu " 5943 "sector %#Lx len %u err no %d\n", 5944 dip->inode->i_ino, bio->bi_rw, 5945 (unsigned long long)bio->bi_sector, bio->bi_size, err); 5946 dip->errors = 1; 5947 5948 /* 5949 * before atomic variable goto zero, we must make sure 5950 * dip->errors is perceived to be set. 5951 */ 5952 smp_mb__before_atomic_dec(); 5953 } 5954 5955 /* if there are more bios still pending for this dio, just exit */ 5956 if (!atomic_dec_and_test(&dip->pending_bios)) 5957 goto out; 5958 5959 if (dip->errors) 5960 bio_io_error(dip->orig_bio); 5961 else { 5962 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); 5963 bio_endio(dip->orig_bio, 0); 5964 } 5965 out: 5966 bio_put(bio); 5967 } 5968 5969 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, 5970 u64 first_sector, gfp_t gfp_flags) 5971 { 5972 int nr_vecs = bio_get_nr_vecs(bdev); 5973 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 5974 } 5975 5976 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 5977 int rw, u64 file_offset, int skip_sum, 5978 u32 *csums, int async_submit) 5979 { 5980 int write = rw & REQ_WRITE; 5981 struct btrfs_root *root = BTRFS_I(inode)->root; 5982 int ret; 5983 5984 bio_get(bio); 5985 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 5986 if (ret) 5987 goto err; 5988 5989 if (skip_sum) 5990 goto map; 5991 5992 if (write && async_submit) { 5993 ret = btrfs_wq_submit_bio(root->fs_info, 5994 inode, rw, bio, 0, 0, 5995 file_offset, 5996 __btrfs_submit_bio_start_direct_io, 5997 __btrfs_submit_bio_done); 5998 goto err; 5999 } else if (write) { 6000 /* 6001 * If we aren't doing async submit, calculate the csum of the 6002 * bio now. 6003 */ 6004 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 6005 if (ret) 6006 goto err; 6007 } else if (!skip_sum) { 6008 ret = btrfs_lookup_bio_sums_dio(root, inode, bio, 6009 file_offset, csums); 6010 if (ret) 6011 goto err; 6012 } 6013 6014 map: 6015 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 6016 err: 6017 bio_put(bio); 6018 return ret; 6019 } 6020 6021 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, 6022 int skip_sum) 6023 { 6024 struct inode *inode = dip->inode; 6025 struct btrfs_root *root = BTRFS_I(inode)->root; 6026 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 6027 struct bio *bio; 6028 struct bio *orig_bio = dip->orig_bio; 6029 struct bio_vec *bvec = orig_bio->bi_io_vec; 6030 u64 start_sector = orig_bio->bi_sector; 6031 u64 file_offset = dip->logical_offset; 6032 u64 submit_len = 0; 6033 u64 map_length; 6034 int nr_pages = 0; 6035 u32 *csums = dip->csums; 6036 int ret = 0; 6037 int async_submit = 0; 6038 int write = rw & REQ_WRITE; 6039 6040 map_length = orig_bio->bi_size; 6041 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6042 &map_length, NULL, 0); 6043 if (ret) { 6044 bio_put(orig_bio); 6045 return -EIO; 6046 } 6047 6048 if (map_length >= orig_bio->bi_size) { 6049 bio = orig_bio; 6050 goto submit; 6051 } 6052 6053 async_submit = 1; 6054 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6055 if (!bio) 6056 return -ENOMEM; 6057 bio->bi_private = dip; 6058 bio->bi_end_io = btrfs_end_dio_bio; 6059 atomic_inc(&dip->pending_bios); 6060 6061 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 6062 if (unlikely(map_length < submit_len + bvec->bv_len || 6063 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 6064 bvec->bv_offset) < bvec->bv_len)) { 6065 /* 6066 * inc the count before we submit the bio so 6067 * we know the end IO handler won't happen before 6068 * we inc the count. Otherwise, the dip might get freed 6069 * before we're done setting it up 6070 */ 6071 atomic_inc(&dip->pending_bios); 6072 ret = __btrfs_submit_dio_bio(bio, inode, rw, 6073 file_offset, skip_sum, 6074 csums, async_submit); 6075 if (ret) { 6076 bio_put(bio); 6077 atomic_dec(&dip->pending_bios); 6078 goto out_err; 6079 } 6080 6081 /* Write's use the ordered csums */ 6082 if (!write && !skip_sum) 6083 csums = csums + nr_pages; 6084 start_sector += submit_len >> 9; 6085 file_offset += submit_len; 6086 6087 submit_len = 0; 6088 nr_pages = 0; 6089 6090 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 6091 start_sector, GFP_NOFS); 6092 if (!bio) 6093 goto out_err; 6094 bio->bi_private = dip; 6095 bio->bi_end_io = btrfs_end_dio_bio; 6096 6097 map_length = orig_bio->bi_size; 6098 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6099 &map_length, NULL, 0); 6100 if (ret) { 6101 bio_put(bio); 6102 goto out_err; 6103 } 6104 } else { 6105 submit_len += bvec->bv_len; 6106 nr_pages ++; 6107 bvec++; 6108 } 6109 } 6110 6111 submit: 6112 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 6113 csums, async_submit); 6114 if (!ret) 6115 return 0; 6116 6117 bio_put(bio); 6118 out_err: 6119 dip->errors = 1; 6120 /* 6121 * before atomic variable goto zero, we must 6122 * make sure dip->errors is perceived to be set. 6123 */ 6124 smp_mb__before_atomic_dec(); 6125 if (atomic_dec_and_test(&dip->pending_bios)) 6126 bio_io_error(dip->orig_bio); 6127 6128 /* bio_end_io() will handle error, so we needn't return it */ 6129 return 0; 6130 } 6131 6132 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 6133 loff_t file_offset) 6134 { 6135 struct btrfs_root *root = BTRFS_I(inode)->root; 6136 struct btrfs_dio_private *dip; 6137 struct bio_vec *bvec = bio->bi_io_vec; 6138 int skip_sum; 6139 int write = rw & REQ_WRITE; 6140 int ret = 0; 6141 6142 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 6143 6144 dip = kmalloc(sizeof(*dip), GFP_NOFS); 6145 if (!dip) { 6146 ret = -ENOMEM; 6147 goto free_ordered; 6148 } 6149 dip->csums = NULL; 6150 6151 /* Write's use the ordered csum stuff, so we don't need dip->csums */ 6152 if (!write && !skip_sum) { 6153 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); 6154 if (!dip->csums) { 6155 kfree(dip); 6156 ret = -ENOMEM; 6157 goto free_ordered; 6158 } 6159 } 6160 6161 dip->private = bio->bi_private; 6162 dip->inode = inode; 6163 dip->logical_offset = file_offset; 6164 6165 dip->bytes = 0; 6166 do { 6167 dip->bytes += bvec->bv_len; 6168 bvec++; 6169 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); 6170 6171 dip->disk_bytenr = (u64)bio->bi_sector << 9; 6172 bio->bi_private = dip; 6173 dip->errors = 0; 6174 dip->orig_bio = bio; 6175 atomic_set(&dip->pending_bios, 0); 6176 6177 if (write) 6178 bio->bi_end_io = btrfs_endio_direct_write; 6179 else 6180 bio->bi_end_io = btrfs_endio_direct_read; 6181 6182 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 6183 if (!ret) 6184 return; 6185 free_ordered: 6186 /* 6187 * If this is a write, we need to clean up the reserved space and kill 6188 * the ordered extent. 6189 */ 6190 if (write) { 6191 struct btrfs_ordered_extent *ordered; 6192 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 6193 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 6194 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 6195 btrfs_free_reserved_extent(root, ordered->start, 6196 ordered->disk_len); 6197 btrfs_put_ordered_extent(ordered); 6198 btrfs_put_ordered_extent(ordered); 6199 } 6200 bio_endio(bio, ret); 6201 } 6202 6203 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, 6204 const struct iovec *iov, loff_t offset, 6205 unsigned long nr_segs) 6206 { 6207 int seg; 6208 int i; 6209 size_t size; 6210 unsigned long addr; 6211 unsigned blocksize_mask = root->sectorsize - 1; 6212 ssize_t retval = -EINVAL; 6213 loff_t end = offset; 6214 6215 if (offset & blocksize_mask) 6216 goto out; 6217 6218 /* Check the memory alignment. Blocks cannot straddle pages */ 6219 for (seg = 0; seg < nr_segs; seg++) { 6220 addr = (unsigned long)iov[seg].iov_base; 6221 size = iov[seg].iov_len; 6222 end += size; 6223 if ((addr & blocksize_mask) || (size & blocksize_mask)) 6224 goto out; 6225 6226 /* If this is a write we don't need to check anymore */ 6227 if (rw & WRITE) 6228 continue; 6229 6230 /* 6231 * Check to make sure we don't have duplicate iov_base's in this 6232 * iovec, if so return EINVAL, otherwise we'll get csum errors 6233 * when reading back. 6234 */ 6235 for (i = seg + 1; i < nr_segs; i++) { 6236 if (iov[seg].iov_base == iov[i].iov_base) 6237 goto out; 6238 } 6239 } 6240 retval = 0; 6241 out: 6242 return retval; 6243 } 6244 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 6245 const struct iovec *iov, loff_t offset, 6246 unsigned long nr_segs) 6247 { 6248 struct file *file = iocb->ki_filp; 6249 struct inode *inode = file->f_mapping->host; 6250 struct btrfs_ordered_extent *ordered; 6251 struct extent_state *cached_state = NULL; 6252 u64 lockstart, lockend; 6253 ssize_t ret; 6254 int writing = rw & WRITE; 6255 int write_bits = 0; 6256 size_t count = iov_length(iov, nr_segs); 6257 6258 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6259 offset, nr_segs)) { 6260 return 0; 6261 } 6262 6263 lockstart = offset; 6264 lockend = offset + count - 1; 6265 6266 if (writing) { 6267 ret = btrfs_delalloc_reserve_space(inode, count); 6268 if (ret) 6269 goto out; 6270 } 6271 6272 while (1) { 6273 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6274 0, &cached_state, GFP_NOFS); 6275 /* 6276 * We're concerned with the entire range that we're going to be 6277 * doing DIO to, so we need to make sure theres no ordered 6278 * extents in this range. 6279 */ 6280 ordered = btrfs_lookup_ordered_range(inode, lockstart, 6281 lockend - lockstart + 1); 6282 if (!ordered) 6283 break; 6284 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6285 &cached_state, GFP_NOFS); 6286 btrfs_start_ordered_extent(inode, ordered, 1); 6287 btrfs_put_ordered_extent(ordered); 6288 cond_resched(); 6289 } 6290 6291 /* 6292 * we don't use btrfs_set_extent_delalloc because we don't want 6293 * the dirty or uptodate bits 6294 */ 6295 if (writing) { 6296 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; 6297 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 6298 EXTENT_DELALLOC, 0, NULL, &cached_state, 6299 GFP_NOFS); 6300 if (ret) { 6301 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 6302 lockend, EXTENT_LOCKED | write_bits, 6303 1, 0, &cached_state, GFP_NOFS); 6304 goto out; 6305 } 6306 } 6307 6308 free_extent_state(cached_state); 6309 cached_state = NULL; 6310 6311 ret = __blockdev_direct_IO(rw, iocb, inode, 6312 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6313 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6314 btrfs_submit_direct, 0); 6315 6316 if (ret < 0 && ret != -EIOCBQUEUED) { 6317 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, 6318 offset + iov_length(iov, nr_segs) - 1, 6319 EXTENT_LOCKED | write_bits, 1, 0, 6320 &cached_state, GFP_NOFS); 6321 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { 6322 /* 6323 * We're falling back to buffered, unlock the section we didn't 6324 * do IO on. 6325 */ 6326 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, 6327 offset + iov_length(iov, nr_segs) - 1, 6328 EXTENT_LOCKED | write_bits, 1, 0, 6329 &cached_state, GFP_NOFS); 6330 } 6331 out: 6332 free_extent_state(cached_state); 6333 return ret; 6334 } 6335 6336 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6337 __u64 start, __u64 len) 6338 { 6339 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6340 } 6341 6342 int btrfs_readpage(struct file *file, struct page *page) 6343 { 6344 struct extent_io_tree *tree; 6345 tree = &BTRFS_I(page->mapping->host)->io_tree; 6346 return extent_read_full_page(tree, page, btrfs_get_extent); 6347 } 6348 6349 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6350 { 6351 struct extent_io_tree *tree; 6352 6353 6354 if (current->flags & PF_MEMALLOC) { 6355 redirty_page_for_writepage(wbc, page); 6356 unlock_page(page); 6357 return 0; 6358 } 6359 tree = &BTRFS_I(page->mapping->host)->io_tree; 6360 return extent_write_full_page(tree, page, btrfs_get_extent, wbc); 6361 } 6362 6363 int btrfs_writepages(struct address_space *mapping, 6364 struct writeback_control *wbc) 6365 { 6366 struct extent_io_tree *tree; 6367 6368 tree = &BTRFS_I(mapping->host)->io_tree; 6369 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 6370 } 6371 6372 static int 6373 btrfs_readpages(struct file *file, struct address_space *mapping, 6374 struct list_head *pages, unsigned nr_pages) 6375 { 6376 struct extent_io_tree *tree; 6377 tree = &BTRFS_I(mapping->host)->io_tree; 6378 return extent_readpages(tree, mapping, pages, nr_pages, 6379 btrfs_get_extent); 6380 } 6381 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6382 { 6383 struct extent_io_tree *tree; 6384 struct extent_map_tree *map; 6385 int ret; 6386 6387 tree = &BTRFS_I(page->mapping->host)->io_tree; 6388 map = &BTRFS_I(page->mapping->host)->extent_tree; 6389 ret = try_release_extent_mapping(map, tree, page, gfp_flags); 6390 if (ret == 1) { 6391 ClearPagePrivate(page); 6392 set_page_private(page, 0); 6393 page_cache_release(page); 6394 } 6395 return ret; 6396 } 6397 6398 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) 6399 { 6400 if (PageWriteback(page) || PageDirty(page)) 6401 return 0; 6402 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 6403 } 6404 6405 static void btrfs_invalidatepage(struct page *page, unsigned long offset) 6406 { 6407 struct extent_io_tree *tree; 6408 struct btrfs_ordered_extent *ordered; 6409 struct extent_state *cached_state = NULL; 6410 u64 page_start = page_offset(page); 6411 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6412 6413 6414 /* 6415 * we have the page locked, so new writeback can't start, 6416 * and the dirty bit won't be cleared while we are here. 6417 * 6418 * Wait for IO on this page so that we can safely clear 6419 * the PagePrivate2 bit and do ordered accounting 6420 */ 6421 wait_on_page_writeback(page); 6422 6423 tree = &BTRFS_I(page->mapping->host)->io_tree; 6424 if (offset) { 6425 btrfs_releasepage(page, GFP_NOFS); 6426 return; 6427 } 6428 lock_extent_bits(tree, page_start, page_end, 0, &cached_state, 6429 GFP_NOFS); 6430 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 6431 page_offset(page)); 6432 if (ordered) { 6433 /* 6434 * IO on this page will never be started, so we need 6435 * to account for any ordered extents now 6436 */ 6437 clear_extent_bit(tree, page_start, page_end, 6438 EXTENT_DIRTY | EXTENT_DELALLOC | 6439 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 6440 &cached_state, GFP_NOFS); 6441 /* 6442 * whoever cleared the private bit is responsible 6443 * for the finish_ordered_io 6444 */ 6445 if (TestClearPagePrivate2(page)) { 6446 btrfs_finish_ordered_io(page->mapping->host, 6447 page_start, page_end); 6448 } 6449 btrfs_put_ordered_extent(ordered); 6450 cached_state = NULL; 6451 lock_extent_bits(tree, page_start, page_end, 0, &cached_state, 6452 GFP_NOFS); 6453 } 6454 clear_extent_bit(tree, page_start, page_end, 6455 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6456 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); 6457 __btrfs_releasepage(page, GFP_NOFS); 6458 6459 ClearPageChecked(page); 6460 if (PagePrivate(page)) { 6461 ClearPagePrivate(page); 6462 set_page_private(page, 0); 6463 page_cache_release(page); 6464 } 6465 } 6466 6467 /* 6468 * btrfs_page_mkwrite() is not allowed to change the file size as it gets 6469 * called from a page fault handler when a page is first dirtied. Hence we must 6470 * be careful to check for EOF conditions here. We set the page up correctly 6471 * for a written page which means we get ENOSPC checking when writing into 6472 * holes and correct delalloc and unwritten extent mapping on filesystems that 6473 * support these features. 6474 * 6475 * We are not allowed to take the i_mutex here so we have to play games to 6476 * protect against truncate races as the page could now be beyond EOF. Because 6477 * vmtruncate() writes the inode size before removing pages, once we have the 6478 * page lock we can determine safely if the page is beyond EOF. If it is not 6479 * beyond EOF, then the page is guaranteed safe against truncation until we 6480 * unlock the page. 6481 */ 6482 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 6483 { 6484 struct page *page = vmf->page; 6485 struct inode *inode = fdentry(vma->vm_file)->d_inode; 6486 struct btrfs_root *root = BTRFS_I(inode)->root; 6487 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6488 struct btrfs_ordered_extent *ordered; 6489 struct extent_state *cached_state = NULL; 6490 char *kaddr; 6491 unsigned long zero_start; 6492 loff_t size; 6493 int ret; 6494 u64 page_start; 6495 u64 page_end; 6496 6497 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6498 if (ret) { 6499 if (ret == -ENOMEM) 6500 ret = VM_FAULT_OOM; 6501 else /* -ENOSPC, -EIO, etc */ 6502 ret = VM_FAULT_SIGBUS; 6503 goto out; 6504 } 6505 6506 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6507 again: 6508 lock_page(page); 6509 size = i_size_read(inode); 6510 page_start = page_offset(page); 6511 page_end = page_start + PAGE_CACHE_SIZE - 1; 6512 6513 if ((page->mapping != inode->i_mapping) || 6514 (page_start >= size)) { 6515 /* page got truncated out from underneath us */ 6516 goto out_unlock; 6517 } 6518 wait_on_page_writeback(page); 6519 6520 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, 6521 GFP_NOFS); 6522 set_page_extent_mapped(page); 6523 6524 /* 6525 * we can't set the delalloc bits if there are pending ordered 6526 * extents. Drop our locks and wait for them to finish 6527 */ 6528 ordered = btrfs_lookup_ordered_extent(inode, page_start); 6529 if (ordered) { 6530 unlock_extent_cached(io_tree, page_start, page_end, 6531 &cached_state, GFP_NOFS); 6532 unlock_page(page); 6533 btrfs_start_ordered_extent(inode, ordered, 1); 6534 btrfs_put_ordered_extent(ordered); 6535 goto again; 6536 } 6537 6538 /* 6539 * XXX - page_mkwrite gets called every time the page is dirtied, even 6540 * if it was already dirty, so for space accounting reasons we need to 6541 * clear any delalloc bits for the range we are fixing to save. There 6542 * is probably a better way to do this, but for now keep consistent with 6543 * prepare_pages in the normal write path. 6544 */ 6545 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, 6546 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6547 0, 0, &cached_state, GFP_NOFS); 6548 6549 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 6550 &cached_state); 6551 if (ret) { 6552 unlock_extent_cached(io_tree, page_start, page_end, 6553 &cached_state, GFP_NOFS); 6554 ret = VM_FAULT_SIGBUS; 6555 goto out_unlock; 6556 } 6557 ret = 0; 6558 6559 /* page is wholly or partially inside EOF */ 6560 if (page_start + PAGE_CACHE_SIZE > size) 6561 zero_start = size & ~PAGE_CACHE_MASK; 6562 else 6563 zero_start = PAGE_CACHE_SIZE; 6564 6565 if (zero_start != PAGE_CACHE_SIZE) { 6566 kaddr = kmap(page); 6567 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); 6568 flush_dcache_page(page); 6569 kunmap(page); 6570 } 6571 ClearPageChecked(page); 6572 set_page_dirty(page); 6573 SetPageUptodate(page); 6574 6575 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6576 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6577 6578 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6579 6580 out_unlock: 6581 if (!ret) 6582 return VM_FAULT_LOCKED; 6583 unlock_page(page); 6584 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 6585 out: 6586 return ret; 6587 } 6588 6589 static int btrfs_truncate(struct inode *inode) 6590 { 6591 struct btrfs_root *root = BTRFS_I(inode)->root; 6592 int ret; 6593 int err = 0; 6594 struct btrfs_trans_handle *trans; 6595 unsigned long nr; 6596 u64 mask = root->sectorsize - 1; 6597 6598 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6599 if (ret) 6600 return ret; 6601 6602 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6603 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6604 6605 trans = btrfs_start_transaction(root, 5); 6606 if (IS_ERR(trans)) 6607 return PTR_ERR(trans); 6608 6609 btrfs_set_trans_block_group(trans, inode); 6610 6611 ret = btrfs_orphan_add(trans, inode); 6612 if (ret) { 6613 btrfs_end_transaction(trans, root); 6614 return ret; 6615 } 6616 6617 nr = trans->blocks_used; 6618 btrfs_end_transaction(trans, root); 6619 btrfs_btree_balance_dirty(root, nr); 6620 6621 /* Now start a transaction for the truncate */ 6622 trans = btrfs_start_transaction(root, 0); 6623 if (IS_ERR(trans)) 6624 return PTR_ERR(trans); 6625 btrfs_set_trans_block_group(trans, inode); 6626 trans->block_rsv = root->orphan_block_rsv; 6627 6628 /* 6629 * setattr is responsible for setting the ordered_data_close flag, 6630 * but that is only tested during the last file release. That 6631 * could happen well after the next commit, leaving a great big 6632 * window where new writes may get lost if someone chooses to write 6633 * to this file after truncating to zero 6634 * 6635 * The inode doesn't have any dirty data here, and so if we commit 6636 * this is a noop. If someone immediately starts writing to the inode 6637 * it is very likely we'll catch some of their writes in this 6638 * transaction, and the commit will find this file on the ordered 6639 * data list with good things to send down. 6640 * 6641 * This is a best effort solution, there is still a window where 6642 * using truncate to replace the contents of the file will 6643 * end up with a zero length file after a crash. 6644 */ 6645 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) 6646 btrfs_add_ordered_operation(trans, root, inode); 6647 6648 while (1) { 6649 if (!trans) { 6650 trans = btrfs_start_transaction(root, 0); 6651 if (IS_ERR(trans)) 6652 return PTR_ERR(trans); 6653 btrfs_set_trans_block_group(trans, inode); 6654 trans->block_rsv = root->orphan_block_rsv; 6655 } 6656 6657 ret = btrfs_block_rsv_check(trans, root, 6658 root->orphan_block_rsv, 0, 5); 6659 if (ret == -EAGAIN) { 6660 ret = btrfs_commit_transaction(trans, root); 6661 if (ret) 6662 return ret; 6663 trans = NULL; 6664 continue; 6665 } else if (ret) { 6666 err = ret; 6667 break; 6668 } 6669 6670 ret = btrfs_truncate_inode_items(trans, root, inode, 6671 inode->i_size, 6672 BTRFS_EXTENT_DATA_KEY); 6673 if (ret != -EAGAIN) { 6674 err = ret; 6675 break; 6676 } 6677 6678 ret = btrfs_update_inode(trans, root, inode); 6679 if (ret) { 6680 err = ret; 6681 break; 6682 } 6683 6684 nr = trans->blocks_used; 6685 btrfs_end_transaction(trans, root); 6686 trans = NULL; 6687 btrfs_btree_balance_dirty(root, nr); 6688 } 6689 6690 if (ret == 0 && inode->i_nlink > 0) { 6691 ret = btrfs_orphan_del(trans, inode); 6692 if (ret) 6693 err = ret; 6694 } else if (ret && inode->i_nlink > 0) { 6695 /* 6696 * Failed to do the truncate, remove us from the in memory 6697 * orphan list. 6698 */ 6699 ret = btrfs_orphan_del(NULL, inode); 6700 } 6701 6702 ret = btrfs_update_inode(trans, root, inode); 6703 if (ret && !err) 6704 err = ret; 6705 6706 nr = trans->blocks_used; 6707 ret = btrfs_end_transaction_throttle(trans, root); 6708 if (ret && !err) 6709 err = ret; 6710 btrfs_btree_balance_dirty(root, nr); 6711 6712 return err; 6713 } 6714 6715 /* 6716 * create a new subvolume directory/inode (helper for the ioctl). 6717 */ 6718 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 6719 struct btrfs_root *new_root, 6720 u64 new_dirid, u64 alloc_hint) 6721 { 6722 struct inode *inode; 6723 int err; 6724 u64 index = 0; 6725 6726 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, 6727 new_dirid, alloc_hint, S_IFDIR | 0700, &index); 6728 if (IS_ERR(inode)) 6729 return PTR_ERR(inode); 6730 inode->i_op = &btrfs_dir_inode_operations; 6731 inode->i_fop = &btrfs_dir_file_operations; 6732 6733 inode->i_nlink = 1; 6734 btrfs_i_size_write(inode, 0); 6735 6736 err = btrfs_update_inode(trans, new_root, inode); 6737 BUG_ON(err); 6738 6739 iput(inode); 6740 return 0; 6741 } 6742 6743 /* helper function for file defrag and space balancing. This 6744 * forces readahead on a given range of bytes in an inode 6745 */ 6746 unsigned long btrfs_force_ra(struct address_space *mapping, 6747 struct file_ra_state *ra, struct file *file, 6748 pgoff_t offset, pgoff_t last_index) 6749 { 6750 pgoff_t req_size = last_index - offset + 1; 6751 6752 page_cache_sync_readahead(mapping, ra, file, offset, req_size); 6753 return offset + req_size; 6754 } 6755 6756 struct inode *btrfs_alloc_inode(struct super_block *sb) 6757 { 6758 struct btrfs_inode *ei; 6759 struct inode *inode; 6760 6761 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6762 if (!ei) 6763 return NULL; 6764 6765 ei->root = NULL; 6766 ei->space_info = NULL; 6767 ei->generation = 0; 6768 ei->sequence = 0; 6769 ei->last_trans = 0; 6770 ei->last_sub_trans = 0; 6771 ei->logged_trans = 0; 6772 ei->delalloc_bytes = 0; 6773 ei->reserved_bytes = 0; 6774 ei->disk_i_size = 0; 6775 ei->flags = 0; 6776 ei->index_cnt = (u64)-1; 6777 ei->last_unlink_trans = 0; 6778 6779 atomic_set(&ei->outstanding_extents, 0); 6780 atomic_set(&ei->reserved_extents, 0); 6781 6782 ei->ordered_data_close = 0; 6783 ei->orphan_meta_reserved = 0; 6784 ei->dummy_inode = 0; 6785 ei->force_compress = BTRFS_COMPRESS_NONE; 6786 6787 inode = &ei->vfs_inode; 6788 extent_map_tree_init(&ei->extent_tree, GFP_NOFS); 6789 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS); 6790 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS); 6791 mutex_init(&ei->log_mutex); 6792 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6793 INIT_LIST_HEAD(&ei->i_orphan); 6794 INIT_LIST_HEAD(&ei->delalloc_inodes); 6795 INIT_LIST_HEAD(&ei->ordered_operations); 6796 RB_CLEAR_NODE(&ei->rb_node); 6797 6798 return inode; 6799 } 6800 6801 static void btrfs_i_callback(struct rcu_head *head) 6802 { 6803 struct inode *inode = container_of(head, struct inode, i_rcu); 6804 INIT_LIST_HEAD(&inode->i_dentry); 6805 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6806 } 6807 6808 void btrfs_destroy_inode(struct inode *inode) 6809 { 6810 struct btrfs_ordered_extent *ordered; 6811 struct btrfs_root *root = BTRFS_I(inode)->root; 6812 6813 WARN_ON(!list_empty(&inode->i_dentry)); 6814 WARN_ON(inode->i_data.nrpages); 6815 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); 6816 WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); 6817 6818 /* 6819 * This can happen where we create an inode, but somebody else also 6820 * created the same inode and we need to destroy the one we already 6821 * created. 6822 */ 6823 if (!root) 6824 goto free; 6825 6826 /* 6827 * Make sure we're properly removed from the ordered operation 6828 * lists. 6829 */ 6830 smp_mb(); 6831 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 6832 spin_lock(&root->fs_info->ordered_extent_lock); 6833 list_del_init(&BTRFS_I(inode)->ordered_operations); 6834 spin_unlock(&root->fs_info->ordered_extent_lock); 6835 } 6836 6837 if (root == root->fs_info->tree_root) { 6838 struct btrfs_block_group_cache *block_group; 6839 6840 block_group = btrfs_lookup_block_group(root->fs_info, 6841 BTRFS_I(inode)->block_group); 6842 if (block_group && block_group->inode == inode) { 6843 spin_lock(&block_group->lock); 6844 block_group->inode = NULL; 6845 spin_unlock(&block_group->lock); 6846 btrfs_put_block_group(block_group); 6847 } else if (block_group) { 6848 btrfs_put_block_group(block_group); 6849 } 6850 } 6851 6852 spin_lock(&root->orphan_lock); 6853 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6854 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6855 inode->i_ino); 6856 list_del_init(&BTRFS_I(inode)->i_orphan); 6857 } 6858 spin_unlock(&root->orphan_lock); 6859 6860 while (1) { 6861 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6862 if (!ordered) 6863 break; 6864 else { 6865 printk(KERN_ERR "btrfs found ordered " 6866 "extent %llu %llu on inode cleanup\n", 6867 (unsigned long long)ordered->file_offset, 6868 (unsigned long long)ordered->len); 6869 btrfs_remove_ordered_extent(inode, ordered); 6870 btrfs_put_ordered_extent(ordered); 6871 btrfs_put_ordered_extent(ordered); 6872 } 6873 } 6874 inode_tree_del(inode); 6875 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 6876 free: 6877 call_rcu(&inode->i_rcu, btrfs_i_callback); 6878 } 6879 6880 int btrfs_drop_inode(struct inode *inode) 6881 { 6882 struct btrfs_root *root = BTRFS_I(inode)->root; 6883 6884 if (btrfs_root_refs(&root->root_item) == 0 && 6885 root != root->fs_info->tree_root) 6886 return 1; 6887 else 6888 return generic_drop_inode(inode); 6889 } 6890 6891 static void init_once(void *foo) 6892 { 6893 struct btrfs_inode *ei = (struct btrfs_inode *) foo; 6894 6895 inode_init_once(&ei->vfs_inode); 6896 } 6897 6898 void btrfs_destroy_cachep(void) 6899 { 6900 if (btrfs_inode_cachep) 6901 kmem_cache_destroy(btrfs_inode_cachep); 6902 if (btrfs_trans_handle_cachep) 6903 kmem_cache_destroy(btrfs_trans_handle_cachep); 6904 if (btrfs_transaction_cachep) 6905 kmem_cache_destroy(btrfs_transaction_cachep); 6906 if (btrfs_path_cachep) 6907 kmem_cache_destroy(btrfs_path_cachep); 6908 if (btrfs_free_space_cachep) 6909 kmem_cache_destroy(btrfs_free_space_cachep); 6910 } 6911 6912 int btrfs_init_cachep(void) 6913 { 6914 btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", 6915 sizeof(struct btrfs_inode), 0, 6916 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); 6917 if (!btrfs_inode_cachep) 6918 goto fail; 6919 6920 btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", 6921 sizeof(struct btrfs_trans_handle), 0, 6922 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 6923 if (!btrfs_trans_handle_cachep) 6924 goto fail; 6925 6926 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", 6927 sizeof(struct btrfs_transaction), 0, 6928 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 6929 if (!btrfs_transaction_cachep) 6930 goto fail; 6931 6932 btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", 6933 sizeof(struct btrfs_path), 0, 6934 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 6935 if (!btrfs_path_cachep) 6936 goto fail; 6937 6938 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", 6939 sizeof(struct btrfs_free_space), 0, 6940 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 6941 if (!btrfs_free_space_cachep) 6942 goto fail; 6943 6944 return 0; 6945 fail: 6946 btrfs_destroy_cachep(); 6947 return -ENOMEM; 6948 } 6949 6950 static int btrfs_getattr(struct vfsmount *mnt, 6951 struct dentry *dentry, struct kstat *stat) 6952 { 6953 struct inode *inode = dentry->d_inode; 6954 generic_fillattr(inode, stat); 6955 stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; 6956 stat->blksize = PAGE_CACHE_SIZE; 6957 stat->blocks = (inode_get_bytes(inode) + 6958 BTRFS_I(inode)->delalloc_bytes) >> 9; 6959 return 0; 6960 } 6961 6962 /* 6963 * If a file is moved, it will inherit the cow and compression flags of the new 6964 * directory. 6965 */ 6966 static void fixup_inode_flags(struct inode *dir, struct inode *inode) 6967 { 6968 struct btrfs_inode *b_dir = BTRFS_I(dir); 6969 struct btrfs_inode *b_inode = BTRFS_I(inode); 6970 6971 if (b_dir->flags & BTRFS_INODE_NODATACOW) 6972 b_inode->flags |= BTRFS_INODE_NODATACOW; 6973 else 6974 b_inode->flags &= ~BTRFS_INODE_NODATACOW; 6975 6976 if (b_dir->flags & BTRFS_INODE_COMPRESS) 6977 b_inode->flags |= BTRFS_INODE_COMPRESS; 6978 else 6979 b_inode->flags &= ~BTRFS_INODE_COMPRESS; 6980 } 6981 6982 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 6983 struct inode *new_dir, struct dentry *new_dentry) 6984 { 6985 struct btrfs_trans_handle *trans; 6986 struct btrfs_root *root = BTRFS_I(old_dir)->root; 6987 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 6988 struct inode *new_inode = new_dentry->d_inode; 6989 struct inode *old_inode = old_dentry->d_inode; 6990 struct timespec ctime = CURRENT_TIME; 6991 u64 index = 0; 6992 u64 root_objectid; 6993 int ret; 6994 6995 if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 6996 return -EPERM; 6997 6998 /* we only allow rename subvolume link between subvolumes */ 6999 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 7000 return -EXDEV; 7001 7002 if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 7003 (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) 7004 return -ENOTEMPTY; 7005 7006 if (S_ISDIR(old_inode->i_mode) && new_inode && 7007 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7008 return -ENOTEMPTY; 7009 /* 7010 * we're using rename to replace one file with another. 7011 * and the replacement file is large. Start IO on it now so 7012 * we don't add too much work to the end of the transaction 7013 */ 7014 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 7015 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 7016 filemap_flush(old_inode->i_mapping); 7017 7018 /* close the racy window with snapshot create/destroy ioctl */ 7019 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 7020 down_read(&root->fs_info->subvol_sem); 7021 /* 7022 * We want to reserve the absolute worst case amount of items. So if 7023 * both inodes are subvols and we need to unlink them then that would 7024 * require 4 item modifications, but if they are both normal inodes it 7025 * would require 5 item modifications, so we'll assume their normal 7026 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 7027 * should cover the worst case number of items we'll modify. 7028 */ 7029 trans = btrfs_start_transaction(root, 20); 7030 if (IS_ERR(trans)) { 7031 ret = PTR_ERR(trans); 7032 goto out_notrans; 7033 } 7034 7035 btrfs_set_trans_block_group(trans, new_dir); 7036 7037 if (dest != root) 7038 btrfs_record_root_in_trans(trans, dest); 7039 7040 ret = btrfs_set_inode_index(new_dir, &index); 7041 if (ret) 7042 goto out_fail; 7043 7044 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 7045 /* force full log commit if subvolume involved. */ 7046 root->fs_info->last_trans_log_full_commit = trans->transid; 7047 } else { 7048 ret = btrfs_insert_inode_ref(trans, dest, 7049 new_dentry->d_name.name, 7050 new_dentry->d_name.len, 7051 old_inode->i_ino, 7052 new_dir->i_ino, index); 7053 if (ret) 7054 goto out_fail; 7055 /* 7056 * this is an ugly little race, but the rename is required 7057 * to make sure that if we crash, the inode is either at the 7058 * old name or the new one. pinning the log transaction lets 7059 * us make sure we don't allow a log commit to come in after 7060 * we unlink the name but before we add the new name back in. 7061 */ 7062 btrfs_pin_log_trans(root); 7063 } 7064 /* 7065 * make sure the inode gets flushed if it is replacing 7066 * something. 7067 */ 7068 if (new_inode && new_inode->i_size && 7069 old_inode && S_ISREG(old_inode->i_mode)) { 7070 btrfs_add_ordered_operation(trans, root, old_inode); 7071 } 7072 7073 old_dir->i_ctime = old_dir->i_mtime = ctime; 7074 new_dir->i_ctime = new_dir->i_mtime = ctime; 7075 old_inode->i_ctime = ctime; 7076 7077 if (old_dentry->d_parent != new_dentry->d_parent) 7078 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 7079 7080 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 7081 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 7082 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 7083 old_dentry->d_name.name, 7084 old_dentry->d_name.len); 7085 } else { 7086 ret = __btrfs_unlink_inode(trans, root, old_dir, 7087 old_dentry->d_inode, 7088 old_dentry->d_name.name, 7089 old_dentry->d_name.len); 7090 if (!ret) 7091 ret = btrfs_update_inode(trans, root, old_inode); 7092 } 7093 BUG_ON(ret); 7094 7095 if (new_inode) { 7096 new_inode->i_ctime = CURRENT_TIME; 7097 if (unlikely(new_inode->i_ino == 7098 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 7099 root_objectid = BTRFS_I(new_inode)->location.objectid; 7100 ret = btrfs_unlink_subvol(trans, dest, new_dir, 7101 root_objectid, 7102 new_dentry->d_name.name, 7103 new_dentry->d_name.len); 7104 BUG_ON(new_inode->i_nlink == 0); 7105 } else { 7106 ret = btrfs_unlink_inode(trans, dest, new_dir, 7107 new_dentry->d_inode, 7108 new_dentry->d_name.name, 7109 new_dentry->d_name.len); 7110 } 7111 BUG_ON(ret); 7112 if (new_inode->i_nlink == 0) { 7113 ret = btrfs_orphan_add(trans, new_dentry->d_inode); 7114 BUG_ON(ret); 7115 } 7116 } 7117 7118 fixup_inode_flags(new_dir, old_inode); 7119 7120 ret = btrfs_add_link(trans, new_dir, old_inode, 7121 new_dentry->d_name.name, 7122 new_dentry->d_name.len, 0, index); 7123 BUG_ON(ret); 7124 7125 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 7126 struct dentry *parent = dget_parent(new_dentry); 7127 btrfs_log_new_name(trans, old_inode, old_dir, parent); 7128 dput(parent); 7129 btrfs_end_log_trans(root); 7130 } 7131 out_fail: 7132 btrfs_end_transaction_throttle(trans, root); 7133 out_notrans: 7134 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 7135 up_read(&root->fs_info->subvol_sem); 7136 7137 return ret; 7138 } 7139 7140 /* 7141 * some fairly slow code that needs optimization. This walks the list 7142 * of all the inodes with pending delalloc and forces them to disk. 7143 */ 7144 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 7145 { 7146 struct list_head *head = &root->fs_info->delalloc_inodes; 7147 struct btrfs_inode *binode; 7148 struct inode *inode; 7149 7150 if (root->fs_info->sb->s_flags & MS_RDONLY) 7151 return -EROFS; 7152 7153 spin_lock(&root->fs_info->delalloc_lock); 7154 while (!list_empty(head)) { 7155 binode = list_entry(head->next, struct btrfs_inode, 7156 delalloc_inodes); 7157 inode = igrab(&binode->vfs_inode); 7158 if (!inode) 7159 list_del_init(&binode->delalloc_inodes); 7160 spin_unlock(&root->fs_info->delalloc_lock); 7161 if (inode) { 7162 filemap_flush(inode->i_mapping); 7163 if (delay_iput) 7164 btrfs_add_delayed_iput(inode); 7165 else 7166 iput(inode); 7167 } 7168 cond_resched(); 7169 spin_lock(&root->fs_info->delalloc_lock); 7170 } 7171 spin_unlock(&root->fs_info->delalloc_lock); 7172 7173 /* the filemap_flush will queue IO into the worker threads, but 7174 * we have to make sure the IO is actually started and that 7175 * ordered extents get created before we return 7176 */ 7177 atomic_inc(&root->fs_info->async_submit_draining); 7178 while (atomic_read(&root->fs_info->nr_async_submits) || 7179 atomic_read(&root->fs_info->async_delalloc_pages)) { 7180 wait_event(root->fs_info->async_submit_wait, 7181 (atomic_read(&root->fs_info->nr_async_submits) == 0 && 7182 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 7183 } 7184 atomic_dec(&root->fs_info->async_submit_draining); 7185 return 0; 7186 } 7187 7188 int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, 7189 int sync) 7190 { 7191 struct btrfs_inode *binode; 7192 struct inode *inode = NULL; 7193 7194 spin_lock(&root->fs_info->delalloc_lock); 7195 while (!list_empty(&root->fs_info->delalloc_inodes)) { 7196 binode = list_entry(root->fs_info->delalloc_inodes.next, 7197 struct btrfs_inode, delalloc_inodes); 7198 inode = igrab(&binode->vfs_inode); 7199 if (inode) { 7200 list_move_tail(&binode->delalloc_inodes, 7201 &root->fs_info->delalloc_inodes); 7202 break; 7203 } 7204 7205 list_del_init(&binode->delalloc_inodes); 7206 cond_resched_lock(&root->fs_info->delalloc_lock); 7207 } 7208 spin_unlock(&root->fs_info->delalloc_lock); 7209 7210 if (inode) { 7211 if (sync) { 7212 filemap_write_and_wait(inode->i_mapping); 7213 /* 7214 * We have to do this because compression doesn't 7215 * actually set PG_writeback until it submits the pages 7216 * for IO, which happens in an async thread, so we could 7217 * race and not actually wait for any writeback pages 7218 * because they've not been submitted yet. Technically 7219 * this could still be the case for the ordered stuff 7220 * since the async thread may not have started to do its 7221 * work yet. If this becomes the case then we need to 7222 * figure out a way to make sure that in writepage we 7223 * wait for any async pages to be submitted before 7224 * returning so that fdatawait does what its supposed to 7225 * do. 7226 */ 7227 btrfs_wait_ordered_range(inode, 0, (u64)-1); 7228 } else { 7229 filemap_flush(inode->i_mapping); 7230 } 7231 if (delay_iput) 7232 btrfs_add_delayed_iput(inode); 7233 else 7234 iput(inode); 7235 return 1; 7236 } 7237 return 0; 7238 } 7239 7240 static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7241 const char *symname) 7242 { 7243 struct btrfs_trans_handle *trans; 7244 struct btrfs_root *root = BTRFS_I(dir)->root; 7245 struct btrfs_path *path; 7246 struct btrfs_key key; 7247 struct inode *inode = NULL; 7248 int err; 7249 int drop_inode = 0; 7250 u64 objectid; 7251 u64 index = 0 ; 7252 int name_len; 7253 int datasize; 7254 unsigned long ptr; 7255 struct btrfs_file_extent_item *ei; 7256 struct extent_buffer *leaf; 7257 unsigned long nr = 0; 7258 7259 name_len = strlen(symname) + 1; 7260 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7261 return -ENAMETOOLONG; 7262 7263 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); 7264 if (err) 7265 return err; 7266 /* 7267 * 2 items for inode item and ref 7268 * 2 items for dir items 7269 * 1 item for xattr if selinux is on 7270 */ 7271 trans = btrfs_start_transaction(root, 5); 7272 if (IS_ERR(trans)) 7273 return PTR_ERR(trans); 7274 7275 btrfs_set_trans_block_group(trans, dir); 7276 7277 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 7278 dentry->d_name.len, dir->i_ino, objectid, 7279 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, 7280 &index); 7281 if (IS_ERR(inode)) { 7282 err = PTR_ERR(inode); 7283 goto out_unlock; 7284 } 7285 7286 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 7287 if (err) { 7288 drop_inode = 1; 7289 goto out_unlock; 7290 } 7291 7292 btrfs_set_trans_block_group(trans, inode); 7293 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7294 if (err) 7295 drop_inode = 1; 7296 else { 7297 inode->i_mapping->a_ops = &btrfs_aops; 7298 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7299 inode->i_fop = &btrfs_file_operations; 7300 inode->i_op = &btrfs_file_inode_operations; 7301 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7302 } 7303 btrfs_update_inode_block_group(trans, inode); 7304 btrfs_update_inode_block_group(trans, dir); 7305 if (drop_inode) 7306 goto out_unlock; 7307 7308 path = btrfs_alloc_path(); 7309 BUG_ON(!path); 7310 key.objectid = inode->i_ino; 7311 key.offset = 0; 7312 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 7313 datasize = btrfs_file_extent_calc_inline_size(name_len); 7314 err = btrfs_insert_empty_item(trans, root, path, &key, 7315 datasize); 7316 if (err) { 7317 drop_inode = 1; 7318 goto out_unlock; 7319 } 7320 leaf = path->nodes[0]; 7321 ei = btrfs_item_ptr(leaf, path->slots[0], 7322 struct btrfs_file_extent_item); 7323 btrfs_set_file_extent_generation(leaf, ei, trans->transid); 7324 btrfs_set_file_extent_type(leaf, ei, 7325 BTRFS_FILE_EXTENT_INLINE); 7326 btrfs_set_file_extent_encryption(leaf, ei, 0); 7327 btrfs_set_file_extent_compression(leaf, ei, 0); 7328 btrfs_set_file_extent_other_encoding(leaf, ei, 0); 7329 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); 7330 7331 ptr = btrfs_file_extent_inline_start(ei); 7332 write_extent_buffer(leaf, symname, ptr, name_len); 7333 btrfs_mark_buffer_dirty(leaf); 7334 btrfs_free_path(path); 7335 7336 inode->i_op = &btrfs_symlink_inode_operations; 7337 inode->i_mapping->a_ops = &btrfs_symlink_aops; 7338 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 7339 inode_set_bytes(inode, name_len); 7340 btrfs_i_size_write(inode, name_len - 1); 7341 err = btrfs_update_inode(trans, root, inode); 7342 if (err) 7343 drop_inode = 1; 7344 7345 out_unlock: 7346 nr = trans->blocks_used; 7347 btrfs_end_transaction_throttle(trans, root); 7348 if (drop_inode) { 7349 inode_dec_link_count(inode); 7350 iput(inode); 7351 } 7352 btrfs_btree_balance_dirty(root, nr); 7353 return err; 7354 } 7355 7356 static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 7357 u64 start, u64 num_bytes, u64 min_size, 7358 loff_t actual_len, u64 *alloc_hint, 7359 struct btrfs_trans_handle *trans) 7360 { 7361 struct btrfs_root *root = BTRFS_I(inode)->root; 7362 struct btrfs_key ins; 7363 u64 cur_offset = start; 7364 u64 i_size; 7365 int ret = 0; 7366 bool own_trans = true; 7367 7368 if (trans) 7369 own_trans = false; 7370 while (num_bytes > 0) { 7371 if (own_trans) { 7372 trans = btrfs_start_transaction(root, 3); 7373 if (IS_ERR(trans)) { 7374 ret = PTR_ERR(trans); 7375 break; 7376 } 7377 } 7378 7379 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 7380 0, *alloc_hint, (u64)-1, &ins, 1); 7381 if (ret) { 7382 if (own_trans) 7383 btrfs_end_transaction(trans, root); 7384 break; 7385 } 7386 7387 ret = insert_reserved_file_extent(trans, inode, 7388 cur_offset, ins.objectid, 7389 ins.offset, ins.offset, 7390 ins.offset, 0, 0, 0, 7391 BTRFS_FILE_EXTENT_PREALLOC); 7392 BUG_ON(ret); 7393 btrfs_drop_extent_cache(inode, cur_offset, 7394 cur_offset + ins.offset -1, 0); 7395 7396 num_bytes -= ins.offset; 7397 cur_offset += ins.offset; 7398 *alloc_hint = ins.objectid + ins.offset; 7399 7400 inode->i_ctime = CURRENT_TIME; 7401 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7402 if (!(mode & FALLOC_FL_KEEP_SIZE) && 7403 (actual_len > inode->i_size) && 7404 (cur_offset > inode->i_size)) { 7405 if (cur_offset > actual_len) 7406 i_size = actual_len; 7407 else 7408 i_size = cur_offset; 7409 i_size_write(inode, i_size); 7410 btrfs_ordered_update_i_size(inode, i_size, NULL); 7411 } 7412 7413 ret = btrfs_update_inode(trans, root, inode); 7414 BUG_ON(ret); 7415 7416 if (own_trans) 7417 btrfs_end_transaction(trans, root); 7418 } 7419 return ret; 7420 } 7421 7422 int btrfs_prealloc_file_range(struct inode *inode, int mode, 7423 u64 start, u64 num_bytes, u64 min_size, 7424 loff_t actual_len, u64 *alloc_hint) 7425 { 7426 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7427 min_size, actual_len, alloc_hint, 7428 NULL); 7429 } 7430 7431 int btrfs_prealloc_file_range_trans(struct inode *inode, 7432 struct btrfs_trans_handle *trans, int mode, 7433 u64 start, u64 num_bytes, u64 min_size, 7434 loff_t actual_len, u64 *alloc_hint) 7435 { 7436 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, 7437 min_size, actual_len, alloc_hint, trans); 7438 } 7439 7440 static int btrfs_set_page_dirty(struct page *page) 7441 { 7442 return __set_page_dirty_nobuffers(page); 7443 } 7444 7445 static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) 7446 { 7447 struct btrfs_root *root = BTRFS_I(inode)->root; 7448 7449 if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) 7450 return -EROFS; 7451 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7452 return -EACCES; 7453 return generic_permission(inode, mask, flags, btrfs_check_acl); 7454 } 7455 7456 static const struct inode_operations btrfs_dir_inode_operations = { 7457 .getattr = btrfs_getattr, 7458 .lookup = btrfs_lookup, 7459 .create = btrfs_create, 7460 .unlink = btrfs_unlink, 7461 .link = btrfs_link, 7462 .mkdir = btrfs_mkdir, 7463 .rmdir = btrfs_rmdir, 7464 .rename = btrfs_rename, 7465 .symlink = btrfs_symlink, 7466 .setattr = btrfs_setattr, 7467 .mknod = btrfs_mknod, 7468 .setxattr = btrfs_setxattr, 7469 .getxattr = btrfs_getxattr, 7470 .listxattr = btrfs_listxattr, 7471 .removexattr = btrfs_removexattr, 7472 .permission = btrfs_permission, 7473 }; 7474 static const struct inode_operations btrfs_dir_ro_inode_operations = { 7475 .lookup = btrfs_lookup, 7476 .permission = btrfs_permission, 7477 }; 7478 7479 static const struct file_operations btrfs_dir_file_operations = { 7480 .llseek = generic_file_llseek, 7481 .read = generic_read_dir, 7482 .readdir = btrfs_real_readdir, 7483 .unlocked_ioctl = btrfs_ioctl, 7484 #ifdef CONFIG_COMPAT 7485 .compat_ioctl = btrfs_ioctl, 7486 #endif 7487 .release = btrfs_release_file, 7488 .fsync = btrfs_sync_file, 7489 }; 7490 7491 static struct extent_io_ops btrfs_extent_io_ops = { 7492 .fill_delalloc = run_delalloc_range, 7493 .submit_bio_hook = btrfs_submit_bio_hook, 7494 .merge_bio_hook = btrfs_merge_bio_hook, 7495 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7496 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7497 .writepage_start_hook = btrfs_writepage_start_hook, 7498 .readpage_io_failed_hook = btrfs_io_failed_hook, 7499 .set_bit_hook = btrfs_set_bit_hook, 7500 .clear_bit_hook = btrfs_clear_bit_hook, 7501 .merge_extent_hook = btrfs_merge_extent_hook, 7502 .split_extent_hook = btrfs_split_extent_hook, 7503 }; 7504 7505 /* 7506 * btrfs doesn't support the bmap operation because swapfiles 7507 * use bmap to make a mapping of extents in the file. They assume 7508 * these extents won't change over the life of the file and they 7509 * use the bmap result to do IO directly to the drive. 7510 * 7511 * the btrfs bmap call would return logical addresses that aren't 7512 * suitable for IO and they also will change frequently as COW 7513 * operations happen. So, swapfile + btrfs == corruption. 7514 * 7515 * For now we're avoiding this by dropping bmap. 7516 */ 7517 static const struct address_space_operations btrfs_aops = { 7518 .readpage = btrfs_readpage, 7519 .writepage = btrfs_writepage, 7520 .writepages = btrfs_writepages, 7521 .readpages = btrfs_readpages, 7522 .direct_IO = btrfs_direct_IO, 7523 .invalidatepage = btrfs_invalidatepage, 7524 .releasepage = btrfs_releasepage, 7525 .set_page_dirty = btrfs_set_page_dirty, 7526 .error_remove_page = generic_error_remove_page, 7527 }; 7528 7529 static const struct address_space_operations btrfs_symlink_aops = { 7530 .readpage = btrfs_readpage, 7531 .writepage = btrfs_writepage, 7532 .invalidatepage = btrfs_invalidatepage, 7533 .releasepage = btrfs_releasepage, 7534 }; 7535 7536 static const struct inode_operations btrfs_file_inode_operations = { 7537 .getattr = btrfs_getattr, 7538 .setattr = btrfs_setattr, 7539 .setxattr = btrfs_setxattr, 7540 .getxattr = btrfs_getxattr, 7541 .listxattr = btrfs_listxattr, 7542 .removexattr = btrfs_removexattr, 7543 .permission = btrfs_permission, 7544 .fiemap = btrfs_fiemap, 7545 }; 7546 static const struct inode_operations btrfs_special_inode_operations = { 7547 .getattr = btrfs_getattr, 7548 .setattr = btrfs_setattr, 7549 .permission = btrfs_permission, 7550 .setxattr = btrfs_setxattr, 7551 .getxattr = btrfs_getxattr, 7552 .listxattr = btrfs_listxattr, 7553 .removexattr = btrfs_removexattr, 7554 }; 7555 static const struct inode_operations btrfs_symlink_inode_operations = { 7556 .readlink = generic_readlink, 7557 .follow_link = page_follow_link_light, 7558 .put_link = page_put_link, 7559 .getattr = btrfs_getattr, 7560 .permission = btrfs_permission, 7561 .setxattr = btrfs_setxattr, 7562 .getxattr = btrfs_getxattr, 7563 .listxattr = btrfs_listxattr, 7564 .removexattr = btrfs_removexattr, 7565 }; 7566 7567 const struct dentry_operations btrfs_dentry_operations = { 7568 .d_delete = btrfs_dentry_delete, 7569 }; 7570