1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/fs.h> 20 #include <linux/blkdev.h> 21 #include <linux/scatterlist.h> 22 #include <linux/swap.h> 23 #include <linux/radix-tree.h> 24 #include <linux/writeback.h> 25 #include <linux/buffer_head.h> // for block_sync_page 26 #include <linux/workqueue.h> 27 #include "crc32c.h" 28 #include "ctree.h" 29 #include "disk-io.h" 30 #include "transaction.h" 31 #include "btrfs_inode.h" 32 #include "volumes.h" 33 #include "print-tree.h" 34 #include "async-thread.h" 35 #include "locking.h" 36 37 #if 0 38 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) 39 { 40 if (extent_buffer_blocknr(buf) != btrfs_header_blocknr(buf)) { 41 printk(KERN_CRIT "buf blocknr(buf) is %llu, header is %llu\n", 42 (unsigned long long)extent_buffer_blocknr(buf), 43 (unsigned long long)btrfs_header_blocknr(buf)); 44 return 1; 45 } 46 return 0; 47 } 48 #endif 49 50 static struct extent_io_ops btree_extent_io_ops; 51 static void end_workqueue_fn(struct btrfs_work *work); 52 53 struct end_io_wq { 54 struct bio *bio; 55 bio_end_io_t *end_io; 56 void *private; 57 struct btrfs_fs_info *info; 58 int error; 59 int metadata; 60 struct list_head list; 61 struct btrfs_work work; 62 }; 63 64 struct async_submit_bio { 65 struct inode *inode; 66 struct bio *bio; 67 struct list_head list; 68 extent_submit_bio_hook_t *submit_bio_hook; 69 int rw; 70 int mirror_num; 71 struct btrfs_work work; 72 }; 73 74 struct extent_map *btree_get_extent(struct inode *inode, struct page *page, 75 size_t page_offset, u64 start, u64 len, 76 int create) 77 { 78 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 79 struct extent_map *em; 80 int ret; 81 82 spin_lock(&em_tree->lock); 83 em = lookup_extent_mapping(em_tree, start, len); 84 if (em) { 85 em->bdev = 86 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 87 spin_unlock(&em_tree->lock); 88 goto out; 89 } 90 spin_unlock(&em_tree->lock); 91 92 em = alloc_extent_map(GFP_NOFS); 93 if (!em) { 94 em = ERR_PTR(-ENOMEM); 95 goto out; 96 } 97 em->start = 0; 98 em->len = (u64)-1; 99 em->block_start = 0; 100 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 101 102 spin_lock(&em_tree->lock); 103 ret = add_extent_mapping(em_tree, em); 104 if (ret == -EEXIST) { 105 u64 failed_start = em->start; 106 u64 failed_len = em->len; 107 108 printk("failed to insert %Lu %Lu -> %Lu into tree\n", 109 em->start, em->len, em->block_start); 110 free_extent_map(em); 111 em = lookup_extent_mapping(em_tree, start, len); 112 if (em) { 113 printk("after failing, found %Lu %Lu %Lu\n", 114 em->start, em->len, em->block_start); 115 ret = 0; 116 } else { 117 em = lookup_extent_mapping(em_tree, failed_start, 118 failed_len); 119 if (em) { 120 printk("double failure lookup gives us " 121 "%Lu %Lu -> %Lu\n", em->start, 122 em->len, em->block_start); 123 free_extent_map(em); 124 } 125 ret = -EIO; 126 } 127 } else if (ret) { 128 free_extent_map(em); 129 em = NULL; 130 } 131 spin_unlock(&em_tree->lock); 132 133 if (ret) 134 em = ERR_PTR(ret); 135 out: 136 return em; 137 } 138 139 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) 140 { 141 return btrfs_crc32c(seed, data, len); 142 } 143 144 void btrfs_csum_final(u32 crc, char *result) 145 { 146 *(__le32 *)result = ~cpu_to_le32(crc); 147 } 148 149 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 150 int verify) 151 { 152 char result[BTRFS_CRC32_SIZE]; 153 unsigned long len; 154 unsigned long cur_len; 155 unsigned long offset = BTRFS_CSUM_SIZE; 156 char *map_token = NULL; 157 char *kaddr; 158 unsigned long map_start; 159 unsigned long map_len; 160 int err; 161 u32 crc = ~(u32)0; 162 163 len = buf->len - offset; 164 while(len > 0) { 165 err = map_private_extent_buffer(buf, offset, 32, 166 &map_token, &kaddr, 167 &map_start, &map_len, KM_USER0); 168 if (err) { 169 printk("failed to map extent buffer! %lu\n", 170 offset); 171 return 1; 172 } 173 cur_len = min(len, map_len - (offset - map_start)); 174 crc = btrfs_csum_data(root, kaddr + offset - map_start, 175 crc, cur_len); 176 len -= cur_len; 177 offset += cur_len; 178 unmap_extent_buffer(buf, map_token, KM_USER0); 179 } 180 btrfs_csum_final(crc, result); 181 182 if (verify) { 183 int from_this_trans = 0; 184 185 if (root->fs_info->running_transaction && 186 btrfs_header_generation(buf) == 187 root->fs_info->running_transaction->transid) 188 from_this_trans = 1; 189 190 /* FIXME, this is not good */ 191 if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) { 192 u32 val; 193 u32 found = 0; 194 memcpy(&found, result, BTRFS_CRC32_SIZE); 195 196 read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE); 197 printk("btrfs: %s checksum verify failed on %llu " 198 "wanted %X found %X from_this_trans %d " 199 "level %d\n", 200 root->fs_info->sb->s_id, 201 buf->start, val, found, from_this_trans, 202 btrfs_header_level(buf)); 203 return 1; 204 } 205 } else { 206 write_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE); 207 } 208 return 0; 209 } 210 211 static int verify_parent_transid(struct extent_io_tree *io_tree, 212 struct extent_buffer *eb, u64 parent_transid) 213 { 214 int ret; 215 216 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 217 return 0; 218 219 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); 220 if (extent_buffer_uptodate(io_tree, eb) && 221 btrfs_header_generation(eb) == parent_transid) { 222 ret = 0; 223 goto out; 224 } 225 printk("parent transid verify failed on %llu wanted %llu found %llu\n", 226 (unsigned long long)eb->start, 227 (unsigned long long)parent_transid, 228 (unsigned long long)btrfs_header_generation(eb)); 229 ret = 1; 230 out: 231 clear_extent_buffer_uptodate(io_tree, eb); 232 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, 233 GFP_NOFS); 234 return ret; 235 236 } 237 238 static int btree_read_extent_buffer_pages(struct btrfs_root *root, 239 struct extent_buffer *eb, 240 u64 start, u64 parent_transid) 241 { 242 struct extent_io_tree *io_tree; 243 int ret; 244 int num_copies = 0; 245 int mirror_num = 0; 246 247 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 248 while (1) { 249 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 250 btree_get_extent, mirror_num); 251 if (!ret && 252 !verify_parent_transid(io_tree, eb, parent_transid)) 253 return ret; 254 255 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 256 eb->start, eb->len); 257 if (num_copies == 1) 258 return ret; 259 260 mirror_num++; 261 if (mirror_num > num_copies) 262 return ret; 263 } 264 return -EIO; 265 } 266 267 int csum_dirty_buffer(struct btrfs_root *root, struct page *page) 268 { 269 struct extent_io_tree *tree; 270 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 271 u64 found_start; 272 int found_level; 273 unsigned long len; 274 struct extent_buffer *eb; 275 int ret; 276 277 tree = &BTRFS_I(page->mapping->host)->io_tree; 278 279 if (page->private == EXTENT_PAGE_PRIVATE) 280 goto out; 281 if (!page->private) 282 goto out; 283 len = page->private >> 2; 284 if (len == 0) { 285 WARN_ON(1); 286 } 287 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 288 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 289 btrfs_header_generation(eb)); 290 BUG_ON(ret); 291 btrfs_clear_buffer_defrag(eb); 292 found_start = btrfs_header_bytenr(eb); 293 if (found_start != start) { 294 printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n", 295 start, found_start, len); 296 WARN_ON(1); 297 goto err; 298 } 299 if (eb->first_page != page) { 300 printk("bad first page %lu %lu\n", eb->first_page->index, 301 page->index); 302 WARN_ON(1); 303 goto err; 304 } 305 if (!PageUptodate(page)) { 306 printk("csum not up to date page %lu\n", page->index); 307 WARN_ON(1); 308 goto err; 309 } 310 found_level = btrfs_header_level(eb); 311 spin_lock(&root->fs_info->hash_lock); 312 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 313 spin_unlock(&root->fs_info->hash_lock); 314 csum_tree_block(root, eb, 0); 315 err: 316 free_extent_buffer(eb); 317 out: 318 return 0; 319 } 320 321 static int btree_writepage_io_hook(struct page *page, u64 start, u64 end) 322 { 323 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 324 325 csum_dirty_buffer(root, page); 326 return 0; 327 } 328 329 int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, 330 struct extent_state *state) 331 { 332 struct extent_io_tree *tree; 333 u64 found_start; 334 int found_level; 335 unsigned long len; 336 struct extent_buffer *eb; 337 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 338 int ret = 0; 339 340 tree = &BTRFS_I(page->mapping->host)->io_tree; 341 if (page->private == EXTENT_PAGE_PRIVATE) 342 goto out; 343 if (!page->private) 344 goto out; 345 len = page->private >> 2; 346 if (len == 0) { 347 WARN_ON(1); 348 } 349 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 350 351 btrfs_clear_buffer_defrag(eb); 352 found_start = btrfs_header_bytenr(eb); 353 if (found_start != start) { 354 ret = -EIO; 355 goto err; 356 } 357 if (eb->first_page != page) { 358 printk("bad first page %lu %lu\n", eb->first_page->index, 359 page->index); 360 WARN_ON(1); 361 ret = -EIO; 362 goto err; 363 } 364 if (memcmp_extent_buffer(eb, root->fs_info->fsid, 365 (unsigned long)btrfs_header_fsid(eb), 366 BTRFS_FSID_SIZE)) { 367 printk("bad fsid on block %Lu\n", eb->start); 368 ret = -EIO; 369 goto err; 370 } 371 found_level = btrfs_header_level(eb); 372 373 ret = csum_tree_block(root, eb, 1); 374 if (ret) 375 ret = -EIO; 376 377 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 378 end = eb->start + end - 1; 379 release_extent_buffer_tail_pages(eb); 380 err: 381 free_extent_buffer(eb); 382 out: 383 return ret; 384 } 385 386 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) 387 static void end_workqueue_bio(struct bio *bio, int err) 388 #else 389 static int end_workqueue_bio(struct bio *bio, 390 unsigned int bytes_done, int err) 391 #endif 392 { 393 struct end_io_wq *end_io_wq = bio->bi_private; 394 struct btrfs_fs_info *fs_info; 395 396 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) 397 if (bio->bi_size) 398 return 1; 399 #endif 400 401 fs_info = end_io_wq->info; 402 end_io_wq->error = err; 403 end_io_wq->work.func = end_workqueue_fn; 404 end_io_wq->work.flags = 0; 405 btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work); 406 407 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) 408 return 0; 409 #endif 410 } 411 412 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 413 int metadata) 414 { 415 struct end_io_wq *end_io_wq; 416 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); 417 if (!end_io_wq) 418 return -ENOMEM; 419 420 end_io_wq->private = bio->bi_private; 421 end_io_wq->end_io = bio->bi_end_io; 422 end_io_wq->info = info; 423 end_io_wq->error = 0; 424 end_io_wq->bio = bio; 425 end_io_wq->metadata = metadata; 426 427 bio->bi_private = end_io_wq; 428 bio->bi_end_io = end_workqueue_bio; 429 return 0; 430 } 431 432 static void run_one_async_submit(struct btrfs_work *work) 433 { 434 struct btrfs_fs_info *fs_info; 435 struct async_submit_bio *async; 436 437 async = container_of(work, struct async_submit_bio, work); 438 fs_info = BTRFS_I(async->inode)->root->fs_info; 439 atomic_dec(&fs_info->nr_async_submits); 440 async->submit_bio_hook(async->inode, async->rw, async->bio, 441 async->mirror_num); 442 kfree(async); 443 } 444 445 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 446 int rw, struct bio *bio, int mirror_num, 447 extent_submit_bio_hook_t *submit_bio_hook) 448 { 449 struct async_submit_bio *async; 450 451 async = kmalloc(sizeof(*async), GFP_NOFS); 452 if (!async) 453 return -ENOMEM; 454 455 async->inode = inode; 456 async->rw = rw; 457 async->bio = bio; 458 async->mirror_num = mirror_num; 459 async->submit_bio_hook = submit_bio_hook; 460 async->work.func = run_one_async_submit; 461 async->work.flags = 0; 462 atomic_inc(&fs_info->nr_async_submits); 463 btrfs_queue_worker(&fs_info->workers, &async->work); 464 return 0; 465 } 466 467 static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 468 int mirror_num) 469 { 470 struct btrfs_root *root = BTRFS_I(inode)->root; 471 u64 offset; 472 int ret; 473 474 offset = bio->bi_sector << 9; 475 476 /* 477 * when we're called for a write, we're already in the async 478 * submission context. Just jump ingo btrfs_map_bio 479 */ 480 if (rw & (1 << BIO_RW)) { 481 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 482 mirror_num, 0); 483 } 484 485 /* 486 * called for a read, do the setup so that checksum validation 487 * can happen in the async kernel threads 488 */ 489 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1); 490 BUG_ON(ret); 491 492 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 493 } 494 495 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 496 int mirror_num) 497 { 498 /* 499 * kthread helpers are used to submit writes so that checksumming 500 * can happen in parallel across all CPUs 501 */ 502 if (!(rw & (1 << BIO_RW))) { 503 return __btree_submit_bio_hook(inode, rw, bio, mirror_num); 504 } 505 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 506 inode, rw, bio, mirror_num, 507 __btree_submit_bio_hook); 508 } 509 510 static int btree_writepage(struct page *page, struct writeback_control *wbc) 511 { 512 struct extent_io_tree *tree; 513 tree = &BTRFS_I(page->mapping->host)->io_tree; 514 return extent_write_full_page(tree, page, btree_get_extent, wbc); 515 } 516 517 static int btree_writepages(struct address_space *mapping, 518 struct writeback_control *wbc) 519 { 520 struct extent_io_tree *tree; 521 tree = &BTRFS_I(mapping->host)->io_tree; 522 if (wbc->sync_mode == WB_SYNC_NONE) { 523 u64 num_dirty; 524 u64 start = 0; 525 unsigned long thresh = 96 * 1024 * 1024; 526 527 if (wbc->for_kupdate) 528 return 0; 529 530 if (current_is_pdflush()) { 531 thresh = 96 * 1024 * 1024; 532 } else { 533 thresh = 8 * 1024 * 1024; 534 } 535 num_dirty = count_range_bits(tree, &start, (u64)-1, 536 thresh, EXTENT_DIRTY); 537 if (num_dirty < thresh) { 538 return 0; 539 } 540 } 541 return extent_writepages(tree, mapping, btree_get_extent, wbc); 542 } 543 544 int btree_readpage(struct file *file, struct page *page) 545 { 546 struct extent_io_tree *tree; 547 tree = &BTRFS_I(page->mapping->host)->io_tree; 548 return extent_read_full_page(tree, page, btree_get_extent); 549 } 550 551 static int btree_releasepage(struct page *page, gfp_t gfp_flags) 552 { 553 struct extent_io_tree *tree; 554 struct extent_map_tree *map; 555 int ret; 556 557 if (page_count(page) > 3) { 558 /* once for page->private, once for the caller, once 559 * once for the page cache 560 */ 561 return 0; 562 } 563 tree = &BTRFS_I(page->mapping->host)->io_tree; 564 map = &BTRFS_I(page->mapping->host)->extent_tree; 565 ret = try_release_extent_state(map, tree, page, gfp_flags); 566 if (ret == 1) { 567 invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE); 568 ClearPagePrivate(page); 569 set_page_private(page, 0); 570 page_cache_release(page); 571 } 572 return ret; 573 } 574 575 static void btree_invalidatepage(struct page *page, unsigned long offset) 576 { 577 struct extent_io_tree *tree; 578 tree = &BTRFS_I(page->mapping->host)->io_tree; 579 extent_invalidatepage(tree, page, offset); 580 btree_releasepage(page, GFP_NOFS); 581 if (PagePrivate(page)) { 582 invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE); 583 ClearPagePrivate(page); 584 set_page_private(page, 0); 585 page_cache_release(page); 586 } 587 } 588 589 #if 0 590 static int btree_writepage(struct page *page, struct writeback_control *wbc) 591 { 592 struct buffer_head *bh; 593 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 594 struct buffer_head *head; 595 if (!page_has_buffers(page)) { 596 create_empty_buffers(page, root->fs_info->sb->s_blocksize, 597 (1 << BH_Dirty)|(1 << BH_Uptodate)); 598 } 599 head = page_buffers(page); 600 bh = head; 601 do { 602 if (buffer_dirty(bh)) 603 csum_tree_block(root, bh, 0); 604 bh = bh->b_this_page; 605 } while (bh != head); 606 return block_write_full_page(page, btree_get_block, wbc); 607 } 608 #endif 609 610 static struct address_space_operations btree_aops = { 611 .readpage = btree_readpage, 612 .writepage = btree_writepage, 613 .writepages = btree_writepages, 614 .releasepage = btree_releasepage, 615 .invalidatepage = btree_invalidatepage, 616 .sync_page = block_sync_page, 617 }; 618 619 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 620 u64 parent_transid) 621 { 622 struct extent_buffer *buf = NULL; 623 struct inode *btree_inode = root->fs_info->btree_inode; 624 int ret = 0; 625 626 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 627 if (!buf) 628 return 0; 629 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 630 buf, 0, 0, btree_get_extent, 0); 631 free_extent_buffer(buf); 632 return ret; 633 } 634 635 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 636 u64 bytenr, u32 blocksize) 637 { 638 struct inode *btree_inode = root->fs_info->btree_inode; 639 struct extent_buffer *eb; 640 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, 641 bytenr, blocksize, GFP_NOFS); 642 return eb; 643 } 644 645 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 646 u64 bytenr, u32 blocksize) 647 { 648 struct inode *btree_inode = root->fs_info->btree_inode; 649 struct extent_buffer *eb; 650 651 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, 652 bytenr, blocksize, NULL, GFP_NOFS); 653 return eb; 654 } 655 656 657 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 658 u32 blocksize, u64 parent_transid) 659 { 660 struct extent_buffer *buf = NULL; 661 struct inode *btree_inode = root->fs_info->btree_inode; 662 struct extent_io_tree *io_tree; 663 int ret; 664 665 io_tree = &BTRFS_I(btree_inode)->io_tree; 666 667 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 668 if (!buf) 669 return NULL; 670 671 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 672 673 if (ret == 0) { 674 buf->flags |= EXTENT_UPTODATE; 675 } 676 return buf; 677 678 } 679 680 int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, 681 struct extent_buffer *buf) 682 { 683 struct inode *btree_inode = root->fs_info->btree_inode; 684 if (btrfs_header_generation(buf) == 685 root->fs_info->running_transaction->transid) { 686 WARN_ON(!btrfs_tree_locked(buf)); 687 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 688 buf); 689 } 690 return 0; 691 } 692 693 int wait_on_tree_block_writeback(struct btrfs_root *root, 694 struct extent_buffer *buf) 695 { 696 struct inode *btree_inode = root->fs_info->btree_inode; 697 wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->io_tree, 698 buf); 699 return 0; 700 } 701 702 static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, 703 u32 stripesize, struct btrfs_root *root, 704 struct btrfs_fs_info *fs_info, 705 u64 objectid) 706 { 707 root->node = NULL; 708 root->inode = NULL; 709 root->commit_root = NULL; 710 root->sectorsize = sectorsize; 711 root->nodesize = nodesize; 712 root->leafsize = leafsize; 713 root->stripesize = stripesize; 714 root->ref_cows = 0; 715 root->track_dirty = 0; 716 717 root->fs_info = fs_info; 718 root->objectid = objectid; 719 root->last_trans = 0; 720 root->highest_inode = 0; 721 root->last_inode_alloc = 0; 722 root->name = NULL; 723 root->in_sysfs = 0; 724 725 INIT_LIST_HEAD(&root->dirty_list); 726 spin_lock_init(&root->node_lock); 727 mutex_init(&root->objectid_mutex); 728 memset(&root->root_key, 0, sizeof(root->root_key)); 729 memset(&root->root_item, 0, sizeof(root->root_item)); 730 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); 731 memset(&root->root_kobj, 0, sizeof(root->root_kobj)); 732 init_completion(&root->kobj_unregister); 733 root->defrag_running = 0; 734 root->defrag_level = 0; 735 root->root_key.objectid = objectid; 736 return 0; 737 } 738 739 static int find_and_setup_root(struct btrfs_root *tree_root, 740 struct btrfs_fs_info *fs_info, 741 u64 objectid, 742 struct btrfs_root *root) 743 { 744 int ret; 745 u32 blocksize; 746 747 __setup_root(tree_root->nodesize, tree_root->leafsize, 748 tree_root->sectorsize, tree_root->stripesize, 749 root, fs_info, objectid); 750 ret = btrfs_find_last_root(tree_root, objectid, 751 &root->root_item, &root->root_key); 752 BUG_ON(ret); 753 754 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 755 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 756 blocksize, 0); 757 BUG_ON(!root->node); 758 return 0; 759 } 760 761 struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info, 762 struct btrfs_key *location) 763 { 764 struct btrfs_root *root; 765 struct btrfs_root *tree_root = fs_info->tree_root; 766 struct btrfs_path *path; 767 struct extent_buffer *l; 768 u64 highest_inode; 769 u32 blocksize; 770 int ret = 0; 771 772 root = kzalloc(sizeof(*root), GFP_NOFS); 773 if (!root) 774 return ERR_PTR(-ENOMEM); 775 if (location->offset == (u64)-1) { 776 ret = find_and_setup_root(tree_root, fs_info, 777 location->objectid, root); 778 if (ret) { 779 kfree(root); 780 return ERR_PTR(ret); 781 } 782 goto insert; 783 } 784 785 __setup_root(tree_root->nodesize, tree_root->leafsize, 786 tree_root->sectorsize, tree_root->stripesize, 787 root, fs_info, location->objectid); 788 789 path = btrfs_alloc_path(); 790 BUG_ON(!path); 791 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 792 if (ret != 0) { 793 if (ret > 0) 794 ret = -ENOENT; 795 goto out; 796 } 797 l = path->nodes[0]; 798 read_extent_buffer(l, &root->root_item, 799 btrfs_item_ptr_offset(l, path->slots[0]), 800 sizeof(root->root_item)); 801 memcpy(&root->root_key, location, sizeof(*location)); 802 ret = 0; 803 out: 804 btrfs_release_path(root, path); 805 btrfs_free_path(path); 806 if (ret) { 807 kfree(root); 808 return ERR_PTR(ret); 809 } 810 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 811 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 812 blocksize, 0); 813 BUG_ON(!root->node); 814 insert: 815 root->ref_cows = 1; 816 ret = btrfs_find_highest_inode(root, &highest_inode); 817 if (ret == 0) { 818 root->highest_inode = highest_inode; 819 root->last_inode_alloc = highest_inode; 820 } 821 return root; 822 } 823 824 struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, 825 u64 root_objectid) 826 { 827 struct btrfs_root *root; 828 829 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID) 830 return fs_info->tree_root; 831 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID) 832 return fs_info->extent_root; 833 834 root = radix_tree_lookup(&fs_info->fs_roots_radix, 835 (unsigned long)root_objectid); 836 return root; 837 } 838 839 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 840 struct btrfs_key *location) 841 { 842 struct btrfs_root *root; 843 int ret; 844 845 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) 846 return fs_info->tree_root; 847 if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) 848 return fs_info->extent_root; 849 if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) 850 return fs_info->chunk_root; 851 if (location->objectid == BTRFS_DEV_TREE_OBJECTID) 852 return fs_info->dev_root; 853 854 root = radix_tree_lookup(&fs_info->fs_roots_radix, 855 (unsigned long)location->objectid); 856 if (root) 857 return root; 858 859 root = btrfs_read_fs_root_no_radix(fs_info, location); 860 if (IS_ERR(root)) 861 return root; 862 ret = radix_tree_insert(&fs_info->fs_roots_radix, 863 (unsigned long)root->root_key.objectid, 864 root); 865 if (ret) { 866 free_extent_buffer(root->node); 867 kfree(root); 868 return ERR_PTR(ret); 869 } 870 ret = btrfs_find_dead_roots(fs_info->tree_root, 871 root->root_key.objectid, root); 872 BUG_ON(ret); 873 874 return root; 875 } 876 877 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, 878 struct btrfs_key *location, 879 const char *name, int namelen) 880 { 881 struct btrfs_root *root; 882 int ret; 883 884 root = btrfs_read_fs_root_no_name(fs_info, location); 885 if (!root) 886 return NULL; 887 888 if (root->in_sysfs) 889 return root; 890 891 ret = btrfs_set_root_name(root, name, namelen); 892 if (ret) { 893 free_extent_buffer(root->node); 894 kfree(root); 895 return ERR_PTR(ret); 896 } 897 898 ret = btrfs_sysfs_add_root(root); 899 if (ret) { 900 free_extent_buffer(root->node); 901 kfree(root->name); 902 kfree(root); 903 return ERR_PTR(ret); 904 } 905 root->in_sysfs = 1; 906 return root; 907 } 908 #if 0 909 static int add_hasher(struct btrfs_fs_info *info, char *type) { 910 struct btrfs_hasher *hasher; 911 912 hasher = kmalloc(sizeof(*hasher), GFP_NOFS); 913 if (!hasher) 914 return -ENOMEM; 915 hasher->hash_tfm = crypto_alloc_hash(type, 0, CRYPTO_ALG_ASYNC); 916 if (!hasher->hash_tfm) { 917 kfree(hasher); 918 return -EINVAL; 919 } 920 spin_lock(&info->hash_lock); 921 list_add(&hasher->list, &info->hashers); 922 spin_unlock(&info->hash_lock); 923 return 0; 924 } 925 #endif 926 927 static int btrfs_congested_fn(void *congested_data, int bdi_bits) 928 { 929 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; 930 int ret = 0; 931 int limit = 256 * info->fs_devices->open_devices; 932 struct list_head *cur; 933 struct btrfs_device *device; 934 struct backing_dev_info *bdi; 935 936 if ((bdi_bits & (1 << BDI_write_congested)) && 937 atomic_read(&info->nr_async_submits) > limit) { 938 return 1; 939 } 940 941 list_for_each(cur, &info->fs_devices->devices) { 942 device = list_entry(cur, struct btrfs_device, dev_list); 943 if (!device->bdev) 944 continue; 945 bdi = blk_get_backing_dev_info(device->bdev); 946 if (bdi && bdi_congested(bdi, bdi_bits)) { 947 ret = 1; 948 break; 949 } 950 } 951 return ret; 952 } 953 954 /* 955 * this unplugs every device on the box, and it is only used when page 956 * is null 957 */ 958 static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 959 { 960 struct list_head *cur; 961 struct btrfs_device *device; 962 struct btrfs_fs_info *info; 963 964 info = (struct btrfs_fs_info *)bdi->unplug_io_data; 965 list_for_each(cur, &info->fs_devices->devices) { 966 device = list_entry(cur, struct btrfs_device, dev_list); 967 bdi = blk_get_backing_dev_info(device->bdev); 968 if (bdi->unplug_io_fn) { 969 bdi->unplug_io_fn(bdi, page); 970 } 971 } 972 } 973 974 void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 975 { 976 struct inode *inode; 977 struct extent_map_tree *em_tree; 978 struct extent_map *em; 979 struct address_space *mapping; 980 u64 offset; 981 982 /* the generic O_DIRECT read code does this */ 983 if (!page) { 984 __unplug_io_fn(bdi, page); 985 return; 986 } 987 988 /* 989 * page->mapping may change at any time. Get a consistent copy 990 * and use that for everything below 991 */ 992 smp_mb(); 993 mapping = page->mapping; 994 if (!mapping) 995 return; 996 997 inode = mapping->host; 998 offset = page_offset(page); 999 1000 em_tree = &BTRFS_I(inode)->extent_tree; 1001 spin_lock(&em_tree->lock); 1002 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); 1003 spin_unlock(&em_tree->lock); 1004 if (!em) 1005 return; 1006 1007 offset = offset - em->start; 1008 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, 1009 em->block_start + offset, page); 1010 free_extent_map(em); 1011 } 1012 1013 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) 1014 { 1015 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) 1016 bdi_init(bdi); 1017 #endif 1018 bdi->ra_pages = default_backing_dev_info.ra_pages; 1019 bdi->state = 0; 1020 bdi->capabilities = default_backing_dev_info.capabilities; 1021 bdi->unplug_io_fn = btrfs_unplug_io_fn; 1022 bdi->unplug_io_data = info; 1023 bdi->congested_fn = btrfs_congested_fn; 1024 bdi->congested_data = info; 1025 return 0; 1026 } 1027 1028 static int bio_ready_for_csum(struct bio *bio) 1029 { 1030 u64 length = 0; 1031 u64 buf_len = 0; 1032 u64 start = 0; 1033 struct page *page; 1034 struct extent_io_tree *io_tree = NULL; 1035 struct btrfs_fs_info *info = NULL; 1036 struct bio_vec *bvec; 1037 int i; 1038 int ret; 1039 1040 bio_for_each_segment(bvec, bio, i) { 1041 page = bvec->bv_page; 1042 if (page->private == EXTENT_PAGE_PRIVATE) { 1043 length += bvec->bv_len; 1044 continue; 1045 } 1046 if (!page->private) { 1047 length += bvec->bv_len; 1048 continue; 1049 } 1050 length = bvec->bv_len; 1051 buf_len = page->private >> 2; 1052 start = page_offset(page) + bvec->bv_offset; 1053 io_tree = &BTRFS_I(page->mapping->host)->io_tree; 1054 info = BTRFS_I(page->mapping->host)->root->fs_info; 1055 } 1056 /* are we fully contained in this bio? */ 1057 if (buf_len <= length) 1058 return 1; 1059 1060 ret = extent_range_uptodate(io_tree, start + length, 1061 start + buf_len - 1); 1062 if (ret == 1) 1063 return ret; 1064 return ret; 1065 } 1066 1067 /* 1068 * called by the kthread helper functions to finally call the bio end_io 1069 * functions. This is where read checksum verification actually happens 1070 */ 1071 static void end_workqueue_fn(struct btrfs_work *work) 1072 { 1073 struct bio *bio; 1074 struct end_io_wq *end_io_wq; 1075 struct btrfs_fs_info *fs_info; 1076 int error; 1077 1078 end_io_wq = container_of(work, struct end_io_wq, work); 1079 bio = end_io_wq->bio; 1080 fs_info = end_io_wq->info; 1081 1082 /* metadata bios are special because the whole tree block must 1083 * be checksummed at once. This makes sure the entire block is in 1084 * ram and up to date before trying to verify things. For 1085 * blocksize <= pagesize, it is basically a noop 1086 */ 1087 if (end_io_wq->metadata && !bio_ready_for_csum(bio)) { 1088 btrfs_queue_worker(&fs_info->endio_workers, 1089 &end_io_wq->work); 1090 return; 1091 } 1092 error = end_io_wq->error; 1093 bio->bi_private = end_io_wq->private; 1094 bio->bi_end_io = end_io_wq->end_io; 1095 kfree(end_io_wq); 1096 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) 1097 bio_endio(bio, bio->bi_size, error); 1098 #else 1099 bio_endio(bio, error); 1100 #endif 1101 } 1102 1103 struct btrfs_root *open_ctree(struct super_block *sb, 1104 struct btrfs_fs_devices *fs_devices, 1105 char *options) 1106 { 1107 u32 sectorsize; 1108 u32 nodesize; 1109 u32 leafsize; 1110 u32 blocksize; 1111 u32 stripesize; 1112 struct buffer_head *bh; 1113 struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root), 1114 GFP_NOFS); 1115 struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root), 1116 GFP_NOFS); 1117 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), 1118 GFP_NOFS); 1119 struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root), 1120 GFP_NOFS); 1121 struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root), 1122 GFP_NOFS); 1123 int ret; 1124 int err = -EINVAL; 1125 1126 struct btrfs_super_block *disk_super; 1127 1128 if (!extent_root || !tree_root || !fs_info) { 1129 err = -ENOMEM; 1130 goto fail; 1131 } 1132 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); 1133 INIT_LIST_HEAD(&fs_info->trans_list); 1134 INIT_LIST_HEAD(&fs_info->dead_roots); 1135 INIT_LIST_HEAD(&fs_info->hashers); 1136 spin_lock_init(&fs_info->hash_lock); 1137 spin_lock_init(&fs_info->delalloc_lock); 1138 spin_lock_init(&fs_info->new_trans_lock); 1139 1140 init_completion(&fs_info->kobj_unregister); 1141 fs_info->tree_root = tree_root; 1142 fs_info->extent_root = extent_root; 1143 fs_info->chunk_root = chunk_root; 1144 fs_info->dev_root = dev_root; 1145 fs_info->fs_devices = fs_devices; 1146 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1147 INIT_LIST_HEAD(&fs_info->space_info); 1148 btrfs_mapping_init(&fs_info->mapping_tree); 1149 atomic_set(&fs_info->nr_async_submits, 0); 1150 atomic_set(&fs_info->throttles, 0); 1151 fs_info->sb = sb; 1152 fs_info->max_extent = (u64)-1; 1153 fs_info->max_inline = 8192 * 1024; 1154 setup_bdi(fs_info, &fs_info->bdi); 1155 fs_info->btree_inode = new_inode(sb); 1156 fs_info->btree_inode->i_ino = 1; 1157 fs_info->btree_inode->i_nlink = 1; 1158 fs_info->thread_pool_size = min(num_online_cpus() + 2, 8); 1159 1160 sb->s_blocksize = 4096; 1161 sb->s_blocksize_bits = blksize_bits(4096); 1162 1163 /* 1164 * we set the i_size on the btree inode to the max possible int. 1165 * the real end of the address space is determined by all of 1166 * the devices in the system 1167 */ 1168 fs_info->btree_inode->i_size = OFFSET_MAX; 1169 fs_info->btree_inode->i_mapping->a_ops = &btree_aops; 1170 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; 1171 1172 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, 1173 fs_info->btree_inode->i_mapping, 1174 GFP_NOFS); 1175 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, 1176 GFP_NOFS); 1177 1178 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; 1179 1180 extent_io_tree_init(&fs_info->free_space_cache, 1181 fs_info->btree_inode->i_mapping, GFP_NOFS); 1182 extent_io_tree_init(&fs_info->block_group_cache, 1183 fs_info->btree_inode->i_mapping, GFP_NOFS); 1184 extent_io_tree_init(&fs_info->pinned_extents, 1185 fs_info->btree_inode->i_mapping, GFP_NOFS); 1186 extent_io_tree_init(&fs_info->pending_del, 1187 fs_info->btree_inode->i_mapping, GFP_NOFS); 1188 extent_io_tree_init(&fs_info->extent_ins, 1189 fs_info->btree_inode->i_mapping, GFP_NOFS); 1190 fs_info->do_barriers = 1; 1191 1192 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) 1193 INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info); 1194 #else 1195 INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner); 1196 #endif 1197 BTRFS_I(fs_info->btree_inode)->root = tree_root; 1198 memset(&BTRFS_I(fs_info->btree_inode)->location, 0, 1199 sizeof(struct btrfs_key)); 1200 insert_inode_hash(fs_info->btree_inode); 1201 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 1202 1203 mutex_init(&fs_info->trans_mutex); 1204 mutex_init(&fs_info->drop_mutex); 1205 mutex_init(&fs_info->alloc_mutex); 1206 mutex_init(&fs_info->chunk_mutex); 1207 1208 #if 0 1209 ret = add_hasher(fs_info, "crc32c"); 1210 if (ret) { 1211 printk("btrfs: failed hash setup, modprobe cryptomgr?\n"); 1212 err = -ENOMEM; 1213 goto fail_iput; 1214 } 1215 #endif 1216 __setup_root(4096, 4096, 4096, 4096, tree_root, 1217 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1218 1219 1220 bh = __bread(fs_devices->latest_bdev, 1221 BTRFS_SUPER_INFO_OFFSET / 4096, 4096); 1222 if (!bh) 1223 goto fail_iput; 1224 1225 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1226 brelse(bh); 1227 1228 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); 1229 1230 disk_super = &fs_info->super_copy; 1231 if (!btrfs_super_root(disk_super)) 1232 goto fail_sb_buffer; 1233 1234 err = btrfs_parse_options(tree_root, options); 1235 if (err) 1236 goto fail_sb_buffer; 1237 1238 /* 1239 * we need to start all the end_io workers up front because the 1240 * queue work function gets called at interrupt time, and so it 1241 * cannot dynamically grow. 1242 */ 1243 btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size); 1244 btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size); 1245 btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size); 1246 btrfs_start_workers(&fs_info->workers, 1); 1247 btrfs_start_workers(&fs_info->submit_workers, 1); 1248 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); 1249 1250 1251 err = -EINVAL; 1252 if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) { 1253 printk("Btrfs: wanted %llu devices, but found %llu\n", 1254 (unsigned long long)btrfs_super_num_devices(disk_super), 1255 (unsigned long long)fs_devices->open_devices); 1256 if (btrfs_test_opt(tree_root, DEGRADED)) 1257 printk("continuing in degraded mode\n"); 1258 else { 1259 goto fail_sb_buffer; 1260 } 1261 } 1262 1263 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1264 1265 nodesize = btrfs_super_nodesize(disk_super); 1266 leafsize = btrfs_super_leafsize(disk_super); 1267 sectorsize = btrfs_super_sectorsize(disk_super); 1268 stripesize = btrfs_super_stripesize(disk_super); 1269 tree_root->nodesize = nodesize; 1270 tree_root->leafsize = leafsize; 1271 tree_root->sectorsize = sectorsize; 1272 tree_root->stripesize = stripesize; 1273 1274 sb->s_blocksize = sectorsize; 1275 sb->s_blocksize_bits = blksize_bits(sectorsize); 1276 1277 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, 1278 sizeof(disk_super->magic))) { 1279 printk("btrfs: valid FS not found on %s\n", sb->s_id); 1280 goto fail_sb_buffer; 1281 } 1282 1283 mutex_lock(&fs_info->chunk_mutex); 1284 ret = btrfs_read_sys_array(tree_root); 1285 mutex_unlock(&fs_info->chunk_mutex); 1286 if (ret) { 1287 printk("btrfs: failed to read the system array on %s\n", 1288 sb->s_id); 1289 goto fail_sys_array; 1290 } 1291 1292 blocksize = btrfs_level_size(tree_root, 1293 btrfs_super_chunk_root_level(disk_super)); 1294 1295 __setup_root(nodesize, leafsize, sectorsize, stripesize, 1296 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); 1297 1298 chunk_root->node = read_tree_block(chunk_root, 1299 btrfs_super_chunk_root(disk_super), 1300 blocksize, 0); 1301 BUG_ON(!chunk_root->node); 1302 1303 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, 1304 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), 1305 BTRFS_UUID_SIZE); 1306 1307 mutex_lock(&fs_info->chunk_mutex); 1308 ret = btrfs_read_chunk_tree(chunk_root); 1309 mutex_unlock(&fs_info->chunk_mutex); 1310 BUG_ON(ret); 1311 1312 btrfs_close_extra_devices(fs_devices); 1313 1314 blocksize = btrfs_level_size(tree_root, 1315 btrfs_super_root_level(disk_super)); 1316 1317 1318 tree_root->node = read_tree_block(tree_root, 1319 btrfs_super_root(disk_super), 1320 blocksize, 0); 1321 if (!tree_root->node) 1322 goto fail_sb_buffer; 1323 1324 1325 ret = find_and_setup_root(tree_root, fs_info, 1326 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 1327 if (ret) 1328 goto fail_tree_root; 1329 extent_root->track_dirty = 1; 1330 1331 ret = find_and_setup_root(tree_root, fs_info, 1332 BTRFS_DEV_TREE_OBJECTID, dev_root); 1333 dev_root->track_dirty = 1; 1334 1335 if (ret) 1336 goto fail_extent_root; 1337 1338 btrfs_read_block_groups(extent_root); 1339 1340 fs_info->generation = btrfs_super_generation(disk_super) + 1; 1341 fs_info->data_alloc_profile = (u64)-1; 1342 fs_info->metadata_alloc_profile = (u64)-1; 1343 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1344 1345 return tree_root; 1346 1347 fail_extent_root: 1348 free_extent_buffer(extent_root->node); 1349 fail_tree_root: 1350 free_extent_buffer(tree_root->node); 1351 fail_sys_array: 1352 fail_sb_buffer: 1353 extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); 1354 btrfs_stop_workers(&fs_info->workers); 1355 btrfs_stop_workers(&fs_info->endio_workers); 1356 btrfs_stop_workers(&fs_info->submit_workers); 1357 fail_iput: 1358 iput(fs_info->btree_inode); 1359 fail: 1360 btrfs_close_devices(fs_info->fs_devices); 1361 btrfs_mapping_tree_free(&fs_info->mapping_tree); 1362 1363 kfree(extent_root); 1364 kfree(tree_root); 1365 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) 1366 bdi_destroy(&fs_info->bdi); 1367 #endif 1368 kfree(fs_info); 1369 return ERR_PTR(err); 1370 } 1371 1372 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 1373 { 1374 char b[BDEVNAME_SIZE]; 1375 1376 if (uptodate) { 1377 set_buffer_uptodate(bh); 1378 } else { 1379 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 1380 printk(KERN_WARNING "lost page write due to " 1381 "I/O error on %s\n", 1382 bdevname(bh->b_bdev, b)); 1383 } 1384 /* note, we dont' set_buffer_write_io_error because we have 1385 * our own ways of dealing with the IO errors 1386 */ 1387 clear_buffer_uptodate(bh); 1388 } 1389 unlock_buffer(bh); 1390 put_bh(bh); 1391 } 1392 1393 int write_all_supers(struct btrfs_root *root) 1394 { 1395 struct list_head *cur; 1396 struct list_head *head = &root->fs_info->fs_devices->devices; 1397 struct btrfs_device *dev; 1398 struct btrfs_super_block *sb; 1399 struct btrfs_dev_item *dev_item; 1400 struct buffer_head *bh; 1401 int ret; 1402 int do_barriers; 1403 int max_errors; 1404 int total_errors = 0; 1405 u32 crc; 1406 u64 flags; 1407 1408 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1409 do_barriers = !btrfs_test_opt(root, NOBARRIER); 1410 1411 sb = &root->fs_info->super_for_commit; 1412 dev_item = &sb->dev_item; 1413 list_for_each(cur, head) { 1414 dev = list_entry(cur, struct btrfs_device, dev_list); 1415 if (!dev->bdev) { 1416 total_errors++; 1417 continue; 1418 } 1419 if (!dev->in_fs_metadata) 1420 continue; 1421 1422 btrfs_set_stack_device_type(dev_item, dev->type); 1423 btrfs_set_stack_device_id(dev_item, dev->devid); 1424 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); 1425 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); 1426 btrfs_set_stack_device_io_align(dev_item, dev->io_align); 1427 btrfs_set_stack_device_io_width(dev_item, dev->io_width); 1428 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); 1429 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); 1430 flags = btrfs_super_flags(sb); 1431 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); 1432 1433 1434 crc = ~(u32)0; 1435 crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc, 1436 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); 1437 btrfs_csum_final(crc, sb->csum); 1438 1439 bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 1440 BTRFS_SUPER_INFO_SIZE); 1441 1442 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); 1443 dev->pending_io = bh; 1444 1445 get_bh(bh); 1446 set_buffer_uptodate(bh); 1447 lock_buffer(bh); 1448 bh->b_end_io = btrfs_end_buffer_write_sync; 1449 1450 if (do_barriers && dev->barriers) { 1451 ret = submit_bh(WRITE_BARRIER, bh); 1452 if (ret == -EOPNOTSUPP) { 1453 printk("btrfs: disabling barriers on dev %s\n", 1454 dev->name); 1455 set_buffer_uptodate(bh); 1456 dev->barriers = 0; 1457 get_bh(bh); 1458 lock_buffer(bh); 1459 ret = submit_bh(WRITE, bh); 1460 } 1461 } else { 1462 ret = submit_bh(WRITE, bh); 1463 } 1464 if (ret) 1465 total_errors++; 1466 } 1467 if (total_errors > max_errors) { 1468 printk("btrfs: %d errors while writing supers\n", total_errors); 1469 BUG(); 1470 } 1471 total_errors = 0; 1472 1473 list_for_each(cur, head) { 1474 dev = list_entry(cur, struct btrfs_device, dev_list); 1475 if (!dev->bdev) 1476 continue; 1477 if (!dev->in_fs_metadata) 1478 continue; 1479 1480 BUG_ON(!dev->pending_io); 1481 bh = dev->pending_io; 1482 wait_on_buffer(bh); 1483 if (!buffer_uptodate(dev->pending_io)) { 1484 if (do_barriers && dev->barriers) { 1485 printk("btrfs: disabling barriers on dev %s\n", 1486 dev->name); 1487 set_buffer_uptodate(bh); 1488 get_bh(bh); 1489 lock_buffer(bh); 1490 dev->barriers = 0; 1491 ret = submit_bh(WRITE, bh); 1492 BUG_ON(ret); 1493 wait_on_buffer(bh); 1494 if (!buffer_uptodate(bh)) 1495 total_errors++; 1496 } else { 1497 total_errors++; 1498 } 1499 1500 } 1501 dev->pending_io = NULL; 1502 brelse(bh); 1503 } 1504 if (total_errors > max_errors) { 1505 printk("btrfs: %d errors while writing supers\n", total_errors); 1506 BUG(); 1507 } 1508 return 0; 1509 } 1510 1511 int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root 1512 *root) 1513 { 1514 int ret; 1515 1516 ret = write_all_supers(root); 1517 return ret; 1518 } 1519 1520 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 1521 { 1522 radix_tree_delete(&fs_info->fs_roots_radix, 1523 (unsigned long)root->root_key.objectid); 1524 if (root->in_sysfs) 1525 btrfs_sysfs_del_root(root); 1526 if (root->inode) 1527 iput(root->inode); 1528 if (root->node) 1529 free_extent_buffer(root->node); 1530 if (root->commit_root) 1531 free_extent_buffer(root->commit_root); 1532 if (root->name) 1533 kfree(root->name); 1534 kfree(root); 1535 return 0; 1536 } 1537 1538 static int del_fs_roots(struct btrfs_fs_info *fs_info) 1539 { 1540 int ret; 1541 struct btrfs_root *gang[8]; 1542 int i; 1543 1544 while(1) { 1545 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 1546 (void **)gang, 0, 1547 ARRAY_SIZE(gang)); 1548 if (!ret) 1549 break; 1550 for (i = 0; i < ret; i++) 1551 btrfs_free_fs_root(fs_info, gang[i]); 1552 } 1553 return 0; 1554 } 1555 1556 int close_ctree(struct btrfs_root *root) 1557 { 1558 int ret; 1559 struct btrfs_trans_handle *trans; 1560 struct btrfs_fs_info *fs_info = root->fs_info; 1561 1562 fs_info->closing = 1; 1563 smp_mb(); 1564 1565 btrfs_transaction_flush_work(root); 1566 btrfs_defrag_dirty_roots(root->fs_info); 1567 trans = btrfs_start_transaction(root, 1); 1568 ret = btrfs_commit_transaction(trans, root); 1569 /* run commit again to drop the original snapshot */ 1570 trans = btrfs_start_transaction(root, 1); 1571 btrfs_commit_transaction(trans, root); 1572 ret = btrfs_write_and_wait_transaction(NULL, root); 1573 BUG_ON(ret); 1574 1575 write_ctree_super(NULL, root); 1576 1577 btrfs_transaction_flush_work(root); 1578 1579 if (fs_info->delalloc_bytes) { 1580 printk("btrfs: at unmount delalloc count %Lu\n", 1581 fs_info->delalloc_bytes); 1582 } 1583 if (fs_info->extent_root->node) 1584 free_extent_buffer(fs_info->extent_root->node); 1585 1586 if (fs_info->tree_root->node) 1587 free_extent_buffer(fs_info->tree_root->node); 1588 1589 if (root->fs_info->chunk_root->node); 1590 free_extent_buffer(root->fs_info->chunk_root->node); 1591 1592 if (root->fs_info->dev_root->node); 1593 free_extent_buffer(root->fs_info->dev_root->node); 1594 1595 btrfs_free_block_groups(root->fs_info); 1596 del_fs_roots(fs_info); 1597 1598 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 1599 1600 extent_io_tree_empty_lru(&fs_info->free_space_cache); 1601 extent_io_tree_empty_lru(&fs_info->block_group_cache); 1602 extent_io_tree_empty_lru(&fs_info->pinned_extents); 1603 extent_io_tree_empty_lru(&fs_info->pending_del); 1604 extent_io_tree_empty_lru(&fs_info->extent_ins); 1605 extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); 1606 1607 truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); 1608 1609 btrfs_stop_workers(&fs_info->workers); 1610 btrfs_stop_workers(&fs_info->endio_workers); 1611 btrfs_stop_workers(&fs_info->submit_workers); 1612 1613 iput(fs_info->btree_inode); 1614 #if 0 1615 while(!list_empty(&fs_info->hashers)) { 1616 struct btrfs_hasher *hasher; 1617 hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher, 1618 hashers); 1619 list_del(&hasher->hashers); 1620 crypto_free_hash(&fs_info->hash_tfm); 1621 kfree(hasher); 1622 } 1623 #endif 1624 btrfs_close_devices(fs_info->fs_devices); 1625 btrfs_mapping_tree_free(&fs_info->mapping_tree); 1626 1627 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) 1628 bdi_destroy(&fs_info->bdi); 1629 #endif 1630 1631 kfree(fs_info->extent_root); 1632 kfree(fs_info->tree_root); 1633 kfree(fs_info->chunk_root); 1634 kfree(fs_info->dev_root); 1635 return 0; 1636 } 1637 1638 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) 1639 { 1640 int ret; 1641 struct inode *btree_inode = buf->first_page->mapping->host; 1642 1643 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); 1644 if (!ret) 1645 return ret; 1646 1647 ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, 1648 parent_transid); 1649 return !ret; 1650 } 1651 1652 int btrfs_set_buffer_uptodate(struct extent_buffer *buf) 1653 { 1654 struct inode *btree_inode = buf->first_page->mapping->host; 1655 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, 1656 buf); 1657 } 1658 1659 void btrfs_mark_buffer_dirty(struct extent_buffer *buf) 1660 { 1661 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 1662 u64 transid = btrfs_header_generation(buf); 1663 struct inode *btree_inode = root->fs_info->btree_inode; 1664 1665 WARN_ON(!btrfs_tree_locked(buf)); 1666 if (transid != root->fs_info->generation) { 1667 printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n", 1668 (unsigned long long)buf->start, 1669 transid, root->fs_info->generation); 1670 WARN_ON(1); 1671 } 1672 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); 1673 } 1674 1675 void btrfs_throttle(struct btrfs_root *root) 1676 { 1677 struct backing_dev_info *bdi; 1678 1679 bdi = &root->fs_info->bdi; 1680 if (atomic_read(&root->fs_info->throttles) && 1681 bdi_write_congested(bdi)) { 1682 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18) 1683 congestion_wait(WRITE, HZ/20); 1684 #else 1685 blk_congestion_wait(WRITE, HZ/20); 1686 #endif 1687 } 1688 } 1689 1690 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 1691 { 1692 /* 1693 * looks as though older kernels can get into trouble with 1694 * this code, they end up stuck in balance_dirty_pages forever 1695 */ 1696 struct extent_io_tree *tree; 1697 u64 num_dirty; 1698 u64 start = 0; 1699 unsigned long thresh = 16 * 1024 * 1024; 1700 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 1701 1702 if (current_is_pdflush()) 1703 return; 1704 1705 num_dirty = count_range_bits(tree, &start, (u64)-1, 1706 thresh, EXTENT_DIRTY); 1707 if (num_dirty > thresh) { 1708 balance_dirty_pages_ratelimited_nr( 1709 root->fs_info->btree_inode->i_mapping, 1); 1710 } 1711 return; 1712 } 1713 1714 void btrfs_set_buffer_defrag(struct extent_buffer *buf) 1715 { 1716 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 1717 struct inode *btree_inode = root->fs_info->btree_inode; 1718 set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, 1719 buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS); 1720 } 1721 1722 void btrfs_set_buffer_defrag_done(struct extent_buffer *buf) 1723 { 1724 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 1725 struct inode *btree_inode = root->fs_info->btree_inode; 1726 set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, 1727 buf->start + buf->len - 1, EXTENT_DEFRAG_DONE, 1728 GFP_NOFS); 1729 } 1730 1731 int btrfs_buffer_defrag(struct extent_buffer *buf) 1732 { 1733 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 1734 struct inode *btree_inode = root->fs_info->btree_inode; 1735 return test_range_bit(&BTRFS_I(btree_inode)->io_tree, 1736 buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0); 1737 } 1738 1739 int btrfs_buffer_defrag_done(struct extent_buffer *buf) 1740 { 1741 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 1742 struct inode *btree_inode = root->fs_info->btree_inode; 1743 return test_range_bit(&BTRFS_I(btree_inode)->io_tree, 1744 buf->start, buf->start + buf->len - 1, 1745 EXTENT_DEFRAG_DONE, 0); 1746 } 1747 1748 int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf) 1749 { 1750 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 1751 struct inode *btree_inode = root->fs_info->btree_inode; 1752 return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree, 1753 buf->start, buf->start + buf->len - 1, 1754 EXTENT_DEFRAG_DONE, GFP_NOFS); 1755 } 1756 1757 int btrfs_clear_buffer_defrag(struct extent_buffer *buf) 1758 { 1759 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 1760 struct inode *btree_inode = root->fs_info->btree_inode; 1761 return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree, 1762 buf->start, buf->start + buf->len - 1, 1763 EXTENT_DEFRAG, GFP_NOFS); 1764 } 1765 1766 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) 1767 { 1768 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 1769 int ret; 1770 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 1771 if (ret == 0) { 1772 buf->flags |= EXTENT_UPTODATE; 1773 } 1774 return ret; 1775 } 1776 1777 static struct extent_io_ops btree_extent_io_ops = { 1778 .writepage_io_hook = btree_writepage_io_hook, 1779 .readpage_end_io_hook = btree_readpage_end_io_hook, 1780 .submit_bio_hook = btree_submit_bio_hook, 1781 /* note we're sharing with inode.c for the merge bio hook */ 1782 .merge_bio_hook = btrfs_merge_bio_hook, 1783 }; 1784