1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/fs.h> 20 #include <linux/sched.h> 21 #include <linux/writeback.h> 22 #include <linux/pagemap.h> 23 #include <linux/blkdev.h> 24 #include "ctree.h" 25 #include "disk-io.h" 26 #include "transaction.h" 27 #include "locking.h" 28 #include "ref-cache.h" 29 #include "tree-log.h" 30 31 #define BTRFS_ROOT_TRANS_TAG 0 32 33 static noinline void put_transaction(struct btrfs_transaction *transaction) 34 { 35 WARN_ON(transaction->use_count == 0); 36 transaction->use_count--; 37 if (transaction->use_count == 0) { 38 list_del_init(&transaction->list); 39 memset(transaction, 0, sizeof(*transaction)); 40 kmem_cache_free(btrfs_transaction_cachep, transaction); 41 } 42 } 43 44 /* 45 * either allocate a new transaction or hop into the existing one 46 */ 47 static noinline int join_transaction(struct btrfs_root *root) 48 { 49 struct btrfs_transaction *cur_trans; 50 cur_trans = root->fs_info->running_transaction; 51 if (!cur_trans) { 52 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, 53 GFP_NOFS); 54 BUG_ON(!cur_trans); 55 root->fs_info->generation++; 56 cur_trans->num_writers = 1; 57 cur_trans->num_joined = 0; 58 cur_trans->transid = root->fs_info->generation; 59 init_waitqueue_head(&cur_trans->writer_wait); 60 init_waitqueue_head(&cur_trans->commit_wait); 61 cur_trans->in_commit = 0; 62 cur_trans->blocked = 0; 63 cur_trans->use_count = 1; 64 cur_trans->commit_done = 0; 65 cur_trans->start_time = get_seconds(); 66 67 cur_trans->delayed_refs.root.rb_node = NULL; 68 cur_trans->delayed_refs.num_entries = 0; 69 cur_trans->delayed_refs.num_heads_ready = 0; 70 cur_trans->delayed_refs.num_heads = 0; 71 cur_trans->delayed_refs.flushing = 0; 72 cur_trans->delayed_refs.run_delayed_start = 0; 73 spin_lock_init(&cur_trans->delayed_refs.lock); 74 75 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 76 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 77 extent_io_tree_init(&cur_trans->dirty_pages, 78 root->fs_info->btree_inode->i_mapping, 79 GFP_NOFS); 80 spin_lock(&root->fs_info->new_trans_lock); 81 root->fs_info->running_transaction = cur_trans; 82 spin_unlock(&root->fs_info->new_trans_lock); 83 } else { 84 cur_trans->num_writers++; 85 cur_trans->num_joined++; 86 } 87 88 return 0; 89 } 90 91 /* 92 * this does all the record keeping required to make sure that a reference 93 * counted root is properly recorded in a given transaction. This is required 94 * to make sure the old root from before we joined the transaction is deleted 95 * when the transaction commits 96 */ 97 noinline int btrfs_record_root_in_trans(struct btrfs_root *root) 98 { 99 struct btrfs_dirty_root *dirty; 100 u64 running_trans_id = root->fs_info->running_transaction->transid; 101 if (root->ref_cows && root->last_trans < running_trans_id) { 102 WARN_ON(root == root->fs_info->extent_root); 103 if (root->root_item.refs != 0) { 104 radix_tree_tag_set(&root->fs_info->fs_roots_radix, 105 (unsigned long)root->root_key.objectid, 106 BTRFS_ROOT_TRANS_TAG); 107 108 dirty = kmalloc(sizeof(*dirty), GFP_NOFS); 109 BUG_ON(!dirty); 110 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); 111 BUG_ON(!dirty->root); 112 dirty->latest_root = root; 113 INIT_LIST_HEAD(&dirty->list); 114 115 root->commit_root = btrfs_root_node(root); 116 117 memcpy(dirty->root, root, sizeof(*root)); 118 spin_lock_init(&dirty->root->node_lock); 119 spin_lock_init(&dirty->root->list_lock); 120 mutex_init(&dirty->root->objectid_mutex); 121 mutex_init(&dirty->root->log_mutex); 122 INIT_LIST_HEAD(&dirty->root->dead_list); 123 dirty->root->node = root->commit_root; 124 dirty->root->commit_root = NULL; 125 126 spin_lock(&root->list_lock); 127 list_add(&dirty->root->dead_list, &root->dead_list); 128 spin_unlock(&root->list_lock); 129 130 root->dirty_root = dirty; 131 } else { 132 WARN_ON(1); 133 } 134 root->last_trans = running_trans_id; 135 } 136 return 0; 137 } 138 139 /* wait for commit against the current transaction to become unblocked 140 * when this is done, it is safe to start a new transaction, but the current 141 * transaction might not be fully on disk. 142 */ 143 static void wait_current_trans(struct btrfs_root *root) 144 { 145 struct btrfs_transaction *cur_trans; 146 147 cur_trans = root->fs_info->running_transaction; 148 if (cur_trans && cur_trans->blocked) { 149 DEFINE_WAIT(wait); 150 cur_trans->use_count++; 151 while (1) { 152 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 153 TASK_UNINTERRUPTIBLE); 154 if (cur_trans->blocked) { 155 mutex_unlock(&root->fs_info->trans_mutex); 156 schedule(); 157 mutex_lock(&root->fs_info->trans_mutex); 158 finish_wait(&root->fs_info->transaction_wait, 159 &wait); 160 } else { 161 finish_wait(&root->fs_info->transaction_wait, 162 &wait); 163 break; 164 } 165 } 166 put_transaction(cur_trans); 167 } 168 } 169 170 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 171 int num_blocks, int wait) 172 { 173 struct btrfs_trans_handle *h = 174 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 175 int ret; 176 177 mutex_lock(&root->fs_info->trans_mutex); 178 if (!root->fs_info->log_root_recovering && 179 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) 180 wait_current_trans(root); 181 ret = join_transaction(root); 182 BUG_ON(ret); 183 184 btrfs_record_root_in_trans(root); 185 h->transid = root->fs_info->running_transaction->transid; 186 h->transaction = root->fs_info->running_transaction; 187 h->blocks_reserved = num_blocks; 188 h->blocks_used = 0; 189 h->block_group = 0; 190 h->alloc_exclude_nr = 0; 191 h->alloc_exclude_start = 0; 192 h->delayed_ref_updates = 0; 193 194 root->fs_info->running_transaction->use_count++; 195 mutex_unlock(&root->fs_info->trans_mutex); 196 return h; 197 } 198 199 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 200 int num_blocks) 201 { 202 return start_transaction(root, num_blocks, 1); 203 } 204 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 205 int num_blocks) 206 { 207 return start_transaction(root, num_blocks, 0); 208 } 209 210 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 211 int num_blocks) 212 { 213 return start_transaction(r, num_blocks, 2); 214 } 215 216 /* wait for a transaction commit to be fully complete */ 217 static noinline int wait_for_commit(struct btrfs_root *root, 218 struct btrfs_transaction *commit) 219 { 220 DEFINE_WAIT(wait); 221 mutex_lock(&root->fs_info->trans_mutex); 222 while (!commit->commit_done) { 223 prepare_to_wait(&commit->commit_wait, &wait, 224 TASK_UNINTERRUPTIBLE); 225 if (commit->commit_done) 226 break; 227 mutex_unlock(&root->fs_info->trans_mutex); 228 schedule(); 229 mutex_lock(&root->fs_info->trans_mutex); 230 } 231 mutex_unlock(&root->fs_info->trans_mutex); 232 finish_wait(&commit->commit_wait, &wait); 233 return 0; 234 } 235 236 /* 237 * rate limit against the drop_snapshot code. This helps to slow down new 238 * operations if the drop_snapshot code isn't able to keep up. 239 */ 240 static void throttle_on_drops(struct btrfs_root *root) 241 { 242 struct btrfs_fs_info *info = root->fs_info; 243 int harder_count = 0; 244 245 harder: 246 if (atomic_read(&info->throttles)) { 247 DEFINE_WAIT(wait); 248 int thr; 249 thr = atomic_read(&info->throttle_gen); 250 251 do { 252 prepare_to_wait(&info->transaction_throttle, 253 &wait, TASK_UNINTERRUPTIBLE); 254 if (!atomic_read(&info->throttles)) { 255 finish_wait(&info->transaction_throttle, &wait); 256 break; 257 } 258 schedule(); 259 finish_wait(&info->transaction_throttle, &wait); 260 } while (thr == atomic_read(&info->throttle_gen)); 261 harder_count++; 262 263 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 && 264 harder_count < 2) 265 goto harder; 266 267 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 && 268 harder_count < 10) 269 goto harder; 270 271 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 && 272 harder_count < 20) 273 goto harder; 274 } 275 } 276 277 void btrfs_throttle(struct btrfs_root *root) 278 { 279 mutex_lock(&root->fs_info->trans_mutex); 280 if (!root->fs_info->open_ioctl_trans) 281 wait_current_trans(root); 282 mutex_unlock(&root->fs_info->trans_mutex); 283 throttle_on_drops(root); 284 } 285 286 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 287 struct btrfs_root *root, int throttle) 288 { 289 struct btrfs_transaction *cur_trans; 290 struct btrfs_fs_info *info = root->fs_info; 291 int count = 0; 292 293 while (count < 4) { 294 unsigned long cur = trans->delayed_ref_updates; 295 trans->delayed_ref_updates = 0; 296 if (cur && 297 trans->transaction->delayed_refs.num_heads_ready > 64) { 298 trans->delayed_ref_updates = 0; 299 300 /* 301 * do a full flush if the transaction is trying 302 * to close 303 */ 304 if (trans->transaction->delayed_refs.flushing) 305 cur = 0; 306 btrfs_run_delayed_refs(trans, root, cur); 307 } else { 308 break; 309 } 310 count++; 311 } 312 313 mutex_lock(&info->trans_mutex); 314 cur_trans = info->running_transaction; 315 WARN_ON(cur_trans != trans->transaction); 316 WARN_ON(cur_trans->num_writers < 1); 317 cur_trans->num_writers--; 318 319 if (waitqueue_active(&cur_trans->writer_wait)) 320 wake_up(&cur_trans->writer_wait); 321 put_transaction(cur_trans); 322 mutex_unlock(&info->trans_mutex); 323 memset(trans, 0, sizeof(*trans)); 324 kmem_cache_free(btrfs_trans_handle_cachep, trans); 325 326 if (throttle) 327 throttle_on_drops(root); 328 329 return 0; 330 } 331 332 int btrfs_end_transaction(struct btrfs_trans_handle *trans, 333 struct btrfs_root *root) 334 { 335 return __btrfs_end_transaction(trans, root, 0); 336 } 337 338 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 339 struct btrfs_root *root) 340 { 341 return __btrfs_end_transaction(trans, root, 1); 342 } 343 344 /* 345 * when btree blocks are allocated, they have some corresponding bits set for 346 * them in one of two extent_io trees. This is used to make sure all of 347 * those extents are on disk for transaction or log commit 348 */ 349 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 350 struct extent_io_tree *dirty_pages) 351 { 352 int ret; 353 int err = 0; 354 int werr = 0; 355 struct page *page; 356 struct inode *btree_inode = root->fs_info->btree_inode; 357 u64 start = 0; 358 u64 end; 359 unsigned long index; 360 361 while (1) { 362 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 363 EXTENT_DIRTY); 364 if (ret) 365 break; 366 while (start <= end) { 367 cond_resched(); 368 369 index = start >> PAGE_CACHE_SHIFT; 370 start = (u64)(index + 1) << PAGE_CACHE_SHIFT; 371 page = find_get_page(btree_inode->i_mapping, index); 372 if (!page) 373 continue; 374 375 btree_lock_page_hook(page); 376 if (!page->mapping) { 377 unlock_page(page); 378 page_cache_release(page); 379 continue; 380 } 381 382 if (PageWriteback(page)) { 383 if (PageDirty(page)) 384 wait_on_page_writeback(page); 385 else { 386 unlock_page(page); 387 page_cache_release(page); 388 continue; 389 } 390 } 391 err = write_one_page(page, 0); 392 if (err) 393 werr = err; 394 page_cache_release(page); 395 } 396 } 397 while (1) { 398 ret = find_first_extent_bit(dirty_pages, 0, &start, &end, 399 EXTENT_DIRTY); 400 if (ret) 401 break; 402 403 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); 404 while (start <= end) { 405 index = start >> PAGE_CACHE_SHIFT; 406 start = (u64)(index + 1) << PAGE_CACHE_SHIFT; 407 page = find_get_page(btree_inode->i_mapping, index); 408 if (!page) 409 continue; 410 if (PageDirty(page)) { 411 btree_lock_page_hook(page); 412 wait_on_page_writeback(page); 413 err = write_one_page(page, 0); 414 if (err) 415 werr = err; 416 } 417 wait_on_page_writeback(page); 418 page_cache_release(page); 419 cond_resched(); 420 } 421 } 422 if (err) 423 werr = err; 424 return werr; 425 } 426 427 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 428 struct btrfs_root *root) 429 { 430 if (!trans || !trans->transaction) { 431 struct inode *btree_inode; 432 btree_inode = root->fs_info->btree_inode; 433 return filemap_write_and_wait(btree_inode->i_mapping); 434 } 435 return btrfs_write_and_wait_marked_extents(root, 436 &trans->transaction->dirty_pages); 437 } 438 439 /* 440 * this is used to update the root pointer in the tree of tree roots. 441 * 442 * But, in the case of the extent allocation tree, updating the root 443 * pointer may allocate blocks which may change the root of the extent 444 * allocation tree. 445 * 446 * So, this loops and repeats and makes sure the cowonly root didn't 447 * change while the root pointer was being updated in the metadata. 448 */ 449 static int update_cowonly_root(struct btrfs_trans_handle *trans, 450 struct btrfs_root *root) 451 { 452 int ret; 453 u64 old_root_bytenr; 454 struct btrfs_root *tree_root = root->fs_info->tree_root; 455 456 btrfs_write_dirty_block_groups(trans, root); 457 458 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 459 BUG_ON(ret); 460 461 while (1) { 462 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 463 if (old_root_bytenr == root->node->start) 464 break; 465 btrfs_set_root_bytenr(&root->root_item, 466 root->node->start); 467 btrfs_set_root_level(&root->root_item, 468 btrfs_header_level(root->node)); 469 btrfs_set_root_generation(&root->root_item, trans->transid); 470 471 ret = btrfs_update_root(trans, tree_root, 472 &root->root_key, 473 &root->root_item); 474 BUG_ON(ret); 475 btrfs_write_dirty_block_groups(trans, root); 476 477 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 478 BUG_ON(ret); 479 } 480 return 0; 481 } 482 483 /* 484 * update all the cowonly tree roots on disk 485 */ 486 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 487 struct btrfs_root *root) 488 { 489 struct btrfs_fs_info *fs_info = root->fs_info; 490 struct list_head *next; 491 struct extent_buffer *eb; 492 int ret; 493 494 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 495 BUG_ON(ret); 496 497 eb = btrfs_lock_root_node(fs_info->tree_root); 498 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); 499 btrfs_tree_unlock(eb); 500 free_extent_buffer(eb); 501 502 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 503 BUG_ON(ret); 504 505 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 506 next = fs_info->dirty_cowonly_roots.next; 507 list_del_init(next); 508 root = list_entry(next, struct btrfs_root, dirty_list); 509 510 update_cowonly_root(trans, root); 511 512 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 513 BUG_ON(ret); 514 } 515 return 0; 516 } 517 518 /* 519 * dead roots are old snapshots that need to be deleted. This allocates 520 * a dirty root struct and adds it into the list of dead roots that need to 521 * be deleted 522 */ 523 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) 524 { 525 struct btrfs_dirty_root *dirty; 526 527 dirty = kmalloc(sizeof(*dirty), GFP_NOFS); 528 if (!dirty) 529 return -ENOMEM; 530 dirty->root = root; 531 dirty->latest_root = latest; 532 533 mutex_lock(&root->fs_info->trans_mutex); 534 list_add(&dirty->list, &latest->fs_info->dead_roots); 535 mutex_unlock(&root->fs_info->trans_mutex); 536 return 0; 537 } 538 539 /* 540 * at transaction commit time we need to schedule the old roots for 541 * deletion via btrfs_drop_snapshot. This runs through all the 542 * reference counted roots that were modified in the current 543 * transaction and puts them into the drop list 544 */ 545 static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, 546 struct radix_tree_root *radix, 547 struct list_head *list) 548 { 549 struct btrfs_dirty_root *dirty; 550 struct btrfs_root *gang[8]; 551 struct btrfs_root *root; 552 int i; 553 int ret; 554 int err = 0; 555 u32 refs; 556 557 while (1) { 558 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, 559 ARRAY_SIZE(gang), 560 BTRFS_ROOT_TRANS_TAG); 561 if (ret == 0) 562 break; 563 for (i = 0; i < ret; i++) { 564 root = gang[i]; 565 radix_tree_tag_clear(radix, 566 (unsigned long)root->root_key.objectid, 567 BTRFS_ROOT_TRANS_TAG); 568 569 BUG_ON(!root->ref_tree); 570 dirty = root->dirty_root; 571 572 btrfs_free_log(trans, root); 573 btrfs_free_reloc_root(trans, root); 574 575 if (root->commit_root == root->node) { 576 WARN_ON(root->node->start != 577 btrfs_root_bytenr(&root->root_item)); 578 579 free_extent_buffer(root->commit_root); 580 root->commit_root = NULL; 581 root->dirty_root = NULL; 582 583 spin_lock(&root->list_lock); 584 list_del_init(&dirty->root->dead_list); 585 spin_unlock(&root->list_lock); 586 587 kfree(dirty->root); 588 kfree(dirty); 589 590 /* make sure to update the root on disk 591 * so we get any updates to the block used 592 * counts 593 */ 594 err = btrfs_update_root(trans, 595 root->fs_info->tree_root, 596 &root->root_key, 597 &root->root_item); 598 continue; 599 } 600 601 memset(&root->root_item.drop_progress, 0, 602 sizeof(struct btrfs_disk_key)); 603 root->root_item.drop_level = 0; 604 root->commit_root = NULL; 605 root->dirty_root = NULL; 606 root->root_key.offset = root->fs_info->generation; 607 btrfs_set_root_bytenr(&root->root_item, 608 root->node->start); 609 btrfs_set_root_level(&root->root_item, 610 btrfs_header_level(root->node)); 611 btrfs_set_root_generation(&root->root_item, 612 root->root_key.offset); 613 614 err = btrfs_insert_root(trans, root->fs_info->tree_root, 615 &root->root_key, 616 &root->root_item); 617 if (err) 618 break; 619 620 refs = btrfs_root_refs(&dirty->root->root_item); 621 btrfs_set_root_refs(&dirty->root->root_item, refs - 1); 622 err = btrfs_update_root(trans, root->fs_info->tree_root, 623 &dirty->root->root_key, 624 &dirty->root->root_item); 625 626 BUG_ON(err); 627 if (refs == 1) { 628 list_add(&dirty->list, list); 629 } else { 630 WARN_ON(1); 631 free_extent_buffer(dirty->root->node); 632 kfree(dirty->root); 633 kfree(dirty); 634 } 635 } 636 } 637 return err; 638 } 639 640 /* 641 * defrag a given btree. If cacheonly == 1, this won't read from the disk, 642 * otherwise every leaf in the btree is read and defragged. 643 */ 644 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 645 { 646 struct btrfs_fs_info *info = root->fs_info; 647 int ret; 648 struct btrfs_trans_handle *trans; 649 unsigned long nr; 650 651 smp_mb(); 652 if (root->defrag_running) 653 return 0; 654 trans = btrfs_start_transaction(root, 1); 655 while (1) { 656 root->defrag_running = 1; 657 ret = btrfs_defrag_leaves(trans, root, cacheonly); 658 nr = trans->blocks_used; 659 btrfs_end_transaction(trans, root); 660 btrfs_btree_balance_dirty(info->tree_root, nr); 661 cond_resched(); 662 663 trans = btrfs_start_transaction(root, 1); 664 if (root->fs_info->closing || ret != -EAGAIN) 665 break; 666 } 667 root->defrag_running = 0; 668 smp_mb(); 669 btrfs_end_transaction(trans, root); 670 return 0; 671 } 672 673 /* 674 * when dropping snapshots, we generate a ton of delayed refs, and it makes 675 * sense not to join the transaction while it is trying to flush the current 676 * queue of delayed refs out. 677 * 678 * This is used by the drop snapshot code only 679 */ 680 static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) 681 { 682 DEFINE_WAIT(wait); 683 684 mutex_lock(&info->trans_mutex); 685 while (info->running_transaction && 686 info->running_transaction->delayed_refs.flushing) { 687 prepare_to_wait(&info->transaction_wait, &wait, 688 TASK_UNINTERRUPTIBLE); 689 mutex_unlock(&info->trans_mutex); 690 691 atomic_dec(&info->throttles); 692 wake_up(&info->transaction_throttle); 693 694 schedule(); 695 696 atomic_inc(&info->throttles); 697 mutex_lock(&info->trans_mutex); 698 finish_wait(&info->transaction_wait, &wait); 699 } 700 mutex_unlock(&info->trans_mutex); 701 return 0; 702 } 703 704 /* 705 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on 706 * all of them 707 */ 708 static noinline int drop_dirty_roots(struct btrfs_root *tree_root, 709 struct list_head *list) 710 { 711 struct btrfs_dirty_root *dirty; 712 struct btrfs_trans_handle *trans; 713 unsigned long nr; 714 u64 num_bytes; 715 u64 bytes_used; 716 u64 max_useless; 717 int ret = 0; 718 int err; 719 720 while (!list_empty(list)) { 721 struct btrfs_root *root; 722 723 dirty = list_entry(list->prev, struct btrfs_dirty_root, list); 724 list_del_init(&dirty->list); 725 726 num_bytes = btrfs_root_used(&dirty->root->root_item); 727 root = dirty->latest_root; 728 atomic_inc(&root->fs_info->throttles); 729 730 while (1) { 731 /* 732 * we don't want to jump in and create a bunch of 733 * delayed refs if the transaction is starting to close 734 */ 735 wait_transaction_pre_flush(tree_root->fs_info); 736 trans = btrfs_start_transaction(tree_root, 1); 737 738 /* 739 * we've joined a transaction, make sure it isn't 740 * closing right now 741 */ 742 if (trans->transaction->delayed_refs.flushing) { 743 btrfs_end_transaction(trans, tree_root); 744 continue; 745 } 746 747 mutex_lock(&root->fs_info->drop_mutex); 748 ret = btrfs_drop_snapshot(trans, dirty->root); 749 if (ret != -EAGAIN) 750 break; 751 mutex_unlock(&root->fs_info->drop_mutex); 752 753 err = btrfs_update_root(trans, 754 tree_root, 755 &dirty->root->root_key, 756 &dirty->root->root_item); 757 if (err) 758 ret = err; 759 nr = trans->blocks_used; 760 ret = btrfs_end_transaction(trans, tree_root); 761 BUG_ON(ret); 762 763 btrfs_btree_balance_dirty(tree_root, nr); 764 cond_resched(); 765 } 766 BUG_ON(ret); 767 atomic_dec(&root->fs_info->throttles); 768 wake_up(&root->fs_info->transaction_throttle); 769 770 num_bytes -= btrfs_root_used(&dirty->root->root_item); 771 bytes_used = btrfs_root_used(&root->root_item); 772 if (num_bytes) { 773 mutex_lock(&root->fs_info->trans_mutex); 774 btrfs_record_root_in_trans(root); 775 mutex_unlock(&root->fs_info->trans_mutex); 776 btrfs_set_root_used(&root->root_item, 777 bytes_used - num_bytes); 778 } 779 780 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); 781 if (ret) { 782 BUG(); 783 break; 784 } 785 mutex_unlock(&root->fs_info->drop_mutex); 786 787 spin_lock(&root->list_lock); 788 list_del_init(&dirty->root->dead_list); 789 if (!list_empty(&root->dead_list)) { 790 struct btrfs_root *oldest; 791 oldest = list_entry(root->dead_list.prev, 792 struct btrfs_root, dead_list); 793 max_useless = oldest->root_key.offset - 1; 794 } else { 795 max_useless = root->root_key.offset - 1; 796 } 797 spin_unlock(&root->list_lock); 798 799 nr = trans->blocks_used; 800 ret = btrfs_end_transaction(trans, tree_root); 801 BUG_ON(ret); 802 803 ret = btrfs_remove_leaf_refs(root, max_useless, 0); 804 BUG_ON(ret); 805 806 free_extent_buffer(dirty->root->node); 807 kfree(dirty->root); 808 kfree(dirty); 809 810 btrfs_btree_balance_dirty(tree_root, nr); 811 cond_resched(); 812 } 813 return ret; 814 } 815 816 /* 817 * new snapshots need to be created at a very specific time in the 818 * transaction commit. This does the actual creation 819 */ 820 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, 821 struct btrfs_fs_info *fs_info, 822 struct btrfs_pending_snapshot *pending) 823 { 824 struct btrfs_key key; 825 struct btrfs_root_item *new_root_item; 826 struct btrfs_root *tree_root = fs_info->tree_root; 827 struct btrfs_root *root = pending->root; 828 struct extent_buffer *tmp; 829 struct extent_buffer *old; 830 int ret; 831 u64 objectid; 832 833 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 834 if (!new_root_item) { 835 ret = -ENOMEM; 836 goto fail; 837 } 838 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 839 if (ret) 840 goto fail; 841 842 btrfs_record_root_in_trans(root); 843 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 844 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 845 846 key.objectid = objectid; 847 key.offset = trans->transid; 848 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 849 850 old = btrfs_lock_root_node(root); 851 btrfs_cow_block(trans, root, old, NULL, 0, &old); 852 853 btrfs_copy_root(trans, root, old, &tmp, objectid); 854 btrfs_tree_unlock(old); 855 free_extent_buffer(old); 856 857 btrfs_set_root_bytenr(new_root_item, tmp->start); 858 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp)); 859 btrfs_set_root_generation(new_root_item, trans->transid); 860 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 861 new_root_item); 862 btrfs_tree_unlock(tmp); 863 free_extent_buffer(tmp); 864 if (ret) 865 goto fail; 866 867 key.offset = (u64)-1; 868 memcpy(&pending->root_key, &key, sizeof(key)); 869 fail: 870 kfree(new_root_item); 871 return ret; 872 } 873 874 static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, 875 struct btrfs_pending_snapshot *pending) 876 { 877 int ret; 878 int namelen; 879 u64 index = 0; 880 struct btrfs_trans_handle *trans; 881 struct inode *parent_inode; 882 struct inode *inode; 883 struct btrfs_root *parent_root; 884 885 parent_inode = pending->dentry->d_parent->d_inode; 886 parent_root = BTRFS_I(parent_inode)->root; 887 trans = btrfs_join_transaction(parent_root, 1); 888 889 /* 890 * insert the directory item 891 */ 892 namelen = strlen(pending->name); 893 ret = btrfs_set_inode_index(parent_inode, &index); 894 ret = btrfs_insert_dir_item(trans, parent_root, 895 pending->name, namelen, 896 parent_inode->i_ino, 897 &pending->root_key, BTRFS_FT_DIR, index); 898 899 if (ret) 900 goto fail; 901 902 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); 903 ret = btrfs_update_inode(trans, parent_root, parent_inode); 904 BUG_ON(ret); 905 906 /* add the backref first */ 907 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 908 pending->root_key.objectid, 909 BTRFS_ROOT_BACKREF_KEY, 910 parent_root->root_key.objectid, 911 parent_inode->i_ino, index, pending->name, 912 namelen); 913 914 BUG_ON(ret); 915 916 /* now add the forward ref */ 917 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 918 parent_root->root_key.objectid, 919 BTRFS_ROOT_REF_KEY, 920 pending->root_key.objectid, 921 parent_inode->i_ino, index, pending->name, 922 namelen); 923 924 inode = btrfs_lookup_dentry(parent_inode, pending->dentry); 925 d_instantiate(pending->dentry, inode); 926 fail: 927 btrfs_end_transaction(trans, fs_info->fs_root); 928 return ret; 929 } 930 931 /* 932 * create all the snapshots we've scheduled for creation 933 */ 934 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, 935 struct btrfs_fs_info *fs_info) 936 { 937 struct btrfs_pending_snapshot *pending; 938 struct list_head *head = &trans->transaction->pending_snapshots; 939 int ret; 940 941 list_for_each_entry(pending, head, list) { 942 ret = create_pending_snapshot(trans, fs_info, pending); 943 BUG_ON(ret); 944 } 945 return 0; 946 } 947 948 static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans, 949 struct btrfs_fs_info *fs_info) 950 { 951 struct btrfs_pending_snapshot *pending; 952 struct list_head *head = &trans->transaction->pending_snapshots; 953 int ret; 954 955 while (!list_empty(head)) { 956 pending = list_entry(head->next, 957 struct btrfs_pending_snapshot, list); 958 ret = finish_pending_snapshot(fs_info, pending); 959 BUG_ON(ret); 960 list_del(&pending->list); 961 kfree(pending->name); 962 kfree(pending); 963 } 964 return 0; 965 } 966 967 int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 968 struct btrfs_root *root) 969 { 970 unsigned long joined = 0; 971 unsigned long timeout = 1; 972 struct btrfs_transaction *cur_trans; 973 struct btrfs_transaction *prev_trans = NULL; 974 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 975 struct list_head dirty_fs_roots; 976 struct extent_io_tree *pinned_copy; 977 DEFINE_WAIT(wait); 978 int ret; 979 int should_grow = 0; 980 unsigned long now = get_seconds(); 981 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); 982 983 btrfs_run_ordered_operations(root, 0); 984 985 /* make a pass through all the delayed refs we have so far 986 * any runnings procs may add more while we are here 987 */ 988 ret = btrfs_run_delayed_refs(trans, root, 0); 989 BUG_ON(ret); 990 991 cur_trans = trans->transaction; 992 /* 993 * set the flushing flag so procs in this transaction have to 994 * start sending their work down. 995 */ 996 cur_trans->delayed_refs.flushing = 1; 997 998 ret = btrfs_run_delayed_refs(trans, root, 0); 999 BUG_ON(ret); 1000 1001 mutex_lock(&root->fs_info->trans_mutex); 1002 INIT_LIST_HEAD(&dirty_fs_roots); 1003 if (cur_trans->in_commit) { 1004 cur_trans->use_count++; 1005 mutex_unlock(&root->fs_info->trans_mutex); 1006 btrfs_end_transaction(trans, root); 1007 1008 ret = wait_for_commit(root, cur_trans); 1009 BUG_ON(ret); 1010 1011 mutex_lock(&root->fs_info->trans_mutex); 1012 put_transaction(cur_trans); 1013 mutex_unlock(&root->fs_info->trans_mutex); 1014 1015 return 0; 1016 } 1017 1018 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS); 1019 if (!pinned_copy) 1020 return -ENOMEM; 1021 1022 extent_io_tree_init(pinned_copy, 1023 root->fs_info->btree_inode->i_mapping, GFP_NOFS); 1024 1025 trans->transaction->in_commit = 1; 1026 trans->transaction->blocked = 1; 1027 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1028 prev_trans = list_entry(cur_trans->list.prev, 1029 struct btrfs_transaction, list); 1030 if (!prev_trans->commit_done) { 1031 prev_trans->use_count++; 1032 mutex_unlock(&root->fs_info->trans_mutex); 1033 1034 wait_for_commit(root, prev_trans); 1035 1036 mutex_lock(&root->fs_info->trans_mutex); 1037 put_transaction(prev_trans); 1038 } 1039 } 1040 1041 if (now < cur_trans->start_time || now - cur_trans->start_time < 1) 1042 should_grow = 1; 1043 1044 do { 1045 int snap_pending = 0; 1046 joined = cur_trans->num_joined; 1047 if (!list_empty(&trans->transaction->pending_snapshots)) 1048 snap_pending = 1; 1049 1050 WARN_ON(cur_trans != trans->transaction); 1051 prepare_to_wait(&cur_trans->writer_wait, &wait, 1052 TASK_UNINTERRUPTIBLE); 1053 1054 if (cur_trans->num_writers > 1) 1055 timeout = MAX_SCHEDULE_TIMEOUT; 1056 else if (should_grow) 1057 timeout = 1; 1058 1059 mutex_unlock(&root->fs_info->trans_mutex); 1060 1061 if (flush_on_commit || snap_pending) { 1062 if (flush_on_commit) 1063 btrfs_start_delalloc_inodes(root); 1064 ret = btrfs_wait_ordered_extents(root, 1); 1065 BUG_ON(ret); 1066 } 1067 1068 /* 1069 * rename don't use btrfs_join_transaction, so, once we 1070 * set the transaction to blocked above, we aren't going 1071 * to get any new ordered operations. We can safely run 1072 * it here and no for sure that nothing new will be added 1073 * to the list 1074 */ 1075 btrfs_run_ordered_operations(root, 1); 1076 1077 smp_mb(); 1078 if (cur_trans->num_writers > 1 || should_grow) 1079 schedule_timeout(timeout); 1080 1081 mutex_lock(&root->fs_info->trans_mutex); 1082 finish_wait(&cur_trans->writer_wait, &wait); 1083 } while (cur_trans->num_writers > 1 || 1084 (should_grow && cur_trans->num_joined != joined)); 1085 1086 ret = create_pending_snapshots(trans, root->fs_info); 1087 BUG_ON(ret); 1088 1089 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1090 BUG_ON(ret); 1091 1092 WARN_ON(cur_trans != trans->transaction); 1093 1094 /* btrfs_commit_tree_roots is responsible for getting the 1095 * various roots consistent with each other. Every pointer 1096 * in the tree of tree roots has to point to the most up to date 1097 * root for every subvolume and other tree. So, we have to keep 1098 * the tree logging code from jumping in and changing any 1099 * of the trees. 1100 * 1101 * At this point in the commit, there can't be any tree-log 1102 * writers, but a little lower down we drop the trans mutex 1103 * and let new people in. By holding the tree_log_mutex 1104 * from now until after the super is written, we avoid races 1105 * with the tree-log code. 1106 */ 1107 mutex_lock(&root->fs_info->tree_log_mutex); 1108 /* 1109 * keep tree reloc code from adding new reloc trees 1110 */ 1111 mutex_lock(&root->fs_info->tree_reloc_mutex); 1112 1113 1114 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, 1115 &dirty_fs_roots); 1116 BUG_ON(ret); 1117 1118 /* add_dirty_roots gets rid of all the tree log roots, it is now 1119 * safe to free the root of tree log roots 1120 */ 1121 btrfs_free_log_root_tree(trans, root->fs_info); 1122 1123 ret = btrfs_commit_tree_roots(trans, root); 1124 BUG_ON(ret); 1125 1126 cur_trans = root->fs_info->running_transaction; 1127 spin_lock(&root->fs_info->new_trans_lock); 1128 root->fs_info->running_transaction = NULL; 1129 spin_unlock(&root->fs_info->new_trans_lock); 1130 btrfs_set_super_generation(&root->fs_info->super_copy, 1131 cur_trans->transid); 1132 btrfs_set_super_root(&root->fs_info->super_copy, 1133 root->fs_info->tree_root->node->start); 1134 btrfs_set_super_root_level(&root->fs_info->super_copy, 1135 btrfs_header_level(root->fs_info->tree_root->node)); 1136 1137 btrfs_set_super_chunk_root(&root->fs_info->super_copy, 1138 chunk_root->node->start); 1139 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, 1140 btrfs_header_level(chunk_root->node)); 1141 btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy, 1142 btrfs_header_generation(chunk_root->node)); 1143 1144 if (!root->fs_info->log_root_recovering) { 1145 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1146 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1147 } 1148 1149 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1150 sizeof(root->fs_info->super_copy)); 1151 1152 btrfs_copy_pinned(root, pinned_copy); 1153 1154 trans->transaction->blocked = 0; 1155 1156 wake_up(&root->fs_info->transaction_throttle); 1157 wake_up(&root->fs_info->transaction_wait); 1158 1159 mutex_unlock(&root->fs_info->trans_mutex); 1160 ret = btrfs_write_and_wait_transaction(trans, root); 1161 BUG_ON(ret); 1162 write_ctree_super(trans, root, 0); 1163 1164 /* 1165 * the super is written, we can safely allow the tree-loggers 1166 * to go about their business 1167 */ 1168 mutex_unlock(&root->fs_info->tree_log_mutex); 1169 1170 btrfs_finish_extent_commit(trans, root, pinned_copy); 1171 kfree(pinned_copy); 1172 1173 btrfs_drop_dead_reloc_roots(root); 1174 mutex_unlock(&root->fs_info->tree_reloc_mutex); 1175 1176 /* do the directory inserts of any pending snapshot creations */ 1177 finish_pending_snapshots(trans, root->fs_info); 1178 1179 mutex_lock(&root->fs_info->trans_mutex); 1180 1181 cur_trans->commit_done = 1; 1182 1183 root->fs_info->last_trans_committed = cur_trans->transid; 1184 wake_up(&cur_trans->commit_wait); 1185 1186 put_transaction(cur_trans); 1187 put_transaction(cur_trans); 1188 1189 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots); 1190 if (root->fs_info->closing) 1191 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots); 1192 1193 mutex_unlock(&root->fs_info->trans_mutex); 1194 1195 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1196 1197 if (root->fs_info->closing) 1198 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots); 1199 return ret; 1200 } 1201 1202 /* 1203 * interface function to delete all the snapshots we have scheduled for deletion 1204 */ 1205 int btrfs_clean_old_snapshots(struct btrfs_root *root) 1206 { 1207 struct list_head dirty_roots; 1208 INIT_LIST_HEAD(&dirty_roots); 1209 again: 1210 mutex_lock(&root->fs_info->trans_mutex); 1211 list_splice_init(&root->fs_info->dead_roots, &dirty_roots); 1212 mutex_unlock(&root->fs_info->trans_mutex); 1213 1214 if (!list_empty(&dirty_roots)) { 1215 drop_dirty_roots(root, &dirty_roots); 1216 goto again; 1217 } 1218 return 0; 1219 } 1220