1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/fs.h> 20 #include <linux/slab.h> 21 #include <linux/sched.h> 22 #include <linux/writeback.h> 23 #include <linux/pagemap.h> 24 #include <linux/blkdev.h> 25 #include <linux/uuid.h> 26 #include "ctree.h" 27 #include "disk-io.h" 28 #include "transaction.h" 29 #include "locking.h" 30 #include "tree-log.h" 31 #include "inode-map.h" 32 #include "volumes.h" 33 #include "dev-replace.h" 34 #include "qgroup.h" 35 36 #define BTRFS_ROOT_TRANS_TAG 0 37 38 static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { 39 [TRANS_STATE_RUNNING] = 0U, 40 [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | 41 __TRANS_START), 42 [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE | 43 __TRANS_START | 44 __TRANS_ATTACH), 45 [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE | 46 __TRANS_START | 47 __TRANS_ATTACH | 48 __TRANS_JOIN), 49 [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE | 50 __TRANS_START | 51 __TRANS_ATTACH | 52 __TRANS_JOIN | 53 __TRANS_JOIN_NOLOCK), 54 [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE | 55 __TRANS_START | 56 __TRANS_ATTACH | 57 __TRANS_JOIN | 58 __TRANS_JOIN_NOLOCK), 59 }; 60 61 void btrfs_put_transaction(struct btrfs_transaction *transaction) 62 { 63 WARN_ON(atomic_read(&transaction->use_count) == 0); 64 if (atomic_dec_and_test(&transaction->use_count)) { 65 BUG_ON(!list_empty(&transaction->list)); 66 WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); 67 if (transaction->delayed_refs.pending_csums) 68 btrfs_err(transaction->fs_info, 69 "pending csums is %llu", 70 transaction->delayed_refs.pending_csums); 71 while (!list_empty(&transaction->pending_chunks)) { 72 struct extent_map *em; 73 74 em = list_first_entry(&transaction->pending_chunks, 75 struct extent_map, list); 76 list_del_init(&em->list); 77 free_extent_map(em); 78 } 79 /* 80 * If any block groups are found in ->deleted_bgs then it's 81 * because the transaction was aborted and a commit did not 82 * happen (things failed before writing the new superblock 83 * and calling btrfs_finish_extent_commit()), so we can not 84 * discard the physical locations of the block groups. 85 */ 86 while (!list_empty(&transaction->deleted_bgs)) { 87 struct btrfs_block_group_cache *cache; 88 89 cache = list_first_entry(&transaction->deleted_bgs, 90 struct btrfs_block_group_cache, 91 bg_list); 92 list_del_init(&cache->bg_list); 93 btrfs_put_block_group_trimming(cache); 94 btrfs_put_block_group(cache); 95 } 96 kmem_cache_free(btrfs_transaction_cachep, transaction); 97 } 98 } 99 100 static void clear_btree_io_tree(struct extent_io_tree *tree) 101 { 102 spin_lock(&tree->lock); 103 /* 104 * Do a single barrier for the waitqueue_active check here, the state 105 * of the waitqueue should not change once clear_btree_io_tree is 106 * called. 107 */ 108 smp_mb(); 109 while (!RB_EMPTY_ROOT(&tree->state)) { 110 struct rb_node *node; 111 struct extent_state *state; 112 113 node = rb_first(&tree->state); 114 state = rb_entry(node, struct extent_state, rb_node); 115 rb_erase(&state->rb_node, &tree->state); 116 RB_CLEAR_NODE(&state->rb_node); 117 /* 118 * btree io trees aren't supposed to have tasks waiting for 119 * changes in the flags of extent states ever. 120 */ 121 ASSERT(!waitqueue_active(&state->wq)); 122 free_extent_state(state); 123 124 cond_resched_lock(&tree->lock); 125 } 126 spin_unlock(&tree->lock); 127 } 128 129 static noinline void switch_commit_roots(struct btrfs_transaction *trans, 130 struct btrfs_fs_info *fs_info) 131 { 132 struct btrfs_root *root, *tmp; 133 134 down_write(&fs_info->commit_root_sem); 135 list_for_each_entry_safe(root, tmp, &trans->switch_commits, 136 dirty_list) { 137 list_del_init(&root->dirty_list); 138 free_extent_buffer(root->commit_root); 139 root->commit_root = btrfs_root_node(root); 140 if (is_fstree(root->objectid)) 141 btrfs_unpin_free_ino(root); 142 clear_btree_io_tree(&root->dirty_log_pages); 143 } 144 145 /* We can free old roots now. */ 146 spin_lock(&trans->dropped_roots_lock); 147 while (!list_empty(&trans->dropped_roots)) { 148 root = list_first_entry(&trans->dropped_roots, 149 struct btrfs_root, root_list); 150 list_del_init(&root->root_list); 151 spin_unlock(&trans->dropped_roots_lock); 152 btrfs_drop_and_free_fs_root(fs_info, root); 153 spin_lock(&trans->dropped_roots_lock); 154 } 155 spin_unlock(&trans->dropped_roots_lock); 156 up_write(&fs_info->commit_root_sem); 157 } 158 159 static inline void extwriter_counter_inc(struct btrfs_transaction *trans, 160 unsigned int type) 161 { 162 if (type & TRANS_EXTWRITERS) 163 atomic_inc(&trans->num_extwriters); 164 } 165 166 static inline void extwriter_counter_dec(struct btrfs_transaction *trans, 167 unsigned int type) 168 { 169 if (type & TRANS_EXTWRITERS) 170 atomic_dec(&trans->num_extwriters); 171 } 172 173 static inline void extwriter_counter_init(struct btrfs_transaction *trans, 174 unsigned int type) 175 { 176 atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0)); 177 } 178 179 static inline int extwriter_counter_read(struct btrfs_transaction *trans) 180 { 181 return atomic_read(&trans->num_extwriters); 182 } 183 184 /* 185 * either allocate a new transaction or hop into the existing one 186 */ 187 static noinline int join_transaction(struct btrfs_fs_info *fs_info, 188 unsigned int type) 189 { 190 struct btrfs_transaction *cur_trans; 191 192 spin_lock(&fs_info->trans_lock); 193 loop: 194 /* The file system has been taken offline. No new transactions. */ 195 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { 196 spin_unlock(&fs_info->trans_lock); 197 return -EROFS; 198 } 199 200 cur_trans = fs_info->running_transaction; 201 if (cur_trans) { 202 if (cur_trans->aborted) { 203 spin_unlock(&fs_info->trans_lock); 204 return cur_trans->aborted; 205 } 206 if (btrfs_blocked_trans_types[cur_trans->state] & type) { 207 spin_unlock(&fs_info->trans_lock); 208 return -EBUSY; 209 } 210 atomic_inc(&cur_trans->use_count); 211 atomic_inc(&cur_trans->num_writers); 212 extwriter_counter_inc(cur_trans, type); 213 spin_unlock(&fs_info->trans_lock); 214 return 0; 215 } 216 spin_unlock(&fs_info->trans_lock); 217 218 /* 219 * If we are ATTACH, we just want to catch the current transaction, 220 * and commit it. If there is no transaction, just return ENOENT. 221 */ 222 if (type == TRANS_ATTACH) 223 return -ENOENT; 224 225 /* 226 * JOIN_NOLOCK only happens during the transaction commit, so 227 * it is impossible that ->running_transaction is NULL 228 */ 229 BUG_ON(type == TRANS_JOIN_NOLOCK); 230 231 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 232 if (!cur_trans) 233 return -ENOMEM; 234 235 spin_lock(&fs_info->trans_lock); 236 if (fs_info->running_transaction) { 237 /* 238 * someone started a transaction after we unlocked. Make sure 239 * to redo the checks above 240 */ 241 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 242 goto loop; 243 } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { 244 spin_unlock(&fs_info->trans_lock); 245 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 246 return -EROFS; 247 } 248 249 cur_trans->fs_info = fs_info; 250 atomic_set(&cur_trans->num_writers, 1); 251 extwriter_counter_init(cur_trans, type); 252 init_waitqueue_head(&cur_trans->writer_wait); 253 init_waitqueue_head(&cur_trans->commit_wait); 254 init_waitqueue_head(&cur_trans->pending_wait); 255 cur_trans->state = TRANS_STATE_RUNNING; 256 /* 257 * One for this trans handle, one so it will live on until we 258 * commit the transaction. 259 */ 260 atomic_set(&cur_trans->use_count, 2); 261 atomic_set(&cur_trans->pending_ordered, 0); 262 cur_trans->flags = 0; 263 cur_trans->start_time = get_seconds(); 264 265 memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); 266 267 cur_trans->delayed_refs.href_root = RB_ROOT; 268 cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; 269 atomic_set(&cur_trans->delayed_refs.num_entries, 0); 270 271 /* 272 * although the tree mod log is per file system and not per transaction, 273 * the log must never go across transaction boundaries. 274 */ 275 smp_mb(); 276 if (!list_empty(&fs_info->tree_mod_seq_list)) 277 WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n"); 278 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) 279 WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n"); 280 atomic64_set(&fs_info->tree_mod_seq, 0); 281 282 spin_lock_init(&cur_trans->delayed_refs.lock); 283 284 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 285 INIT_LIST_HEAD(&cur_trans->pending_chunks); 286 INIT_LIST_HEAD(&cur_trans->switch_commits); 287 INIT_LIST_HEAD(&cur_trans->dirty_bgs); 288 INIT_LIST_HEAD(&cur_trans->io_bgs); 289 INIT_LIST_HEAD(&cur_trans->dropped_roots); 290 mutex_init(&cur_trans->cache_write_mutex); 291 cur_trans->num_dirty_bgs = 0; 292 spin_lock_init(&cur_trans->dirty_bgs_lock); 293 INIT_LIST_HEAD(&cur_trans->deleted_bgs); 294 spin_lock_init(&cur_trans->dropped_roots_lock); 295 list_add_tail(&cur_trans->list, &fs_info->trans_list); 296 extent_io_tree_init(&cur_trans->dirty_pages, 297 fs_info->btree_inode->i_mapping); 298 fs_info->generation++; 299 cur_trans->transid = fs_info->generation; 300 fs_info->running_transaction = cur_trans; 301 cur_trans->aborted = 0; 302 spin_unlock(&fs_info->trans_lock); 303 304 return 0; 305 } 306 307 /* 308 * this does all the record keeping required to make sure that a reference 309 * counted root is properly recorded in a given transaction. This is required 310 * to make sure the old root from before we joined the transaction is deleted 311 * when the transaction commits 312 */ 313 static int record_root_in_trans(struct btrfs_trans_handle *trans, 314 struct btrfs_root *root, 315 int force) 316 { 317 struct btrfs_fs_info *fs_info = root->fs_info; 318 319 if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 320 root->last_trans < trans->transid) || force) { 321 WARN_ON(root == fs_info->extent_root); 322 WARN_ON(root->commit_root != root->node); 323 324 /* 325 * see below for IN_TRANS_SETUP usage rules 326 * we have the reloc mutex held now, so there 327 * is only one writer in this function 328 */ 329 set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); 330 331 /* make sure readers find IN_TRANS_SETUP before 332 * they find our root->last_trans update 333 */ 334 smp_wmb(); 335 336 spin_lock(&fs_info->fs_roots_radix_lock); 337 if (root->last_trans == trans->transid && !force) { 338 spin_unlock(&fs_info->fs_roots_radix_lock); 339 return 0; 340 } 341 radix_tree_tag_set(&fs_info->fs_roots_radix, 342 (unsigned long)root->root_key.objectid, 343 BTRFS_ROOT_TRANS_TAG); 344 spin_unlock(&fs_info->fs_roots_radix_lock); 345 root->last_trans = trans->transid; 346 347 /* this is pretty tricky. We don't want to 348 * take the relocation lock in btrfs_record_root_in_trans 349 * unless we're really doing the first setup for this root in 350 * this transaction. 351 * 352 * Normally we'd use root->last_trans as a flag to decide 353 * if we want to take the expensive mutex. 354 * 355 * But, we have to set root->last_trans before we 356 * init the relocation root, otherwise, we trip over warnings 357 * in ctree.c. The solution used here is to flag ourselves 358 * with root IN_TRANS_SETUP. When this is 1, we're still 359 * fixing up the reloc trees and everyone must wait. 360 * 361 * When this is zero, they can trust root->last_trans and fly 362 * through btrfs_record_root_in_trans without having to take the 363 * lock. smp_wmb() makes sure that all the writes above are 364 * done before we pop in the zero below 365 */ 366 btrfs_init_reloc_root(trans, root); 367 smp_mb__before_atomic(); 368 clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); 369 } 370 return 0; 371 } 372 373 374 void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, 375 struct btrfs_root *root) 376 { 377 struct btrfs_fs_info *fs_info = root->fs_info; 378 struct btrfs_transaction *cur_trans = trans->transaction; 379 380 /* Add ourselves to the transaction dropped list */ 381 spin_lock(&cur_trans->dropped_roots_lock); 382 list_add_tail(&root->root_list, &cur_trans->dropped_roots); 383 spin_unlock(&cur_trans->dropped_roots_lock); 384 385 /* Make sure we don't try to update the root at commit time */ 386 spin_lock(&fs_info->fs_roots_radix_lock); 387 radix_tree_tag_clear(&fs_info->fs_roots_radix, 388 (unsigned long)root->root_key.objectid, 389 BTRFS_ROOT_TRANS_TAG); 390 spin_unlock(&fs_info->fs_roots_radix_lock); 391 } 392 393 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 394 struct btrfs_root *root) 395 { 396 struct btrfs_fs_info *fs_info = root->fs_info; 397 398 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 399 return 0; 400 401 /* 402 * see record_root_in_trans for comments about IN_TRANS_SETUP usage 403 * and barriers 404 */ 405 smp_rmb(); 406 if (root->last_trans == trans->transid && 407 !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) 408 return 0; 409 410 mutex_lock(&fs_info->reloc_mutex); 411 record_root_in_trans(trans, root, 0); 412 mutex_unlock(&fs_info->reloc_mutex); 413 414 return 0; 415 } 416 417 static inline int is_transaction_blocked(struct btrfs_transaction *trans) 418 { 419 return (trans->state >= TRANS_STATE_BLOCKED && 420 trans->state < TRANS_STATE_UNBLOCKED && 421 !trans->aborted); 422 } 423 424 /* wait for commit against the current transaction to become unblocked 425 * when this is done, it is safe to start a new transaction, but the current 426 * transaction might not be fully on disk. 427 */ 428 static void wait_current_trans(struct btrfs_fs_info *fs_info) 429 { 430 struct btrfs_transaction *cur_trans; 431 432 spin_lock(&fs_info->trans_lock); 433 cur_trans = fs_info->running_transaction; 434 if (cur_trans && is_transaction_blocked(cur_trans)) { 435 atomic_inc(&cur_trans->use_count); 436 spin_unlock(&fs_info->trans_lock); 437 438 wait_event(fs_info->transaction_wait, 439 cur_trans->state >= TRANS_STATE_UNBLOCKED || 440 cur_trans->aborted); 441 btrfs_put_transaction(cur_trans); 442 } else { 443 spin_unlock(&fs_info->trans_lock); 444 } 445 } 446 447 static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type) 448 { 449 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 450 return 0; 451 452 if (type == TRANS_USERSPACE) 453 return 1; 454 455 if (type == TRANS_START && 456 !atomic_read(&fs_info->open_ioctl_trans)) 457 return 1; 458 459 return 0; 460 } 461 462 static inline bool need_reserve_reloc_root(struct btrfs_root *root) 463 { 464 struct btrfs_fs_info *fs_info = root->fs_info; 465 466 if (!fs_info->reloc_ctl || 467 !test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 468 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 469 root->reloc_root) 470 return false; 471 472 return true; 473 } 474 475 static struct btrfs_trans_handle * 476 start_transaction(struct btrfs_root *root, unsigned int num_items, 477 unsigned int type, enum btrfs_reserve_flush_enum flush) 478 { 479 struct btrfs_fs_info *fs_info = root->fs_info; 480 481 struct btrfs_trans_handle *h; 482 struct btrfs_transaction *cur_trans; 483 u64 num_bytes = 0; 484 u64 qgroup_reserved = 0; 485 bool reloc_reserved = false; 486 int ret; 487 488 /* Send isn't supposed to start transactions. */ 489 ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB); 490 491 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 492 return ERR_PTR(-EROFS); 493 494 if (current->journal_info) { 495 WARN_ON(type & TRANS_EXTWRITERS); 496 h = current->journal_info; 497 h->use_count++; 498 WARN_ON(h->use_count > 2); 499 h->orig_rsv = h->block_rsv; 500 h->block_rsv = NULL; 501 goto got_it; 502 } 503 504 /* 505 * Do the reservation before we join the transaction so we can do all 506 * the appropriate flushing if need be. 507 */ 508 if (num_items > 0 && root != fs_info->chunk_root) { 509 qgroup_reserved = num_items * fs_info->nodesize; 510 ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved); 511 if (ret) 512 return ERR_PTR(ret); 513 514 num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); 515 /* 516 * Do the reservation for the relocation root creation 517 */ 518 if (need_reserve_reloc_root(root)) { 519 num_bytes += fs_info->nodesize; 520 reloc_reserved = true; 521 } 522 523 ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, 524 num_bytes, flush); 525 if (ret) 526 goto reserve_fail; 527 } 528 again: 529 h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS); 530 if (!h) { 531 ret = -ENOMEM; 532 goto alloc_fail; 533 } 534 535 /* 536 * If we are JOIN_NOLOCK we're already committing a transaction and 537 * waiting on this guy, so we don't need to do the sb_start_intwrite 538 * because we're already holding a ref. We need this because we could 539 * have raced in and did an fsync() on a file which can kick a commit 540 * and then we deadlock with somebody doing a freeze. 541 * 542 * If we are ATTACH, it means we just want to catch the current 543 * transaction and commit it, so we needn't do sb_start_intwrite(). 544 */ 545 if (type & __TRANS_FREEZABLE) 546 sb_start_intwrite(fs_info->sb); 547 548 if (may_wait_transaction(fs_info, type)) 549 wait_current_trans(fs_info); 550 551 do { 552 ret = join_transaction(fs_info, type); 553 if (ret == -EBUSY) { 554 wait_current_trans(fs_info); 555 if (unlikely(type == TRANS_ATTACH)) 556 ret = -ENOENT; 557 } 558 } while (ret == -EBUSY); 559 560 if (ret < 0) 561 goto join_fail; 562 563 cur_trans = fs_info->running_transaction; 564 565 h->transid = cur_trans->transid; 566 h->transaction = cur_trans; 567 h->root = root; 568 h->use_count = 1; 569 h->fs_info = root->fs_info; 570 571 h->type = type; 572 h->can_flush_pending_bgs = true; 573 INIT_LIST_HEAD(&h->qgroup_ref_list); 574 INIT_LIST_HEAD(&h->new_bgs); 575 576 smp_mb(); 577 if (cur_trans->state >= TRANS_STATE_BLOCKED && 578 may_wait_transaction(fs_info, type)) { 579 current->journal_info = h; 580 btrfs_commit_transaction(h); 581 goto again; 582 } 583 584 if (num_bytes) { 585 trace_btrfs_space_reservation(fs_info, "transaction", 586 h->transid, num_bytes, 1); 587 h->block_rsv = &fs_info->trans_block_rsv; 588 h->bytes_reserved = num_bytes; 589 h->reloc_reserved = reloc_reserved; 590 } 591 592 got_it: 593 btrfs_record_root_in_trans(h, root); 594 595 if (!current->journal_info && type != TRANS_USERSPACE) 596 current->journal_info = h; 597 return h; 598 599 join_fail: 600 if (type & __TRANS_FREEZABLE) 601 sb_end_intwrite(fs_info->sb); 602 kmem_cache_free(btrfs_trans_handle_cachep, h); 603 alloc_fail: 604 if (num_bytes) 605 btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, 606 num_bytes); 607 reserve_fail: 608 btrfs_qgroup_free_meta(root, qgroup_reserved); 609 return ERR_PTR(ret); 610 } 611 612 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 613 unsigned int num_items) 614 { 615 return start_transaction(root, num_items, TRANS_START, 616 BTRFS_RESERVE_FLUSH_ALL); 617 } 618 struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( 619 struct btrfs_root *root, 620 unsigned int num_items, 621 int min_factor) 622 { 623 struct btrfs_fs_info *fs_info = root->fs_info; 624 struct btrfs_trans_handle *trans; 625 u64 num_bytes; 626 int ret; 627 628 trans = btrfs_start_transaction(root, num_items); 629 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 630 return trans; 631 632 trans = btrfs_start_transaction(root, 0); 633 if (IS_ERR(trans)) 634 return trans; 635 636 num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); 637 ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv, 638 num_bytes, min_factor); 639 if (ret) { 640 btrfs_end_transaction(trans); 641 return ERR_PTR(ret); 642 } 643 644 trans->block_rsv = &fs_info->trans_block_rsv; 645 trans->bytes_reserved = num_bytes; 646 trace_btrfs_space_reservation(fs_info, "transaction", 647 trans->transid, num_bytes, 1); 648 649 return trans; 650 } 651 652 struct btrfs_trans_handle *btrfs_start_transaction_lflush( 653 struct btrfs_root *root, 654 unsigned int num_items) 655 { 656 return start_transaction(root, num_items, TRANS_START, 657 BTRFS_RESERVE_FLUSH_LIMIT); 658 } 659 660 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 661 { 662 return start_transaction(root, 0, TRANS_JOIN, 663 BTRFS_RESERVE_NO_FLUSH); 664 } 665 666 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) 667 { 668 return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 669 BTRFS_RESERVE_NO_FLUSH); 670 } 671 672 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) 673 { 674 return start_transaction(root, 0, TRANS_USERSPACE, 675 BTRFS_RESERVE_NO_FLUSH); 676 } 677 678 /* 679 * btrfs_attach_transaction() - catch the running transaction 680 * 681 * It is used when we want to commit the current the transaction, but 682 * don't want to start a new one. 683 * 684 * Note: If this function return -ENOENT, it just means there is no 685 * running transaction. But it is possible that the inactive transaction 686 * is still in the memory, not fully on disk. If you hope there is no 687 * inactive transaction in the fs when -ENOENT is returned, you should 688 * invoke 689 * btrfs_attach_transaction_barrier() 690 */ 691 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) 692 { 693 return start_transaction(root, 0, TRANS_ATTACH, 694 BTRFS_RESERVE_NO_FLUSH); 695 } 696 697 /* 698 * btrfs_attach_transaction_barrier() - catch the running transaction 699 * 700 * It is similar to the above function, the differentia is this one 701 * will wait for all the inactive transactions until they fully 702 * complete. 703 */ 704 struct btrfs_trans_handle * 705 btrfs_attach_transaction_barrier(struct btrfs_root *root) 706 { 707 struct btrfs_trans_handle *trans; 708 709 trans = start_transaction(root, 0, TRANS_ATTACH, 710 BTRFS_RESERVE_NO_FLUSH); 711 if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) 712 btrfs_wait_for_commit(root->fs_info, 0); 713 714 return trans; 715 } 716 717 /* wait for a transaction commit to be fully complete */ 718 static noinline void wait_for_commit(struct btrfs_transaction *commit) 719 { 720 wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); 721 } 722 723 int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) 724 { 725 struct btrfs_transaction *cur_trans = NULL, *t; 726 int ret = 0; 727 728 if (transid) { 729 if (transid <= fs_info->last_trans_committed) 730 goto out; 731 732 /* find specified transaction */ 733 spin_lock(&fs_info->trans_lock); 734 list_for_each_entry(t, &fs_info->trans_list, list) { 735 if (t->transid == transid) { 736 cur_trans = t; 737 atomic_inc(&cur_trans->use_count); 738 ret = 0; 739 break; 740 } 741 if (t->transid > transid) { 742 ret = 0; 743 break; 744 } 745 } 746 spin_unlock(&fs_info->trans_lock); 747 748 /* 749 * The specified transaction doesn't exist, or we 750 * raced with btrfs_commit_transaction 751 */ 752 if (!cur_trans) { 753 if (transid > fs_info->last_trans_committed) 754 ret = -EINVAL; 755 goto out; 756 } 757 } else { 758 /* find newest transaction that is committing | committed */ 759 spin_lock(&fs_info->trans_lock); 760 list_for_each_entry_reverse(t, &fs_info->trans_list, 761 list) { 762 if (t->state >= TRANS_STATE_COMMIT_START) { 763 if (t->state == TRANS_STATE_COMPLETED) 764 break; 765 cur_trans = t; 766 atomic_inc(&cur_trans->use_count); 767 break; 768 } 769 } 770 spin_unlock(&fs_info->trans_lock); 771 if (!cur_trans) 772 goto out; /* nothing committing|committed */ 773 } 774 775 wait_for_commit(cur_trans); 776 btrfs_put_transaction(cur_trans); 777 out: 778 return ret; 779 } 780 781 void btrfs_throttle(struct btrfs_fs_info *fs_info) 782 { 783 if (!atomic_read(&fs_info->open_ioctl_trans)) 784 wait_current_trans(fs_info); 785 } 786 787 static int should_end_transaction(struct btrfs_trans_handle *trans) 788 { 789 struct btrfs_fs_info *fs_info = trans->fs_info; 790 791 if (fs_info->global_block_rsv.space_info->full && 792 btrfs_check_space_for_delayed_refs(trans, fs_info)) 793 return 1; 794 795 return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); 796 } 797 798 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) 799 { 800 struct btrfs_transaction *cur_trans = trans->transaction; 801 struct btrfs_fs_info *fs_info = trans->fs_info; 802 int updates; 803 int err; 804 805 smp_mb(); 806 if (cur_trans->state >= TRANS_STATE_BLOCKED || 807 cur_trans->delayed_refs.flushing) 808 return 1; 809 810 updates = trans->delayed_ref_updates; 811 trans->delayed_ref_updates = 0; 812 if (updates) { 813 err = btrfs_run_delayed_refs(trans, fs_info, updates * 2); 814 if (err) /* Error code will also eval true */ 815 return err; 816 } 817 818 return should_end_transaction(trans); 819 } 820 821 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 822 int throttle) 823 { 824 struct btrfs_fs_info *info = trans->fs_info; 825 struct btrfs_transaction *cur_trans = trans->transaction; 826 u64 transid = trans->transid; 827 unsigned long cur = trans->delayed_ref_updates; 828 int lock = (trans->type != TRANS_JOIN_NOLOCK); 829 int err = 0; 830 int must_run_delayed_refs = 0; 831 832 if (trans->use_count > 1) { 833 trans->use_count--; 834 trans->block_rsv = trans->orig_rsv; 835 return 0; 836 } 837 838 btrfs_trans_release_metadata(trans, info); 839 trans->block_rsv = NULL; 840 841 if (!list_empty(&trans->new_bgs)) 842 btrfs_create_pending_block_groups(trans, info); 843 844 trans->delayed_ref_updates = 0; 845 if (!trans->sync) { 846 must_run_delayed_refs = 847 btrfs_should_throttle_delayed_refs(trans, info); 848 cur = max_t(unsigned long, cur, 32); 849 850 /* 851 * don't make the caller wait if they are from a NOLOCK 852 * or ATTACH transaction, it will deadlock with commit 853 */ 854 if (must_run_delayed_refs == 1 && 855 (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) 856 must_run_delayed_refs = 2; 857 } 858 859 btrfs_trans_release_metadata(trans, info); 860 trans->block_rsv = NULL; 861 862 if (!list_empty(&trans->new_bgs)) 863 btrfs_create_pending_block_groups(trans, info); 864 865 btrfs_trans_release_chunk_metadata(trans); 866 867 if (lock && !atomic_read(&info->open_ioctl_trans) && 868 should_end_transaction(trans) && 869 ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { 870 spin_lock(&info->trans_lock); 871 if (cur_trans->state == TRANS_STATE_RUNNING) 872 cur_trans->state = TRANS_STATE_BLOCKED; 873 spin_unlock(&info->trans_lock); 874 } 875 876 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { 877 if (throttle) 878 return btrfs_commit_transaction(trans); 879 else 880 wake_up_process(info->transaction_kthread); 881 } 882 883 if (trans->type & __TRANS_FREEZABLE) 884 sb_end_intwrite(info->sb); 885 886 WARN_ON(cur_trans != info->running_transaction); 887 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 888 atomic_dec(&cur_trans->num_writers); 889 extwriter_counter_dec(cur_trans, trans->type); 890 891 /* 892 * Make sure counter is updated before we wake up waiters. 893 */ 894 smp_mb(); 895 if (waitqueue_active(&cur_trans->writer_wait)) 896 wake_up(&cur_trans->writer_wait); 897 btrfs_put_transaction(cur_trans); 898 899 if (current->journal_info == trans) 900 current->journal_info = NULL; 901 902 if (throttle) 903 btrfs_run_delayed_iputs(info); 904 905 if (trans->aborted || 906 test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { 907 wake_up_process(info->transaction_kthread); 908 err = -EIO; 909 } 910 assert_qgroups_uptodate(trans); 911 912 kmem_cache_free(btrfs_trans_handle_cachep, trans); 913 if (must_run_delayed_refs) { 914 btrfs_async_run_delayed_refs(info, cur, transid, 915 must_run_delayed_refs == 1); 916 } 917 return err; 918 } 919 920 int btrfs_end_transaction(struct btrfs_trans_handle *trans) 921 { 922 return __btrfs_end_transaction(trans, 0); 923 } 924 925 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans) 926 { 927 return __btrfs_end_transaction(trans, 1); 928 } 929 930 /* 931 * when btree blocks are allocated, they have some corresponding bits set for 932 * them in one of two extent_io trees. This is used to make sure all of 933 * those extents are sent to disk but does not wait on them 934 */ 935 int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, 936 struct extent_io_tree *dirty_pages, int mark) 937 { 938 int err = 0; 939 int werr = 0; 940 struct address_space *mapping = fs_info->btree_inode->i_mapping; 941 struct extent_state *cached_state = NULL; 942 u64 start = 0; 943 u64 end; 944 945 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 946 mark, &cached_state)) { 947 bool wait_writeback = false; 948 949 err = convert_extent_bit(dirty_pages, start, end, 950 EXTENT_NEED_WAIT, 951 mark, &cached_state); 952 /* 953 * convert_extent_bit can return -ENOMEM, which is most of the 954 * time a temporary error. So when it happens, ignore the error 955 * and wait for writeback of this range to finish - because we 956 * failed to set the bit EXTENT_NEED_WAIT for the range, a call 957 * to __btrfs_wait_marked_extents() would not know that 958 * writeback for this range started and therefore wouldn't 959 * wait for it to finish - we don't want to commit a 960 * superblock that points to btree nodes/leafs for which 961 * writeback hasn't finished yet (and without errors). 962 * We cleanup any entries left in the io tree when committing 963 * the transaction (through clear_btree_io_tree()). 964 */ 965 if (err == -ENOMEM) { 966 err = 0; 967 wait_writeback = true; 968 } 969 if (!err) 970 err = filemap_fdatawrite_range(mapping, start, end); 971 if (err) 972 werr = err; 973 else if (wait_writeback) 974 werr = filemap_fdatawait_range(mapping, start, end); 975 free_extent_state(cached_state); 976 cached_state = NULL; 977 cond_resched(); 978 start = end + 1; 979 } 980 return werr; 981 } 982 983 /* 984 * when btree blocks are allocated, they have some corresponding bits set for 985 * them in one of two extent_io trees. This is used to make sure all of 986 * those extents are on disk for transaction or log commit. We wait 987 * on all the pages and clear them from the dirty pages state tree 988 */ 989 static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, 990 struct extent_io_tree *dirty_pages) 991 { 992 int err = 0; 993 int werr = 0; 994 struct address_space *mapping = fs_info->btree_inode->i_mapping; 995 struct extent_state *cached_state = NULL; 996 u64 start = 0; 997 u64 end; 998 999 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 1000 EXTENT_NEED_WAIT, &cached_state)) { 1001 /* 1002 * Ignore -ENOMEM errors returned by clear_extent_bit(). 1003 * When committing the transaction, we'll remove any entries 1004 * left in the io tree. For a log commit, we don't remove them 1005 * after committing the log because the tree can be accessed 1006 * concurrently - we do it only at transaction commit time when 1007 * it's safe to do it (through clear_btree_io_tree()). 1008 */ 1009 err = clear_extent_bit(dirty_pages, start, end, 1010 EXTENT_NEED_WAIT, 1011 0, 0, &cached_state, GFP_NOFS); 1012 if (err == -ENOMEM) 1013 err = 0; 1014 if (!err) 1015 err = filemap_fdatawait_range(mapping, start, end); 1016 if (err) 1017 werr = err; 1018 free_extent_state(cached_state); 1019 cached_state = NULL; 1020 cond_resched(); 1021 start = end + 1; 1022 } 1023 if (err) 1024 werr = err; 1025 return werr; 1026 } 1027 1028 int btrfs_wait_extents(struct btrfs_fs_info *fs_info, 1029 struct extent_io_tree *dirty_pages) 1030 { 1031 bool errors = false; 1032 int err; 1033 1034 err = __btrfs_wait_marked_extents(fs_info, dirty_pages); 1035 if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags)) 1036 errors = true; 1037 1038 if (errors && !err) 1039 err = -EIO; 1040 return err; 1041 } 1042 1043 int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) 1044 { 1045 struct btrfs_fs_info *fs_info = log_root->fs_info; 1046 struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages; 1047 bool errors = false; 1048 int err; 1049 1050 ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); 1051 1052 err = __btrfs_wait_marked_extents(fs_info, dirty_pages); 1053 if ((mark & EXTENT_DIRTY) && 1054 test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags)) 1055 errors = true; 1056 1057 if ((mark & EXTENT_NEW) && 1058 test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags)) 1059 errors = true; 1060 1061 if (errors && !err) 1062 err = -EIO; 1063 return err; 1064 } 1065 1066 /* 1067 * when btree blocks are allocated, they have some corresponding bits set for 1068 * them in one of two extent_io trees. This is used to make sure all of 1069 * those extents are on disk for transaction or log commit 1070 */ 1071 static int btrfs_write_and_wait_marked_extents(struct btrfs_fs_info *fs_info, 1072 struct extent_io_tree *dirty_pages, int mark) 1073 { 1074 int ret; 1075 int ret2; 1076 struct blk_plug plug; 1077 1078 blk_start_plug(&plug); 1079 ret = btrfs_write_marked_extents(fs_info, dirty_pages, mark); 1080 blk_finish_plug(&plug); 1081 ret2 = btrfs_wait_extents(fs_info, dirty_pages); 1082 1083 if (ret) 1084 return ret; 1085 if (ret2) 1086 return ret2; 1087 return 0; 1088 } 1089 1090 static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 1091 struct btrfs_fs_info *fs_info) 1092 { 1093 int ret; 1094 1095 ret = btrfs_write_and_wait_marked_extents(fs_info, 1096 &trans->transaction->dirty_pages, 1097 EXTENT_DIRTY); 1098 clear_btree_io_tree(&trans->transaction->dirty_pages); 1099 1100 return ret; 1101 } 1102 1103 /* 1104 * this is used to update the root pointer in the tree of tree roots. 1105 * 1106 * But, in the case of the extent allocation tree, updating the root 1107 * pointer may allocate blocks which may change the root of the extent 1108 * allocation tree. 1109 * 1110 * So, this loops and repeats and makes sure the cowonly root didn't 1111 * change while the root pointer was being updated in the metadata. 1112 */ 1113 static int update_cowonly_root(struct btrfs_trans_handle *trans, 1114 struct btrfs_root *root) 1115 { 1116 int ret; 1117 u64 old_root_bytenr; 1118 u64 old_root_used; 1119 struct btrfs_fs_info *fs_info = root->fs_info; 1120 struct btrfs_root *tree_root = fs_info->tree_root; 1121 1122 old_root_used = btrfs_root_used(&root->root_item); 1123 1124 while (1) { 1125 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 1126 if (old_root_bytenr == root->node->start && 1127 old_root_used == btrfs_root_used(&root->root_item)) 1128 break; 1129 1130 btrfs_set_root_node(&root->root_item, root->node); 1131 ret = btrfs_update_root(trans, tree_root, 1132 &root->root_key, 1133 &root->root_item); 1134 if (ret) 1135 return ret; 1136 1137 old_root_used = btrfs_root_used(&root->root_item); 1138 } 1139 1140 return 0; 1141 } 1142 1143 /* 1144 * update all the cowonly tree roots on disk 1145 * 1146 * The error handling in this function may not be obvious. Any of the 1147 * failures will cause the file system to go offline. We still need 1148 * to clean up the delayed refs. 1149 */ 1150 static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, 1151 struct btrfs_fs_info *fs_info) 1152 { 1153 struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; 1154 struct list_head *io_bgs = &trans->transaction->io_bgs; 1155 struct list_head *next; 1156 struct extent_buffer *eb; 1157 int ret; 1158 1159 eb = btrfs_lock_root_node(fs_info->tree_root); 1160 ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 1161 0, &eb); 1162 btrfs_tree_unlock(eb); 1163 free_extent_buffer(eb); 1164 1165 if (ret) 1166 return ret; 1167 1168 ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); 1169 if (ret) 1170 return ret; 1171 1172 ret = btrfs_run_dev_stats(trans, fs_info); 1173 if (ret) 1174 return ret; 1175 ret = btrfs_run_dev_replace(trans, fs_info); 1176 if (ret) 1177 return ret; 1178 ret = btrfs_run_qgroups(trans, fs_info); 1179 if (ret) 1180 return ret; 1181 1182 ret = btrfs_setup_space_cache(trans, fs_info); 1183 if (ret) 1184 return ret; 1185 1186 /* run_qgroups might have added some more refs */ 1187 ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); 1188 if (ret) 1189 return ret; 1190 again: 1191 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 1192 struct btrfs_root *root; 1193 next = fs_info->dirty_cowonly_roots.next; 1194 list_del_init(next); 1195 root = list_entry(next, struct btrfs_root, dirty_list); 1196 clear_bit(BTRFS_ROOT_DIRTY, &root->state); 1197 1198 if (root != fs_info->extent_root) 1199 list_add_tail(&root->dirty_list, 1200 &trans->transaction->switch_commits); 1201 ret = update_cowonly_root(trans, root); 1202 if (ret) 1203 return ret; 1204 ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); 1205 if (ret) 1206 return ret; 1207 } 1208 1209 while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { 1210 ret = btrfs_write_dirty_block_groups(trans, fs_info); 1211 if (ret) 1212 return ret; 1213 ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); 1214 if (ret) 1215 return ret; 1216 } 1217 1218 if (!list_empty(&fs_info->dirty_cowonly_roots)) 1219 goto again; 1220 1221 list_add_tail(&fs_info->extent_root->dirty_list, 1222 &trans->transaction->switch_commits); 1223 btrfs_after_dev_replace_commit(fs_info); 1224 1225 return 0; 1226 } 1227 1228 /* 1229 * dead roots are old snapshots that need to be deleted. This allocates 1230 * a dirty root struct and adds it into the list of dead roots that need to 1231 * be deleted 1232 */ 1233 void btrfs_add_dead_root(struct btrfs_root *root) 1234 { 1235 struct btrfs_fs_info *fs_info = root->fs_info; 1236 1237 spin_lock(&fs_info->trans_lock); 1238 if (list_empty(&root->root_list)) 1239 list_add_tail(&root->root_list, &fs_info->dead_roots); 1240 spin_unlock(&fs_info->trans_lock); 1241 } 1242 1243 /* 1244 * update all the cowonly tree roots on disk 1245 */ 1246 static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, 1247 struct btrfs_fs_info *fs_info) 1248 { 1249 struct btrfs_root *gang[8]; 1250 int i; 1251 int ret; 1252 int err = 0; 1253 1254 spin_lock(&fs_info->fs_roots_radix_lock); 1255 while (1) { 1256 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, 1257 (void **)gang, 0, 1258 ARRAY_SIZE(gang), 1259 BTRFS_ROOT_TRANS_TAG); 1260 if (ret == 0) 1261 break; 1262 for (i = 0; i < ret; i++) { 1263 struct btrfs_root *root = gang[i]; 1264 radix_tree_tag_clear(&fs_info->fs_roots_radix, 1265 (unsigned long)root->root_key.objectid, 1266 BTRFS_ROOT_TRANS_TAG); 1267 spin_unlock(&fs_info->fs_roots_radix_lock); 1268 1269 btrfs_free_log(trans, root); 1270 btrfs_update_reloc_root(trans, root); 1271 btrfs_orphan_commit_root(trans, root); 1272 1273 btrfs_save_ino_cache(root, trans); 1274 1275 /* see comments in should_cow_block() */ 1276 clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); 1277 smp_mb__after_atomic(); 1278 1279 if (root->commit_root != root->node) { 1280 list_add_tail(&root->dirty_list, 1281 &trans->transaction->switch_commits); 1282 btrfs_set_root_node(&root->root_item, 1283 root->node); 1284 } 1285 1286 err = btrfs_update_root(trans, fs_info->tree_root, 1287 &root->root_key, 1288 &root->root_item); 1289 spin_lock(&fs_info->fs_roots_radix_lock); 1290 if (err) 1291 break; 1292 btrfs_qgroup_free_meta_all(root); 1293 } 1294 } 1295 spin_unlock(&fs_info->fs_roots_radix_lock); 1296 return err; 1297 } 1298 1299 /* 1300 * defrag a given btree. 1301 * Every leaf in the btree is read and defragged. 1302 */ 1303 int btrfs_defrag_root(struct btrfs_root *root) 1304 { 1305 struct btrfs_fs_info *info = root->fs_info; 1306 struct btrfs_trans_handle *trans; 1307 int ret; 1308 1309 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) 1310 return 0; 1311 1312 while (1) { 1313 trans = btrfs_start_transaction(root, 0); 1314 if (IS_ERR(trans)) 1315 return PTR_ERR(trans); 1316 1317 ret = btrfs_defrag_leaves(trans, root); 1318 1319 btrfs_end_transaction(trans); 1320 btrfs_btree_balance_dirty(info); 1321 cond_resched(); 1322 1323 if (btrfs_fs_closing(info) || ret != -EAGAIN) 1324 break; 1325 1326 if (btrfs_defrag_cancelled(info)) { 1327 btrfs_debug(info, "defrag_root cancelled"); 1328 ret = -EAGAIN; 1329 break; 1330 } 1331 } 1332 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); 1333 return ret; 1334 } 1335 1336 /* 1337 * Do all special snapshot related qgroup dirty hack. 1338 * 1339 * Will do all needed qgroup inherit and dirty hack like switch commit 1340 * roots inside one transaction and write all btree into disk, to make 1341 * qgroup works. 1342 */ 1343 static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, 1344 struct btrfs_root *src, 1345 struct btrfs_root *parent, 1346 struct btrfs_qgroup_inherit *inherit, 1347 u64 dst_objectid) 1348 { 1349 struct btrfs_fs_info *fs_info = src->fs_info; 1350 int ret; 1351 1352 /* 1353 * Save some performance in the case that qgroups are not 1354 * enabled. If this check races with the ioctl, rescan will 1355 * kick in anyway. 1356 */ 1357 mutex_lock(&fs_info->qgroup_ioctl_lock); 1358 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 1359 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1360 return 0; 1361 } 1362 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1363 1364 /* 1365 * We are going to commit transaction, see btrfs_commit_transaction() 1366 * comment for reason locking tree_log_mutex 1367 */ 1368 mutex_lock(&fs_info->tree_log_mutex); 1369 1370 ret = commit_fs_roots(trans, fs_info); 1371 if (ret) 1372 goto out; 1373 ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); 1374 if (ret < 0) 1375 goto out; 1376 ret = btrfs_qgroup_account_extents(trans, fs_info); 1377 if (ret < 0) 1378 goto out; 1379 1380 /* Now qgroup are all updated, we can inherit it to new qgroups */ 1381 ret = btrfs_qgroup_inherit(trans, fs_info, 1382 src->root_key.objectid, dst_objectid, 1383 inherit); 1384 if (ret < 0) 1385 goto out; 1386 1387 /* 1388 * Now we do a simplified commit transaction, which will: 1389 * 1) commit all subvolume and extent tree 1390 * To ensure all subvolume and extent tree have a valid 1391 * commit_root to accounting later insert_dir_item() 1392 * 2) write all btree blocks onto disk 1393 * This is to make sure later btree modification will be cowed 1394 * Or commit_root can be populated and cause wrong qgroup numbers 1395 * In this simplified commit, we don't really care about other trees 1396 * like chunk and root tree, as they won't affect qgroup. 1397 * And we don't write super to avoid half committed status. 1398 */ 1399 ret = commit_cowonly_roots(trans, fs_info); 1400 if (ret) 1401 goto out; 1402 switch_commit_roots(trans->transaction, fs_info); 1403 ret = btrfs_write_and_wait_transaction(trans, fs_info); 1404 if (ret) 1405 btrfs_handle_fs_error(fs_info, ret, 1406 "Error while writing out transaction for qgroup"); 1407 1408 out: 1409 mutex_unlock(&fs_info->tree_log_mutex); 1410 1411 /* 1412 * Force parent root to be updated, as we recorded it before so its 1413 * last_trans == cur_transid. 1414 * Or it won't be committed again onto disk after later 1415 * insert_dir_item() 1416 */ 1417 if (!ret) 1418 record_root_in_trans(trans, parent, 1); 1419 return ret; 1420 } 1421 1422 /* 1423 * new snapshots need to be created at a very specific time in the 1424 * transaction commit. This does the actual creation. 1425 * 1426 * Note: 1427 * If the error which may affect the commitment of the current transaction 1428 * happens, we should return the error number. If the error which just affect 1429 * the creation of the pending snapshots, just return 0. 1430 */ 1431 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, 1432 struct btrfs_fs_info *fs_info, 1433 struct btrfs_pending_snapshot *pending) 1434 { 1435 struct btrfs_key key; 1436 struct btrfs_root_item *new_root_item; 1437 struct btrfs_root *tree_root = fs_info->tree_root; 1438 struct btrfs_root *root = pending->root; 1439 struct btrfs_root *parent_root; 1440 struct btrfs_block_rsv *rsv; 1441 struct inode *parent_inode; 1442 struct btrfs_path *path; 1443 struct btrfs_dir_item *dir_item; 1444 struct dentry *dentry; 1445 struct extent_buffer *tmp; 1446 struct extent_buffer *old; 1447 struct timespec cur_time; 1448 int ret = 0; 1449 u64 to_reserve = 0; 1450 u64 index = 0; 1451 u64 objectid; 1452 u64 root_flags; 1453 uuid_le new_uuid; 1454 1455 ASSERT(pending->path); 1456 path = pending->path; 1457 1458 ASSERT(pending->root_item); 1459 new_root_item = pending->root_item; 1460 1461 pending->error = btrfs_find_free_objectid(tree_root, &objectid); 1462 if (pending->error) 1463 goto no_free_objectid; 1464 1465 /* 1466 * Make qgroup to skip current new snapshot's qgroupid, as it is 1467 * accounted by later btrfs_qgroup_inherit(). 1468 */ 1469 btrfs_set_skip_qgroup(trans, objectid); 1470 1471 btrfs_reloc_pre_snapshot(pending, &to_reserve); 1472 1473 if (to_reserve > 0) { 1474 pending->error = btrfs_block_rsv_add(root, 1475 &pending->block_rsv, 1476 to_reserve, 1477 BTRFS_RESERVE_NO_FLUSH); 1478 if (pending->error) 1479 goto clear_skip_qgroup; 1480 } 1481 1482 key.objectid = objectid; 1483 key.offset = (u64)-1; 1484 key.type = BTRFS_ROOT_ITEM_KEY; 1485 1486 rsv = trans->block_rsv; 1487 trans->block_rsv = &pending->block_rsv; 1488 trans->bytes_reserved = trans->block_rsv->reserved; 1489 trace_btrfs_space_reservation(fs_info, "transaction", 1490 trans->transid, 1491 trans->bytes_reserved, 1); 1492 dentry = pending->dentry; 1493 parent_inode = pending->dir; 1494 parent_root = BTRFS_I(parent_inode)->root; 1495 record_root_in_trans(trans, parent_root, 0); 1496 1497 cur_time = current_time(parent_inode); 1498 1499 /* 1500 * insert the directory item 1501 */ 1502 ret = btrfs_set_inode_index(parent_inode, &index); 1503 BUG_ON(ret); /* -ENOMEM */ 1504 1505 /* check if there is a file/dir which has the same name. */ 1506 dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, 1507 btrfs_ino(parent_inode), 1508 dentry->d_name.name, 1509 dentry->d_name.len, 0); 1510 if (dir_item != NULL && !IS_ERR(dir_item)) { 1511 pending->error = -EEXIST; 1512 goto dir_item_existed; 1513 } else if (IS_ERR(dir_item)) { 1514 ret = PTR_ERR(dir_item); 1515 btrfs_abort_transaction(trans, ret); 1516 goto fail; 1517 } 1518 btrfs_release_path(path); 1519 1520 /* 1521 * pull in the delayed directory update 1522 * and the delayed inode item 1523 * otherwise we corrupt the FS during 1524 * snapshot 1525 */ 1526 ret = btrfs_run_delayed_items(trans, fs_info); 1527 if (ret) { /* Transaction aborted */ 1528 btrfs_abort_transaction(trans, ret); 1529 goto fail; 1530 } 1531 1532 record_root_in_trans(trans, root, 0); 1533 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 1534 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 1535 btrfs_check_and_init_root_item(new_root_item); 1536 1537 root_flags = btrfs_root_flags(new_root_item); 1538 if (pending->readonly) 1539 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; 1540 else 1541 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; 1542 btrfs_set_root_flags(new_root_item, root_flags); 1543 1544 btrfs_set_root_generation_v2(new_root_item, 1545 trans->transid); 1546 uuid_le_gen(&new_uuid); 1547 memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); 1548 memcpy(new_root_item->parent_uuid, root->root_item.uuid, 1549 BTRFS_UUID_SIZE); 1550 if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { 1551 memset(new_root_item->received_uuid, 0, 1552 sizeof(new_root_item->received_uuid)); 1553 memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); 1554 memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); 1555 btrfs_set_root_stransid(new_root_item, 0); 1556 btrfs_set_root_rtransid(new_root_item, 0); 1557 } 1558 btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec); 1559 btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec); 1560 btrfs_set_root_otransid(new_root_item, trans->transid); 1561 1562 old = btrfs_lock_root_node(root); 1563 ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); 1564 if (ret) { 1565 btrfs_tree_unlock(old); 1566 free_extent_buffer(old); 1567 btrfs_abort_transaction(trans, ret); 1568 goto fail; 1569 } 1570 1571 btrfs_set_lock_blocking(old); 1572 1573 ret = btrfs_copy_root(trans, root, old, &tmp, objectid); 1574 /* clean up in any case */ 1575 btrfs_tree_unlock(old); 1576 free_extent_buffer(old); 1577 if (ret) { 1578 btrfs_abort_transaction(trans, ret); 1579 goto fail; 1580 } 1581 /* see comments in should_cow_block() */ 1582 set_bit(BTRFS_ROOT_FORCE_COW, &root->state); 1583 smp_wmb(); 1584 1585 btrfs_set_root_node(new_root_item, tmp); 1586 /* record when the snapshot was created in key.offset */ 1587 key.offset = trans->transid; 1588 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); 1589 btrfs_tree_unlock(tmp); 1590 free_extent_buffer(tmp); 1591 if (ret) { 1592 btrfs_abort_transaction(trans, ret); 1593 goto fail; 1594 } 1595 1596 /* 1597 * insert root back/forward references 1598 */ 1599 ret = btrfs_add_root_ref(trans, fs_info, objectid, 1600 parent_root->root_key.objectid, 1601 btrfs_ino(parent_inode), index, 1602 dentry->d_name.name, dentry->d_name.len); 1603 if (ret) { 1604 btrfs_abort_transaction(trans, ret); 1605 goto fail; 1606 } 1607 1608 key.offset = (u64)-1; 1609 pending->snap = btrfs_read_fs_root_no_name(fs_info, &key); 1610 if (IS_ERR(pending->snap)) { 1611 ret = PTR_ERR(pending->snap); 1612 btrfs_abort_transaction(trans, ret); 1613 goto fail; 1614 } 1615 1616 ret = btrfs_reloc_post_snapshot(trans, pending); 1617 if (ret) { 1618 btrfs_abort_transaction(trans, ret); 1619 goto fail; 1620 } 1621 1622 ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); 1623 if (ret) { 1624 btrfs_abort_transaction(trans, ret); 1625 goto fail; 1626 } 1627 1628 /* 1629 * Do special qgroup accounting for snapshot, as we do some qgroup 1630 * snapshot hack to do fast snapshot. 1631 * To co-operate with that hack, we do hack again. 1632 * Or snapshot will be greatly slowed down by a subtree qgroup rescan 1633 */ 1634 ret = qgroup_account_snapshot(trans, root, parent_root, 1635 pending->inherit, objectid); 1636 if (ret < 0) 1637 goto fail; 1638 1639 ret = btrfs_insert_dir_item(trans, parent_root, 1640 dentry->d_name.name, dentry->d_name.len, 1641 parent_inode, &key, 1642 BTRFS_FT_DIR, index); 1643 /* We have check then name at the beginning, so it is impossible. */ 1644 BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); 1645 if (ret) { 1646 btrfs_abort_transaction(trans, ret); 1647 goto fail; 1648 } 1649 1650 btrfs_i_size_write(parent_inode, parent_inode->i_size + 1651 dentry->d_name.len * 2); 1652 parent_inode->i_mtime = parent_inode->i_ctime = 1653 current_time(parent_inode); 1654 ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); 1655 if (ret) { 1656 btrfs_abort_transaction(trans, ret); 1657 goto fail; 1658 } 1659 ret = btrfs_uuid_tree_add(trans, fs_info, new_uuid.b, 1660 BTRFS_UUID_KEY_SUBVOL, objectid); 1661 if (ret) { 1662 btrfs_abort_transaction(trans, ret); 1663 goto fail; 1664 } 1665 if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { 1666 ret = btrfs_uuid_tree_add(trans, fs_info, 1667 new_root_item->received_uuid, 1668 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 1669 objectid); 1670 if (ret && ret != -EEXIST) { 1671 btrfs_abort_transaction(trans, ret); 1672 goto fail; 1673 } 1674 } 1675 1676 ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); 1677 if (ret) { 1678 btrfs_abort_transaction(trans, ret); 1679 goto fail; 1680 } 1681 1682 fail: 1683 pending->error = ret; 1684 dir_item_existed: 1685 trans->block_rsv = rsv; 1686 trans->bytes_reserved = 0; 1687 clear_skip_qgroup: 1688 btrfs_clear_skip_qgroup(trans); 1689 no_free_objectid: 1690 kfree(new_root_item); 1691 pending->root_item = NULL; 1692 btrfs_free_path(path); 1693 pending->path = NULL; 1694 1695 return ret; 1696 } 1697 1698 /* 1699 * create all the snapshots we've scheduled for creation 1700 */ 1701 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, 1702 struct btrfs_fs_info *fs_info) 1703 { 1704 struct btrfs_pending_snapshot *pending, *next; 1705 struct list_head *head = &trans->transaction->pending_snapshots; 1706 int ret = 0; 1707 1708 list_for_each_entry_safe(pending, next, head, list) { 1709 list_del(&pending->list); 1710 ret = create_pending_snapshot(trans, fs_info, pending); 1711 if (ret) 1712 break; 1713 } 1714 return ret; 1715 } 1716 1717 static void update_super_roots(struct btrfs_fs_info *fs_info) 1718 { 1719 struct btrfs_root_item *root_item; 1720 struct btrfs_super_block *super; 1721 1722 super = fs_info->super_copy; 1723 1724 root_item = &fs_info->chunk_root->root_item; 1725 super->chunk_root = root_item->bytenr; 1726 super->chunk_root_generation = root_item->generation; 1727 super->chunk_root_level = root_item->level; 1728 1729 root_item = &fs_info->tree_root->root_item; 1730 super->root = root_item->bytenr; 1731 super->generation = root_item->generation; 1732 super->root_level = root_item->level; 1733 if (btrfs_test_opt(fs_info, SPACE_CACHE)) 1734 super->cache_generation = root_item->generation; 1735 if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) 1736 super->uuid_tree_generation = root_item->generation; 1737 } 1738 1739 int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1740 { 1741 struct btrfs_transaction *trans; 1742 int ret = 0; 1743 1744 spin_lock(&info->trans_lock); 1745 trans = info->running_transaction; 1746 if (trans) 1747 ret = (trans->state >= TRANS_STATE_COMMIT_START); 1748 spin_unlock(&info->trans_lock); 1749 return ret; 1750 } 1751 1752 int btrfs_transaction_blocked(struct btrfs_fs_info *info) 1753 { 1754 struct btrfs_transaction *trans; 1755 int ret = 0; 1756 1757 spin_lock(&info->trans_lock); 1758 trans = info->running_transaction; 1759 if (trans) 1760 ret = is_transaction_blocked(trans); 1761 spin_unlock(&info->trans_lock); 1762 return ret; 1763 } 1764 1765 /* 1766 * wait for the current transaction commit to start and block subsequent 1767 * transaction joins 1768 */ 1769 static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info, 1770 struct btrfs_transaction *trans) 1771 { 1772 wait_event(fs_info->transaction_blocked_wait, 1773 trans->state >= TRANS_STATE_COMMIT_START || trans->aborted); 1774 } 1775 1776 /* 1777 * wait for the current transaction to start and then become unblocked. 1778 * caller holds ref. 1779 */ 1780 static void wait_current_trans_commit_start_and_unblock( 1781 struct btrfs_fs_info *fs_info, 1782 struct btrfs_transaction *trans) 1783 { 1784 wait_event(fs_info->transaction_wait, 1785 trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted); 1786 } 1787 1788 /* 1789 * commit transactions asynchronously. once btrfs_commit_transaction_async 1790 * returns, any subsequent transaction will not be allowed to join. 1791 */ 1792 struct btrfs_async_commit { 1793 struct btrfs_trans_handle *newtrans; 1794 struct work_struct work; 1795 }; 1796 1797 static void do_async_commit(struct work_struct *work) 1798 { 1799 struct btrfs_async_commit *ac = 1800 container_of(work, struct btrfs_async_commit, work); 1801 1802 /* 1803 * We've got freeze protection passed with the transaction. 1804 * Tell lockdep about it. 1805 */ 1806 if (ac->newtrans->type & __TRANS_FREEZABLE) 1807 __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS); 1808 1809 current->journal_info = ac->newtrans; 1810 1811 btrfs_commit_transaction(ac->newtrans); 1812 kfree(ac); 1813 } 1814 1815 int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, 1816 int wait_for_unblock) 1817 { 1818 struct btrfs_fs_info *fs_info = trans->fs_info; 1819 struct btrfs_async_commit *ac; 1820 struct btrfs_transaction *cur_trans; 1821 1822 ac = kmalloc(sizeof(*ac), GFP_NOFS); 1823 if (!ac) 1824 return -ENOMEM; 1825 1826 INIT_WORK(&ac->work, do_async_commit); 1827 ac->newtrans = btrfs_join_transaction(trans->root); 1828 if (IS_ERR(ac->newtrans)) { 1829 int err = PTR_ERR(ac->newtrans); 1830 kfree(ac); 1831 return err; 1832 } 1833 1834 /* take transaction reference */ 1835 cur_trans = trans->transaction; 1836 atomic_inc(&cur_trans->use_count); 1837 1838 btrfs_end_transaction(trans); 1839 1840 /* 1841 * Tell lockdep we've released the freeze rwsem, since the 1842 * async commit thread will be the one to unlock it. 1843 */ 1844 if (ac->newtrans->type & __TRANS_FREEZABLE) 1845 __sb_writers_release(fs_info->sb, SB_FREEZE_FS); 1846 1847 schedule_work(&ac->work); 1848 1849 /* wait for transaction to start and unblock */ 1850 if (wait_for_unblock) 1851 wait_current_trans_commit_start_and_unblock(fs_info, cur_trans); 1852 else 1853 wait_current_trans_commit_start(fs_info, cur_trans); 1854 1855 if (current->journal_info == trans) 1856 current->journal_info = NULL; 1857 1858 btrfs_put_transaction(cur_trans); 1859 return 0; 1860 } 1861 1862 1863 static void cleanup_transaction(struct btrfs_trans_handle *trans, 1864 struct btrfs_root *root, int err) 1865 { 1866 struct btrfs_fs_info *fs_info = root->fs_info; 1867 struct btrfs_transaction *cur_trans = trans->transaction; 1868 DEFINE_WAIT(wait); 1869 1870 WARN_ON(trans->use_count > 1); 1871 1872 btrfs_abort_transaction(trans, err); 1873 1874 spin_lock(&fs_info->trans_lock); 1875 1876 /* 1877 * If the transaction is removed from the list, it means this 1878 * transaction has been committed successfully, so it is impossible 1879 * to call the cleanup function. 1880 */ 1881 BUG_ON(list_empty(&cur_trans->list)); 1882 1883 list_del_init(&cur_trans->list); 1884 if (cur_trans == fs_info->running_transaction) { 1885 cur_trans->state = TRANS_STATE_COMMIT_DOING; 1886 spin_unlock(&fs_info->trans_lock); 1887 wait_event(cur_trans->writer_wait, 1888 atomic_read(&cur_trans->num_writers) == 1); 1889 1890 spin_lock(&fs_info->trans_lock); 1891 } 1892 spin_unlock(&fs_info->trans_lock); 1893 1894 btrfs_cleanup_one_transaction(trans->transaction, fs_info); 1895 1896 spin_lock(&fs_info->trans_lock); 1897 if (cur_trans == fs_info->running_transaction) 1898 fs_info->running_transaction = NULL; 1899 spin_unlock(&fs_info->trans_lock); 1900 1901 if (trans->type & __TRANS_FREEZABLE) 1902 sb_end_intwrite(fs_info->sb); 1903 btrfs_put_transaction(cur_trans); 1904 btrfs_put_transaction(cur_trans); 1905 1906 trace_btrfs_transaction_commit(root); 1907 1908 if (current->journal_info == trans) 1909 current->journal_info = NULL; 1910 btrfs_scrub_cancel(fs_info); 1911 1912 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1913 } 1914 1915 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1916 { 1917 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) 1918 return btrfs_start_delalloc_roots(fs_info, 1, -1); 1919 return 0; 1920 } 1921 1922 static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) 1923 { 1924 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) 1925 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); 1926 } 1927 1928 static inline void 1929 btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) 1930 { 1931 wait_event(cur_trans->pending_wait, 1932 atomic_read(&cur_trans->pending_ordered) == 0); 1933 } 1934 1935 int btrfs_commit_transaction(struct btrfs_trans_handle *trans) 1936 { 1937 struct btrfs_fs_info *fs_info = trans->fs_info; 1938 struct btrfs_transaction *cur_trans = trans->transaction; 1939 struct btrfs_transaction *prev_trans = NULL; 1940 int ret; 1941 1942 /* Stop the commit early if ->aborted is set */ 1943 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1944 ret = cur_trans->aborted; 1945 btrfs_end_transaction(trans); 1946 return ret; 1947 } 1948 1949 /* make a pass through all the delayed refs we have so far 1950 * any runnings procs may add more while we are here 1951 */ 1952 ret = btrfs_run_delayed_refs(trans, fs_info, 0); 1953 if (ret) { 1954 btrfs_end_transaction(trans); 1955 return ret; 1956 } 1957 1958 btrfs_trans_release_metadata(trans, fs_info); 1959 trans->block_rsv = NULL; 1960 1961 cur_trans = trans->transaction; 1962 1963 /* 1964 * set the flushing flag so procs in this transaction have to 1965 * start sending their work down. 1966 */ 1967 cur_trans->delayed_refs.flushing = 1; 1968 smp_wmb(); 1969 1970 if (!list_empty(&trans->new_bgs)) 1971 btrfs_create_pending_block_groups(trans, fs_info); 1972 1973 ret = btrfs_run_delayed_refs(trans, fs_info, 0); 1974 if (ret) { 1975 btrfs_end_transaction(trans); 1976 return ret; 1977 } 1978 1979 if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { 1980 int run_it = 0; 1981 1982 /* this mutex is also taken before trying to set 1983 * block groups readonly. We need to make sure 1984 * that nobody has set a block group readonly 1985 * after a extents from that block group have been 1986 * allocated for cache files. btrfs_set_block_group_ro 1987 * will wait for the transaction to commit if it 1988 * finds BTRFS_TRANS_DIRTY_BG_RUN set. 1989 * 1990 * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure 1991 * only one process starts all the block group IO. It wouldn't 1992 * hurt to have more than one go through, but there's no 1993 * real advantage to it either. 1994 */ 1995 mutex_lock(&fs_info->ro_block_group_mutex); 1996 if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN, 1997 &cur_trans->flags)) 1998 run_it = 1; 1999 mutex_unlock(&fs_info->ro_block_group_mutex); 2000 2001 if (run_it) 2002 ret = btrfs_start_dirty_block_groups(trans, fs_info); 2003 } 2004 if (ret) { 2005 btrfs_end_transaction(trans); 2006 return ret; 2007 } 2008 2009 spin_lock(&fs_info->trans_lock); 2010 if (cur_trans->state >= TRANS_STATE_COMMIT_START) { 2011 spin_unlock(&fs_info->trans_lock); 2012 atomic_inc(&cur_trans->use_count); 2013 ret = btrfs_end_transaction(trans); 2014 2015 wait_for_commit(cur_trans); 2016 2017 if (unlikely(cur_trans->aborted)) 2018 ret = cur_trans->aborted; 2019 2020 btrfs_put_transaction(cur_trans); 2021 2022 return ret; 2023 } 2024 2025 cur_trans->state = TRANS_STATE_COMMIT_START; 2026 wake_up(&fs_info->transaction_blocked_wait); 2027 2028 if (cur_trans->list.prev != &fs_info->trans_list) { 2029 prev_trans = list_entry(cur_trans->list.prev, 2030 struct btrfs_transaction, list); 2031 if (prev_trans->state != TRANS_STATE_COMPLETED) { 2032 atomic_inc(&prev_trans->use_count); 2033 spin_unlock(&fs_info->trans_lock); 2034 2035 wait_for_commit(prev_trans); 2036 ret = prev_trans->aborted; 2037 2038 btrfs_put_transaction(prev_trans); 2039 if (ret) 2040 goto cleanup_transaction; 2041 } else { 2042 spin_unlock(&fs_info->trans_lock); 2043 } 2044 } else { 2045 spin_unlock(&fs_info->trans_lock); 2046 } 2047 2048 extwriter_counter_dec(cur_trans, trans->type); 2049 2050 ret = btrfs_start_delalloc_flush(fs_info); 2051 if (ret) 2052 goto cleanup_transaction; 2053 2054 ret = btrfs_run_delayed_items(trans, fs_info); 2055 if (ret) 2056 goto cleanup_transaction; 2057 2058 wait_event(cur_trans->writer_wait, 2059 extwriter_counter_read(cur_trans) == 0); 2060 2061 /* some pending stuffs might be added after the previous flush. */ 2062 ret = btrfs_run_delayed_items(trans, fs_info); 2063 if (ret) 2064 goto cleanup_transaction; 2065 2066 btrfs_wait_delalloc_flush(fs_info); 2067 2068 btrfs_wait_pending_ordered(cur_trans); 2069 2070 btrfs_scrub_pause(fs_info); 2071 /* 2072 * Ok now we need to make sure to block out any other joins while we 2073 * commit the transaction. We could have started a join before setting 2074 * COMMIT_DOING so make sure to wait for num_writers to == 1 again. 2075 */ 2076 spin_lock(&fs_info->trans_lock); 2077 cur_trans->state = TRANS_STATE_COMMIT_DOING; 2078 spin_unlock(&fs_info->trans_lock); 2079 wait_event(cur_trans->writer_wait, 2080 atomic_read(&cur_trans->num_writers) == 1); 2081 2082 /* ->aborted might be set after the previous check, so check it */ 2083 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 2084 ret = cur_trans->aborted; 2085 goto scrub_continue; 2086 } 2087 /* 2088 * the reloc mutex makes sure that we stop 2089 * the balancing code from coming in and moving 2090 * extents around in the middle of the commit 2091 */ 2092 mutex_lock(&fs_info->reloc_mutex); 2093 2094 /* 2095 * We needn't worry about the delayed items because we will 2096 * deal with them in create_pending_snapshot(), which is the 2097 * core function of the snapshot creation. 2098 */ 2099 ret = create_pending_snapshots(trans, fs_info); 2100 if (ret) { 2101 mutex_unlock(&fs_info->reloc_mutex); 2102 goto scrub_continue; 2103 } 2104 2105 /* 2106 * We insert the dir indexes of the snapshots and update the inode 2107 * of the snapshots' parents after the snapshot creation, so there 2108 * are some delayed items which are not dealt with. Now deal with 2109 * them. 2110 * 2111 * We needn't worry that this operation will corrupt the snapshots, 2112 * because all the tree which are snapshoted will be forced to COW 2113 * the nodes and leaves. 2114 */ 2115 ret = btrfs_run_delayed_items(trans, fs_info); 2116 if (ret) { 2117 mutex_unlock(&fs_info->reloc_mutex); 2118 goto scrub_continue; 2119 } 2120 2121 ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); 2122 if (ret) { 2123 mutex_unlock(&fs_info->reloc_mutex); 2124 goto scrub_continue; 2125 } 2126 2127 /* Reocrd old roots for later qgroup accounting */ 2128 ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); 2129 if (ret) { 2130 mutex_unlock(&fs_info->reloc_mutex); 2131 goto scrub_continue; 2132 } 2133 2134 /* 2135 * make sure none of the code above managed to slip in a 2136 * delayed item 2137 */ 2138 btrfs_assert_delayed_root_empty(fs_info); 2139 2140 WARN_ON(cur_trans != trans->transaction); 2141 2142 /* btrfs_commit_tree_roots is responsible for getting the 2143 * various roots consistent with each other. Every pointer 2144 * in the tree of tree roots has to point to the most up to date 2145 * root for every subvolume and other tree. So, we have to keep 2146 * the tree logging code from jumping in and changing any 2147 * of the trees. 2148 * 2149 * At this point in the commit, there can't be any tree-log 2150 * writers, but a little lower down we drop the trans mutex 2151 * and let new people in. By holding the tree_log_mutex 2152 * from now until after the super is written, we avoid races 2153 * with the tree-log code. 2154 */ 2155 mutex_lock(&fs_info->tree_log_mutex); 2156 2157 ret = commit_fs_roots(trans, fs_info); 2158 if (ret) { 2159 mutex_unlock(&fs_info->tree_log_mutex); 2160 mutex_unlock(&fs_info->reloc_mutex); 2161 goto scrub_continue; 2162 } 2163 2164 /* 2165 * Since the transaction is done, we can apply the pending changes 2166 * before the next transaction. 2167 */ 2168 btrfs_apply_pending_changes(fs_info); 2169 2170 /* commit_fs_roots gets rid of all the tree log roots, it is now 2171 * safe to free the root of tree log roots 2172 */ 2173 btrfs_free_log_root_tree(trans, fs_info); 2174 2175 /* 2176 * Since fs roots are all committed, we can get a quite accurate 2177 * new_roots. So let's do quota accounting. 2178 */ 2179 ret = btrfs_qgroup_account_extents(trans, fs_info); 2180 if (ret < 0) { 2181 mutex_unlock(&fs_info->tree_log_mutex); 2182 mutex_unlock(&fs_info->reloc_mutex); 2183 goto scrub_continue; 2184 } 2185 2186 ret = commit_cowonly_roots(trans, fs_info); 2187 if (ret) { 2188 mutex_unlock(&fs_info->tree_log_mutex); 2189 mutex_unlock(&fs_info->reloc_mutex); 2190 goto scrub_continue; 2191 } 2192 2193 /* 2194 * The tasks which save the space cache and inode cache may also 2195 * update ->aborted, check it. 2196 */ 2197 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 2198 ret = cur_trans->aborted; 2199 mutex_unlock(&fs_info->tree_log_mutex); 2200 mutex_unlock(&fs_info->reloc_mutex); 2201 goto scrub_continue; 2202 } 2203 2204 btrfs_prepare_extent_commit(trans, fs_info); 2205 2206 cur_trans = fs_info->running_transaction; 2207 2208 btrfs_set_root_node(&fs_info->tree_root->root_item, 2209 fs_info->tree_root->node); 2210 list_add_tail(&fs_info->tree_root->dirty_list, 2211 &cur_trans->switch_commits); 2212 2213 btrfs_set_root_node(&fs_info->chunk_root->root_item, 2214 fs_info->chunk_root->node); 2215 list_add_tail(&fs_info->chunk_root->dirty_list, 2216 &cur_trans->switch_commits); 2217 2218 switch_commit_roots(cur_trans, fs_info); 2219 2220 assert_qgroups_uptodate(trans); 2221 ASSERT(list_empty(&cur_trans->dirty_bgs)); 2222 ASSERT(list_empty(&cur_trans->io_bgs)); 2223 update_super_roots(fs_info); 2224 2225 btrfs_set_super_log_root(fs_info->super_copy, 0); 2226 btrfs_set_super_log_root_level(fs_info->super_copy, 0); 2227 memcpy(fs_info->super_for_commit, fs_info->super_copy, 2228 sizeof(*fs_info->super_copy)); 2229 2230 btrfs_update_commit_device_size(fs_info); 2231 btrfs_update_commit_device_bytes_used(fs_info, cur_trans); 2232 2233 clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); 2234 clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); 2235 2236 btrfs_trans_release_chunk_metadata(trans); 2237 2238 spin_lock(&fs_info->trans_lock); 2239 cur_trans->state = TRANS_STATE_UNBLOCKED; 2240 fs_info->running_transaction = NULL; 2241 spin_unlock(&fs_info->trans_lock); 2242 mutex_unlock(&fs_info->reloc_mutex); 2243 2244 wake_up(&fs_info->transaction_wait); 2245 2246 ret = btrfs_write_and_wait_transaction(trans, fs_info); 2247 if (ret) { 2248 btrfs_handle_fs_error(fs_info, ret, 2249 "Error while writing out transaction"); 2250 mutex_unlock(&fs_info->tree_log_mutex); 2251 goto scrub_continue; 2252 } 2253 2254 ret = write_ctree_super(trans, fs_info, 0); 2255 if (ret) { 2256 mutex_unlock(&fs_info->tree_log_mutex); 2257 goto scrub_continue; 2258 } 2259 2260 /* 2261 * the super is written, we can safely allow the tree-loggers 2262 * to go about their business 2263 */ 2264 mutex_unlock(&fs_info->tree_log_mutex); 2265 2266 btrfs_finish_extent_commit(trans, fs_info); 2267 2268 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) 2269 btrfs_clear_space_info_full(fs_info); 2270 2271 fs_info->last_trans_committed = cur_trans->transid; 2272 /* 2273 * We needn't acquire the lock here because there is no other task 2274 * which can change it. 2275 */ 2276 cur_trans->state = TRANS_STATE_COMPLETED; 2277 wake_up(&cur_trans->commit_wait); 2278 2279 spin_lock(&fs_info->trans_lock); 2280 list_del_init(&cur_trans->list); 2281 spin_unlock(&fs_info->trans_lock); 2282 2283 btrfs_put_transaction(cur_trans); 2284 btrfs_put_transaction(cur_trans); 2285 2286 if (trans->type & __TRANS_FREEZABLE) 2287 sb_end_intwrite(fs_info->sb); 2288 2289 trace_btrfs_transaction_commit(trans->root); 2290 2291 btrfs_scrub_continue(fs_info); 2292 2293 if (current->journal_info == trans) 2294 current->journal_info = NULL; 2295 2296 kmem_cache_free(btrfs_trans_handle_cachep, trans); 2297 2298 /* 2299 * If fs has been frozen, we can not handle delayed iputs, otherwise 2300 * it'll result in deadlock about SB_FREEZE_FS. 2301 */ 2302 if (current != fs_info->transaction_kthread && 2303 current != fs_info->cleaner_kthread && !fs_info->fs_frozen) 2304 btrfs_run_delayed_iputs(fs_info); 2305 2306 return ret; 2307 2308 scrub_continue: 2309 btrfs_scrub_continue(fs_info); 2310 cleanup_transaction: 2311 btrfs_trans_release_metadata(trans, fs_info); 2312 btrfs_trans_release_chunk_metadata(trans); 2313 trans->block_rsv = NULL; 2314 btrfs_warn(fs_info, "Skipping commit of aborted transaction."); 2315 if (current->journal_info == trans) 2316 current->journal_info = NULL; 2317 cleanup_transaction(trans, trans->root, ret); 2318 2319 return ret; 2320 } 2321 2322 /* 2323 * return < 0 if error 2324 * 0 if there are no more dead_roots at the time of call 2325 * 1 there are more to be processed, call me again 2326 * 2327 * The return value indicates there are certainly more snapshots to delete, but 2328 * if there comes a new one during processing, it may return 0. We don't mind, 2329 * because btrfs_commit_super will poke cleaner thread and it will process it a 2330 * few seconds later. 2331 */ 2332 int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) 2333 { 2334 int ret; 2335 struct btrfs_fs_info *fs_info = root->fs_info; 2336 2337 spin_lock(&fs_info->trans_lock); 2338 if (list_empty(&fs_info->dead_roots)) { 2339 spin_unlock(&fs_info->trans_lock); 2340 return 0; 2341 } 2342 root = list_first_entry(&fs_info->dead_roots, 2343 struct btrfs_root, root_list); 2344 list_del_init(&root->root_list); 2345 spin_unlock(&fs_info->trans_lock); 2346 2347 btrfs_debug(fs_info, "cleaner removing %llu", root->objectid); 2348 2349 btrfs_kill_all_delayed_nodes(root); 2350 2351 if (btrfs_header_backref_rev(root->node) < 2352 BTRFS_MIXED_BACKREF_REV) 2353 ret = btrfs_drop_snapshot(root, NULL, 0, 0); 2354 else 2355 ret = btrfs_drop_snapshot(root, NULL, 1, 0); 2356 2357 return (ret < 0) ? 0 : 1; 2358 } 2359 2360 void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) 2361 { 2362 unsigned long prev; 2363 unsigned long bit; 2364 2365 prev = xchg(&fs_info->pending_changes, 0); 2366 if (!prev) 2367 return; 2368 2369 bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE; 2370 if (prev & bit) 2371 btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE); 2372 prev &= ~bit; 2373 2374 bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE; 2375 if (prev & bit) 2376 btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE); 2377 prev &= ~bit; 2378 2379 bit = 1 << BTRFS_PENDING_COMMIT; 2380 if (prev & bit) 2381 btrfs_debug(fs_info, "pending commit done"); 2382 prev &= ~bit; 2383 2384 if (prev) 2385 btrfs_warn(fs_info, 2386 "unknown pending changes left 0x%lx, ignoring", prev); 2387 } 2388