1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/sort.h> 9 #include "ctree.h" 10 #include "delayed-ref.h" 11 #include "transaction.h" 12 #include "qgroup.h" 13 #include "space-info.h" 14 #include "tree-mod-log.h" 15 16 struct kmem_cache *btrfs_delayed_ref_head_cachep; 17 struct kmem_cache *btrfs_delayed_tree_ref_cachep; 18 struct kmem_cache *btrfs_delayed_data_ref_cachep; 19 struct kmem_cache *btrfs_delayed_extent_op_cachep; 20 /* 21 * delayed back reference update tracking. For subvolume trees 22 * we queue up extent allocations and backref maintenance for 23 * delayed processing. This avoids deep call chains where we 24 * add extents in the middle of btrfs_search_slot, and it allows 25 * us to buffer up frequently modified backrefs in an rb tree instead 26 * of hammering updates on the extent allocation tree. 27 */ 28 29 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) 30 { 31 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 32 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 33 bool ret = false; 34 u64 reserved; 35 36 spin_lock(&global_rsv->lock); 37 reserved = global_rsv->reserved; 38 spin_unlock(&global_rsv->lock); 39 40 /* 41 * Since the global reserve is just kind of magic we don't really want 42 * to rely on it to save our bacon, so if our size is more than the 43 * delayed_refs_rsv and the global rsv then it's time to think about 44 * bailing. 45 */ 46 spin_lock(&delayed_refs_rsv->lock); 47 reserved += delayed_refs_rsv->reserved; 48 if (delayed_refs_rsv->size >= reserved) 49 ret = true; 50 spin_unlock(&delayed_refs_rsv->lock); 51 return ret; 52 } 53 54 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) 55 { 56 u64 num_entries = 57 atomic_read(&trans->transaction->delayed_refs.num_entries); 58 u64 avg_runtime; 59 u64 val; 60 61 smp_mb(); 62 avg_runtime = trans->fs_info->avg_delayed_ref_runtime; 63 val = num_entries * avg_runtime; 64 if (val >= NSEC_PER_SEC) 65 return 1; 66 if (val >= NSEC_PER_SEC / 2) 67 return 2; 68 69 return btrfs_check_space_for_delayed_refs(trans->fs_info); 70 } 71 72 /** 73 * Release a ref head's reservation 74 * 75 * @fs_info: the filesystem 76 * @nr: number of items to drop 77 * 78 * This drops the delayed ref head's count from the delayed refs rsv and frees 79 * any excess reservation we had. 80 */ 81 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) 82 { 83 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 84 u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); 85 u64 released = 0; 86 87 /* 88 * We have to check the mount option here because we could be enabling 89 * the free space tree for the first time and don't have the compat_ro 90 * option set yet. 91 * 92 * We need extra reservations if we have the free space tree because 93 * we'll have to modify that tree as well. 94 */ 95 if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) 96 num_bytes *= 2; 97 98 released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); 99 if (released) 100 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 101 0, released, 0); 102 } 103 104 /* 105 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv 106 * @trans - the trans that may have generated delayed refs 107 * 108 * This is to be called anytime we may have adjusted trans->delayed_ref_updates, 109 * it'll calculate the additional size and add it to the delayed_refs_rsv. 110 */ 111 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) 112 { 113 struct btrfs_fs_info *fs_info = trans->fs_info; 114 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 115 u64 num_bytes; 116 117 if (!trans->delayed_ref_updates) 118 return; 119 120 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 121 trans->delayed_ref_updates); 122 /* 123 * We have to check the mount option here because we could be enabling 124 * the free space tree for the first time and don't have the compat_ro 125 * option set yet. 126 * 127 * We need extra reservations if we have the free space tree because 128 * we'll have to modify that tree as well. 129 */ 130 if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) 131 num_bytes *= 2; 132 133 spin_lock(&delayed_rsv->lock); 134 delayed_rsv->size += num_bytes; 135 delayed_rsv->full = 0; 136 spin_unlock(&delayed_rsv->lock); 137 trans->delayed_ref_updates = 0; 138 } 139 140 /** 141 * Transfer bytes to our delayed refs rsv 142 * 143 * @fs_info: the filesystem 144 * @src: source block rsv to transfer from 145 * @num_bytes: number of bytes to transfer 146 * 147 * This transfers up to the num_bytes amount from the src rsv to the 148 * delayed_refs_rsv. Any extra bytes are returned to the space info. 149 */ 150 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, 151 struct btrfs_block_rsv *src, 152 u64 num_bytes) 153 { 154 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 155 u64 to_free = 0; 156 157 spin_lock(&src->lock); 158 src->reserved -= num_bytes; 159 src->size -= num_bytes; 160 spin_unlock(&src->lock); 161 162 spin_lock(&delayed_refs_rsv->lock); 163 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { 164 u64 delta = delayed_refs_rsv->size - 165 delayed_refs_rsv->reserved; 166 if (num_bytes > delta) { 167 to_free = num_bytes - delta; 168 num_bytes = delta; 169 } 170 } else { 171 to_free = num_bytes; 172 num_bytes = 0; 173 } 174 175 if (num_bytes) 176 delayed_refs_rsv->reserved += num_bytes; 177 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) 178 delayed_refs_rsv->full = 1; 179 spin_unlock(&delayed_refs_rsv->lock); 180 181 if (num_bytes) 182 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 183 0, num_bytes, 1); 184 if (to_free) 185 btrfs_space_info_free_bytes_may_use(fs_info, 186 delayed_refs_rsv->space_info, to_free); 187 } 188 189 /** 190 * Refill based on our delayed refs usage 191 * 192 * @fs_info: the filesystem 193 * @flush: control how we can flush for this reservation. 194 * 195 * This will refill the delayed block_rsv up to 1 items size worth of space and 196 * will return -ENOSPC if we can't make the reservation. 197 */ 198 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 199 enum btrfs_reserve_flush_enum flush) 200 { 201 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 202 u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1); 203 u64 num_bytes = 0; 204 int ret = -ENOSPC; 205 206 spin_lock(&block_rsv->lock); 207 if (block_rsv->reserved < block_rsv->size) { 208 num_bytes = block_rsv->size - block_rsv->reserved; 209 num_bytes = min(num_bytes, limit); 210 } 211 spin_unlock(&block_rsv->lock); 212 213 if (!num_bytes) 214 return 0; 215 216 ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); 217 if (ret) 218 return ret; 219 btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); 220 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 221 0, num_bytes, 1); 222 return 0; 223 } 224 225 /* 226 * compare two delayed tree backrefs with same bytenr and type 227 */ 228 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1, 229 struct btrfs_delayed_tree_ref *ref2) 230 { 231 if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { 232 if (ref1->root < ref2->root) 233 return -1; 234 if (ref1->root > ref2->root) 235 return 1; 236 } else { 237 if (ref1->parent < ref2->parent) 238 return -1; 239 if (ref1->parent > ref2->parent) 240 return 1; 241 } 242 return 0; 243 } 244 245 /* 246 * compare two delayed data backrefs with same bytenr and type 247 */ 248 static int comp_data_refs(struct btrfs_delayed_data_ref *ref1, 249 struct btrfs_delayed_data_ref *ref2) 250 { 251 if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { 252 if (ref1->root < ref2->root) 253 return -1; 254 if (ref1->root > ref2->root) 255 return 1; 256 if (ref1->objectid < ref2->objectid) 257 return -1; 258 if (ref1->objectid > ref2->objectid) 259 return 1; 260 if (ref1->offset < ref2->offset) 261 return -1; 262 if (ref1->offset > ref2->offset) 263 return 1; 264 } else { 265 if (ref1->parent < ref2->parent) 266 return -1; 267 if (ref1->parent > ref2->parent) 268 return 1; 269 } 270 return 0; 271 } 272 273 static int comp_refs(struct btrfs_delayed_ref_node *ref1, 274 struct btrfs_delayed_ref_node *ref2, 275 bool check_seq) 276 { 277 int ret = 0; 278 279 if (ref1->type < ref2->type) 280 return -1; 281 if (ref1->type > ref2->type) 282 return 1; 283 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 284 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) 285 ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1), 286 btrfs_delayed_node_to_tree_ref(ref2)); 287 else 288 ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1), 289 btrfs_delayed_node_to_data_ref(ref2)); 290 if (ret) 291 return ret; 292 if (check_seq) { 293 if (ref1->seq < ref2->seq) 294 return -1; 295 if (ref1->seq > ref2->seq) 296 return 1; 297 } 298 return 0; 299 } 300 301 /* insert a new ref to head ref rbtree */ 302 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, 303 struct rb_node *node) 304 { 305 struct rb_node **p = &root->rb_root.rb_node; 306 struct rb_node *parent_node = NULL; 307 struct btrfs_delayed_ref_head *entry; 308 struct btrfs_delayed_ref_head *ins; 309 u64 bytenr; 310 bool leftmost = true; 311 312 ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); 313 bytenr = ins->bytenr; 314 while (*p) { 315 parent_node = *p; 316 entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, 317 href_node); 318 319 if (bytenr < entry->bytenr) { 320 p = &(*p)->rb_left; 321 } else if (bytenr > entry->bytenr) { 322 p = &(*p)->rb_right; 323 leftmost = false; 324 } else { 325 return entry; 326 } 327 } 328 329 rb_link_node(node, parent_node, p); 330 rb_insert_color_cached(node, root, leftmost); 331 return NULL; 332 } 333 334 static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, 335 struct btrfs_delayed_ref_node *ins) 336 { 337 struct rb_node **p = &root->rb_root.rb_node; 338 struct rb_node *node = &ins->ref_node; 339 struct rb_node *parent_node = NULL; 340 struct btrfs_delayed_ref_node *entry; 341 bool leftmost = true; 342 343 while (*p) { 344 int comp; 345 346 parent_node = *p; 347 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 348 ref_node); 349 comp = comp_refs(ins, entry, true); 350 if (comp < 0) { 351 p = &(*p)->rb_left; 352 } else if (comp > 0) { 353 p = &(*p)->rb_right; 354 leftmost = false; 355 } else { 356 return entry; 357 } 358 } 359 360 rb_link_node(node, parent_node, p); 361 rb_insert_color_cached(node, root, leftmost); 362 return NULL; 363 } 364 365 static struct btrfs_delayed_ref_head *find_first_ref_head( 366 struct btrfs_delayed_ref_root *dr) 367 { 368 struct rb_node *n; 369 struct btrfs_delayed_ref_head *entry; 370 371 n = rb_first_cached(&dr->href_root); 372 if (!n) 373 return NULL; 374 375 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 376 377 return entry; 378 } 379 380 /* 381 * Find a head entry based on bytenr. This returns the delayed ref head if it 382 * was able to find one, or NULL if nothing was in that spot. If return_bigger 383 * is given, the next bigger entry is returned if no exact match is found. 384 */ 385 static struct btrfs_delayed_ref_head *find_ref_head( 386 struct btrfs_delayed_ref_root *dr, u64 bytenr, 387 bool return_bigger) 388 { 389 struct rb_root *root = &dr->href_root.rb_root; 390 struct rb_node *n; 391 struct btrfs_delayed_ref_head *entry; 392 393 n = root->rb_node; 394 entry = NULL; 395 while (n) { 396 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 397 398 if (bytenr < entry->bytenr) 399 n = n->rb_left; 400 else if (bytenr > entry->bytenr) 401 n = n->rb_right; 402 else 403 return entry; 404 } 405 if (entry && return_bigger) { 406 if (bytenr > entry->bytenr) { 407 n = rb_next(&entry->href_node); 408 if (!n) 409 return NULL; 410 entry = rb_entry(n, struct btrfs_delayed_ref_head, 411 href_node); 412 } 413 return entry; 414 } 415 return NULL; 416 } 417 418 int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, 419 struct btrfs_delayed_ref_head *head) 420 { 421 lockdep_assert_held(&delayed_refs->lock); 422 if (mutex_trylock(&head->mutex)) 423 return 0; 424 425 refcount_inc(&head->refs); 426 spin_unlock(&delayed_refs->lock); 427 428 mutex_lock(&head->mutex); 429 spin_lock(&delayed_refs->lock); 430 if (RB_EMPTY_NODE(&head->href_node)) { 431 mutex_unlock(&head->mutex); 432 btrfs_put_delayed_ref_head(head); 433 return -EAGAIN; 434 } 435 btrfs_put_delayed_ref_head(head); 436 return 0; 437 } 438 439 static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, 440 struct btrfs_delayed_ref_root *delayed_refs, 441 struct btrfs_delayed_ref_head *head, 442 struct btrfs_delayed_ref_node *ref) 443 { 444 lockdep_assert_held(&head->lock); 445 rb_erase_cached(&ref->ref_node, &head->ref_tree); 446 RB_CLEAR_NODE(&ref->ref_node); 447 if (!list_empty(&ref->add_list)) 448 list_del(&ref->add_list); 449 ref->in_tree = 0; 450 btrfs_put_delayed_ref(ref); 451 atomic_dec(&delayed_refs->num_entries); 452 } 453 454 static bool merge_ref(struct btrfs_trans_handle *trans, 455 struct btrfs_delayed_ref_root *delayed_refs, 456 struct btrfs_delayed_ref_head *head, 457 struct btrfs_delayed_ref_node *ref, 458 u64 seq) 459 { 460 struct btrfs_delayed_ref_node *next; 461 struct rb_node *node = rb_next(&ref->ref_node); 462 bool done = false; 463 464 while (!done && node) { 465 int mod; 466 467 next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 468 node = rb_next(node); 469 if (seq && next->seq >= seq) 470 break; 471 if (comp_refs(ref, next, false)) 472 break; 473 474 if (ref->action == next->action) { 475 mod = next->ref_mod; 476 } else { 477 if (ref->ref_mod < next->ref_mod) { 478 swap(ref, next); 479 done = true; 480 } 481 mod = -next->ref_mod; 482 } 483 484 drop_delayed_ref(trans, delayed_refs, head, next); 485 ref->ref_mod += mod; 486 if (ref->ref_mod == 0) { 487 drop_delayed_ref(trans, delayed_refs, head, ref); 488 done = true; 489 } else { 490 /* 491 * Can't have multiples of the same ref on a tree block. 492 */ 493 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 494 ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 495 } 496 } 497 498 return done; 499 } 500 501 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, 502 struct btrfs_delayed_ref_root *delayed_refs, 503 struct btrfs_delayed_ref_head *head) 504 { 505 struct btrfs_fs_info *fs_info = trans->fs_info; 506 struct btrfs_delayed_ref_node *ref; 507 struct rb_node *node; 508 u64 seq = 0; 509 510 lockdep_assert_held(&head->lock); 511 512 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 513 return; 514 515 /* We don't have too many refs to merge for data. */ 516 if (head->is_data) 517 return; 518 519 seq = btrfs_tree_mod_log_lowest_seq(fs_info); 520 again: 521 for (node = rb_first_cached(&head->ref_tree); node; 522 node = rb_next(node)) { 523 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 524 if (seq && ref->seq >= seq) 525 continue; 526 if (merge_ref(trans, delayed_refs, head, ref, seq)) 527 goto again; 528 } 529 } 530 531 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) 532 { 533 int ret = 0; 534 u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); 535 536 if (min_seq != 0 && seq >= min_seq) { 537 btrfs_debug(fs_info, 538 "holding back delayed_ref %llu, lowest is %llu", 539 seq, min_seq); 540 ret = 1; 541 } 542 543 return ret; 544 } 545 546 struct btrfs_delayed_ref_head *btrfs_select_ref_head( 547 struct btrfs_delayed_ref_root *delayed_refs) 548 { 549 struct btrfs_delayed_ref_head *head; 550 551 again: 552 head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 553 true); 554 if (!head && delayed_refs->run_delayed_start != 0) { 555 delayed_refs->run_delayed_start = 0; 556 head = find_first_ref_head(delayed_refs); 557 } 558 if (!head) 559 return NULL; 560 561 while (head->processing) { 562 struct rb_node *node; 563 564 node = rb_next(&head->href_node); 565 if (!node) { 566 if (delayed_refs->run_delayed_start == 0) 567 return NULL; 568 delayed_refs->run_delayed_start = 0; 569 goto again; 570 } 571 head = rb_entry(node, struct btrfs_delayed_ref_head, 572 href_node); 573 } 574 575 head->processing = 1; 576 WARN_ON(delayed_refs->num_heads_ready == 0); 577 delayed_refs->num_heads_ready--; 578 delayed_refs->run_delayed_start = head->bytenr + 579 head->num_bytes; 580 return head; 581 } 582 583 void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 584 struct btrfs_delayed_ref_head *head) 585 { 586 lockdep_assert_held(&delayed_refs->lock); 587 lockdep_assert_held(&head->lock); 588 589 rb_erase_cached(&head->href_node, &delayed_refs->href_root); 590 RB_CLEAR_NODE(&head->href_node); 591 atomic_dec(&delayed_refs->num_entries); 592 delayed_refs->num_heads--; 593 if (head->processing == 0) 594 delayed_refs->num_heads_ready--; 595 } 596 597 /* 598 * Helper to insert the ref_node to the tail or merge with tail. 599 * 600 * Return 0 for insert. 601 * Return >0 for merge. 602 */ 603 static int insert_delayed_ref(struct btrfs_trans_handle *trans, 604 struct btrfs_delayed_ref_root *root, 605 struct btrfs_delayed_ref_head *href, 606 struct btrfs_delayed_ref_node *ref) 607 { 608 struct btrfs_delayed_ref_node *exist; 609 int mod; 610 int ret = 0; 611 612 spin_lock(&href->lock); 613 exist = tree_insert(&href->ref_tree, ref); 614 if (!exist) 615 goto inserted; 616 617 /* Now we are sure we can merge */ 618 ret = 1; 619 if (exist->action == ref->action) { 620 mod = ref->ref_mod; 621 } else { 622 /* Need to change action */ 623 if (exist->ref_mod < ref->ref_mod) { 624 exist->action = ref->action; 625 mod = -exist->ref_mod; 626 exist->ref_mod = ref->ref_mod; 627 if (ref->action == BTRFS_ADD_DELAYED_REF) 628 list_add_tail(&exist->add_list, 629 &href->ref_add_list); 630 else if (ref->action == BTRFS_DROP_DELAYED_REF) { 631 ASSERT(!list_empty(&exist->add_list)); 632 list_del(&exist->add_list); 633 } else { 634 ASSERT(0); 635 } 636 } else 637 mod = -ref->ref_mod; 638 } 639 exist->ref_mod += mod; 640 641 /* remove existing tail if its ref_mod is zero */ 642 if (exist->ref_mod == 0) 643 drop_delayed_ref(trans, root, href, exist); 644 spin_unlock(&href->lock); 645 return ret; 646 inserted: 647 if (ref->action == BTRFS_ADD_DELAYED_REF) 648 list_add_tail(&ref->add_list, &href->ref_add_list); 649 atomic_inc(&root->num_entries); 650 spin_unlock(&href->lock); 651 return ret; 652 } 653 654 /* 655 * helper function to update the accounting in the head ref 656 * existing and update must have the same bytenr 657 */ 658 static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, 659 struct btrfs_delayed_ref_head *existing, 660 struct btrfs_delayed_ref_head *update) 661 { 662 struct btrfs_delayed_ref_root *delayed_refs = 663 &trans->transaction->delayed_refs; 664 struct btrfs_fs_info *fs_info = trans->fs_info; 665 int old_ref_mod; 666 667 BUG_ON(existing->is_data != update->is_data); 668 669 spin_lock(&existing->lock); 670 if (update->must_insert_reserved) { 671 /* if the extent was freed and then 672 * reallocated before the delayed ref 673 * entries were processed, we can end up 674 * with an existing head ref without 675 * the must_insert_reserved flag set. 676 * Set it again here 677 */ 678 existing->must_insert_reserved = update->must_insert_reserved; 679 680 /* 681 * update the num_bytes so we make sure the accounting 682 * is done correctly 683 */ 684 existing->num_bytes = update->num_bytes; 685 686 } 687 688 if (update->extent_op) { 689 if (!existing->extent_op) { 690 existing->extent_op = update->extent_op; 691 } else { 692 if (update->extent_op->update_key) { 693 memcpy(&existing->extent_op->key, 694 &update->extent_op->key, 695 sizeof(update->extent_op->key)); 696 existing->extent_op->update_key = true; 697 } 698 if (update->extent_op->update_flags) { 699 existing->extent_op->flags_to_set |= 700 update->extent_op->flags_to_set; 701 existing->extent_op->update_flags = true; 702 } 703 btrfs_free_delayed_extent_op(update->extent_op); 704 } 705 } 706 /* 707 * update the reference mod on the head to reflect this new operation, 708 * only need the lock for this case cause we could be processing it 709 * currently, for refs we just added we know we're a-ok. 710 */ 711 old_ref_mod = existing->total_ref_mod; 712 existing->ref_mod += update->ref_mod; 713 existing->total_ref_mod += update->ref_mod; 714 715 /* 716 * If we are going to from a positive ref mod to a negative or vice 717 * versa we need to make sure to adjust pending_csums accordingly. 718 */ 719 if (existing->is_data) { 720 u64 csum_leaves = 721 btrfs_csum_bytes_to_leaves(fs_info, 722 existing->num_bytes); 723 724 if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { 725 delayed_refs->pending_csums -= existing->num_bytes; 726 btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); 727 } 728 if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { 729 delayed_refs->pending_csums += existing->num_bytes; 730 trans->delayed_ref_updates += csum_leaves; 731 } 732 } 733 734 spin_unlock(&existing->lock); 735 } 736 737 static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, 738 struct btrfs_qgroup_extent_record *qrecord, 739 u64 bytenr, u64 num_bytes, u64 ref_root, 740 u64 reserved, int action, bool is_data, 741 bool is_system) 742 { 743 int count_mod = 1; 744 int must_insert_reserved = 0; 745 746 /* If reserved is provided, it must be a data extent. */ 747 BUG_ON(!is_data && reserved); 748 749 /* 750 * The head node stores the sum of all the mods, so dropping a ref 751 * should drop the sum in the head node by one. 752 */ 753 if (action == BTRFS_UPDATE_DELAYED_HEAD) 754 count_mod = 0; 755 else if (action == BTRFS_DROP_DELAYED_REF) 756 count_mod = -1; 757 758 /* 759 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved 760 * accounting when the extent is finally added, or if a later 761 * modification deletes the delayed ref without ever inserting the 762 * extent into the extent allocation tree. ref->must_insert_reserved 763 * is the flag used to record that accounting mods are required. 764 * 765 * Once we record must_insert_reserved, switch the action to 766 * BTRFS_ADD_DELAYED_REF because other special casing is not required. 767 */ 768 if (action == BTRFS_ADD_DELAYED_EXTENT) 769 must_insert_reserved = 1; 770 else 771 must_insert_reserved = 0; 772 773 refcount_set(&head_ref->refs, 1); 774 head_ref->bytenr = bytenr; 775 head_ref->num_bytes = num_bytes; 776 head_ref->ref_mod = count_mod; 777 head_ref->must_insert_reserved = must_insert_reserved; 778 head_ref->is_data = is_data; 779 head_ref->is_system = is_system; 780 head_ref->ref_tree = RB_ROOT_CACHED; 781 INIT_LIST_HEAD(&head_ref->ref_add_list); 782 RB_CLEAR_NODE(&head_ref->href_node); 783 head_ref->processing = 0; 784 head_ref->total_ref_mod = count_mod; 785 spin_lock_init(&head_ref->lock); 786 mutex_init(&head_ref->mutex); 787 788 if (qrecord) { 789 if (ref_root && reserved) { 790 qrecord->data_rsv = reserved; 791 qrecord->data_rsv_refroot = ref_root; 792 } 793 qrecord->bytenr = bytenr; 794 qrecord->num_bytes = num_bytes; 795 qrecord->old_roots = NULL; 796 } 797 } 798 799 /* 800 * helper function to actually insert a head node into the rbtree. 801 * this does all the dirty work in terms of maintaining the correct 802 * overall modification count. 803 */ 804 static noinline struct btrfs_delayed_ref_head * 805 add_delayed_ref_head(struct btrfs_trans_handle *trans, 806 struct btrfs_delayed_ref_head *head_ref, 807 struct btrfs_qgroup_extent_record *qrecord, 808 int action, int *qrecord_inserted_ret) 809 { 810 struct btrfs_delayed_ref_head *existing; 811 struct btrfs_delayed_ref_root *delayed_refs; 812 int qrecord_inserted = 0; 813 814 delayed_refs = &trans->transaction->delayed_refs; 815 816 /* Record qgroup extent info if provided */ 817 if (qrecord) { 818 if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, 819 delayed_refs, qrecord)) 820 kfree(qrecord); 821 else 822 qrecord_inserted = 1; 823 } 824 825 trace_add_delayed_ref_head(trans->fs_info, head_ref, action); 826 827 existing = htree_insert(&delayed_refs->href_root, 828 &head_ref->href_node); 829 if (existing) { 830 update_existing_head_ref(trans, existing, head_ref); 831 /* 832 * we've updated the existing ref, free the newly 833 * allocated ref 834 */ 835 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 836 head_ref = existing; 837 } else { 838 if (head_ref->is_data && head_ref->ref_mod < 0) { 839 delayed_refs->pending_csums += head_ref->num_bytes; 840 trans->delayed_ref_updates += 841 btrfs_csum_bytes_to_leaves(trans->fs_info, 842 head_ref->num_bytes); 843 } 844 delayed_refs->num_heads++; 845 delayed_refs->num_heads_ready++; 846 atomic_inc(&delayed_refs->num_entries); 847 trans->delayed_ref_updates++; 848 } 849 if (qrecord_inserted_ret) 850 *qrecord_inserted_ret = qrecord_inserted; 851 852 return head_ref; 853 } 854 855 /* 856 * init_delayed_ref_common - Initialize the structure which represents a 857 * modification to a an extent. 858 * 859 * @fs_info: Internal to the mounted filesystem mount structure. 860 * 861 * @ref: The structure which is going to be initialized. 862 * 863 * @bytenr: The logical address of the extent for which a modification is 864 * going to be recorded. 865 * 866 * @num_bytes: Size of the extent whose modification is being recorded. 867 * 868 * @ref_root: The id of the root where this modification has originated, this 869 * can be either one of the well-known metadata trees or the 870 * subvolume id which references this extent. 871 * 872 * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or 873 * BTRFS_ADD_DELAYED_EXTENT 874 * 875 * @ref_type: Holds the type of the extent which is being recorded, can be 876 * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY 877 * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ 878 * BTRFS_EXTENT_DATA_REF_KEY when recording data extent 879 */ 880 static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, 881 struct btrfs_delayed_ref_node *ref, 882 u64 bytenr, u64 num_bytes, u64 ref_root, 883 int action, u8 ref_type) 884 { 885 u64 seq = 0; 886 887 if (action == BTRFS_ADD_DELAYED_EXTENT) 888 action = BTRFS_ADD_DELAYED_REF; 889 890 if (is_fstree(ref_root)) 891 seq = atomic64_read(&fs_info->tree_mod_seq); 892 893 refcount_set(&ref->refs, 1); 894 ref->bytenr = bytenr; 895 ref->num_bytes = num_bytes; 896 ref->ref_mod = 1; 897 ref->action = action; 898 ref->is_head = 0; 899 ref->in_tree = 1; 900 ref->seq = seq; 901 ref->type = ref_type; 902 RB_CLEAR_NODE(&ref->ref_node); 903 INIT_LIST_HEAD(&ref->add_list); 904 } 905 906 /* 907 * add a delayed tree ref. This does all of the accounting required 908 * to make sure the delayed ref is eventually processed before this 909 * transaction commits. 910 */ 911 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 912 struct btrfs_ref *generic_ref, 913 struct btrfs_delayed_extent_op *extent_op) 914 { 915 struct btrfs_fs_info *fs_info = trans->fs_info; 916 struct btrfs_delayed_tree_ref *ref; 917 struct btrfs_delayed_ref_head *head_ref; 918 struct btrfs_delayed_ref_root *delayed_refs; 919 struct btrfs_qgroup_extent_record *record = NULL; 920 int qrecord_inserted; 921 bool is_system; 922 int action = generic_ref->action; 923 int level = generic_ref->tree_ref.level; 924 int ret; 925 u64 bytenr = generic_ref->bytenr; 926 u64 num_bytes = generic_ref->len; 927 u64 parent = generic_ref->parent; 928 u8 ref_type; 929 930 is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); 931 932 ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); 933 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); 934 if (!ref) 935 return -ENOMEM; 936 937 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 938 if (!head_ref) { 939 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 940 return -ENOMEM; 941 } 942 943 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && 944 !generic_ref->skip_qgroup) { 945 record = kzalloc(sizeof(*record), GFP_NOFS); 946 if (!record) { 947 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 948 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 949 return -ENOMEM; 950 } 951 } 952 953 if (parent) 954 ref_type = BTRFS_SHARED_BLOCK_REF_KEY; 955 else 956 ref_type = BTRFS_TREE_BLOCK_REF_KEY; 957 958 init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, 959 generic_ref->tree_ref.owning_root, action, 960 ref_type); 961 ref->root = generic_ref->tree_ref.owning_root; 962 ref->parent = parent; 963 ref->level = level; 964 965 init_delayed_ref_head(head_ref, record, bytenr, num_bytes, 966 generic_ref->tree_ref.owning_root, 0, action, 967 false, is_system); 968 head_ref->extent_op = extent_op; 969 970 delayed_refs = &trans->transaction->delayed_refs; 971 spin_lock(&delayed_refs->lock); 972 973 /* 974 * insert both the head node and the new ref without dropping 975 * the spin lock 976 */ 977 head_ref = add_delayed_ref_head(trans, head_ref, record, 978 action, &qrecord_inserted); 979 980 ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); 981 spin_unlock(&delayed_refs->lock); 982 983 /* 984 * Need to update the delayed_refs_rsv with any changes we may have 985 * made. 986 */ 987 btrfs_update_delayed_refs_rsv(trans); 988 989 trace_add_delayed_tree_ref(fs_info, &ref->node, ref, 990 action == BTRFS_ADD_DELAYED_EXTENT ? 991 BTRFS_ADD_DELAYED_REF : action); 992 if (ret > 0) 993 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 994 995 if (qrecord_inserted) 996 btrfs_qgroup_trace_extent_post(trans, record); 997 998 return 0; 999 } 1000 1001 /* 1002 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 1003 */ 1004 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 1005 struct btrfs_ref *generic_ref, 1006 u64 reserved) 1007 { 1008 struct btrfs_fs_info *fs_info = trans->fs_info; 1009 struct btrfs_delayed_data_ref *ref; 1010 struct btrfs_delayed_ref_head *head_ref; 1011 struct btrfs_delayed_ref_root *delayed_refs; 1012 struct btrfs_qgroup_extent_record *record = NULL; 1013 int qrecord_inserted; 1014 int action = generic_ref->action; 1015 int ret; 1016 u64 bytenr = generic_ref->bytenr; 1017 u64 num_bytes = generic_ref->len; 1018 u64 parent = generic_ref->parent; 1019 u64 ref_root = generic_ref->data_ref.owning_root; 1020 u64 owner = generic_ref->data_ref.ino; 1021 u64 offset = generic_ref->data_ref.offset; 1022 u8 ref_type; 1023 1024 ASSERT(generic_ref->type == BTRFS_REF_DATA && action); 1025 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); 1026 if (!ref) 1027 return -ENOMEM; 1028 1029 if (parent) 1030 ref_type = BTRFS_SHARED_DATA_REF_KEY; 1031 else 1032 ref_type = BTRFS_EXTENT_DATA_REF_KEY; 1033 init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, 1034 ref_root, action, ref_type); 1035 ref->root = ref_root; 1036 ref->parent = parent; 1037 ref->objectid = owner; 1038 ref->offset = offset; 1039 1040 1041 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1042 if (!head_ref) { 1043 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 1044 return -ENOMEM; 1045 } 1046 1047 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && 1048 !generic_ref->skip_qgroup) { 1049 record = kzalloc(sizeof(*record), GFP_NOFS); 1050 if (!record) { 1051 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 1052 kmem_cache_free(btrfs_delayed_ref_head_cachep, 1053 head_ref); 1054 return -ENOMEM; 1055 } 1056 } 1057 1058 init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, 1059 reserved, action, true, false); 1060 head_ref->extent_op = NULL; 1061 1062 delayed_refs = &trans->transaction->delayed_refs; 1063 spin_lock(&delayed_refs->lock); 1064 1065 /* 1066 * insert both the head node and the new ref without dropping 1067 * the spin lock 1068 */ 1069 head_ref = add_delayed_ref_head(trans, head_ref, record, 1070 action, &qrecord_inserted); 1071 1072 ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); 1073 spin_unlock(&delayed_refs->lock); 1074 1075 /* 1076 * Need to update the delayed_refs_rsv with any changes we may have 1077 * made. 1078 */ 1079 btrfs_update_delayed_refs_rsv(trans); 1080 1081 trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, 1082 action == BTRFS_ADD_DELAYED_EXTENT ? 1083 BTRFS_ADD_DELAYED_REF : action); 1084 if (ret > 0) 1085 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 1086 1087 1088 if (qrecord_inserted) 1089 return btrfs_qgroup_trace_extent_post(trans, record); 1090 return 0; 1091 } 1092 1093 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 1094 u64 bytenr, u64 num_bytes, 1095 struct btrfs_delayed_extent_op *extent_op) 1096 { 1097 struct btrfs_delayed_ref_head *head_ref; 1098 struct btrfs_delayed_ref_root *delayed_refs; 1099 1100 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1101 if (!head_ref) 1102 return -ENOMEM; 1103 1104 init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, 1105 BTRFS_UPDATE_DELAYED_HEAD, false, false); 1106 head_ref->extent_op = extent_op; 1107 1108 delayed_refs = &trans->transaction->delayed_refs; 1109 spin_lock(&delayed_refs->lock); 1110 1111 add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, 1112 NULL); 1113 1114 spin_unlock(&delayed_refs->lock); 1115 1116 /* 1117 * Need to update the delayed_refs_rsv with any changes we may have 1118 * made. 1119 */ 1120 btrfs_update_delayed_refs_rsv(trans); 1121 return 0; 1122 } 1123 1124 /* 1125 * This does a simple search for the head node for a given extent. Returns the 1126 * head node if found, or NULL if not. 1127 */ 1128 struct btrfs_delayed_ref_head * 1129 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) 1130 { 1131 lockdep_assert_held(&delayed_refs->lock); 1132 1133 return find_ref_head(delayed_refs, bytenr, false); 1134 } 1135 1136 void __cold btrfs_delayed_ref_exit(void) 1137 { 1138 kmem_cache_destroy(btrfs_delayed_ref_head_cachep); 1139 kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); 1140 kmem_cache_destroy(btrfs_delayed_data_ref_cachep); 1141 kmem_cache_destroy(btrfs_delayed_extent_op_cachep); 1142 } 1143 1144 int __init btrfs_delayed_ref_init(void) 1145 { 1146 btrfs_delayed_ref_head_cachep = kmem_cache_create( 1147 "btrfs_delayed_ref_head", 1148 sizeof(struct btrfs_delayed_ref_head), 0, 1149 SLAB_MEM_SPREAD, NULL); 1150 if (!btrfs_delayed_ref_head_cachep) 1151 goto fail; 1152 1153 btrfs_delayed_tree_ref_cachep = kmem_cache_create( 1154 "btrfs_delayed_tree_ref", 1155 sizeof(struct btrfs_delayed_tree_ref), 0, 1156 SLAB_MEM_SPREAD, NULL); 1157 if (!btrfs_delayed_tree_ref_cachep) 1158 goto fail; 1159 1160 btrfs_delayed_data_ref_cachep = kmem_cache_create( 1161 "btrfs_delayed_data_ref", 1162 sizeof(struct btrfs_delayed_data_ref), 0, 1163 SLAB_MEM_SPREAD, NULL); 1164 if (!btrfs_delayed_data_ref_cachep) 1165 goto fail; 1166 1167 btrfs_delayed_extent_op_cachep = kmem_cache_create( 1168 "btrfs_delayed_extent_op", 1169 sizeof(struct btrfs_delayed_extent_op), 0, 1170 SLAB_MEM_SPREAD, NULL); 1171 if (!btrfs_delayed_extent_op_cachep) 1172 goto fail; 1173 1174 return 0; 1175 fail: 1176 btrfs_delayed_ref_exit(); 1177 return -ENOMEM; 1178 } 1179