1 /* 2 * Copyright (C) 2009 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/sort.h> 21 #include <linux/ftrace.h> 22 #include "ctree.h" 23 #include "delayed-ref.h" 24 #include "transaction.h" 25 26 /* 27 * delayed back reference update tracking. For subvolume trees 28 * we queue up extent allocations and backref maintenance for 29 * delayed processing. This avoids deep call chains where we 30 * add extents in the middle of btrfs_search_slot, and it allows 31 * us to buffer up frequently modified backrefs in an rb tree instead 32 * of hammering updates on the extent allocation tree. 33 * 34 * Right now this code is only used for reference counted trees, but 35 * the long term goal is to get rid of the similar code for delayed 36 * extent tree modifications. 37 */ 38 39 /* 40 * entries in the rb tree are ordered by the byte number of the extent 41 * and by the byte number of the parent block. 42 */ 43 static int comp_entry(struct btrfs_delayed_ref_node *ref, 44 u64 bytenr, u64 parent) 45 { 46 if (bytenr < ref->bytenr) 47 return -1; 48 if (bytenr > ref->bytenr) 49 return 1; 50 if (parent < ref->parent) 51 return -1; 52 if (parent > ref->parent) 53 return 1; 54 return 0; 55 } 56 57 /* 58 * insert a new ref into the rbtree. This returns any existing refs 59 * for the same (bytenr,parent) tuple, or NULL if the new node was properly 60 * inserted. 61 */ 62 static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, 63 u64 bytenr, u64 parent, 64 struct rb_node *node) 65 { 66 struct rb_node **p = &root->rb_node; 67 struct rb_node *parent_node = NULL; 68 struct btrfs_delayed_ref_node *entry; 69 int cmp; 70 71 while (*p) { 72 parent_node = *p; 73 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 74 rb_node); 75 76 cmp = comp_entry(entry, bytenr, parent); 77 if (cmp < 0) 78 p = &(*p)->rb_left; 79 else if (cmp > 0) 80 p = &(*p)->rb_right; 81 else 82 return entry; 83 } 84 85 entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 86 rb_link_node(node, parent_node, p); 87 rb_insert_color(node, root); 88 return NULL; 89 } 90 91 /* 92 * find an entry based on (bytenr,parent). This returns the delayed 93 * ref if it was able to find one, or NULL if nothing was in that spot 94 */ 95 static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, 96 u64 bytenr, u64 parent, 97 struct btrfs_delayed_ref_node **last) 98 { 99 struct rb_node *n = root->rb_node; 100 struct btrfs_delayed_ref_node *entry; 101 int cmp; 102 103 while (n) { 104 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 105 WARN_ON(!entry->in_tree); 106 if (last) 107 *last = entry; 108 109 cmp = comp_entry(entry, bytenr, parent); 110 if (cmp < 0) 111 n = n->rb_left; 112 else if (cmp > 0) 113 n = n->rb_right; 114 else 115 return entry; 116 } 117 return NULL; 118 } 119 120 int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, 121 struct btrfs_delayed_ref_head *head) 122 { 123 struct btrfs_delayed_ref_root *delayed_refs; 124 125 delayed_refs = &trans->transaction->delayed_refs; 126 assert_spin_locked(&delayed_refs->lock); 127 if (mutex_trylock(&head->mutex)) 128 return 0; 129 130 atomic_inc(&head->node.refs); 131 spin_unlock(&delayed_refs->lock); 132 133 mutex_lock(&head->mutex); 134 spin_lock(&delayed_refs->lock); 135 if (!head->node.in_tree) { 136 mutex_unlock(&head->mutex); 137 btrfs_put_delayed_ref(&head->node); 138 return -EAGAIN; 139 } 140 btrfs_put_delayed_ref(&head->node); 141 return 0; 142 } 143 144 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 145 struct list_head *cluster, u64 start) 146 { 147 int count = 0; 148 struct btrfs_delayed_ref_root *delayed_refs; 149 struct rb_node *node; 150 struct btrfs_delayed_ref_node *ref; 151 struct btrfs_delayed_ref_head *head; 152 153 delayed_refs = &trans->transaction->delayed_refs; 154 if (start == 0) { 155 node = rb_first(&delayed_refs->root); 156 } else { 157 ref = NULL; 158 tree_search(&delayed_refs->root, start, (u64)-1, &ref); 159 if (ref) { 160 struct btrfs_delayed_ref_node *tmp; 161 162 node = rb_prev(&ref->rb_node); 163 while (node) { 164 tmp = rb_entry(node, 165 struct btrfs_delayed_ref_node, 166 rb_node); 167 if (tmp->bytenr < start) 168 break; 169 ref = tmp; 170 node = rb_prev(&ref->rb_node); 171 } 172 node = &ref->rb_node; 173 } else 174 node = rb_first(&delayed_refs->root); 175 } 176 again: 177 while (node && count < 32) { 178 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 179 if (btrfs_delayed_ref_is_head(ref)) { 180 head = btrfs_delayed_node_to_head(ref); 181 if (list_empty(&head->cluster)) { 182 list_add_tail(&head->cluster, cluster); 183 delayed_refs->run_delayed_start = 184 head->node.bytenr; 185 count++; 186 187 WARN_ON(delayed_refs->num_heads_ready == 0); 188 delayed_refs->num_heads_ready--; 189 } else if (count) { 190 /* the goal of the clustering is to find extents 191 * that are likely to end up in the same extent 192 * leaf on disk. So, we don't want them spread 193 * all over the tree. Stop now if we've hit 194 * a head that was already in use 195 */ 196 break; 197 } 198 } 199 node = rb_next(node); 200 } 201 if (count) { 202 return 0; 203 } else if (start) { 204 /* 205 * we've gone to the end of the rbtree without finding any 206 * clusters. start from the beginning and try again 207 */ 208 start = 0; 209 node = rb_first(&delayed_refs->root); 210 goto again; 211 } 212 return 1; 213 } 214 215 /* 216 * This checks to see if there are any delayed refs in the 217 * btree for a given bytenr. It returns one if it finds any 218 * and zero otherwise. 219 * 220 * If it only finds a head node, it returns 0. 221 * 222 * The idea is to use this when deciding if you can safely delete an 223 * extent from the extent allocation tree. There may be a pending 224 * ref in the rbtree that adds or removes references, so as long as this 225 * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent 226 * allocation tree. 227 */ 228 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) 229 { 230 struct btrfs_delayed_ref_node *ref; 231 struct btrfs_delayed_ref_root *delayed_refs; 232 struct rb_node *prev_node; 233 int ret = 0; 234 235 delayed_refs = &trans->transaction->delayed_refs; 236 spin_lock(&delayed_refs->lock); 237 238 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 239 if (ref) { 240 prev_node = rb_prev(&ref->rb_node); 241 if (!prev_node) 242 goto out; 243 ref = rb_entry(prev_node, struct btrfs_delayed_ref_node, 244 rb_node); 245 if (ref->bytenr == bytenr) 246 ret = 1; 247 } 248 out: 249 spin_unlock(&delayed_refs->lock); 250 return ret; 251 } 252 253 /* 254 * helper function to lookup reference count 255 * 256 * the head node for delayed ref is used to store the sum of all the 257 * reference count modifications queued up in the rbtree. This way you 258 * can check to see what the reference count would be if all of the 259 * delayed refs are processed. 260 */ 261 int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 262 struct btrfs_root *root, u64 bytenr, 263 u64 num_bytes, u32 *refs) 264 { 265 struct btrfs_delayed_ref_node *ref; 266 struct btrfs_delayed_ref_head *head; 267 struct btrfs_delayed_ref_root *delayed_refs; 268 struct btrfs_path *path; 269 struct extent_buffer *leaf; 270 struct btrfs_extent_item *ei; 271 struct btrfs_key key; 272 u32 num_refs; 273 int ret; 274 275 path = btrfs_alloc_path(); 276 if (!path) 277 return -ENOMEM; 278 279 key.objectid = bytenr; 280 key.type = BTRFS_EXTENT_ITEM_KEY; 281 key.offset = num_bytes; 282 delayed_refs = &trans->transaction->delayed_refs; 283 again: 284 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 285 &key, path, 0, 0); 286 if (ret < 0) 287 goto out; 288 289 if (ret == 0) { 290 leaf = path->nodes[0]; 291 ei = btrfs_item_ptr(leaf, path->slots[0], 292 struct btrfs_extent_item); 293 num_refs = btrfs_extent_refs(leaf, ei); 294 } else { 295 num_refs = 0; 296 ret = 0; 297 } 298 299 spin_lock(&delayed_refs->lock); 300 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 301 if (ref) { 302 head = btrfs_delayed_node_to_head(ref); 303 if (mutex_trylock(&head->mutex)) { 304 num_refs += ref->ref_mod; 305 mutex_unlock(&head->mutex); 306 *refs = num_refs; 307 goto out; 308 } 309 310 atomic_inc(&ref->refs); 311 spin_unlock(&delayed_refs->lock); 312 313 btrfs_release_path(root->fs_info->extent_root, path); 314 315 mutex_lock(&head->mutex); 316 mutex_unlock(&head->mutex); 317 btrfs_put_delayed_ref(ref); 318 goto again; 319 } else { 320 *refs = num_refs; 321 } 322 out: 323 spin_unlock(&delayed_refs->lock); 324 btrfs_free_path(path); 325 return ret; 326 } 327 328 /* 329 * helper function to update an extent delayed ref in the 330 * rbtree. existing and update must both have the same 331 * bytenr and parent 332 * 333 * This may free existing if the update cancels out whatever 334 * operation it was doing. 335 */ 336 static noinline void 337 update_existing_ref(struct btrfs_trans_handle *trans, 338 struct btrfs_delayed_ref_root *delayed_refs, 339 struct btrfs_delayed_ref_node *existing, 340 struct btrfs_delayed_ref_node *update) 341 { 342 struct btrfs_delayed_ref *existing_ref; 343 struct btrfs_delayed_ref *ref; 344 345 existing_ref = btrfs_delayed_node_to_ref(existing); 346 ref = btrfs_delayed_node_to_ref(update); 347 348 if (ref->pin) 349 existing_ref->pin = 1; 350 351 if (ref->action != existing_ref->action) { 352 /* 353 * this is effectively undoing either an add or a 354 * drop. We decrement the ref_mod, and if it goes 355 * down to zero we just delete the entry without 356 * every changing the extent allocation tree. 357 */ 358 existing->ref_mod--; 359 if (existing->ref_mod == 0) { 360 rb_erase(&existing->rb_node, 361 &delayed_refs->root); 362 existing->in_tree = 0; 363 btrfs_put_delayed_ref(existing); 364 delayed_refs->num_entries--; 365 if (trans->delayed_ref_updates) 366 trans->delayed_ref_updates--; 367 } 368 } else { 369 if (existing_ref->action == BTRFS_ADD_DELAYED_REF) { 370 /* if we're adding refs, make sure all the 371 * details match up. The extent could 372 * have been totally freed and reallocated 373 * by a different owner before the delayed 374 * ref entries were removed. 375 */ 376 existing_ref->owner_objectid = ref->owner_objectid; 377 existing_ref->generation = ref->generation; 378 existing_ref->root = ref->root; 379 existing->num_bytes = update->num_bytes; 380 } 381 /* 382 * the action on the existing ref matches 383 * the action on the ref we're trying to add. 384 * Bump the ref_mod by one so the backref that 385 * is eventually added/removed has the correct 386 * reference count 387 */ 388 existing->ref_mod += update->ref_mod; 389 } 390 } 391 392 /* 393 * helper function to update the accounting in the head ref 394 * existing and update must have the same bytenr 395 */ 396 static noinline void 397 update_existing_head_ref(struct btrfs_delayed_ref_node *existing, 398 struct btrfs_delayed_ref_node *update) 399 { 400 struct btrfs_delayed_ref_head *existing_ref; 401 struct btrfs_delayed_ref_head *ref; 402 403 existing_ref = btrfs_delayed_node_to_head(existing); 404 ref = btrfs_delayed_node_to_head(update); 405 406 if (ref->must_insert_reserved) { 407 /* if the extent was freed and then 408 * reallocated before the delayed ref 409 * entries were processed, we can end up 410 * with an existing head ref without 411 * the must_insert_reserved flag set. 412 * Set it again here 413 */ 414 existing_ref->must_insert_reserved = ref->must_insert_reserved; 415 416 /* 417 * update the num_bytes so we make sure the accounting 418 * is done correctly 419 */ 420 existing->num_bytes = update->num_bytes; 421 422 } 423 424 /* 425 * update the reference mod on the head to reflect this new operation 426 */ 427 existing->ref_mod += update->ref_mod; 428 } 429 430 /* 431 * helper function to actually insert a delayed ref into the rbtree. 432 * this does all the dirty work in terms of maintaining the correct 433 * overall modification count in the head node and properly dealing 434 * with updating existing nodes as new modifications are queued. 435 */ 436 static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, 437 struct btrfs_delayed_ref_node *ref, 438 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, 439 u64 ref_generation, u64 owner_objectid, int action, 440 int pin) 441 { 442 struct btrfs_delayed_ref_node *existing; 443 struct btrfs_delayed_ref *full_ref; 444 struct btrfs_delayed_ref_head *head_ref = NULL; 445 struct btrfs_delayed_ref_root *delayed_refs; 446 int count_mod = 1; 447 int must_insert_reserved = 0; 448 449 /* 450 * the head node stores the sum of all the mods, so dropping a ref 451 * should drop the sum in the head node by one. 452 */ 453 if (parent == (u64)-1) { 454 if (action == BTRFS_DROP_DELAYED_REF) 455 count_mod = -1; 456 else if (action == BTRFS_UPDATE_DELAYED_HEAD) 457 count_mod = 0; 458 } 459 460 /* 461 * BTRFS_ADD_DELAYED_EXTENT means that we need to update 462 * the reserved accounting when the extent is finally added, or 463 * if a later modification deletes the delayed ref without ever 464 * inserting the extent into the extent allocation tree. 465 * ref->must_insert_reserved is the flag used to record 466 * that accounting mods are required. 467 * 468 * Once we record must_insert_reserved, switch the action to 469 * BTRFS_ADD_DELAYED_REF because other special casing is not required. 470 */ 471 if (action == BTRFS_ADD_DELAYED_EXTENT) { 472 must_insert_reserved = 1; 473 action = BTRFS_ADD_DELAYED_REF; 474 } else { 475 must_insert_reserved = 0; 476 } 477 478 479 delayed_refs = &trans->transaction->delayed_refs; 480 481 /* first set the basic ref node struct up */ 482 atomic_set(&ref->refs, 1); 483 ref->bytenr = bytenr; 484 ref->parent = parent; 485 ref->ref_mod = count_mod; 486 ref->in_tree = 1; 487 ref->num_bytes = num_bytes; 488 489 if (btrfs_delayed_ref_is_head(ref)) { 490 head_ref = btrfs_delayed_node_to_head(ref); 491 head_ref->must_insert_reserved = must_insert_reserved; 492 INIT_LIST_HEAD(&head_ref->cluster); 493 mutex_init(&head_ref->mutex); 494 } else { 495 full_ref = btrfs_delayed_node_to_ref(ref); 496 full_ref->root = ref_root; 497 full_ref->generation = ref_generation; 498 full_ref->owner_objectid = owner_objectid; 499 full_ref->pin = pin; 500 full_ref->action = action; 501 } 502 503 existing = tree_insert(&delayed_refs->root, bytenr, 504 parent, &ref->rb_node); 505 506 if (existing) { 507 if (btrfs_delayed_ref_is_head(ref)) 508 update_existing_head_ref(existing, ref); 509 else 510 update_existing_ref(trans, delayed_refs, existing, ref); 511 512 /* 513 * we've updated the existing ref, free the newly 514 * allocated ref 515 */ 516 kfree(ref); 517 } else { 518 if (btrfs_delayed_ref_is_head(ref)) { 519 delayed_refs->num_heads++; 520 delayed_refs->num_heads_ready++; 521 } 522 delayed_refs->num_entries++; 523 trans->delayed_ref_updates++; 524 } 525 return 0; 526 } 527 528 /* 529 * add a delayed ref to the tree. This does all of the accounting required 530 * to make sure the delayed ref is eventually processed before this 531 * transaction commits. 532 */ 533 int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, 534 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, 535 u64 ref_generation, u64 owner_objectid, int action, 536 int pin) 537 { 538 struct btrfs_delayed_ref *ref; 539 struct btrfs_delayed_ref_head *head_ref; 540 struct btrfs_delayed_ref_root *delayed_refs; 541 int ret; 542 543 ref = kmalloc(sizeof(*ref), GFP_NOFS); 544 if (!ref) 545 return -ENOMEM; 546 547 /* 548 * the parent = 0 case comes from cases where we don't actually 549 * know the parent yet. It will get updated later via a add/drop 550 * pair. 551 */ 552 if (parent == 0) 553 parent = bytenr; 554 555 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 556 if (!head_ref) { 557 kfree(ref); 558 return -ENOMEM; 559 } 560 delayed_refs = &trans->transaction->delayed_refs; 561 spin_lock(&delayed_refs->lock); 562 563 /* 564 * insert both the head node and the new ref without dropping 565 * the spin lock 566 */ 567 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, 568 (u64)-1, 0, 0, 0, action, pin); 569 BUG_ON(ret); 570 571 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, 572 parent, ref_root, ref_generation, 573 owner_objectid, action, pin); 574 BUG_ON(ret); 575 spin_unlock(&delayed_refs->lock); 576 return 0; 577 } 578 579 /* 580 * this does a simple search for the head node for a given extent. 581 * It must be called with the delayed ref spinlock held, and it returns 582 * the head node if any where found, or NULL if not. 583 */ 584 struct btrfs_delayed_ref_head * 585 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) 586 { 587 struct btrfs_delayed_ref_node *ref; 588 struct btrfs_delayed_ref_root *delayed_refs; 589 590 delayed_refs = &trans->transaction->delayed_refs; 591 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 592 if (ref) 593 return btrfs_delayed_node_to_head(ref); 594 return NULL; 595 } 596 597 /* 598 * add a delayed ref to the tree. This does all of the accounting required 599 * to make sure the delayed ref is eventually processed before this 600 * transaction commits. 601 * 602 * The main point of this call is to add and remove a backreference in a single 603 * shot, taking the lock only once, and only searching for the head node once. 604 * 605 * It is the same as doing a ref add and delete in two separate calls. 606 */ 607 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 608 u64 bytenr, u64 num_bytes, u64 orig_parent, 609 u64 parent, u64 orig_ref_root, u64 ref_root, 610 u64 orig_ref_generation, u64 ref_generation, 611 u64 owner_objectid, int pin) 612 { 613 struct btrfs_delayed_ref *ref; 614 struct btrfs_delayed_ref *old_ref; 615 struct btrfs_delayed_ref_head *head_ref; 616 struct btrfs_delayed_ref_root *delayed_refs; 617 int ret; 618 619 ref = kmalloc(sizeof(*ref), GFP_NOFS); 620 if (!ref) 621 return -ENOMEM; 622 623 old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS); 624 if (!old_ref) { 625 kfree(ref); 626 return -ENOMEM; 627 } 628 629 /* 630 * the parent = 0 case comes from cases where we don't actually 631 * know the parent yet. It will get updated later via a add/drop 632 * pair. 633 */ 634 if (parent == 0) 635 parent = bytenr; 636 if (orig_parent == 0) 637 orig_parent = bytenr; 638 639 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 640 if (!head_ref) { 641 kfree(ref); 642 kfree(old_ref); 643 return -ENOMEM; 644 } 645 delayed_refs = &trans->transaction->delayed_refs; 646 spin_lock(&delayed_refs->lock); 647 648 /* 649 * insert both the head node and the new ref without dropping 650 * the spin lock 651 */ 652 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, 653 (u64)-1, 0, 0, 0, 654 BTRFS_UPDATE_DELAYED_HEAD, 0); 655 BUG_ON(ret); 656 657 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, 658 parent, ref_root, ref_generation, 659 owner_objectid, BTRFS_ADD_DELAYED_REF, 0); 660 BUG_ON(ret); 661 662 ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes, 663 orig_parent, orig_ref_root, 664 orig_ref_generation, owner_objectid, 665 BTRFS_DROP_DELAYED_REF, pin); 666 BUG_ON(ret); 667 spin_unlock(&delayed_refs->lock); 668 return 0; 669 } 670