1 /* 2 * Copyright (C) 2009 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/sort.h> 21 #include "ctree.h" 22 #include "delayed-ref.h" 23 #include "transaction.h" 24 25 /* 26 * delayed back reference update tracking. For subvolume trees 27 * we queue up extent allocations and backref maintenance for 28 * delayed processing. This avoids deep call chains where we 29 * add extents in the middle of btrfs_search_slot, and it allows 30 * us to buffer up frequently modified backrefs in an rb tree instead 31 * of hammering updates on the extent allocation tree. 32 * 33 * Right now this code is only used for reference counted trees, but 34 * the long term goal is to get rid of the similar code for delayed 35 * extent tree modifications. 36 */ 37 38 /* 39 * entries in the rb tree are ordered by the byte number of the extent 40 * and by the byte number of the parent block. 41 */ 42 static int comp_entry(struct btrfs_delayed_ref_node *ref, 43 u64 bytenr, u64 parent) 44 { 45 if (bytenr < ref->bytenr) 46 return -1; 47 if (bytenr > ref->bytenr) 48 return 1; 49 if (parent < ref->parent) 50 return -1; 51 if (parent > ref->parent) 52 return 1; 53 return 0; 54 } 55 56 /* 57 * insert a new ref into the rbtree. This returns any existing refs 58 * for the same (bytenr,parent) tuple, or NULL if the new node was properly 59 * inserted. 60 */ 61 static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, 62 u64 bytenr, u64 parent, 63 struct rb_node *node) 64 { 65 struct rb_node **p = &root->rb_node; 66 struct rb_node *parent_node = NULL; 67 struct btrfs_delayed_ref_node *entry; 68 int cmp; 69 70 while (*p) { 71 parent_node = *p; 72 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 73 rb_node); 74 75 cmp = comp_entry(entry, bytenr, parent); 76 if (cmp < 0) 77 p = &(*p)->rb_left; 78 else if (cmp > 0) 79 p = &(*p)->rb_right; 80 else 81 return entry; 82 } 83 84 entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 85 rb_link_node(node, parent_node, p); 86 rb_insert_color(node, root); 87 return NULL; 88 } 89 90 /* 91 * find an entry based on (bytenr,parent). This returns the delayed 92 * ref if it was able to find one, or NULL if nothing was in that spot 93 */ 94 static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, 95 u64 bytenr, u64 parent, 96 struct btrfs_delayed_ref_node **last) 97 { 98 struct rb_node *n = root->rb_node; 99 struct btrfs_delayed_ref_node *entry; 100 int cmp; 101 102 while (n) { 103 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 104 WARN_ON(!entry->in_tree); 105 if (last) 106 *last = entry; 107 108 cmp = comp_entry(entry, bytenr, parent); 109 if (cmp < 0) 110 n = n->rb_left; 111 else if (cmp > 0) 112 n = n->rb_right; 113 else 114 return entry; 115 } 116 return NULL; 117 } 118 119 int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, 120 struct btrfs_delayed_ref_head *head) 121 { 122 struct btrfs_delayed_ref_root *delayed_refs; 123 124 delayed_refs = &trans->transaction->delayed_refs; 125 assert_spin_locked(&delayed_refs->lock); 126 if (mutex_trylock(&head->mutex)) 127 return 0; 128 129 atomic_inc(&head->node.refs); 130 spin_unlock(&delayed_refs->lock); 131 132 mutex_lock(&head->mutex); 133 spin_lock(&delayed_refs->lock); 134 if (!head->node.in_tree) { 135 mutex_unlock(&head->mutex); 136 btrfs_put_delayed_ref(&head->node); 137 return -EAGAIN; 138 } 139 btrfs_put_delayed_ref(&head->node); 140 return 0; 141 } 142 143 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 144 struct list_head *cluster, u64 start) 145 { 146 int count = 0; 147 struct btrfs_delayed_ref_root *delayed_refs; 148 struct rb_node *node; 149 struct btrfs_delayed_ref_node *ref; 150 struct btrfs_delayed_ref_head *head; 151 152 delayed_refs = &trans->transaction->delayed_refs; 153 if (start == 0) { 154 node = rb_first(&delayed_refs->root); 155 } else { 156 ref = NULL; 157 tree_search(&delayed_refs->root, start, (u64)-1, &ref); 158 if (ref) { 159 struct btrfs_delayed_ref_node *tmp; 160 161 node = rb_prev(&ref->rb_node); 162 while (node) { 163 tmp = rb_entry(node, 164 struct btrfs_delayed_ref_node, 165 rb_node); 166 if (tmp->bytenr < start) 167 break; 168 ref = tmp; 169 node = rb_prev(&ref->rb_node); 170 } 171 node = &ref->rb_node; 172 } else 173 node = rb_first(&delayed_refs->root); 174 } 175 again: 176 while (node && count < 32) { 177 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 178 if (btrfs_delayed_ref_is_head(ref)) { 179 head = btrfs_delayed_node_to_head(ref); 180 if (list_empty(&head->cluster)) { 181 list_add_tail(&head->cluster, cluster); 182 delayed_refs->run_delayed_start = 183 head->node.bytenr; 184 count++; 185 186 WARN_ON(delayed_refs->num_heads_ready == 0); 187 delayed_refs->num_heads_ready--; 188 } else if (count) { 189 /* the goal of the clustering is to find extents 190 * that are likely to end up in the same extent 191 * leaf on disk. So, we don't want them spread 192 * all over the tree. Stop now if we've hit 193 * a head that was already in use 194 */ 195 break; 196 } 197 } 198 node = rb_next(node); 199 } 200 if (count) { 201 return 0; 202 } else if (start) { 203 /* 204 * we've gone to the end of the rbtree without finding any 205 * clusters. start from the beginning and try again 206 */ 207 start = 0; 208 node = rb_first(&delayed_refs->root); 209 goto again; 210 } 211 return 1; 212 } 213 214 /* 215 * This checks to see if there are any delayed refs in the 216 * btree for a given bytenr. It returns one if it finds any 217 * and zero otherwise. 218 * 219 * If it only finds a head node, it returns 0. 220 * 221 * The idea is to use this when deciding if you can safely delete an 222 * extent from the extent allocation tree. There may be a pending 223 * ref in the rbtree that adds or removes references, so as long as this 224 * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent 225 * allocation tree. 226 */ 227 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) 228 { 229 struct btrfs_delayed_ref_node *ref; 230 struct btrfs_delayed_ref_root *delayed_refs; 231 struct rb_node *prev_node; 232 int ret = 0; 233 234 delayed_refs = &trans->transaction->delayed_refs; 235 spin_lock(&delayed_refs->lock); 236 237 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 238 if (ref) { 239 prev_node = rb_prev(&ref->rb_node); 240 if (!prev_node) 241 goto out; 242 ref = rb_entry(prev_node, struct btrfs_delayed_ref_node, 243 rb_node); 244 if (ref->bytenr == bytenr) 245 ret = 1; 246 } 247 out: 248 spin_unlock(&delayed_refs->lock); 249 return ret; 250 } 251 252 /* 253 * helper function to lookup reference count 254 * 255 * the head node for delayed ref is used to store the sum of all the 256 * reference count modifications queued up in the rbtree. This way you 257 * can check to see what the reference count would be if all of the 258 * delayed refs are processed. 259 */ 260 int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, 261 struct btrfs_root *root, u64 bytenr, 262 u64 num_bytes, u32 *refs) 263 { 264 struct btrfs_delayed_ref_node *ref; 265 struct btrfs_delayed_ref_head *head; 266 struct btrfs_delayed_ref_root *delayed_refs; 267 struct btrfs_path *path; 268 struct extent_buffer *leaf; 269 struct btrfs_extent_item *ei; 270 struct btrfs_key key; 271 u32 num_refs; 272 int ret; 273 274 path = btrfs_alloc_path(); 275 if (!path) 276 return -ENOMEM; 277 278 key.objectid = bytenr; 279 key.type = BTRFS_EXTENT_ITEM_KEY; 280 key.offset = num_bytes; 281 delayed_refs = &trans->transaction->delayed_refs; 282 again: 283 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 284 &key, path, 0, 0); 285 if (ret < 0) 286 goto out; 287 288 if (ret == 0) { 289 leaf = path->nodes[0]; 290 ei = btrfs_item_ptr(leaf, path->slots[0], 291 struct btrfs_extent_item); 292 num_refs = btrfs_extent_refs(leaf, ei); 293 } else { 294 num_refs = 0; 295 ret = 0; 296 } 297 298 spin_lock(&delayed_refs->lock); 299 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 300 if (ref) { 301 head = btrfs_delayed_node_to_head(ref); 302 if (mutex_trylock(&head->mutex)) { 303 num_refs += ref->ref_mod; 304 mutex_unlock(&head->mutex); 305 *refs = num_refs; 306 goto out; 307 } 308 309 atomic_inc(&ref->refs); 310 spin_unlock(&delayed_refs->lock); 311 312 btrfs_release_path(root->fs_info->extent_root, path); 313 314 mutex_lock(&head->mutex); 315 mutex_unlock(&head->mutex); 316 btrfs_put_delayed_ref(ref); 317 goto again; 318 } else { 319 *refs = num_refs; 320 } 321 out: 322 spin_unlock(&delayed_refs->lock); 323 btrfs_free_path(path); 324 return ret; 325 } 326 327 /* 328 * helper function to update an extent delayed ref in the 329 * rbtree. existing and update must both have the same 330 * bytenr and parent 331 * 332 * This may free existing if the update cancels out whatever 333 * operation it was doing. 334 */ 335 static noinline void 336 update_existing_ref(struct btrfs_trans_handle *trans, 337 struct btrfs_delayed_ref_root *delayed_refs, 338 struct btrfs_delayed_ref_node *existing, 339 struct btrfs_delayed_ref_node *update) 340 { 341 struct btrfs_delayed_ref *existing_ref; 342 struct btrfs_delayed_ref *ref; 343 344 existing_ref = btrfs_delayed_node_to_ref(existing); 345 ref = btrfs_delayed_node_to_ref(update); 346 347 if (ref->pin) 348 existing_ref->pin = 1; 349 350 if (ref->action != existing_ref->action) { 351 /* 352 * this is effectively undoing either an add or a 353 * drop. We decrement the ref_mod, and if it goes 354 * down to zero we just delete the entry without 355 * every changing the extent allocation tree. 356 */ 357 existing->ref_mod--; 358 if (existing->ref_mod == 0) { 359 rb_erase(&existing->rb_node, 360 &delayed_refs->root); 361 existing->in_tree = 0; 362 btrfs_put_delayed_ref(existing); 363 delayed_refs->num_entries--; 364 if (trans->delayed_ref_updates) 365 trans->delayed_ref_updates--; 366 } 367 } else { 368 if (existing_ref->action == BTRFS_ADD_DELAYED_REF) { 369 /* if we're adding refs, make sure all the 370 * details match up. The extent could 371 * have been totally freed and reallocated 372 * by a different owner before the delayed 373 * ref entries were removed. 374 */ 375 existing_ref->owner_objectid = ref->owner_objectid; 376 existing_ref->generation = ref->generation; 377 existing_ref->root = ref->root; 378 existing->num_bytes = update->num_bytes; 379 } 380 /* 381 * the action on the existing ref matches 382 * the action on the ref we're trying to add. 383 * Bump the ref_mod by one so the backref that 384 * is eventually added/removed has the correct 385 * reference count 386 */ 387 existing->ref_mod += update->ref_mod; 388 } 389 } 390 391 /* 392 * helper function to update the accounting in the head ref 393 * existing and update must have the same bytenr 394 */ 395 static noinline void 396 update_existing_head_ref(struct btrfs_delayed_ref_node *existing, 397 struct btrfs_delayed_ref_node *update) 398 { 399 struct btrfs_delayed_ref_head *existing_ref; 400 struct btrfs_delayed_ref_head *ref; 401 402 existing_ref = btrfs_delayed_node_to_head(existing); 403 ref = btrfs_delayed_node_to_head(update); 404 405 if (ref->must_insert_reserved) { 406 /* if the extent was freed and then 407 * reallocated before the delayed ref 408 * entries were processed, we can end up 409 * with an existing head ref without 410 * the must_insert_reserved flag set. 411 * Set it again here 412 */ 413 existing_ref->must_insert_reserved = ref->must_insert_reserved; 414 415 /* 416 * update the num_bytes so we make sure the accounting 417 * is done correctly 418 */ 419 existing->num_bytes = update->num_bytes; 420 421 } 422 423 /* 424 * update the reference mod on the head to reflect this new operation 425 */ 426 existing->ref_mod += update->ref_mod; 427 } 428 429 /* 430 * helper function to actually insert a delayed ref into the rbtree. 431 * this does all the dirty work in terms of maintaining the correct 432 * overall modification count in the head node and properly dealing 433 * with updating existing nodes as new modifications are queued. 434 */ 435 static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, 436 struct btrfs_delayed_ref_node *ref, 437 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, 438 u64 ref_generation, u64 owner_objectid, int action, 439 int pin) 440 { 441 struct btrfs_delayed_ref_node *existing; 442 struct btrfs_delayed_ref *full_ref; 443 struct btrfs_delayed_ref_head *head_ref = NULL; 444 struct btrfs_delayed_ref_root *delayed_refs; 445 int count_mod = 1; 446 int must_insert_reserved = 0; 447 448 /* 449 * the head node stores the sum of all the mods, so dropping a ref 450 * should drop the sum in the head node by one. 451 */ 452 if (parent == (u64)-1) { 453 if (action == BTRFS_DROP_DELAYED_REF) 454 count_mod = -1; 455 else if (action == BTRFS_UPDATE_DELAYED_HEAD) 456 count_mod = 0; 457 } 458 459 /* 460 * BTRFS_ADD_DELAYED_EXTENT means that we need to update 461 * the reserved accounting when the extent is finally added, or 462 * if a later modification deletes the delayed ref without ever 463 * inserting the extent into the extent allocation tree. 464 * ref->must_insert_reserved is the flag used to record 465 * that accounting mods are required. 466 * 467 * Once we record must_insert_reserved, switch the action to 468 * BTRFS_ADD_DELAYED_REF because other special casing is not required. 469 */ 470 if (action == BTRFS_ADD_DELAYED_EXTENT) { 471 must_insert_reserved = 1; 472 action = BTRFS_ADD_DELAYED_REF; 473 } else { 474 must_insert_reserved = 0; 475 } 476 477 478 delayed_refs = &trans->transaction->delayed_refs; 479 480 /* first set the basic ref node struct up */ 481 atomic_set(&ref->refs, 1); 482 ref->bytenr = bytenr; 483 ref->parent = parent; 484 ref->ref_mod = count_mod; 485 ref->in_tree = 1; 486 ref->num_bytes = num_bytes; 487 488 if (btrfs_delayed_ref_is_head(ref)) { 489 head_ref = btrfs_delayed_node_to_head(ref); 490 head_ref->must_insert_reserved = must_insert_reserved; 491 INIT_LIST_HEAD(&head_ref->cluster); 492 mutex_init(&head_ref->mutex); 493 } else { 494 full_ref = btrfs_delayed_node_to_ref(ref); 495 full_ref->root = ref_root; 496 full_ref->generation = ref_generation; 497 full_ref->owner_objectid = owner_objectid; 498 full_ref->pin = pin; 499 full_ref->action = action; 500 } 501 502 existing = tree_insert(&delayed_refs->root, bytenr, 503 parent, &ref->rb_node); 504 505 if (existing) { 506 if (btrfs_delayed_ref_is_head(ref)) 507 update_existing_head_ref(existing, ref); 508 else 509 update_existing_ref(trans, delayed_refs, existing, ref); 510 511 /* 512 * we've updated the existing ref, free the newly 513 * allocated ref 514 */ 515 kfree(ref); 516 } else { 517 if (btrfs_delayed_ref_is_head(ref)) { 518 delayed_refs->num_heads++; 519 delayed_refs->num_heads_ready++; 520 } 521 delayed_refs->num_entries++; 522 trans->delayed_ref_updates++; 523 } 524 return 0; 525 } 526 527 /* 528 * add a delayed ref to the tree. This does all of the accounting required 529 * to make sure the delayed ref is eventually processed before this 530 * transaction commits. 531 */ 532 int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, 533 u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, 534 u64 ref_generation, u64 owner_objectid, int action, 535 int pin) 536 { 537 struct btrfs_delayed_ref *ref; 538 struct btrfs_delayed_ref_head *head_ref; 539 struct btrfs_delayed_ref_root *delayed_refs; 540 int ret; 541 542 ref = kmalloc(sizeof(*ref), GFP_NOFS); 543 if (!ref) 544 return -ENOMEM; 545 546 /* 547 * the parent = 0 case comes from cases where we don't actually 548 * know the parent yet. It will get updated later via a add/drop 549 * pair. 550 */ 551 if (parent == 0) 552 parent = bytenr; 553 554 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 555 if (!head_ref) { 556 kfree(ref); 557 return -ENOMEM; 558 } 559 delayed_refs = &trans->transaction->delayed_refs; 560 spin_lock(&delayed_refs->lock); 561 562 /* 563 * insert both the head node and the new ref without dropping 564 * the spin lock 565 */ 566 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, 567 (u64)-1, 0, 0, 0, action, pin); 568 BUG_ON(ret); 569 570 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, 571 parent, ref_root, ref_generation, 572 owner_objectid, action, pin); 573 BUG_ON(ret); 574 spin_unlock(&delayed_refs->lock); 575 return 0; 576 } 577 578 /* 579 * this does a simple search for the head node for a given extent. 580 * It must be called with the delayed ref spinlock held, and it returns 581 * the head node if any where found, or NULL if not. 582 */ 583 struct btrfs_delayed_ref_head * 584 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) 585 { 586 struct btrfs_delayed_ref_node *ref; 587 struct btrfs_delayed_ref_root *delayed_refs; 588 589 delayed_refs = &trans->transaction->delayed_refs; 590 ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); 591 if (ref) 592 return btrfs_delayed_node_to_head(ref); 593 return NULL; 594 } 595 596 /* 597 * add a delayed ref to the tree. This does all of the accounting required 598 * to make sure the delayed ref is eventually processed before this 599 * transaction commits. 600 * 601 * The main point of this call is to add and remove a backreference in a single 602 * shot, taking the lock only once, and only searching for the head node once. 603 * 604 * It is the same as doing a ref add and delete in two separate calls. 605 */ 606 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 607 u64 bytenr, u64 num_bytes, u64 orig_parent, 608 u64 parent, u64 orig_ref_root, u64 ref_root, 609 u64 orig_ref_generation, u64 ref_generation, 610 u64 owner_objectid, int pin) 611 { 612 struct btrfs_delayed_ref *ref; 613 struct btrfs_delayed_ref *old_ref; 614 struct btrfs_delayed_ref_head *head_ref; 615 struct btrfs_delayed_ref_root *delayed_refs; 616 int ret; 617 618 ref = kmalloc(sizeof(*ref), GFP_NOFS); 619 if (!ref) 620 return -ENOMEM; 621 622 old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS); 623 if (!old_ref) { 624 kfree(ref); 625 return -ENOMEM; 626 } 627 628 /* 629 * the parent = 0 case comes from cases where we don't actually 630 * know the parent yet. It will get updated later via a add/drop 631 * pair. 632 */ 633 if (parent == 0) 634 parent = bytenr; 635 if (orig_parent == 0) 636 orig_parent = bytenr; 637 638 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); 639 if (!head_ref) { 640 kfree(ref); 641 kfree(old_ref); 642 return -ENOMEM; 643 } 644 delayed_refs = &trans->transaction->delayed_refs; 645 spin_lock(&delayed_refs->lock); 646 647 /* 648 * insert both the head node and the new ref without dropping 649 * the spin lock 650 */ 651 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, 652 (u64)-1, 0, 0, 0, 653 BTRFS_UPDATE_DELAYED_HEAD, 0); 654 BUG_ON(ret); 655 656 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, 657 parent, ref_root, ref_generation, 658 owner_objectid, BTRFS_ADD_DELAYED_REF, 0); 659 BUG_ON(ret); 660 661 ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes, 662 orig_parent, orig_ref_root, 663 orig_ref_generation, owner_objectid, 664 BTRFS_DROP_DELAYED_REF, pin); 665 BUG_ON(ret); 666 spin_unlock(&delayed_refs->lock); 667 return 0; 668 } 669