1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include "compat.h" 25 #include "hash.h" 26 #include "crc32c.h" 27 #include "ctree.h" 28 #include "disk-io.h" 29 #include "print-tree.h" 30 #include "transaction.h" 31 #include "volumes.h" 32 #include "locking.h" 33 #include "ref-cache.h" 34 35 #define PENDING_EXTENT_INSERT 0 36 #define PENDING_EXTENT_DELETE 1 37 #define PENDING_BACKREF_UPDATE 2 38 39 struct pending_extent_op { 40 int type; 41 u64 bytenr; 42 u64 num_bytes; 43 u64 parent; 44 u64 orig_parent; 45 u64 generation; 46 u64 orig_generation; 47 int level; 48 struct list_head list; 49 int del; 50 }; 51 52 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 53 struct btrfs_root *root, u64 parent, 54 u64 root_objectid, u64 ref_generation, 55 u64 owner, struct btrfs_key *ins, 56 int ref_mod); 57 static int update_reserved_extents(struct btrfs_root *root, 58 u64 bytenr, u64 num, int reserve); 59 static int update_block_group(struct btrfs_trans_handle *trans, 60 struct btrfs_root *root, 61 u64 bytenr, u64 num_bytes, int alloc, 62 int mark_free); 63 static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans, 64 struct btrfs_root *root, 65 u64 bytenr, u64 num_bytes, u64 parent, 66 u64 root_objectid, u64 ref_generation, 67 u64 owner_objectid, int pin, 68 int ref_to_drop); 69 70 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 71 struct btrfs_root *extent_root, u64 alloc_bytes, 72 u64 flags, int force); 73 74 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 75 { 76 return (cache->flags & bits) == bits; 77 } 78 79 /* 80 * this adds the block group to the fs_info rb tree for the block group 81 * cache 82 */ 83 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 84 struct btrfs_block_group_cache *block_group) 85 { 86 struct rb_node **p; 87 struct rb_node *parent = NULL; 88 struct btrfs_block_group_cache *cache; 89 90 spin_lock(&info->block_group_cache_lock); 91 p = &info->block_group_cache_tree.rb_node; 92 93 while (*p) { 94 parent = *p; 95 cache = rb_entry(parent, struct btrfs_block_group_cache, 96 cache_node); 97 if (block_group->key.objectid < cache->key.objectid) { 98 p = &(*p)->rb_left; 99 } else if (block_group->key.objectid > cache->key.objectid) { 100 p = &(*p)->rb_right; 101 } else { 102 spin_unlock(&info->block_group_cache_lock); 103 return -EEXIST; 104 } 105 } 106 107 rb_link_node(&block_group->cache_node, parent, p); 108 rb_insert_color(&block_group->cache_node, 109 &info->block_group_cache_tree); 110 spin_unlock(&info->block_group_cache_lock); 111 112 return 0; 113 } 114 115 /* 116 * This will return the block group at or after bytenr if contains is 0, else 117 * it will return the block group that contains the bytenr 118 */ 119 static struct btrfs_block_group_cache * 120 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 121 int contains) 122 { 123 struct btrfs_block_group_cache *cache, *ret = NULL; 124 struct rb_node *n; 125 u64 end, start; 126 127 spin_lock(&info->block_group_cache_lock); 128 n = info->block_group_cache_tree.rb_node; 129 130 while (n) { 131 cache = rb_entry(n, struct btrfs_block_group_cache, 132 cache_node); 133 end = cache->key.objectid + cache->key.offset - 1; 134 start = cache->key.objectid; 135 136 if (bytenr < start) { 137 if (!contains && (!ret || start < ret->key.objectid)) 138 ret = cache; 139 n = n->rb_left; 140 } else if (bytenr > start) { 141 if (contains && bytenr <= end) { 142 ret = cache; 143 break; 144 } 145 n = n->rb_right; 146 } else { 147 ret = cache; 148 break; 149 } 150 } 151 if (ret) 152 atomic_inc(&ret->count); 153 spin_unlock(&info->block_group_cache_lock); 154 155 return ret; 156 } 157 158 /* 159 * this is only called by cache_block_group, since we could have freed extents 160 * we need to check the pinned_extents for any extents that can't be used yet 161 * since their free space will be released as soon as the transaction commits. 162 */ 163 static int add_new_free_space(struct btrfs_block_group_cache *block_group, 164 struct btrfs_fs_info *info, u64 start, u64 end) 165 { 166 u64 extent_start, extent_end, size; 167 int ret; 168 169 mutex_lock(&info->pinned_mutex); 170 while (start < end) { 171 ret = find_first_extent_bit(&info->pinned_extents, start, 172 &extent_start, &extent_end, 173 EXTENT_DIRTY); 174 if (ret) 175 break; 176 177 if (extent_start == start) { 178 start = extent_end + 1; 179 } else if (extent_start > start && extent_start < end) { 180 size = extent_start - start; 181 ret = btrfs_add_free_space(block_group, start, 182 size); 183 BUG_ON(ret); 184 start = extent_end + 1; 185 } else { 186 break; 187 } 188 } 189 190 if (start < end) { 191 size = end - start; 192 ret = btrfs_add_free_space(block_group, start, size); 193 BUG_ON(ret); 194 } 195 mutex_unlock(&info->pinned_mutex); 196 197 return 0; 198 } 199 200 static int remove_sb_from_cache(struct btrfs_root *root, 201 struct btrfs_block_group_cache *cache) 202 { 203 u64 bytenr; 204 u64 *logical; 205 int stripe_len; 206 int i, nr, ret; 207 208 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 209 bytenr = btrfs_sb_offset(i); 210 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 211 cache->key.objectid, bytenr, 0, 212 &logical, &nr, &stripe_len); 213 BUG_ON(ret); 214 while (nr--) { 215 btrfs_remove_free_space(cache, logical[nr], 216 stripe_len); 217 } 218 kfree(logical); 219 } 220 return 0; 221 } 222 223 static int cache_block_group(struct btrfs_root *root, 224 struct btrfs_block_group_cache *block_group) 225 { 226 struct btrfs_path *path; 227 int ret = 0; 228 struct btrfs_key key; 229 struct extent_buffer *leaf; 230 int slot; 231 u64 last; 232 233 if (!block_group) 234 return 0; 235 236 root = root->fs_info->extent_root; 237 238 if (block_group->cached) 239 return 0; 240 241 path = btrfs_alloc_path(); 242 if (!path) 243 return -ENOMEM; 244 245 path->reada = 2; 246 /* 247 * we get into deadlocks with paths held by callers of this function. 248 * since the alloc_mutex is protecting things right now, just 249 * skip the locking here 250 */ 251 path->skip_locking = 1; 252 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 253 key.objectid = last; 254 key.offset = 0; 255 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 256 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 257 if (ret < 0) 258 goto err; 259 260 while (1) { 261 leaf = path->nodes[0]; 262 slot = path->slots[0]; 263 if (slot >= btrfs_header_nritems(leaf)) { 264 ret = btrfs_next_leaf(root, path); 265 if (ret < 0) 266 goto err; 267 if (ret == 0) 268 continue; 269 else 270 break; 271 } 272 btrfs_item_key_to_cpu(leaf, &key, slot); 273 if (key.objectid < block_group->key.objectid) 274 goto next; 275 276 if (key.objectid >= block_group->key.objectid + 277 block_group->key.offset) 278 break; 279 280 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { 281 add_new_free_space(block_group, root->fs_info, last, 282 key.objectid); 283 284 last = key.objectid + key.offset; 285 } 286 next: 287 path->slots[0]++; 288 } 289 290 add_new_free_space(block_group, root->fs_info, last, 291 block_group->key.objectid + 292 block_group->key.offset); 293 294 remove_sb_from_cache(root, block_group); 295 block_group->cached = 1; 296 ret = 0; 297 err: 298 btrfs_free_path(path); 299 return ret; 300 } 301 302 /* 303 * return the block group that starts at or after bytenr 304 */ 305 static struct btrfs_block_group_cache * 306 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 307 { 308 struct btrfs_block_group_cache *cache; 309 310 cache = block_group_cache_tree_search(info, bytenr, 0); 311 312 return cache; 313 } 314 315 /* 316 * return the block group that contains teh given bytenr 317 */ 318 struct btrfs_block_group_cache *btrfs_lookup_block_group( 319 struct btrfs_fs_info *info, 320 u64 bytenr) 321 { 322 struct btrfs_block_group_cache *cache; 323 324 cache = block_group_cache_tree_search(info, bytenr, 1); 325 326 return cache; 327 } 328 329 static inline void put_block_group(struct btrfs_block_group_cache *cache) 330 { 331 if (atomic_dec_and_test(&cache->count)) 332 kfree(cache); 333 } 334 335 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 336 u64 flags) 337 { 338 struct list_head *head = &info->space_info; 339 struct btrfs_space_info *found; 340 341 rcu_read_lock(); 342 list_for_each_entry_rcu(found, head, list) { 343 if (found->flags == flags) { 344 rcu_read_unlock(); 345 return found; 346 } 347 } 348 rcu_read_unlock(); 349 return NULL; 350 } 351 352 /* 353 * after adding space to the filesystem, we need to clear the full flags 354 * on all the space infos. 355 */ 356 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 357 { 358 struct list_head *head = &info->space_info; 359 struct btrfs_space_info *found; 360 361 rcu_read_lock(); 362 list_for_each_entry_rcu(found, head, list) 363 found->full = 0; 364 rcu_read_unlock(); 365 } 366 367 static u64 div_factor(u64 num, int factor) 368 { 369 if (factor == 10) 370 return num; 371 num *= factor; 372 do_div(num, 10); 373 return num; 374 } 375 376 u64 btrfs_find_block_group(struct btrfs_root *root, 377 u64 search_start, u64 search_hint, int owner) 378 { 379 struct btrfs_block_group_cache *cache; 380 u64 used; 381 u64 last = max(search_hint, search_start); 382 u64 group_start = 0; 383 int full_search = 0; 384 int factor = 9; 385 int wrapped = 0; 386 again: 387 while (1) { 388 cache = btrfs_lookup_first_block_group(root->fs_info, last); 389 if (!cache) 390 break; 391 392 spin_lock(&cache->lock); 393 last = cache->key.objectid + cache->key.offset; 394 used = btrfs_block_group_used(&cache->item); 395 396 if ((full_search || !cache->ro) && 397 block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { 398 if (used + cache->pinned + cache->reserved < 399 div_factor(cache->key.offset, factor)) { 400 group_start = cache->key.objectid; 401 spin_unlock(&cache->lock); 402 put_block_group(cache); 403 goto found; 404 } 405 } 406 spin_unlock(&cache->lock); 407 put_block_group(cache); 408 cond_resched(); 409 } 410 if (!wrapped) { 411 last = search_start; 412 wrapped = 1; 413 goto again; 414 } 415 if (!full_search && factor < 10) { 416 last = search_start; 417 full_search = 1; 418 factor = 10; 419 goto again; 420 } 421 found: 422 return group_start; 423 } 424 425 /* simple helper to search for an existing extent at a given offset */ 426 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 427 { 428 int ret; 429 struct btrfs_key key; 430 struct btrfs_path *path; 431 432 path = btrfs_alloc_path(); 433 BUG_ON(!path); 434 key.objectid = start; 435 key.offset = len; 436 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 437 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 438 0, 0); 439 btrfs_free_path(path); 440 return ret; 441 } 442 443 /* 444 * Back reference rules. Back refs have three main goals: 445 * 446 * 1) differentiate between all holders of references to an extent so that 447 * when a reference is dropped we can make sure it was a valid reference 448 * before freeing the extent. 449 * 450 * 2) Provide enough information to quickly find the holders of an extent 451 * if we notice a given block is corrupted or bad. 452 * 453 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 454 * maintenance. This is actually the same as #2, but with a slightly 455 * different use case. 456 * 457 * File extents can be referenced by: 458 * 459 * - multiple snapshots, subvolumes, or different generations in one subvol 460 * - different files inside a single subvolume 461 * - different offsets inside a file (bookend extents in file.c) 462 * 463 * The extent ref structure has fields for: 464 * 465 * - Objectid of the subvolume root 466 * - Generation number of the tree holding the reference 467 * - objectid of the file holding the reference 468 * - number of references holding by parent node (alway 1 for tree blocks) 469 * 470 * Btree leaf may hold multiple references to a file extent. In most cases, 471 * these references are from same file and the corresponding offsets inside 472 * the file are close together. 473 * 474 * When a file extent is allocated the fields are filled in: 475 * (root_key.objectid, trans->transid, inode objectid, 1) 476 * 477 * When a leaf is cow'd new references are added for every file extent found 478 * in the leaf. It looks similar to the create case, but trans->transid will 479 * be different when the block is cow'd. 480 * 481 * (root_key.objectid, trans->transid, inode objectid, 482 * number of references in the leaf) 483 * 484 * When a file extent is removed either during snapshot deletion or 485 * file truncation, we find the corresponding back reference and check 486 * the following fields: 487 * 488 * (btrfs_header_owner(leaf), btrfs_header_generation(leaf), 489 * inode objectid) 490 * 491 * Btree extents can be referenced by: 492 * 493 * - Different subvolumes 494 * - Different generations of the same subvolume 495 * 496 * When a tree block is created, back references are inserted: 497 * 498 * (root->root_key.objectid, trans->transid, level, 1) 499 * 500 * When a tree block is cow'd, new back references are added for all the 501 * blocks it points to. If the tree block isn't in reference counted root, 502 * the old back references are removed. These new back references are of 503 * the form (trans->transid will have increased since creation): 504 * 505 * (root->root_key.objectid, trans->transid, level, 1) 506 * 507 * When a backref is in deleting, the following fields are checked: 508 * 509 * if backref was for a tree root: 510 * (btrfs_header_owner(itself), btrfs_header_generation(itself), level) 511 * else 512 * (btrfs_header_owner(parent), btrfs_header_generation(parent), level) 513 * 514 * Back Reference Key composing: 515 * 516 * The key objectid corresponds to the first byte in the extent, the key 517 * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first 518 * byte of parent extent. If a extent is tree root, the key offset is set 519 * to the key objectid. 520 */ 521 522 static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, 523 struct btrfs_root *root, 524 struct btrfs_path *path, 525 u64 bytenr, u64 parent, 526 u64 ref_root, u64 ref_generation, 527 u64 owner_objectid, int del) 528 { 529 struct btrfs_key key; 530 struct btrfs_extent_ref *ref; 531 struct extent_buffer *leaf; 532 u64 ref_objectid; 533 int ret; 534 535 key.objectid = bytenr; 536 key.type = BTRFS_EXTENT_REF_KEY; 537 key.offset = parent; 538 539 ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1); 540 if (ret < 0) 541 goto out; 542 if (ret > 0) { 543 ret = -ENOENT; 544 goto out; 545 } 546 547 leaf = path->nodes[0]; 548 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 549 ref_objectid = btrfs_ref_objectid(leaf, ref); 550 if (btrfs_ref_root(leaf, ref) != ref_root || 551 btrfs_ref_generation(leaf, ref) != ref_generation || 552 (ref_objectid != owner_objectid && 553 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { 554 ret = -EIO; 555 WARN_ON(1); 556 goto out; 557 } 558 ret = 0; 559 out: 560 return ret; 561 } 562 563 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, 564 struct btrfs_root *root, 565 struct btrfs_path *path, 566 u64 bytenr, u64 parent, 567 u64 ref_root, u64 ref_generation, 568 u64 owner_objectid, 569 int refs_to_add) 570 { 571 struct btrfs_key key; 572 struct extent_buffer *leaf; 573 struct btrfs_extent_ref *ref; 574 u32 num_refs; 575 int ret; 576 577 key.objectid = bytenr; 578 key.type = BTRFS_EXTENT_REF_KEY; 579 key.offset = parent; 580 581 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref)); 582 if (ret == 0) { 583 leaf = path->nodes[0]; 584 ref = btrfs_item_ptr(leaf, path->slots[0], 585 struct btrfs_extent_ref); 586 btrfs_set_ref_root(leaf, ref, ref_root); 587 btrfs_set_ref_generation(leaf, ref, ref_generation); 588 btrfs_set_ref_objectid(leaf, ref, owner_objectid); 589 btrfs_set_ref_num_refs(leaf, ref, refs_to_add); 590 } else if (ret == -EEXIST) { 591 u64 existing_owner; 592 593 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); 594 leaf = path->nodes[0]; 595 ref = btrfs_item_ptr(leaf, path->slots[0], 596 struct btrfs_extent_ref); 597 if (btrfs_ref_root(leaf, ref) != ref_root || 598 btrfs_ref_generation(leaf, ref) != ref_generation) { 599 ret = -EIO; 600 WARN_ON(1); 601 goto out; 602 } 603 604 num_refs = btrfs_ref_num_refs(leaf, ref); 605 BUG_ON(num_refs == 0); 606 btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add); 607 608 existing_owner = btrfs_ref_objectid(leaf, ref); 609 if (existing_owner != owner_objectid && 610 existing_owner != BTRFS_MULTIPLE_OBJECTIDS) { 611 btrfs_set_ref_objectid(leaf, ref, 612 BTRFS_MULTIPLE_OBJECTIDS); 613 } 614 ret = 0; 615 } else { 616 goto out; 617 } 618 btrfs_unlock_up_safe(path, 1); 619 btrfs_mark_buffer_dirty(path->nodes[0]); 620 out: 621 btrfs_release_path(root, path); 622 return ret; 623 } 624 625 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, 626 struct btrfs_root *root, 627 struct btrfs_path *path, 628 int refs_to_drop) 629 { 630 struct extent_buffer *leaf; 631 struct btrfs_extent_ref *ref; 632 u32 num_refs; 633 int ret = 0; 634 635 leaf = path->nodes[0]; 636 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); 637 num_refs = btrfs_ref_num_refs(leaf, ref); 638 BUG_ON(num_refs < refs_to_drop); 639 num_refs -= refs_to_drop; 640 if (num_refs == 0) { 641 ret = btrfs_del_item(trans, root, path); 642 } else { 643 btrfs_set_ref_num_refs(leaf, ref, num_refs); 644 btrfs_mark_buffer_dirty(leaf); 645 } 646 btrfs_release_path(root, path); 647 return ret; 648 } 649 650 #ifdef BIO_RW_DISCARD 651 static void btrfs_issue_discard(struct block_device *bdev, 652 u64 start, u64 len) 653 { 654 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); 655 } 656 #endif 657 658 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 659 u64 num_bytes) 660 { 661 #ifdef BIO_RW_DISCARD 662 int ret; 663 u64 map_length = num_bytes; 664 struct btrfs_multi_bio *multi = NULL; 665 666 /* Tell the block device(s) that the sectors can be discarded */ 667 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, 668 bytenr, &map_length, &multi, 0); 669 if (!ret) { 670 struct btrfs_bio_stripe *stripe = multi->stripes; 671 int i; 672 673 if (map_length > num_bytes) 674 map_length = num_bytes; 675 676 for (i = 0; i < multi->num_stripes; i++, stripe++) { 677 btrfs_issue_discard(stripe->dev->bdev, 678 stripe->physical, 679 map_length); 680 } 681 kfree(multi); 682 } 683 684 return ret; 685 #else 686 return 0; 687 #endif 688 } 689 690 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 691 struct btrfs_root *root, u64 bytenr, 692 u64 num_bytes, 693 u64 orig_parent, u64 parent, 694 u64 orig_root, u64 ref_root, 695 u64 orig_generation, u64 ref_generation, 696 u64 owner_objectid) 697 { 698 int ret; 699 int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID; 700 701 ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes, 702 orig_parent, parent, orig_root, 703 ref_root, orig_generation, 704 ref_generation, owner_objectid, pin); 705 BUG_ON(ret); 706 return ret; 707 } 708 709 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, 710 struct btrfs_root *root, u64 bytenr, 711 u64 num_bytes, u64 orig_parent, u64 parent, 712 u64 ref_root, u64 ref_generation, 713 u64 owner_objectid) 714 { 715 int ret; 716 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 717 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 718 return 0; 719 720 ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes, 721 orig_parent, parent, ref_root, 722 ref_root, ref_generation, 723 ref_generation, owner_objectid); 724 return ret; 725 } 726 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 727 struct btrfs_root *root, u64 bytenr, 728 u64 num_bytes, 729 u64 orig_parent, u64 parent, 730 u64 orig_root, u64 ref_root, 731 u64 orig_generation, u64 ref_generation, 732 u64 owner_objectid) 733 { 734 int ret; 735 736 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root, 737 ref_generation, owner_objectid, 738 BTRFS_ADD_DELAYED_REF, 0); 739 BUG_ON(ret); 740 return ret; 741 } 742 743 static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, 744 struct btrfs_root *root, u64 bytenr, 745 u64 num_bytes, u64 parent, u64 ref_root, 746 u64 ref_generation, u64 owner_objectid, 747 int refs_to_add) 748 { 749 struct btrfs_path *path; 750 int ret; 751 struct btrfs_key key; 752 struct extent_buffer *l; 753 struct btrfs_extent_item *item; 754 u32 refs; 755 756 path = btrfs_alloc_path(); 757 if (!path) 758 return -ENOMEM; 759 760 path->reada = 1; 761 path->leave_spinning = 1; 762 key.objectid = bytenr; 763 key.type = BTRFS_EXTENT_ITEM_KEY; 764 key.offset = num_bytes; 765 766 /* first find the extent item and update its reference count */ 767 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 768 path, 0, 1); 769 if (ret < 0) { 770 btrfs_set_path_blocking(path); 771 return ret; 772 } 773 774 if (ret > 0) { 775 WARN_ON(1); 776 btrfs_free_path(path); 777 return -EIO; 778 } 779 l = path->nodes[0]; 780 781 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 782 if (key.objectid != bytenr) { 783 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]); 784 printk(KERN_ERR "btrfs wanted %llu found %llu\n", 785 (unsigned long long)bytenr, 786 (unsigned long long)key.objectid); 787 BUG(); 788 } 789 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); 790 791 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); 792 793 refs = btrfs_extent_refs(l, item); 794 btrfs_set_extent_refs(l, item, refs + refs_to_add); 795 btrfs_unlock_up_safe(path, 1); 796 797 btrfs_mark_buffer_dirty(path->nodes[0]); 798 799 btrfs_release_path(root->fs_info->extent_root, path); 800 801 path->reada = 1; 802 path->leave_spinning = 1; 803 804 /* now insert the actual backref */ 805 ret = insert_extent_backref(trans, root->fs_info->extent_root, 806 path, bytenr, parent, 807 ref_root, ref_generation, 808 owner_objectid, refs_to_add); 809 BUG_ON(ret); 810 btrfs_free_path(path); 811 return 0; 812 } 813 814 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 815 struct btrfs_root *root, 816 u64 bytenr, u64 num_bytes, u64 parent, 817 u64 ref_root, u64 ref_generation, 818 u64 owner_objectid) 819 { 820 int ret; 821 if (ref_root == BTRFS_TREE_LOG_OBJECTID && 822 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 823 return 0; 824 825 ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent, 826 0, ref_root, 0, ref_generation, 827 owner_objectid); 828 return ret; 829 } 830 831 static int drop_delayed_ref(struct btrfs_trans_handle *trans, 832 struct btrfs_root *root, 833 struct btrfs_delayed_ref_node *node) 834 { 835 int ret = 0; 836 struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node); 837 838 BUG_ON(node->ref_mod == 0); 839 ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, 840 node->parent, ref->root, ref->generation, 841 ref->owner_objectid, ref->pin, node->ref_mod); 842 843 return ret; 844 } 845 846 /* helper function to actually process a single delayed ref entry */ 847 static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans, 848 struct btrfs_root *root, 849 struct btrfs_delayed_ref_node *node, 850 int insert_reserved) 851 { 852 int ret; 853 struct btrfs_delayed_ref *ref; 854 855 if (node->parent == (u64)-1) { 856 struct btrfs_delayed_ref_head *head; 857 /* 858 * we've hit the end of the chain and we were supposed 859 * to insert this extent into the tree. But, it got 860 * deleted before we ever needed to insert it, so all 861 * we have to do is clean up the accounting 862 */ 863 if (insert_reserved) { 864 update_reserved_extents(root, node->bytenr, 865 node->num_bytes, 0); 866 } 867 head = btrfs_delayed_node_to_head(node); 868 mutex_unlock(&head->mutex); 869 return 0; 870 } 871 872 ref = btrfs_delayed_node_to_ref(node); 873 if (ref->action == BTRFS_ADD_DELAYED_REF) { 874 if (insert_reserved) { 875 struct btrfs_key ins; 876 877 ins.objectid = node->bytenr; 878 ins.offset = node->num_bytes; 879 ins.type = BTRFS_EXTENT_ITEM_KEY; 880 881 /* record the full extent allocation */ 882 ret = __btrfs_alloc_reserved_extent(trans, root, 883 node->parent, ref->root, 884 ref->generation, ref->owner_objectid, 885 &ins, node->ref_mod); 886 update_reserved_extents(root, node->bytenr, 887 node->num_bytes, 0); 888 } else { 889 /* just add one backref */ 890 ret = add_extent_ref(trans, root, node->bytenr, 891 node->num_bytes, 892 node->parent, ref->root, ref->generation, 893 ref->owner_objectid, node->ref_mod); 894 } 895 BUG_ON(ret); 896 } else if (ref->action == BTRFS_DROP_DELAYED_REF) { 897 WARN_ON(insert_reserved); 898 ret = drop_delayed_ref(trans, root, node); 899 } 900 return 0; 901 } 902 903 static noinline struct btrfs_delayed_ref_node * 904 select_delayed_ref(struct btrfs_delayed_ref_head *head) 905 { 906 struct rb_node *node; 907 struct btrfs_delayed_ref_node *ref; 908 int action = BTRFS_ADD_DELAYED_REF; 909 again: 910 /* 911 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 912 * this prevents ref count from going down to zero when 913 * there still are pending delayed ref. 914 */ 915 node = rb_prev(&head->node.rb_node); 916 while (1) { 917 if (!node) 918 break; 919 ref = rb_entry(node, struct btrfs_delayed_ref_node, 920 rb_node); 921 if (ref->bytenr != head->node.bytenr) 922 break; 923 if (btrfs_delayed_node_to_ref(ref)->action == action) 924 return ref; 925 node = rb_prev(node); 926 } 927 if (action == BTRFS_ADD_DELAYED_REF) { 928 action = BTRFS_DROP_DELAYED_REF; 929 goto again; 930 } 931 return NULL; 932 } 933 934 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, 935 struct btrfs_root *root, 936 struct list_head *cluster) 937 { 938 struct btrfs_delayed_ref_root *delayed_refs; 939 struct btrfs_delayed_ref_node *ref; 940 struct btrfs_delayed_ref_head *locked_ref = NULL; 941 int ret; 942 int count = 0; 943 int must_insert_reserved = 0; 944 945 delayed_refs = &trans->transaction->delayed_refs; 946 while (1) { 947 if (!locked_ref) { 948 /* pick a new head ref from the cluster list */ 949 if (list_empty(cluster)) 950 break; 951 952 locked_ref = list_entry(cluster->next, 953 struct btrfs_delayed_ref_head, cluster); 954 955 /* grab the lock that says we are going to process 956 * all the refs for this head */ 957 ret = btrfs_delayed_ref_lock(trans, locked_ref); 958 959 /* 960 * we may have dropped the spin lock to get the head 961 * mutex lock, and that might have given someone else 962 * time to free the head. If that's true, it has been 963 * removed from our list and we can move on. 964 */ 965 if (ret == -EAGAIN) { 966 locked_ref = NULL; 967 count++; 968 continue; 969 } 970 } 971 972 /* 973 * record the must insert reserved flag before we 974 * drop the spin lock. 975 */ 976 must_insert_reserved = locked_ref->must_insert_reserved; 977 locked_ref->must_insert_reserved = 0; 978 979 /* 980 * locked_ref is the head node, so we have to go one 981 * node back for any delayed ref updates 982 */ 983 ref = select_delayed_ref(locked_ref); 984 if (!ref) { 985 /* All delayed refs have been processed, Go ahead 986 * and send the head node to run_one_delayed_ref, 987 * so that any accounting fixes can happen 988 */ 989 ref = &locked_ref->node; 990 list_del_init(&locked_ref->cluster); 991 locked_ref = NULL; 992 } 993 994 ref->in_tree = 0; 995 rb_erase(&ref->rb_node, &delayed_refs->root); 996 delayed_refs->num_entries--; 997 spin_unlock(&delayed_refs->lock); 998 999 ret = run_one_delayed_ref(trans, root, ref, 1000 must_insert_reserved); 1001 BUG_ON(ret); 1002 btrfs_put_delayed_ref(ref); 1003 1004 count++; 1005 cond_resched(); 1006 spin_lock(&delayed_refs->lock); 1007 } 1008 return count; 1009 } 1010 1011 /* 1012 * this starts processing the delayed reference count updates and 1013 * extent insertions we have queued up so far. count can be 1014 * 0, which means to process everything in the tree at the start 1015 * of the run (but not newly added entries), or it can be some target 1016 * number you'd like to process. 1017 */ 1018 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1019 struct btrfs_root *root, unsigned long count) 1020 { 1021 struct rb_node *node; 1022 struct btrfs_delayed_ref_root *delayed_refs; 1023 struct btrfs_delayed_ref_node *ref; 1024 struct list_head cluster; 1025 int ret; 1026 int run_all = count == (unsigned long)-1; 1027 int run_most = 0; 1028 1029 if (root == root->fs_info->extent_root) 1030 root = root->fs_info->tree_root; 1031 1032 delayed_refs = &trans->transaction->delayed_refs; 1033 INIT_LIST_HEAD(&cluster); 1034 again: 1035 spin_lock(&delayed_refs->lock); 1036 if (count == 0) { 1037 count = delayed_refs->num_entries * 2; 1038 run_most = 1; 1039 } 1040 while (1) { 1041 if (!(run_all || run_most) && 1042 delayed_refs->num_heads_ready < 64) 1043 break; 1044 1045 /* 1046 * go find something we can process in the rbtree. We start at 1047 * the beginning of the tree, and then build a cluster 1048 * of refs to process starting at the first one we are able to 1049 * lock 1050 */ 1051 ret = btrfs_find_ref_cluster(trans, &cluster, 1052 delayed_refs->run_delayed_start); 1053 if (ret) 1054 break; 1055 1056 ret = run_clustered_refs(trans, root, &cluster); 1057 BUG_ON(ret < 0); 1058 1059 count -= min_t(unsigned long, ret, count); 1060 1061 if (count == 0) 1062 break; 1063 } 1064 1065 if (run_all) { 1066 node = rb_first(&delayed_refs->root); 1067 if (!node) 1068 goto out; 1069 count = (unsigned long)-1; 1070 1071 while (node) { 1072 ref = rb_entry(node, struct btrfs_delayed_ref_node, 1073 rb_node); 1074 if (btrfs_delayed_ref_is_head(ref)) { 1075 struct btrfs_delayed_ref_head *head; 1076 1077 head = btrfs_delayed_node_to_head(ref); 1078 atomic_inc(&ref->refs); 1079 1080 spin_unlock(&delayed_refs->lock); 1081 mutex_lock(&head->mutex); 1082 mutex_unlock(&head->mutex); 1083 1084 btrfs_put_delayed_ref(ref); 1085 cond_resched(); 1086 goto again; 1087 } 1088 node = rb_next(node); 1089 } 1090 spin_unlock(&delayed_refs->lock); 1091 schedule_timeout(1); 1092 goto again; 1093 } 1094 out: 1095 spin_unlock(&delayed_refs->lock); 1096 return 0; 1097 } 1098 1099 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 1100 struct btrfs_root *root, u64 objectid, u64 bytenr) 1101 { 1102 struct btrfs_root *extent_root = root->fs_info->extent_root; 1103 struct btrfs_path *path; 1104 struct extent_buffer *leaf; 1105 struct btrfs_extent_ref *ref_item; 1106 struct btrfs_key key; 1107 struct btrfs_key found_key; 1108 u64 ref_root; 1109 u64 last_snapshot; 1110 u32 nritems; 1111 int ret; 1112 1113 key.objectid = bytenr; 1114 key.offset = (u64)-1; 1115 key.type = BTRFS_EXTENT_ITEM_KEY; 1116 1117 path = btrfs_alloc_path(); 1118 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 1119 if (ret < 0) 1120 goto out; 1121 BUG_ON(ret == 0); 1122 1123 ret = -ENOENT; 1124 if (path->slots[0] == 0) 1125 goto out; 1126 1127 path->slots[0]--; 1128 leaf = path->nodes[0]; 1129 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1130 1131 if (found_key.objectid != bytenr || 1132 found_key.type != BTRFS_EXTENT_ITEM_KEY) 1133 goto out; 1134 1135 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1136 while (1) { 1137 leaf = path->nodes[0]; 1138 nritems = btrfs_header_nritems(leaf); 1139 if (path->slots[0] >= nritems) { 1140 ret = btrfs_next_leaf(extent_root, path); 1141 if (ret < 0) 1142 goto out; 1143 if (ret == 0) 1144 continue; 1145 break; 1146 } 1147 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1148 if (found_key.objectid != bytenr) 1149 break; 1150 1151 if (found_key.type != BTRFS_EXTENT_REF_KEY) { 1152 path->slots[0]++; 1153 continue; 1154 } 1155 1156 ref_item = btrfs_item_ptr(leaf, path->slots[0], 1157 struct btrfs_extent_ref); 1158 ref_root = btrfs_ref_root(leaf, ref_item); 1159 if ((ref_root != root->root_key.objectid && 1160 ref_root != BTRFS_TREE_LOG_OBJECTID) || 1161 objectid != btrfs_ref_objectid(leaf, ref_item)) { 1162 ret = 1; 1163 goto out; 1164 } 1165 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) { 1166 ret = 1; 1167 goto out; 1168 } 1169 1170 path->slots[0]++; 1171 } 1172 ret = 0; 1173 out: 1174 btrfs_free_path(path); 1175 return ret; 1176 } 1177 1178 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1179 struct extent_buffer *buf, u32 nr_extents) 1180 { 1181 struct btrfs_key key; 1182 struct btrfs_file_extent_item *fi; 1183 u64 root_gen; 1184 u32 nritems; 1185 int i; 1186 int level; 1187 int ret = 0; 1188 int shared = 0; 1189 1190 if (!root->ref_cows) 1191 return 0; 1192 1193 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1194 shared = 0; 1195 root_gen = root->root_key.offset; 1196 } else { 1197 shared = 1; 1198 root_gen = trans->transid - 1; 1199 } 1200 1201 level = btrfs_header_level(buf); 1202 nritems = btrfs_header_nritems(buf); 1203 1204 if (level == 0) { 1205 struct btrfs_leaf_ref *ref; 1206 struct btrfs_extent_info *info; 1207 1208 ref = btrfs_alloc_leaf_ref(root, nr_extents); 1209 if (!ref) { 1210 ret = -ENOMEM; 1211 goto out; 1212 } 1213 1214 ref->root_gen = root_gen; 1215 ref->bytenr = buf->start; 1216 ref->owner = btrfs_header_owner(buf); 1217 ref->generation = btrfs_header_generation(buf); 1218 ref->nritems = nr_extents; 1219 info = ref->extents; 1220 1221 for (i = 0; nr_extents > 0 && i < nritems; i++) { 1222 u64 disk_bytenr; 1223 btrfs_item_key_to_cpu(buf, &key, i); 1224 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 1225 continue; 1226 fi = btrfs_item_ptr(buf, i, 1227 struct btrfs_file_extent_item); 1228 if (btrfs_file_extent_type(buf, fi) == 1229 BTRFS_FILE_EXTENT_INLINE) 1230 continue; 1231 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 1232 if (disk_bytenr == 0) 1233 continue; 1234 1235 info->bytenr = disk_bytenr; 1236 info->num_bytes = 1237 btrfs_file_extent_disk_num_bytes(buf, fi); 1238 info->objectid = key.objectid; 1239 info->offset = key.offset; 1240 info++; 1241 } 1242 1243 ret = btrfs_add_leaf_ref(root, ref, shared); 1244 if (ret == -EEXIST && shared) { 1245 struct btrfs_leaf_ref *old; 1246 old = btrfs_lookup_leaf_ref(root, ref->bytenr); 1247 BUG_ON(!old); 1248 btrfs_remove_leaf_ref(root, old); 1249 btrfs_free_leaf_ref(root, old); 1250 ret = btrfs_add_leaf_ref(root, ref, shared); 1251 } 1252 WARN_ON(ret); 1253 btrfs_free_leaf_ref(root, ref); 1254 } 1255 out: 1256 return ret; 1257 } 1258 1259 /* when a block goes through cow, we update the reference counts of 1260 * everything that block points to. The internal pointers of the block 1261 * can be in just about any order, and it is likely to have clusters of 1262 * things that are close together and clusters of things that are not. 1263 * 1264 * To help reduce the seeks that come with updating all of these reference 1265 * counts, sort them by byte number before actual updates are done. 1266 * 1267 * struct refsort is used to match byte number to slot in the btree block. 1268 * we sort based on the byte number and then use the slot to actually 1269 * find the item. 1270 * 1271 * struct refsort is smaller than strcut btrfs_item and smaller than 1272 * struct btrfs_key_ptr. Since we're currently limited to the page size 1273 * for a btree block, there's no way for a kmalloc of refsorts for a 1274 * single node to be bigger than a page. 1275 */ 1276 struct refsort { 1277 u64 bytenr; 1278 u32 slot; 1279 }; 1280 1281 /* 1282 * for passing into sort() 1283 */ 1284 static int refsort_cmp(const void *a_void, const void *b_void) 1285 { 1286 const struct refsort *a = a_void; 1287 const struct refsort *b = b_void; 1288 1289 if (a->bytenr < b->bytenr) 1290 return -1; 1291 if (a->bytenr > b->bytenr) 1292 return 1; 1293 return 0; 1294 } 1295 1296 1297 noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, 1298 struct btrfs_root *root, 1299 struct extent_buffer *orig_buf, 1300 struct extent_buffer *buf, u32 *nr_extents) 1301 { 1302 u64 bytenr; 1303 u64 ref_root; 1304 u64 orig_root; 1305 u64 ref_generation; 1306 u64 orig_generation; 1307 struct refsort *sorted; 1308 u32 nritems; 1309 u32 nr_file_extents = 0; 1310 struct btrfs_key key; 1311 struct btrfs_file_extent_item *fi; 1312 int i; 1313 int level; 1314 int ret = 0; 1315 int faili = 0; 1316 int refi = 0; 1317 int slot; 1318 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1319 u64, u64, u64, u64, u64, u64, u64, u64, u64); 1320 1321 ref_root = btrfs_header_owner(buf); 1322 ref_generation = btrfs_header_generation(buf); 1323 orig_root = btrfs_header_owner(orig_buf); 1324 orig_generation = btrfs_header_generation(orig_buf); 1325 1326 nritems = btrfs_header_nritems(buf); 1327 level = btrfs_header_level(buf); 1328 1329 sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS); 1330 BUG_ON(!sorted); 1331 1332 if (root->ref_cows) { 1333 process_func = __btrfs_inc_extent_ref; 1334 } else { 1335 if (level == 0 && 1336 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 1337 goto out; 1338 if (level != 0 && 1339 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) 1340 goto out; 1341 process_func = __btrfs_update_extent_ref; 1342 } 1343 1344 /* 1345 * we make two passes through the items. In the first pass we 1346 * only record the byte number and slot. Then we sort based on 1347 * byte number and do the actual work based on the sorted results 1348 */ 1349 for (i = 0; i < nritems; i++) { 1350 cond_resched(); 1351 if (level == 0) { 1352 btrfs_item_key_to_cpu(buf, &key, i); 1353 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 1354 continue; 1355 fi = btrfs_item_ptr(buf, i, 1356 struct btrfs_file_extent_item); 1357 if (btrfs_file_extent_type(buf, fi) == 1358 BTRFS_FILE_EXTENT_INLINE) 1359 continue; 1360 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 1361 if (bytenr == 0) 1362 continue; 1363 1364 nr_file_extents++; 1365 sorted[refi].bytenr = bytenr; 1366 sorted[refi].slot = i; 1367 refi++; 1368 } else { 1369 bytenr = btrfs_node_blockptr(buf, i); 1370 sorted[refi].bytenr = bytenr; 1371 sorted[refi].slot = i; 1372 refi++; 1373 } 1374 } 1375 /* 1376 * if refi == 0, we didn't actually put anything into the sorted 1377 * array and we're done 1378 */ 1379 if (refi == 0) 1380 goto out; 1381 1382 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 1383 1384 for (i = 0; i < refi; i++) { 1385 cond_resched(); 1386 slot = sorted[i].slot; 1387 bytenr = sorted[i].bytenr; 1388 1389 if (level == 0) { 1390 btrfs_item_key_to_cpu(buf, &key, slot); 1391 fi = btrfs_item_ptr(buf, slot, 1392 struct btrfs_file_extent_item); 1393 1394 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 1395 if (bytenr == 0) 1396 continue; 1397 1398 ret = process_func(trans, root, bytenr, 1399 btrfs_file_extent_disk_num_bytes(buf, fi), 1400 orig_buf->start, buf->start, 1401 orig_root, ref_root, 1402 orig_generation, ref_generation, 1403 key.objectid); 1404 1405 if (ret) { 1406 faili = slot; 1407 WARN_ON(1); 1408 goto fail; 1409 } 1410 } else { 1411 ret = process_func(trans, root, bytenr, buf->len, 1412 orig_buf->start, buf->start, 1413 orig_root, ref_root, 1414 orig_generation, ref_generation, 1415 level - 1); 1416 if (ret) { 1417 faili = slot; 1418 WARN_ON(1); 1419 goto fail; 1420 } 1421 } 1422 } 1423 out: 1424 kfree(sorted); 1425 if (nr_extents) { 1426 if (level == 0) 1427 *nr_extents = nr_file_extents; 1428 else 1429 *nr_extents = nritems; 1430 } 1431 return 0; 1432 fail: 1433 kfree(sorted); 1434 WARN_ON(1); 1435 return ret; 1436 } 1437 1438 int btrfs_update_ref(struct btrfs_trans_handle *trans, 1439 struct btrfs_root *root, struct extent_buffer *orig_buf, 1440 struct extent_buffer *buf, int start_slot, int nr) 1441 1442 { 1443 u64 bytenr; 1444 u64 ref_root; 1445 u64 orig_root; 1446 u64 ref_generation; 1447 u64 orig_generation; 1448 struct btrfs_key key; 1449 struct btrfs_file_extent_item *fi; 1450 int i; 1451 int ret; 1452 int slot; 1453 int level; 1454 1455 BUG_ON(start_slot < 0); 1456 BUG_ON(start_slot + nr > btrfs_header_nritems(buf)); 1457 1458 ref_root = btrfs_header_owner(buf); 1459 ref_generation = btrfs_header_generation(buf); 1460 orig_root = btrfs_header_owner(orig_buf); 1461 orig_generation = btrfs_header_generation(orig_buf); 1462 level = btrfs_header_level(buf); 1463 1464 if (!root->ref_cows) { 1465 if (level == 0 && 1466 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 1467 return 0; 1468 if (level != 0 && 1469 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) 1470 return 0; 1471 } 1472 1473 for (i = 0, slot = start_slot; i < nr; i++, slot++) { 1474 cond_resched(); 1475 if (level == 0) { 1476 btrfs_item_key_to_cpu(buf, &key, slot); 1477 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 1478 continue; 1479 fi = btrfs_item_ptr(buf, slot, 1480 struct btrfs_file_extent_item); 1481 if (btrfs_file_extent_type(buf, fi) == 1482 BTRFS_FILE_EXTENT_INLINE) 1483 continue; 1484 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 1485 if (bytenr == 0) 1486 continue; 1487 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1488 btrfs_file_extent_disk_num_bytes(buf, fi), 1489 orig_buf->start, buf->start, 1490 orig_root, ref_root, orig_generation, 1491 ref_generation, key.objectid); 1492 if (ret) 1493 goto fail; 1494 } else { 1495 bytenr = btrfs_node_blockptr(buf, slot); 1496 ret = __btrfs_update_extent_ref(trans, root, bytenr, 1497 buf->len, orig_buf->start, 1498 buf->start, orig_root, ref_root, 1499 orig_generation, ref_generation, 1500 level - 1); 1501 if (ret) 1502 goto fail; 1503 } 1504 } 1505 return 0; 1506 fail: 1507 WARN_ON(1); 1508 return -1; 1509 } 1510 1511 static int write_one_cache_group(struct btrfs_trans_handle *trans, 1512 struct btrfs_root *root, 1513 struct btrfs_path *path, 1514 struct btrfs_block_group_cache *cache) 1515 { 1516 int ret; 1517 struct btrfs_root *extent_root = root->fs_info->extent_root; 1518 unsigned long bi; 1519 struct extent_buffer *leaf; 1520 1521 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 1522 if (ret < 0) 1523 goto fail; 1524 BUG_ON(ret); 1525 1526 leaf = path->nodes[0]; 1527 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 1528 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 1529 btrfs_mark_buffer_dirty(leaf); 1530 btrfs_release_path(extent_root, path); 1531 fail: 1532 if (ret) 1533 return ret; 1534 return 0; 1535 1536 } 1537 1538 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 1539 struct btrfs_root *root) 1540 { 1541 struct btrfs_block_group_cache *cache, *entry; 1542 struct rb_node *n; 1543 int err = 0; 1544 int werr = 0; 1545 struct btrfs_path *path; 1546 u64 last = 0; 1547 1548 path = btrfs_alloc_path(); 1549 if (!path) 1550 return -ENOMEM; 1551 1552 while (1) { 1553 cache = NULL; 1554 spin_lock(&root->fs_info->block_group_cache_lock); 1555 for (n = rb_first(&root->fs_info->block_group_cache_tree); 1556 n; n = rb_next(n)) { 1557 entry = rb_entry(n, struct btrfs_block_group_cache, 1558 cache_node); 1559 if (entry->dirty) { 1560 cache = entry; 1561 break; 1562 } 1563 } 1564 spin_unlock(&root->fs_info->block_group_cache_lock); 1565 1566 if (!cache) 1567 break; 1568 1569 cache->dirty = 0; 1570 last += cache->key.offset; 1571 1572 err = write_one_cache_group(trans, root, 1573 path, cache); 1574 /* 1575 * if we fail to write the cache group, we want 1576 * to keep it marked dirty in hopes that a later 1577 * write will work 1578 */ 1579 if (err) { 1580 werr = err; 1581 continue; 1582 } 1583 } 1584 btrfs_free_path(path); 1585 return werr; 1586 } 1587 1588 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 1589 { 1590 struct btrfs_block_group_cache *block_group; 1591 int readonly = 0; 1592 1593 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 1594 if (!block_group || block_group->ro) 1595 readonly = 1; 1596 if (block_group) 1597 put_block_group(block_group); 1598 return readonly; 1599 } 1600 1601 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 1602 u64 total_bytes, u64 bytes_used, 1603 struct btrfs_space_info **space_info) 1604 { 1605 struct btrfs_space_info *found; 1606 1607 found = __find_space_info(info, flags); 1608 if (found) { 1609 spin_lock(&found->lock); 1610 found->total_bytes += total_bytes; 1611 found->bytes_used += bytes_used; 1612 found->full = 0; 1613 spin_unlock(&found->lock); 1614 *space_info = found; 1615 return 0; 1616 } 1617 found = kzalloc(sizeof(*found), GFP_NOFS); 1618 if (!found) 1619 return -ENOMEM; 1620 1621 INIT_LIST_HEAD(&found->block_groups); 1622 init_rwsem(&found->groups_sem); 1623 spin_lock_init(&found->lock); 1624 found->flags = flags; 1625 found->total_bytes = total_bytes; 1626 found->bytes_used = bytes_used; 1627 found->bytes_pinned = 0; 1628 found->bytes_reserved = 0; 1629 found->bytes_readonly = 0; 1630 found->bytes_delalloc = 0; 1631 found->full = 0; 1632 found->force_alloc = 0; 1633 *space_info = found; 1634 list_add_rcu(&found->list, &info->space_info); 1635 return 0; 1636 } 1637 1638 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 1639 { 1640 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | 1641 BTRFS_BLOCK_GROUP_RAID1 | 1642 BTRFS_BLOCK_GROUP_RAID10 | 1643 BTRFS_BLOCK_GROUP_DUP); 1644 if (extra_flags) { 1645 if (flags & BTRFS_BLOCK_GROUP_DATA) 1646 fs_info->avail_data_alloc_bits |= extra_flags; 1647 if (flags & BTRFS_BLOCK_GROUP_METADATA) 1648 fs_info->avail_metadata_alloc_bits |= extra_flags; 1649 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 1650 fs_info->avail_system_alloc_bits |= extra_flags; 1651 } 1652 } 1653 1654 static void set_block_group_readonly(struct btrfs_block_group_cache *cache) 1655 { 1656 spin_lock(&cache->space_info->lock); 1657 spin_lock(&cache->lock); 1658 if (!cache->ro) { 1659 cache->space_info->bytes_readonly += cache->key.offset - 1660 btrfs_block_group_used(&cache->item); 1661 cache->ro = 1; 1662 } 1663 spin_unlock(&cache->lock); 1664 spin_unlock(&cache->space_info->lock); 1665 } 1666 1667 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 1668 { 1669 u64 num_devices = root->fs_info->fs_devices->rw_devices; 1670 1671 if (num_devices == 1) 1672 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 1673 if (num_devices < 4) 1674 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 1675 1676 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 1677 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 1678 BTRFS_BLOCK_GROUP_RAID10))) { 1679 flags &= ~BTRFS_BLOCK_GROUP_DUP; 1680 } 1681 1682 if ((flags & BTRFS_BLOCK_GROUP_RAID1) && 1683 (flags & BTRFS_BLOCK_GROUP_RAID10)) { 1684 flags &= ~BTRFS_BLOCK_GROUP_RAID1; 1685 } 1686 1687 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 1688 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 1689 (flags & BTRFS_BLOCK_GROUP_RAID10) | 1690 (flags & BTRFS_BLOCK_GROUP_DUP))) 1691 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 1692 return flags; 1693 } 1694 1695 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 1696 { 1697 struct btrfs_fs_info *info = root->fs_info; 1698 u64 alloc_profile; 1699 1700 if (data) { 1701 alloc_profile = info->avail_data_alloc_bits & 1702 info->data_alloc_profile; 1703 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; 1704 } else if (root == root->fs_info->chunk_root) { 1705 alloc_profile = info->avail_system_alloc_bits & 1706 info->system_alloc_profile; 1707 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; 1708 } else { 1709 alloc_profile = info->avail_metadata_alloc_bits & 1710 info->metadata_alloc_profile; 1711 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; 1712 } 1713 1714 return btrfs_reduce_alloc_profile(root, data); 1715 } 1716 1717 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) 1718 { 1719 u64 alloc_target; 1720 1721 alloc_target = btrfs_get_alloc_profile(root, 1); 1722 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, 1723 alloc_target); 1724 } 1725 1726 /* 1727 * for now this just makes sure we have at least 5% of our metadata space free 1728 * for use. 1729 */ 1730 int btrfs_check_metadata_free_space(struct btrfs_root *root) 1731 { 1732 struct btrfs_fs_info *info = root->fs_info; 1733 struct btrfs_space_info *meta_sinfo; 1734 u64 alloc_target, thresh; 1735 int committed = 0, ret; 1736 1737 /* get the space info for where the metadata will live */ 1738 alloc_target = btrfs_get_alloc_profile(root, 0); 1739 meta_sinfo = __find_space_info(info, alloc_target); 1740 1741 again: 1742 spin_lock(&meta_sinfo->lock); 1743 if (!meta_sinfo->full) 1744 thresh = meta_sinfo->total_bytes * 80; 1745 else 1746 thresh = meta_sinfo->total_bytes * 95; 1747 1748 do_div(thresh, 100); 1749 1750 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 1751 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { 1752 struct btrfs_trans_handle *trans; 1753 if (!meta_sinfo->full) { 1754 meta_sinfo->force_alloc = 1; 1755 spin_unlock(&meta_sinfo->lock); 1756 1757 trans = btrfs_start_transaction(root, 1); 1758 if (!trans) 1759 return -ENOMEM; 1760 1761 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 1762 2 * 1024 * 1024, alloc_target, 0); 1763 btrfs_end_transaction(trans, root); 1764 goto again; 1765 } 1766 spin_unlock(&meta_sinfo->lock); 1767 1768 if (!committed) { 1769 committed = 1; 1770 trans = btrfs_join_transaction(root, 1); 1771 if (!trans) 1772 return -ENOMEM; 1773 ret = btrfs_commit_transaction(trans, root); 1774 if (ret) 1775 return ret; 1776 goto again; 1777 } 1778 return -ENOSPC; 1779 } 1780 spin_unlock(&meta_sinfo->lock); 1781 1782 return 0; 1783 } 1784 1785 /* 1786 * This will check the space that the inode allocates from to make sure we have 1787 * enough space for bytes. 1788 */ 1789 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 1790 u64 bytes) 1791 { 1792 struct btrfs_space_info *data_sinfo; 1793 int ret = 0, committed = 0; 1794 1795 /* make sure bytes are sectorsize aligned */ 1796 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 1797 1798 data_sinfo = BTRFS_I(inode)->space_info; 1799 again: 1800 /* make sure we have enough space to handle the data first */ 1801 spin_lock(&data_sinfo->lock); 1802 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 1803 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 1804 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 1805 data_sinfo->bytes_may_use < bytes) { 1806 struct btrfs_trans_handle *trans; 1807 1808 /* 1809 * if we don't have enough free bytes in this space then we need 1810 * to alloc a new chunk. 1811 */ 1812 if (!data_sinfo->full) { 1813 u64 alloc_target; 1814 1815 data_sinfo->force_alloc = 1; 1816 spin_unlock(&data_sinfo->lock); 1817 1818 alloc_target = btrfs_get_alloc_profile(root, 1); 1819 trans = btrfs_start_transaction(root, 1); 1820 if (!trans) 1821 return -ENOMEM; 1822 1823 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 1824 bytes + 2 * 1024 * 1024, 1825 alloc_target, 0); 1826 btrfs_end_transaction(trans, root); 1827 if (ret) 1828 return ret; 1829 goto again; 1830 } 1831 spin_unlock(&data_sinfo->lock); 1832 1833 /* commit the current transaction and try again */ 1834 if (!committed) { 1835 committed = 1; 1836 trans = btrfs_join_transaction(root, 1); 1837 if (!trans) 1838 return -ENOMEM; 1839 ret = btrfs_commit_transaction(trans, root); 1840 if (ret) 1841 return ret; 1842 goto again; 1843 } 1844 1845 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 1846 ", %llu bytes_used, %llu bytes_reserved, " 1847 "%llu bytes_pinned, %llu bytes_readonly, %llu may use" 1848 "%llu total\n", bytes, data_sinfo->bytes_delalloc, 1849 data_sinfo->bytes_used, data_sinfo->bytes_reserved, 1850 data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, 1851 data_sinfo->bytes_may_use, data_sinfo->total_bytes); 1852 return -ENOSPC; 1853 } 1854 data_sinfo->bytes_may_use += bytes; 1855 BTRFS_I(inode)->reserved_bytes += bytes; 1856 spin_unlock(&data_sinfo->lock); 1857 1858 return btrfs_check_metadata_free_space(root); 1859 } 1860 1861 /* 1862 * if there was an error for whatever reason after calling 1863 * btrfs_check_data_free_space, call this so we can cleanup the counters. 1864 */ 1865 void btrfs_free_reserved_data_space(struct btrfs_root *root, 1866 struct inode *inode, u64 bytes) 1867 { 1868 struct btrfs_space_info *data_sinfo; 1869 1870 /* make sure bytes are sectorsize aligned */ 1871 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 1872 1873 data_sinfo = BTRFS_I(inode)->space_info; 1874 spin_lock(&data_sinfo->lock); 1875 data_sinfo->bytes_may_use -= bytes; 1876 BTRFS_I(inode)->reserved_bytes -= bytes; 1877 spin_unlock(&data_sinfo->lock); 1878 } 1879 1880 /* called when we are adding a delalloc extent to the inode's io_tree */ 1881 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, 1882 u64 bytes) 1883 { 1884 struct btrfs_space_info *data_sinfo; 1885 1886 /* get the space info for where this inode will be storing its data */ 1887 data_sinfo = BTRFS_I(inode)->space_info; 1888 1889 /* make sure we have enough space to handle the data first */ 1890 spin_lock(&data_sinfo->lock); 1891 data_sinfo->bytes_delalloc += bytes; 1892 1893 /* 1894 * we are adding a delalloc extent without calling 1895 * btrfs_check_data_free_space first. This happens on a weird 1896 * writepage condition, but shouldn't hurt our accounting 1897 */ 1898 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { 1899 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; 1900 BTRFS_I(inode)->reserved_bytes = 0; 1901 } else { 1902 data_sinfo->bytes_may_use -= bytes; 1903 BTRFS_I(inode)->reserved_bytes -= bytes; 1904 } 1905 1906 spin_unlock(&data_sinfo->lock); 1907 } 1908 1909 /* called when we are clearing an delalloc extent from the inode's io_tree */ 1910 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 1911 u64 bytes) 1912 { 1913 struct btrfs_space_info *info; 1914 1915 info = BTRFS_I(inode)->space_info; 1916 1917 spin_lock(&info->lock); 1918 info->bytes_delalloc -= bytes; 1919 spin_unlock(&info->lock); 1920 } 1921 1922 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 1923 struct btrfs_root *extent_root, u64 alloc_bytes, 1924 u64 flags, int force) 1925 { 1926 struct btrfs_space_info *space_info; 1927 u64 thresh; 1928 int ret = 0; 1929 1930 mutex_lock(&extent_root->fs_info->chunk_mutex); 1931 1932 flags = btrfs_reduce_alloc_profile(extent_root, flags); 1933 1934 space_info = __find_space_info(extent_root->fs_info, flags); 1935 if (!space_info) { 1936 ret = update_space_info(extent_root->fs_info, flags, 1937 0, 0, &space_info); 1938 BUG_ON(ret); 1939 } 1940 BUG_ON(!space_info); 1941 1942 spin_lock(&space_info->lock); 1943 if (space_info->force_alloc) { 1944 force = 1; 1945 space_info->force_alloc = 0; 1946 } 1947 if (space_info->full) { 1948 spin_unlock(&space_info->lock); 1949 goto out; 1950 } 1951 1952 thresh = space_info->total_bytes - space_info->bytes_readonly; 1953 thresh = div_factor(thresh, 6); 1954 if (!force && 1955 (space_info->bytes_used + space_info->bytes_pinned + 1956 space_info->bytes_reserved + alloc_bytes) < thresh) { 1957 spin_unlock(&space_info->lock); 1958 goto out; 1959 } 1960 spin_unlock(&space_info->lock); 1961 1962 ret = btrfs_alloc_chunk(trans, extent_root, flags); 1963 if (ret) 1964 space_info->full = 1; 1965 out: 1966 mutex_unlock(&extent_root->fs_info->chunk_mutex); 1967 return ret; 1968 } 1969 1970 static int update_block_group(struct btrfs_trans_handle *trans, 1971 struct btrfs_root *root, 1972 u64 bytenr, u64 num_bytes, int alloc, 1973 int mark_free) 1974 { 1975 struct btrfs_block_group_cache *cache; 1976 struct btrfs_fs_info *info = root->fs_info; 1977 u64 total = num_bytes; 1978 u64 old_val; 1979 u64 byte_in_group; 1980 1981 while (total) { 1982 cache = btrfs_lookup_block_group(info, bytenr); 1983 if (!cache) 1984 return -1; 1985 byte_in_group = bytenr - cache->key.objectid; 1986 WARN_ON(byte_in_group > cache->key.offset); 1987 1988 spin_lock(&cache->space_info->lock); 1989 spin_lock(&cache->lock); 1990 cache->dirty = 1; 1991 old_val = btrfs_block_group_used(&cache->item); 1992 num_bytes = min(total, cache->key.offset - byte_in_group); 1993 if (alloc) { 1994 old_val += num_bytes; 1995 cache->space_info->bytes_used += num_bytes; 1996 if (cache->ro) 1997 cache->space_info->bytes_readonly -= num_bytes; 1998 btrfs_set_block_group_used(&cache->item, old_val); 1999 spin_unlock(&cache->lock); 2000 spin_unlock(&cache->space_info->lock); 2001 } else { 2002 old_val -= num_bytes; 2003 cache->space_info->bytes_used -= num_bytes; 2004 if (cache->ro) 2005 cache->space_info->bytes_readonly += num_bytes; 2006 btrfs_set_block_group_used(&cache->item, old_val); 2007 spin_unlock(&cache->lock); 2008 spin_unlock(&cache->space_info->lock); 2009 if (mark_free) { 2010 int ret; 2011 2012 ret = btrfs_discard_extent(root, bytenr, 2013 num_bytes); 2014 WARN_ON(ret); 2015 2016 ret = btrfs_add_free_space(cache, bytenr, 2017 num_bytes); 2018 WARN_ON(ret); 2019 } 2020 } 2021 put_block_group(cache); 2022 total -= num_bytes; 2023 bytenr += num_bytes; 2024 } 2025 return 0; 2026 } 2027 2028 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 2029 { 2030 struct btrfs_block_group_cache *cache; 2031 u64 bytenr; 2032 2033 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 2034 if (!cache) 2035 return 0; 2036 2037 bytenr = cache->key.objectid; 2038 put_block_group(cache); 2039 2040 return bytenr; 2041 } 2042 2043 int btrfs_update_pinned_extents(struct btrfs_root *root, 2044 u64 bytenr, u64 num, int pin) 2045 { 2046 u64 len; 2047 struct btrfs_block_group_cache *cache; 2048 struct btrfs_fs_info *fs_info = root->fs_info; 2049 2050 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex)); 2051 if (pin) { 2052 set_extent_dirty(&fs_info->pinned_extents, 2053 bytenr, bytenr + num - 1, GFP_NOFS); 2054 } else { 2055 clear_extent_dirty(&fs_info->pinned_extents, 2056 bytenr, bytenr + num - 1, GFP_NOFS); 2057 } 2058 mutex_unlock(&root->fs_info->pinned_mutex); 2059 2060 while (num > 0) { 2061 cache = btrfs_lookup_block_group(fs_info, bytenr); 2062 BUG_ON(!cache); 2063 len = min(num, cache->key.offset - 2064 (bytenr - cache->key.objectid)); 2065 if (pin) { 2066 spin_lock(&cache->space_info->lock); 2067 spin_lock(&cache->lock); 2068 cache->pinned += len; 2069 cache->space_info->bytes_pinned += len; 2070 spin_unlock(&cache->lock); 2071 spin_unlock(&cache->space_info->lock); 2072 fs_info->total_pinned += len; 2073 } else { 2074 spin_lock(&cache->space_info->lock); 2075 spin_lock(&cache->lock); 2076 cache->pinned -= len; 2077 cache->space_info->bytes_pinned -= len; 2078 spin_unlock(&cache->lock); 2079 spin_unlock(&cache->space_info->lock); 2080 fs_info->total_pinned -= len; 2081 if (cache->cached) 2082 btrfs_add_free_space(cache, bytenr, len); 2083 } 2084 put_block_group(cache); 2085 bytenr += len; 2086 num -= len; 2087 } 2088 return 0; 2089 } 2090 2091 static int update_reserved_extents(struct btrfs_root *root, 2092 u64 bytenr, u64 num, int reserve) 2093 { 2094 u64 len; 2095 struct btrfs_block_group_cache *cache; 2096 struct btrfs_fs_info *fs_info = root->fs_info; 2097 2098 while (num > 0) { 2099 cache = btrfs_lookup_block_group(fs_info, bytenr); 2100 BUG_ON(!cache); 2101 len = min(num, cache->key.offset - 2102 (bytenr - cache->key.objectid)); 2103 2104 spin_lock(&cache->space_info->lock); 2105 spin_lock(&cache->lock); 2106 if (reserve) { 2107 cache->reserved += len; 2108 cache->space_info->bytes_reserved += len; 2109 } else { 2110 cache->reserved -= len; 2111 cache->space_info->bytes_reserved -= len; 2112 } 2113 spin_unlock(&cache->lock); 2114 spin_unlock(&cache->space_info->lock); 2115 put_block_group(cache); 2116 bytenr += len; 2117 num -= len; 2118 } 2119 return 0; 2120 } 2121 2122 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) 2123 { 2124 u64 last = 0; 2125 u64 start; 2126 u64 end; 2127 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; 2128 int ret; 2129 2130 mutex_lock(&root->fs_info->pinned_mutex); 2131 while (1) { 2132 ret = find_first_extent_bit(pinned_extents, last, 2133 &start, &end, EXTENT_DIRTY); 2134 if (ret) 2135 break; 2136 set_extent_dirty(copy, start, end, GFP_NOFS); 2137 last = end + 1; 2138 } 2139 mutex_unlock(&root->fs_info->pinned_mutex); 2140 return 0; 2141 } 2142 2143 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2144 struct btrfs_root *root, 2145 struct extent_io_tree *unpin) 2146 { 2147 u64 start; 2148 u64 end; 2149 int ret; 2150 2151 while (1) { 2152 mutex_lock(&root->fs_info->pinned_mutex); 2153 ret = find_first_extent_bit(unpin, 0, &start, &end, 2154 EXTENT_DIRTY); 2155 if (ret) 2156 break; 2157 2158 ret = btrfs_discard_extent(root, start, end + 1 - start); 2159 2160 /* unlocks the pinned mutex */ 2161 btrfs_update_pinned_extents(root, start, end + 1 - start, 0); 2162 clear_extent_dirty(unpin, start, end, GFP_NOFS); 2163 2164 cond_resched(); 2165 } 2166 mutex_unlock(&root->fs_info->pinned_mutex); 2167 return ret; 2168 } 2169 2170 static int pin_down_bytes(struct btrfs_trans_handle *trans, 2171 struct btrfs_root *root, 2172 struct btrfs_path *path, 2173 u64 bytenr, u64 num_bytes, int is_data, 2174 struct extent_buffer **must_clean) 2175 { 2176 int err = 0; 2177 struct extent_buffer *buf; 2178 2179 if (is_data) 2180 goto pinit; 2181 2182 buf = btrfs_find_tree_block(root, bytenr, num_bytes); 2183 if (!buf) 2184 goto pinit; 2185 2186 /* we can reuse a block if it hasn't been written 2187 * and it is from this transaction. We can't 2188 * reuse anything from the tree log root because 2189 * it has tiny sub-transactions. 2190 */ 2191 if (btrfs_buffer_uptodate(buf, 0) && 2192 btrfs_try_tree_lock(buf)) { 2193 u64 header_owner = btrfs_header_owner(buf); 2194 u64 header_transid = btrfs_header_generation(buf); 2195 if (header_owner != BTRFS_TREE_LOG_OBJECTID && 2196 header_owner != BTRFS_TREE_RELOC_OBJECTID && 2197 header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID && 2198 header_transid == trans->transid && 2199 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 2200 *must_clean = buf; 2201 return 1; 2202 } 2203 btrfs_tree_unlock(buf); 2204 } 2205 free_extent_buffer(buf); 2206 pinit: 2207 btrfs_set_path_blocking(path); 2208 mutex_lock(&root->fs_info->pinned_mutex); 2209 /* unlocks the pinned mutex */ 2210 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2211 2212 BUG_ON(err < 0); 2213 return 0; 2214 } 2215 2216 /* 2217 * remove an extent from the root, returns 0 on success 2218 */ 2219 static int __free_extent(struct btrfs_trans_handle *trans, 2220 struct btrfs_root *root, 2221 u64 bytenr, u64 num_bytes, u64 parent, 2222 u64 root_objectid, u64 ref_generation, 2223 u64 owner_objectid, int pin, int mark_free, 2224 int refs_to_drop) 2225 { 2226 struct btrfs_path *path; 2227 struct btrfs_key key; 2228 struct btrfs_fs_info *info = root->fs_info; 2229 struct btrfs_root *extent_root = info->extent_root; 2230 struct extent_buffer *leaf; 2231 int ret; 2232 int extent_slot = 0; 2233 int found_extent = 0; 2234 int num_to_del = 1; 2235 struct btrfs_extent_item *ei; 2236 u32 refs; 2237 2238 key.objectid = bytenr; 2239 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 2240 key.offset = num_bytes; 2241 path = btrfs_alloc_path(); 2242 if (!path) 2243 return -ENOMEM; 2244 2245 path->reada = 1; 2246 path->leave_spinning = 1; 2247 ret = lookup_extent_backref(trans, extent_root, path, 2248 bytenr, parent, root_objectid, 2249 ref_generation, owner_objectid, 1); 2250 if (ret == 0) { 2251 struct btrfs_key found_key; 2252 extent_slot = path->slots[0]; 2253 while (extent_slot > 0) { 2254 extent_slot--; 2255 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2256 extent_slot); 2257 if (found_key.objectid != bytenr) 2258 break; 2259 if (found_key.type == BTRFS_EXTENT_ITEM_KEY && 2260 found_key.offset == num_bytes) { 2261 found_extent = 1; 2262 break; 2263 } 2264 if (path->slots[0] - extent_slot > 5) 2265 break; 2266 } 2267 if (!found_extent) { 2268 ret = remove_extent_backref(trans, extent_root, path, 2269 refs_to_drop); 2270 BUG_ON(ret); 2271 btrfs_release_path(extent_root, path); 2272 path->leave_spinning = 1; 2273 ret = btrfs_search_slot(trans, extent_root, 2274 &key, path, -1, 1); 2275 if (ret) { 2276 printk(KERN_ERR "umm, got %d back from search" 2277 ", was looking for %llu\n", ret, 2278 (unsigned long long)bytenr); 2279 btrfs_print_leaf(extent_root, path->nodes[0]); 2280 } 2281 BUG_ON(ret); 2282 extent_slot = path->slots[0]; 2283 } 2284 } else { 2285 btrfs_print_leaf(extent_root, path->nodes[0]); 2286 WARN_ON(1); 2287 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 2288 "parent %llu root %llu gen %llu owner %llu\n", 2289 (unsigned long long)bytenr, 2290 (unsigned long long)parent, 2291 (unsigned long long)root_objectid, 2292 (unsigned long long)ref_generation, 2293 (unsigned long long)owner_objectid); 2294 } 2295 2296 leaf = path->nodes[0]; 2297 ei = btrfs_item_ptr(leaf, extent_slot, 2298 struct btrfs_extent_item); 2299 refs = btrfs_extent_refs(leaf, ei); 2300 2301 /* 2302 * we're not allowed to delete the extent item if there 2303 * are other delayed ref updates pending 2304 */ 2305 2306 BUG_ON(refs < refs_to_drop); 2307 refs -= refs_to_drop; 2308 btrfs_set_extent_refs(leaf, ei, refs); 2309 btrfs_mark_buffer_dirty(leaf); 2310 2311 if (refs == 0 && found_extent && 2312 path->slots[0] == extent_slot + 1) { 2313 struct btrfs_extent_ref *ref; 2314 ref = btrfs_item_ptr(leaf, path->slots[0], 2315 struct btrfs_extent_ref); 2316 BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop); 2317 /* if the back ref and the extent are next to each other 2318 * they get deleted below in one shot 2319 */ 2320 path->slots[0] = extent_slot; 2321 num_to_del = 2; 2322 } else if (found_extent) { 2323 /* otherwise delete the extent back ref */ 2324 ret = remove_extent_backref(trans, extent_root, path, 2325 refs_to_drop); 2326 BUG_ON(ret); 2327 /* if refs are 0, we need to setup the path for deletion */ 2328 if (refs == 0) { 2329 btrfs_release_path(extent_root, path); 2330 path->leave_spinning = 1; 2331 ret = btrfs_search_slot(trans, extent_root, &key, path, 2332 -1, 1); 2333 BUG_ON(ret); 2334 } 2335 } 2336 2337 if (refs == 0) { 2338 u64 super_used; 2339 u64 root_used; 2340 struct extent_buffer *must_clean = NULL; 2341 2342 if (pin) { 2343 ret = pin_down_bytes(trans, root, path, 2344 bytenr, num_bytes, 2345 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, 2346 &must_clean); 2347 if (ret > 0) 2348 mark_free = 1; 2349 BUG_ON(ret < 0); 2350 } 2351 2352 /* block accounting for super block */ 2353 spin_lock(&info->delalloc_lock); 2354 super_used = btrfs_super_bytes_used(&info->super_copy); 2355 btrfs_set_super_bytes_used(&info->super_copy, 2356 super_used - num_bytes); 2357 2358 /* block accounting for root item */ 2359 root_used = btrfs_root_used(&root->root_item); 2360 btrfs_set_root_used(&root->root_item, 2361 root_used - num_bytes); 2362 spin_unlock(&info->delalloc_lock); 2363 2364 /* 2365 * it is going to be very rare for someone to be waiting 2366 * on the block we're freeing. del_items might need to 2367 * schedule, so rather than get fancy, just force it 2368 * to blocking here 2369 */ 2370 if (must_clean) 2371 btrfs_set_lock_blocking(must_clean); 2372 2373 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 2374 num_to_del); 2375 BUG_ON(ret); 2376 btrfs_release_path(extent_root, path); 2377 2378 if (must_clean) { 2379 clean_tree_block(NULL, root, must_clean); 2380 btrfs_tree_unlock(must_clean); 2381 free_extent_buffer(must_clean); 2382 } 2383 2384 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 2385 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 2386 BUG_ON(ret); 2387 } else { 2388 invalidate_mapping_pages(info->btree_inode->i_mapping, 2389 bytenr >> PAGE_CACHE_SHIFT, 2390 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 2391 } 2392 2393 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 2394 mark_free); 2395 BUG_ON(ret); 2396 } 2397 btrfs_free_path(path); 2398 return ret; 2399 } 2400 2401 /* 2402 * remove an extent from the root, returns 0 on success 2403 */ 2404 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 2405 struct btrfs_root *root, 2406 u64 bytenr, u64 num_bytes, u64 parent, 2407 u64 root_objectid, u64 ref_generation, 2408 u64 owner_objectid, int pin, 2409 int refs_to_drop) 2410 { 2411 WARN_ON(num_bytes < root->sectorsize); 2412 2413 /* 2414 * if metadata always pin 2415 * if data pin when any transaction has committed this 2416 */ 2417 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID || 2418 ref_generation != trans->transid) 2419 pin = 1; 2420 2421 if (ref_generation != trans->transid) 2422 pin = 1; 2423 2424 return __free_extent(trans, root, bytenr, num_bytes, parent, 2425 root_objectid, ref_generation, 2426 owner_objectid, pin, pin == 0, refs_to_drop); 2427 } 2428 2429 /* 2430 * when we free an extent, it is possible (and likely) that we free the last 2431 * delayed ref for that extent as well. This searches the delayed ref tree for 2432 * a given extent, and if there are no other delayed refs to be processed, it 2433 * removes it from the tree. 2434 */ 2435 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 2436 struct btrfs_root *root, u64 bytenr) 2437 { 2438 struct btrfs_delayed_ref_head *head; 2439 struct btrfs_delayed_ref_root *delayed_refs; 2440 struct btrfs_delayed_ref_node *ref; 2441 struct rb_node *node; 2442 int ret; 2443 2444 delayed_refs = &trans->transaction->delayed_refs; 2445 spin_lock(&delayed_refs->lock); 2446 head = btrfs_find_delayed_ref_head(trans, bytenr); 2447 if (!head) 2448 goto out; 2449 2450 node = rb_prev(&head->node.rb_node); 2451 if (!node) 2452 goto out; 2453 2454 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2455 2456 /* there are still entries for this ref, we can't drop it */ 2457 if (ref->bytenr == bytenr) 2458 goto out; 2459 2460 /* 2461 * waiting for the lock here would deadlock. If someone else has it 2462 * locked they are already in the process of dropping it anyway 2463 */ 2464 if (!mutex_trylock(&head->mutex)) 2465 goto out; 2466 2467 /* 2468 * at this point we have a head with no other entries. Go 2469 * ahead and process it. 2470 */ 2471 head->node.in_tree = 0; 2472 rb_erase(&head->node.rb_node, &delayed_refs->root); 2473 2474 delayed_refs->num_entries--; 2475 2476 /* 2477 * we don't take a ref on the node because we're removing it from the 2478 * tree, so we just steal the ref the tree was holding. 2479 */ 2480 delayed_refs->num_heads--; 2481 if (list_empty(&head->cluster)) 2482 delayed_refs->num_heads_ready--; 2483 2484 list_del_init(&head->cluster); 2485 spin_unlock(&delayed_refs->lock); 2486 2487 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 2488 &head->node, head->must_insert_reserved); 2489 BUG_ON(ret); 2490 btrfs_put_delayed_ref(&head->node); 2491 return 0; 2492 out: 2493 spin_unlock(&delayed_refs->lock); 2494 return 0; 2495 } 2496 2497 int btrfs_free_extent(struct btrfs_trans_handle *trans, 2498 struct btrfs_root *root, 2499 u64 bytenr, u64 num_bytes, u64 parent, 2500 u64 root_objectid, u64 ref_generation, 2501 u64 owner_objectid, int pin) 2502 { 2503 int ret; 2504 2505 /* 2506 * tree log blocks never actually go into the extent allocation 2507 * tree, just update pinning info and exit early. 2508 * 2509 * data extents referenced by the tree log do need to have 2510 * their reference counts bumped. 2511 */ 2512 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && 2513 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 2514 mutex_lock(&root->fs_info->pinned_mutex); 2515 2516 /* unlocks the pinned mutex */ 2517 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2518 update_reserved_extents(root, bytenr, num_bytes, 0); 2519 ret = 0; 2520 } else { 2521 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, 2522 root_objectid, ref_generation, 2523 owner_objectid, 2524 BTRFS_DROP_DELAYED_REF, 1); 2525 BUG_ON(ret); 2526 ret = check_ref_cleanup(trans, root, bytenr); 2527 BUG_ON(ret); 2528 } 2529 return ret; 2530 } 2531 2532 static u64 stripe_align(struct btrfs_root *root, u64 val) 2533 { 2534 u64 mask = ((u64)root->stripesize - 1); 2535 u64 ret = (val + mask) & ~mask; 2536 return ret; 2537 } 2538 2539 /* 2540 * walks the btree of allocated extents and find a hole of a given size. 2541 * The key ins is changed to record the hole: 2542 * ins->objectid == block start 2543 * ins->flags = BTRFS_EXTENT_ITEM_KEY 2544 * ins->offset == number of blocks 2545 * Any available blocks before search_start are skipped. 2546 */ 2547 static noinline int find_free_extent(struct btrfs_trans_handle *trans, 2548 struct btrfs_root *orig_root, 2549 u64 num_bytes, u64 empty_size, 2550 u64 search_start, u64 search_end, 2551 u64 hint_byte, struct btrfs_key *ins, 2552 u64 exclude_start, u64 exclude_nr, 2553 int data) 2554 { 2555 int ret = 0; 2556 struct btrfs_root *root = orig_root->fs_info->extent_root; 2557 u64 total_needed = num_bytes; 2558 u64 *last_ptr = NULL; 2559 u64 last_wanted = 0; 2560 struct btrfs_block_group_cache *block_group = NULL; 2561 int chunk_alloc_done = 0; 2562 int empty_cluster = 2 * 1024 * 1024; 2563 int allowed_chunk_alloc = 0; 2564 struct list_head *head = NULL, *cur = NULL; 2565 int loop = 0; 2566 int extra_loop = 0; 2567 struct btrfs_space_info *space_info; 2568 2569 WARN_ON(num_bytes < root->sectorsize); 2570 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2571 ins->objectid = 0; 2572 ins->offset = 0; 2573 2574 if (orig_root->ref_cows || empty_size) 2575 allowed_chunk_alloc = 1; 2576 2577 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2578 last_ptr = &root->fs_info->last_alloc; 2579 if (!btrfs_test_opt(root, SSD)) 2580 empty_cluster = 64 * 1024; 2581 } 2582 2583 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2584 last_ptr = &root->fs_info->last_data_alloc; 2585 2586 if (last_ptr) { 2587 if (*last_ptr) { 2588 hint_byte = *last_ptr; 2589 last_wanted = *last_ptr; 2590 } else 2591 empty_size += empty_cluster; 2592 } else { 2593 empty_cluster = 0; 2594 } 2595 search_start = max(search_start, first_logical_byte(root, 0)); 2596 search_start = max(search_start, hint_byte); 2597 2598 if (last_wanted && search_start != last_wanted) { 2599 last_wanted = 0; 2600 empty_size += empty_cluster; 2601 } 2602 2603 total_needed += empty_size; 2604 block_group = btrfs_lookup_block_group(root->fs_info, search_start); 2605 if (!block_group) 2606 block_group = btrfs_lookup_first_block_group(root->fs_info, 2607 search_start); 2608 space_info = __find_space_info(root->fs_info, data); 2609 2610 down_read(&space_info->groups_sem); 2611 while (1) { 2612 struct btrfs_free_space *free_space; 2613 /* 2614 * the only way this happens if our hint points to a block 2615 * group thats not of the proper type, while looping this 2616 * should never happen 2617 */ 2618 if (empty_size) 2619 extra_loop = 1; 2620 2621 if (!block_group) 2622 goto new_group_no_lock; 2623 2624 if (unlikely(!block_group->cached)) { 2625 mutex_lock(&block_group->cache_mutex); 2626 ret = cache_block_group(root, block_group); 2627 mutex_unlock(&block_group->cache_mutex); 2628 if (ret) 2629 break; 2630 } 2631 2632 mutex_lock(&block_group->alloc_mutex); 2633 if (unlikely(!block_group_bits(block_group, data))) 2634 goto new_group; 2635 2636 if (unlikely(block_group->ro)) 2637 goto new_group; 2638 2639 free_space = btrfs_find_free_space(block_group, search_start, 2640 total_needed); 2641 if (free_space) { 2642 u64 start = block_group->key.objectid; 2643 u64 end = block_group->key.objectid + 2644 block_group->key.offset; 2645 2646 search_start = stripe_align(root, free_space->offset); 2647 2648 /* move on to the next group */ 2649 if (search_start + num_bytes >= search_end) 2650 goto new_group; 2651 2652 /* move on to the next group */ 2653 if (search_start + num_bytes > end) 2654 goto new_group; 2655 2656 if (last_wanted && search_start != last_wanted) { 2657 total_needed += empty_cluster; 2658 empty_size += empty_cluster; 2659 last_wanted = 0; 2660 /* 2661 * if search_start is still in this block group 2662 * then we just re-search this block group 2663 */ 2664 if (search_start >= start && 2665 search_start < end) { 2666 mutex_unlock(&block_group->alloc_mutex); 2667 continue; 2668 } 2669 2670 /* else we go to the next block group */ 2671 goto new_group; 2672 } 2673 2674 if (exclude_nr > 0 && 2675 (search_start + num_bytes > exclude_start && 2676 search_start < exclude_start + exclude_nr)) { 2677 search_start = exclude_start + exclude_nr; 2678 /* 2679 * if search_start is still in this block group 2680 * then we just re-search this block group 2681 */ 2682 if (search_start >= start && 2683 search_start < end) { 2684 mutex_unlock(&block_group->alloc_mutex); 2685 last_wanted = 0; 2686 continue; 2687 } 2688 2689 /* else we go to the next block group */ 2690 goto new_group; 2691 } 2692 2693 ins->objectid = search_start; 2694 ins->offset = num_bytes; 2695 2696 btrfs_remove_free_space_lock(block_group, search_start, 2697 num_bytes); 2698 /* we are all good, lets return */ 2699 mutex_unlock(&block_group->alloc_mutex); 2700 break; 2701 } 2702 new_group: 2703 mutex_unlock(&block_group->alloc_mutex); 2704 put_block_group(block_group); 2705 block_group = NULL; 2706 new_group_no_lock: 2707 /* don't try to compare new allocations against the 2708 * last allocation any more 2709 */ 2710 last_wanted = 0; 2711 2712 /* 2713 * Here's how this works. 2714 * loop == 0: we were searching a block group via a hint 2715 * and didn't find anything, so we start at 2716 * the head of the block groups and keep searching 2717 * loop == 1: we're searching through all of the block groups 2718 * if we hit the head again we have searched 2719 * all of the block groups for this space and we 2720 * need to try and allocate, if we cant error out. 2721 * loop == 2: we allocated more space and are looping through 2722 * all of the block groups again. 2723 */ 2724 if (loop == 0) { 2725 head = &space_info->block_groups; 2726 cur = head->next; 2727 loop++; 2728 } else if (loop == 1 && cur == head) { 2729 int keep_going; 2730 2731 /* at this point we give up on the empty_size 2732 * allocations and just try to allocate the min 2733 * space. 2734 * 2735 * The extra_loop field was set if an empty_size 2736 * allocation was attempted above, and if this 2737 * is try we need to try the loop again without 2738 * the additional empty_size. 2739 */ 2740 total_needed -= empty_size; 2741 empty_size = 0; 2742 keep_going = extra_loop; 2743 loop++; 2744 2745 if (allowed_chunk_alloc && !chunk_alloc_done) { 2746 up_read(&space_info->groups_sem); 2747 ret = do_chunk_alloc(trans, root, num_bytes + 2748 2 * 1024 * 1024, data, 1); 2749 down_read(&space_info->groups_sem); 2750 if (ret < 0) 2751 goto loop_check; 2752 head = &space_info->block_groups; 2753 /* 2754 * we've allocated a new chunk, keep 2755 * trying 2756 */ 2757 keep_going = 1; 2758 chunk_alloc_done = 1; 2759 } else if (!allowed_chunk_alloc) { 2760 space_info->force_alloc = 1; 2761 } 2762 loop_check: 2763 if (keep_going) { 2764 cur = head->next; 2765 extra_loop = 0; 2766 } else { 2767 break; 2768 } 2769 } else if (cur == head) { 2770 break; 2771 } 2772 2773 block_group = list_entry(cur, struct btrfs_block_group_cache, 2774 list); 2775 atomic_inc(&block_group->count); 2776 2777 search_start = block_group->key.objectid; 2778 cur = cur->next; 2779 } 2780 2781 /* we found what we needed */ 2782 if (ins->objectid) { 2783 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2784 trans->block_group = block_group->key.objectid; 2785 2786 if (last_ptr) 2787 *last_ptr = ins->objectid + ins->offset; 2788 ret = 0; 2789 } else if (!ret) { 2790 printk(KERN_ERR "btrfs searching for %llu bytes, " 2791 "num_bytes %llu, loop %d, allowed_alloc %d\n", 2792 (unsigned long long)total_needed, 2793 (unsigned long long)num_bytes, 2794 loop, allowed_chunk_alloc); 2795 ret = -ENOSPC; 2796 } 2797 if (block_group) 2798 put_block_group(block_group); 2799 2800 up_read(&space_info->groups_sem); 2801 return ret; 2802 } 2803 2804 static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 2805 { 2806 struct btrfs_block_group_cache *cache; 2807 2808 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 2809 (unsigned long long)(info->total_bytes - info->bytes_used - 2810 info->bytes_pinned - info->bytes_reserved), 2811 (info->full) ? "" : "not "); 2812 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 2813 " may_use=%llu, used=%llu\n", info->total_bytes, 2814 info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, 2815 info->bytes_used); 2816 2817 down_read(&info->groups_sem); 2818 list_for_each_entry(cache, &info->block_groups, list) { 2819 spin_lock(&cache->lock); 2820 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 2821 "%llu pinned %llu reserved\n", 2822 (unsigned long long)cache->key.objectid, 2823 (unsigned long long)cache->key.offset, 2824 (unsigned long long)btrfs_block_group_used(&cache->item), 2825 (unsigned long long)cache->pinned, 2826 (unsigned long long)cache->reserved); 2827 btrfs_dump_free_space(cache, bytes); 2828 spin_unlock(&cache->lock); 2829 } 2830 up_read(&info->groups_sem); 2831 } 2832 2833 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, 2834 struct btrfs_root *root, 2835 u64 num_bytes, u64 min_alloc_size, 2836 u64 empty_size, u64 hint_byte, 2837 u64 search_end, struct btrfs_key *ins, 2838 u64 data) 2839 { 2840 int ret; 2841 u64 search_start = 0; 2842 struct btrfs_fs_info *info = root->fs_info; 2843 2844 data = btrfs_get_alloc_profile(root, data); 2845 again: 2846 /* 2847 * the only place that sets empty_size is btrfs_realloc_node, which 2848 * is not called recursively on allocations 2849 */ 2850 if (empty_size || root->ref_cows) { 2851 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { 2852 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2853 2 * 1024 * 1024, 2854 BTRFS_BLOCK_GROUP_METADATA | 2855 (info->metadata_alloc_profile & 2856 info->avail_metadata_alloc_bits), 0); 2857 } 2858 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2859 num_bytes + 2 * 1024 * 1024, data, 0); 2860 } 2861 2862 WARN_ON(num_bytes < root->sectorsize); 2863 ret = find_free_extent(trans, root, num_bytes, empty_size, 2864 search_start, search_end, hint_byte, ins, 2865 trans->alloc_exclude_start, 2866 trans->alloc_exclude_nr, data); 2867 2868 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 2869 num_bytes = num_bytes >> 1; 2870 num_bytes = num_bytes & ~(root->sectorsize - 1); 2871 num_bytes = max(num_bytes, min_alloc_size); 2872 do_chunk_alloc(trans, root->fs_info->extent_root, 2873 num_bytes, data, 1); 2874 goto again; 2875 } 2876 if (ret) { 2877 struct btrfs_space_info *sinfo; 2878 2879 sinfo = __find_space_info(root->fs_info, data); 2880 printk(KERN_ERR "btrfs allocation failed flags %llu, " 2881 "wanted %llu\n", (unsigned long long)data, 2882 (unsigned long long)num_bytes); 2883 dump_space_info(sinfo, num_bytes); 2884 BUG(); 2885 } 2886 2887 return ret; 2888 } 2889 2890 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 2891 { 2892 struct btrfs_block_group_cache *cache; 2893 int ret = 0; 2894 2895 cache = btrfs_lookup_block_group(root->fs_info, start); 2896 if (!cache) { 2897 printk(KERN_ERR "Unable to find block group for %llu\n", 2898 (unsigned long long)start); 2899 return -ENOSPC; 2900 } 2901 2902 ret = btrfs_discard_extent(root, start, len); 2903 2904 btrfs_add_free_space(cache, start, len); 2905 put_block_group(cache); 2906 update_reserved_extents(root, start, len, 0); 2907 2908 return ret; 2909 } 2910 2911 int btrfs_reserve_extent(struct btrfs_trans_handle *trans, 2912 struct btrfs_root *root, 2913 u64 num_bytes, u64 min_alloc_size, 2914 u64 empty_size, u64 hint_byte, 2915 u64 search_end, struct btrfs_key *ins, 2916 u64 data) 2917 { 2918 int ret; 2919 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, 2920 empty_size, hint_byte, search_end, ins, 2921 data); 2922 update_reserved_extents(root, ins->objectid, ins->offset, 1); 2923 return ret; 2924 } 2925 2926 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 2927 struct btrfs_root *root, u64 parent, 2928 u64 root_objectid, u64 ref_generation, 2929 u64 owner, struct btrfs_key *ins, 2930 int ref_mod) 2931 { 2932 int ret; 2933 u64 super_used; 2934 u64 root_used; 2935 u64 num_bytes = ins->offset; 2936 u32 sizes[2]; 2937 struct btrfs_fs_info *info = root->fs_info; 2938 struct btrfs_root *extent_root = info->extent_root; 2939 struct btrfs_extent_item *extent_item; 2940 struct btrfs_extent_ref *ref; 2941 struct btrfs_path *path; 2942 struct btrfs_key keys[2]; 2943 2944 if (parent == 0) 2945 parent = ins->objectid; 2946 2947 /* block accounting for super block */ 2948 spin_lock(&info->delalloc_lock); 2949 super_used = btrfs_super_bytes_used(&info->super_copy); 2950 btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes); 2951 2952 /* block accounting for root item */ 2953 root_used = btrfs_root_used(&root->root_item); 2954 btrfs_set_root_used(&root->root_item, root_used + num_bytes); 2955 spin_unlock(&info->delalloc_lock); 2956 2957 memcpy(&keys[0], ins, sizeof(*ins)); 2958 keys[1].objectid = ins->objectid; 2959 keys[1].type = BTRFS_EXTENT_REF_KEY; 2960 keys[1].offset = parent; 2961 sizes[0] = sizeof(*extent_item); 2962 sizes[1] = sizeof(*ref); 2963 2964 path = btrfs_alloc_path(); 2965 BUG_ON(!path); 2966 2967 path->leave_spinning = 1; 2968 ret = btrfs_insert_empty_items(trans, extent_root, path, keys, 2969 sizes, 2); 2970 BUG_ON(ret); 2971 2972 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2973 struct btrfs_extent_item); 2974 btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod); 2975 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 2976 struct btrfs_extent_ref); 2977 2978 btrfs_set_ref_root(path->nodes[0], ref, root_objectid); 2979 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); 2980 btrfs_set_ref_objectid(path->nodes[0], ref, owner); 2981 btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod); 2982 2983 btrfs_mark_buffer_dirty(path->nodes[0]); 2984 2985 trans->alloc_exclude_start = 0; 2986 trans->alloc_exclude_nr = 0; 2987 btrfs_free_path(path); 2988 2989 if (ret) 2990 goto out; 2991 2992 ret = update_block_group(trans, root, ins->objectid, 2993 ins->offset, 1, 0); 2994 if (ret) { 2995 printk(KERN_ERR "btrfs update block group failed for %llu " 2996 "%llu\n", (unsigned long long)ins->objectid, 2997 (unsigned long long)ins->offset); 2998 BUG(); 2999 } 3000 out: 3001 return ret; 3002 } 3003 3004 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, 3005 struct btrfs_root *root, u64 parent, 3006 u64 root_objectid, u64 ref_generation, 3007 u64 owner, struct btrfs_key *ins) 3008 { 3009 int ret; 3010 3011 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) 3012 return 0; 3013 3014 ret = btrfs_add_delayed_ref(trans, ins->objectid, 3015 ins->offset, parent, root_objectid, 3016 ref_generation, owner, 3017 BTRFS_ADD_DELAYED_EXTENT, 0); 3018 BUG_ON(ret); 3019 return ret; 3020 } 3021 3022 /* 3023 * this is used by the tree logging recovery code. It records that 3024 * an extent has been allocated and makes sure to clear the free 3025 * space cache bits as well 3026 */ 3027 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, 3028 struct btrfs_root *root, u64 parent, 3029 u64 root_objectid, u64 ref_generation, 3030 u64 owner, struct btrfs_key *ins) 3031 { 3032 int ret; 3033 struct btrfs_block_group_cache *block_group; 3034 3035 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 3036 mutex_lock(&block_group->cache_mutex); 3037 cache_block_group(root, block_group); 3038 mutex_unlock(&block_group->cache_mutex); 3039 3040 ret = btrfs_remove_free_space(block_group, ins->objectid, 3041 ins->offset); 3042 BUG_ON(ret); 3043 put_block_group(block_group); 3044 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3045 ref_generation, owner, ins, 1); 3046 return ret; 3047 } 3048 3049 /* 3050 * finds a free extent and does all the dirty work required for allocation 3051 * returns the key for the extent through ins, and a tree buffer for 3052 * the first block of the extent through buf. 3053 * 3054 * returns 0 if everything worked, non-zero otherwise. 3055 */ 3056 int btrfs_alloc_extent(struct btrfs_trans_handle *trans, 3057 struct btrfs_root *root, 3058 u64 num_bytes, u64 parent, u64 min_alloc_size, 3059 u64 root_objectid, u64 ref_generation, 3060 u64 owner_objectid, u64 empty_size, u64 hint_byte, 3061 u64 search_end, struct btrfs_key *ins, u64 data) 3062 { 3063 int ret; 3064 ret = __btrfs_reserve_extent(trans, root, num_bytes, 3065 min_alloc_size, empty_size, hint_byte, 3066 search_end, ins, data); 3067 BUG_ON(ret); 3068 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 3069 ret = btrfs_add_delayed_ref(trans, ins->objectid, 3070 ins->offset, parent, root_objectid, 3071 ref_generation, owner_objectid, 3072 BTRFS_ADD_DELAYED_EXTENT, 0); 3073 BUG_ON(ret); 3074 } 3075 update_reserved_extents(root, ins->objectid, ins->offset, 1); 3076 return ret; 3077 } 3078 3079 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 3080 struct btrfs_root *root, 3081 u64 bytenr, u32 blocksize, 3082 int level) 3083 { 3084 struct extent_buffer *buf; 3085 3086 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 3087 if (!buf) 3088 return ERR_PTR(-ENOMEM); 3089 btrfs_set_header_generation(buf, trans->transid); 3090 btrfs_set_buffer_lockdep_class(buf, level); 3091 btrfs_tree_lock(buf); 3092 clean_tree_block(trans, root, buf); 3093 3094 btrfs_set_lock_blocking(buf); 3095 btrfs_set_buffer_uptodate(buf); 3096 3097 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3098 set_extent_dirty(&root->dirty_log_pages, buf->start, 3099 buf->start + buf->len - 1, GFP_NOFS); 3100 } else { 3101 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 3102 buf->start + buf->len - 1, GFP_NOFS); 3103 } 3104 trans->blocks_used++; 3105 /* this returns a buffer locked for blocking */ 3106 return buf; 3107 } 3108 3109 /* 3110 * helper function to allocate a block for a given tree 3111 * returns the tree buffer or NULL. 3112 */ 3113 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 3114 struct btrfs_root *root, 3115 u32 blocksize, u64 parent, 3116 u64 root_objectid, 3117 u64 ref_generation, 3118 int level, 3119 u64 hint, 3120 u64 empty_size) 3121 { 3122 struct btrfs_key ins; 3123 int ret; 3124 struct extent_buffer *buf; 3125 3126 ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize, 3127 root_objectid, ref_generation, level, 3128 empty_size, hint, (u64)-1, &ins, 0); 3129 if (ret) { 3130 BUG_ON(ret > 0); 3131 return ERR_PTR(ret); 3132 } 3133 3134 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 3135 blocksize, level); 3136 return buf; 3137 } 3138 3139 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 3140 struct btrfs_root *root, struct extent_buffer *leaf) 3141 { 3142 u64 leaf_owner; 3143 u64 leaf_generation; 3144 struct refsort *sorted; 3145 struct btrfs_key key; 3146 struct btrfs_file_extent_item *fi; 3147 int i; 3148 int nritems; 3149 int ret; 3150 int refi = 0; 3151 int slot; 3152 3153 BUG_ON(!btrfs_is_leaf(leaf)); 3154 nritems = btrfs_header_nritems(leaf); 3155 leaf_owner = btrfs_header_owner(leaf); 3156 leaf_generation = btrfs_header_generation(leaf); 3157 3158 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); 3159 /* we do this loop twice. The first time we build a list 3160 * of the extents we have a reference on, then we sort the list 3161 * by bytenr. The second time around we actually do the 3162 * extent freeing. 3163 */ 3164 for (i = 0; i < nritems; i++) { 3165 u64 disk_bytenr; 3166 cond_resched(); 3167 3168 btrfs_item_key_to_cpu(leaf, &key, i); 3169 3170 /* only extents have references, skip everything else */ 3171 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3172 continue; 3173 3174 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3175 3176 /* inline extents live in the btree, they don't have refs */ 3177 if (btrfs_file_extent_type(leaf, fi) == 3178 BTRFS_FILE_EXTENT_INLINE) 3179 continue; 3180 3181 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3182 3183 /* holes don't have refs */ 3184 if (disk_bytenr == 0) 3185 continue; 3186 3187 sorted[refi].bytenr = disk_bytenr; 3188 sorted[refi].slot = i; 3189 refi++; 3190 } 3191 3192 if (refi == 0) 3193 goto out; 3194 3195 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 3196 3197 for (i = 0; i < refi; i++) { 3198 u64 disk_bytenr; 3199 3200 disk_bytenr = sorted[i].bytenr; 3201 slot = sorted[i].slot; 3202 3203 cond_resched(); 3204 3205 btrfs_item_key_to_cpu(leaf, &key, slot); 3206 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3207 continue; 3208 3209 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 3210 3211 ret = btrfs_free_extent(trans, root, disk_bytenr, 3212 btrfs_file_extent_disk_num_bytes(leaf, fi), 3213 leaf->start, leaf_owner, leaf_generation, 3214 key.objectid, 0); 3215 BUG_ON(ret); 3216 3217 atomic_inc(&root->fs_info->throttle_gen); 3218 wake_up(&root->fs_info->transaction_throttle); 3219 cond_resched(); 3220 } 3221 out: 3222 kfree(sorted); 3223 return 0; 3224 } 3225 3226 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, 3227 struct btrfs_root *root, 3228 struct btrfs_leaf_ref *ref) 3229 { 3230 int i; 3231 int ret; 3232 struct btrfs_extent_info *info; 3233 struct refsort *sorted; 3234 3235 if (ref->nritems == 0) 3236 return 0; 3237 3238 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); 3239 for (i = 0; i < ref->nritems; i++) { 3240 sorted[i].bytenr = ref->extents[i].bytenr; 3241 sorted[i].slot = i; 3242 } 3243 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); 3244 3245 /* 3246 * the items in the ref were sorted when the ref was inserted 3247 * into the ref cache, so this is already in order 3248 */ 3249 for (i = 0; i < ref->nritems; i++) { 3250 info = ref->extents + sorted[i].slot; 3251 ret = btrfs_free_extent(trans, root, info->bytenr, 3252 info->num_bytes, ref->bytenr, 3253 ref->owner, ref->generation, 3254 info->objectid, 0); 3255 3256 atomic_inc(&root->fs_info->throttle_gen); 3257 wake_up(&root->fs_info->transaction_throttle); 3258 cond_resched(); 3259 3260 BUG_ON(ret); 3261 info++; 3262 } 3263 3264 kfree(sorted); 3265 return 0; 3266 } 3267 3268 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, 3269 struct btrfs_root *root, u64 start, 3270 u64 len, u32 *refs) 3271 { 3272 int ret; 3273 3274 ret = btrfs_lookup_extent_ref(trans, root, start, len, refs); 3275 BUG_ON(ret); 3276 3277 #if 0 /* some debugging code in case we see problems here */ 3278 /* if the refs count is one, it won't get increased again. But 3279 * if the ref count is > 1, someone may be decreasing it at 3280 * the same time we are. 3281 */ 3282 if (*refs != 1) { 3283 struct extent_buffer *eb = NULL; 3284 eb = btrfs_find_create_tree_block(root, start, len); 3285 if (eb) 3286 btrfs_tree_lock(eb); 3287 3288 mutex_lock(&root->fs_info->alloc_mutex); 3289 ret = lookup_extent_ref(NULL, root, start, len, refs); 3290 BUG_ON(ret); 3291 mutex_unlock(&root->fs_info->alloc_mutex); 3292 3293 if (eb) { 3294 btrfs_tree_unlock(eb); 3295 free_extent_buffer(eb); 3296 } 3297 if (*refs == 1) { 3298 printk(KERN_ERR "btrfs block %llu went down to one " 3299 "during drop_snap\n", (unsigned long long)start); 3300 } 3301 3302 } 3303 #endif 3304 3305 cond_resched(); 3306 return ret; 3307 } 3308 3309 /* 3310 * this is used while deleting old snapshots, and it drops the refs 3311 * on a whole subtree starting from a level 1 node. 3312 * 3313 * The idea is to sort all the leaf pointers, and then drop the 3314 * ref on all the leaves in order. Most of the time the leaves 3315 * will have ref cache entries, so no leaf IOs will be required to 3316 * find the extents they have references on. 3317 * 3318 * For each leaf, any references it has are also dropped in order 3319 * 3320 * This ends up dropping the references in something close to optimal 3321 * order for reading and modifying the extent allocation tree. 3322 */ 3323 static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, 3324 struct btrfs_root *root, 3325 struct btrfs_path *path) 3326 { 3327 u64 bytenr; 3328 u64 root_owner; 3329 u64 root_gen; 3330 struct extent_buffer *eb = path->nodes[1]; 3331 struct extent_buffer *leaf; 3332 struct btrfs_leaf_ref *ref; 3333 struct refsort *sorted = NULL; 3334 int nritems = btrfs_header_nritems(eb); 3335 int ret; 3336 int i; 3337 int refi = 0; 3338 int slot = path->slots[1]; 3339 u32 blocksize = btrfs_level_size(root, 0); 3340 u32 refs; 3341 3342 if (nritems == 0) 3343 goto out; 3344 3345 root_owner = btrfs_header_owner(eb); 3346 root_gen = btrfs_header_generation(eb); 3347 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); 3348 3349 /* 3350 * step one, sort all the leaf pointers so we don't scribble 3351 * randomly into the extent allocation tree 3352 */ 3353 for (i = slot; i < nritems; i++) { 3354 sorted[refi].bytenr = btrfs_node_blockptr(eb, i); 3355 sorted[refi].slot = i; 3356 refi++; 3357 } 3358 3359 /* 3360 * nritems won't be zero, but if we're picking up drop_snapshot 3361 * after a crash, slot might be > 0, so double check things 3362 * just in case. 3363 */ 3364 if (refi == 0) 3365 goto out; 3366 3367 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 3368 3369 /* 3370 * the first loop frees everything the leaves point to 3371 */ 3372 for (i = 0; i < refi; i++) { 3373 u64 ptr_gen; 3374 3375 bytenr = sorted[i].bytenr; 3376 3377 /* 3378 * check the reference count on this leaf. If it is > 1 3379 * we just decrement it below and don't update any 3380 * of the refs the leaf points to. 3381 */ 3382 ret = drop_snap_lookup_refcount(trans, root, bytenr, 3383 blocksize, &refs); 3384 BUG_ON(ret); 3385 if (refs != 1) 3386 continue; 3387 3388 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); 3389 3390 /* 3391 * the leaf only had one reference, which means the 3392 * only thing pointing to this leaf is the snapshot 3393 * we're deleting. It isn't possible for the reference 3394 * count to increase again later 3395 * 3396 * The reference cache is checked for the leaf, 3397 * and if found we'll be able to drop any refs held by 3398 * the leaf without needing to read it in. 3399 */ 3400 ref = btrfs_lookup_leaf_ref(root, bytenr); 3401 if (ref && ref->generation != ptr_gen) { 3402 btrfs_free_leaf_ref(root, ref); 3403 ref = NULL; 3404 } 3405 if (ref) { 3406 ret = cache_drop_leaf_ref(trans, root, ref); 3407 BUG_ON(ret); 3408 btrfs_remove_leaf_ref(root, ref); 3409 btrfs_free_leaf_ref(root, ref); 3410 } else { 3411 /* 3412 * the leaf wasn't in the reference cache, so 3413 * we have to read it. 3414 */ 3415 leaf = read_tree_block(root, bytenr, blocksize, 3416 ptr_gen); 3417 ret = btrfs_drop_leaf_ref(trans, root, leaf); 3418 BUG_ON(ret); 3419 free_extent_buffer(leaf); 3420 } 3421 atomic_inc(&root->fs_info->throttle_gen); 3422 wake_up(&root->fs_info->transaction_throttle); 3423 cond_resched(); 3424 } 3425 3426 /* 3427 * run through the loop again to free the refs on the leaves. 3428 * This is faster than doing it in the loop above because 3429 * the leaves are likely to be clustered together. We end up 3430 * working in nice chunks on the extent allocation tree. 3431 */ 3432 for (i = 0; i < refi; i++) { 3433 bytenr = sorted[i].bytenr; 3434 ret = btrfs_free_extent(trans, root, bytenr, 3435 blocksize, eb->start, 3436 root_owner, root_gen, 0, 1); 3437 BUG_ON(ret); 3438 3439 atomic_inc(&root->fs_info->throttle_gen); 3440 wake_up(&root->fs_info->transaction_throttle); 3441 cond_resched(); 3442 } 3443 out: 3444 kfree(sorted); 3445 3446 /* 3447 * update the path to show we've processed the entire level 1 3448 * node. This will get saved into the root's drop_snapshot_progress 3449 * field so these drops are not repeated again if this transaction 3450 * commits. 3451 */ 3452 path->slots[1] = nritems; 3453 return 0; 3454 } 3455 3456 /* 3457 * helper function for drop_snapshot, this walks down the tree dropping ref 3458 * counts as it goes. 3459 */ 3460 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 3461 struct btrfs_root *root, 3462 struct btrfs_path *path, int *level) 3463 { 3464 u64 root_owner; 3465 u64 root_gen; 3466 u64 bytenr; 3467 u64 ptr_gen; 3468 struct extent_buffer *next; 3469 struct extent_buffer *cur; 3470 struct extent_buffer *parent; 3471 u32 blocksize; 3472 int ret; 3473 u32 refs; 3474 3475 WARN_ON(*level < 0); 3476 WARN_ON(*level >= BTRFS_MAX_LEVEL); 3477 ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start, 3478 path->nodes[*level]->len, &refs); 3479 BUG_ON(ret); 3480 if (refs > 1) 3481 goto out; 3482 3483 /* 3484 * walk down to the last node level and free all the leaves 3485 */ 3486 while (*level >= 0) { 3487 WARN_ON(*level < 0); 3488 WARN_ON(*level >= BTRFS_MAX_LEVEL); 3489 cur = path->nodes[*level]; 3490 3491 if (btrfs_header_level(cur) != *level) 3492 WARN_ON(1); 3493 3494 if (path->slots[*level] >= 3495 btrfs_header_nritems(cur)) 3496 break; 3497 3498 /* the new code goes down to level 1 and does all the 3499 * leaves pointed to that node in bulk. So, this check 3500 * for level 0 will always be false. 3501 * 3502 * But, the disk format allows the drop_snapshot_progress 3503 * field in the root to leave things in a state where 3504 * a leaf will need cleaning up here. If someone crashes 3505 * with the old code and then boots with the new code, 3506 * we might find a leaf here. 3507 */ 3508 if (*level == 0) { 3509 ret = btrfs_drop_leaf_ref(trans, root, cur); 3510 BUG_ON(ret); 3511 break; 3512 } 3513 3514 /* 3515 * once we get to level one, process the whole node 3516 * at once, including everything below it. 3517 */ 3518 if (*level == 1) { 3519 ret = drop_level_one_refs(trans, root, path); 3520 BUG_ON(ret); 3521 break; 3522 } 3523 3524 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 3525 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3526 blocksize = btrfs_level_size(root, *level - 1); 3527 3528 ret = drop_snap_lookup_refcount(trans, root, bytenr, 3529 blocksize, &refs); 3530 BUG_ON(ret); 3531 3532 /* 3533 * if there is more than one reference, we don't need 3534 * to read that node to drop any references it has. We 3535 * just drop the ref we hold on that node and move on to the 3536 * next slot in this level. 3537 */ 3538 if (refs != 1) { 3539 parent = path->nodes[*level]; 3540 root_owner = btrfs_header_owner(parent); 3541 root_gen = btrfs_header_generation(parent); 3542 path->slots[*level]++; 3543 3544 ret = btrfs_free_extent(trans, root, bytenr, 3545 blocksize, parent->start, 3546 root_owner, root_gen, 3547 *level - 1, 1); 3548 BUG_ON(ret); 3549 3550 atomic_inc(&root->fs_info->throttle_gen); 3551 wake_up(&root->fs_info->transaction_throttle); 3552 cond_resched(); 3553 3554 continue; 3555 } 3556 3557 /* 3558 * we need to keep freeing things in the next level down. 3559 * read the block and loop around to process it 3560 */ 3561 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3562 WARN_ON(*level <= 0); 3563 if (path->nodes[*level-1]) 3564 free_extent_buffer(path->nodes[*level-1]); 3565 path->nodes[*level-1] = next; 3566 *level = btrfs_header_level(next); 3567 path->slots[*level] = 0; 3568 cond_resched(); 3569 } 3570 out: 3571 WARN_ON(*level < 0); 3572 WARN_ON(*level >= BTRFS_MAX_LEVEL); 3573 3574 if (path->nodes[*level] == root->node) { 3575 parent = path->nodes[*level]; 3576 bytenr = path->nodes[*level]->start; 3577 } else { 3578 parent = path->nodes[*level + 1]; 3579 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); 3580 } 3581 3582 blocksize = btrfs_level_size(root, *level); 3583 root_owner = btrfs_header_owner(parent); 3584 root_gen = btrfs_header_generation(parent); 3585 3586 /* 3587 * cleanup and free the reference on the last node 3588 * we processed 3589 */ 3590 ret = btrfs_free_extent(trans, root, bytenr, blocksize, 3591 parent->start, root_owner, root_gen, 3592 *level, 1); 3593 free_extent_buffer(path->nodes[*level]); 3594 path->nodes[*level] = NULL; 3595 3596 *level += 1; 3597 BUG_ON(ret); 3598 3599 cond_resched(); 3600 return 0; 3601 } 3602 3603 /* 3604 * helper function for drop_subtree, this function is similar to 3605 * walk_down_tree. The main difference is that it checks reference 3606 * counts while tree blocks are locked. 3607 */ 3608 static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, 3609 struct btrfs_root *root, 3610 struct btrfs_path *path, int *level) 3611 { 3612 struct extent_buffer *next; 3613 struct extent_buffer *cur; 3614 struct extent_buffer *parent; 3615 u64 bytenr; 3616 u64 ptr_gen; 3617 u32 blocksize; 3618 u32 refs; 3619 int ret; 3620 3621 cur = path->nodes[*level]; 3622 ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, 3623 &refs); 3624 BUG_ON(ret); 3625 if (refs > 1) 3626 goto out; 3627 3628 while (*level >= 0) { 3629 cur = path->nodes[*level]; 3630 if (*level == 0) { 3631 ret = btrfs_drop_leaf_ref(trans, root, cur); 3632 BUG_ON(ret); 3633 clean_tree_block(trans, root, cur); 3634 break; 3635 } 3636 if (path->slots[*level] >= btrfs_header_nritems(cur)) { 3637 clean_tree_block(trans, root, cur); 3638 break; 3639 } 3640 3641 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 3642 blocksize = btrfs_level_size(root, *level - 1); 3643 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3644 3645 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3646 btrfs_tree_lock(next); 3647 btrfs_set_lock_blocking(next); 3648 3649 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 3650 &refs); 3651 BUG_ON(ret); 3652 if (refs > 1) { 3653 parent = path->nodes[*level]; 3654 ret = btrfs_free_extent(trans, root, bytenr, 3655 blocksize, parent->start, 3656 btrfs_header_owner(parent), 3657 btrfs_header_generation(parent), 3658 *level - 1, 1); 3659 BUG_ON(ret); 3660 path->slots[*level]++; 3661 btrfs_tree_unlock(next); 3662 free_extent_buffer(next); 3663 continue; 3664 } 3665 3666 *level = btrfs_header_level(next); 3667 path->nodes[*level] = next; 3668 path->slots[*level] = 0; 3669 path->locks[*level] = 1; 3670 cond_resched(); 3671 } 3672 out: 3673 parent = path->nodes[*level + 1]; 3674 bytenr = path->nodes[*level]->start; 3675 blocksize = path->nodes[*level]->len; 3676 3677 ret = btrfs_free_extent(trans, root, bytenr, blocksize, 3678 parent->start, btrfs_header_owner(parent), 3679 btrfs_header_generation(parent), *level, 1); 3680 BUG_ON(ret); 3681 3682 if (path->locks[*level]) { 3683 btrfs_tree_unlock(path->nodes[*level]); 3684 path->locks[*level] = 0; 3685 } 3686 free_extent_buffer(path->nodes[*level]); 3687 path->nodes[*level] = NULL; 3688 *level += 1; 3689 cond_resched(); 3690 return 0; 3691 } 3692 3693 /* 3694 * helper for dropping snapshots. This walks back up the tree in the path 3695 * to find the first node higher up where we haven't yet gone through 3696 * all the slots 3697 */ 3698 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 3699 struct btrfs_root *root, 3700 struct btrfs_path *path, 3701 int *level, int max_level) 3702 { 3703 u64 root_owner; 3704 u64 root_gen; 3705 struct btrfs_root_item *root_item = &root->root_item; 3706 int i; 3707 int slot; 3708 int ret; 3709 3710 for (i = *level; i < max_level && path->nodes[i]; i++) { 3711 slot = path->slots[i]; 3712 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 3713 struct extent_buffer *node; 3714 struct btrfs_disk_key disk_key; 3715 3716 /* 3717 * there is more work to do in this level. 3718 * Update the drop_progress marker to reflect 3719 * the work we've done so far, and then bump 3720 * the slot number 3721 */ 3722 node = path->nodes[i]; 3723 path->slots[i]++; 3724 *level = i; 3725 WARN_ON(*level == 0); 3726 btrfs_node_key(node, &disk_key, path->slots[i]); 3727 memcpy(&root_item->drop_progress, 3728 &disk_key, sizeof(disk_key)); 3729 root_item->drop_level = i; 3730 return 0; 3731 } else { 3732 struct extent_buffer *parent; 3733 3734 /* 3735 * this whole node is done, free our reference 3736 * on it and go up one level 3737 */ 3738 if (path->nodes[*level] == root->node) 3739 parent = path->nodes[*level]; 3740 else 3741 parent = path->nodes[*level + 1]; 3742 3743 root_owner = btrfs_header_owner(parent); 3744 root_gen = btrfs_header_generation(parent); 3745 3746 clean_tree_block(trans, root, path->nodes[*level]); 3747 ret = btrfs_free_extent(trans, root, 3748 path->nodes[*level]->start, 3749 path->nodes[*level]->len, 3750 parent->start, root_owner, 3751 root_gen, *level, 1); 3752 BUG_ON(ret); 3753 if (path->locks[*level]) { 3754 btrfs_tree_unlock(path->nodes[*level]); 3755 path->locks[*level] = 0; 3756 } 3757 free_extent_buffer(path->nodes[*level]); 3758 path->nodes[*level] = NULL; 3759 *level = i + 1; 3760 } 3761 } 3762 return 1; 3763 } 3764 3765 /* 3766 * drop the reference count on the tree rooted at 'snap'. This traverses 3767 * the tree freeing any blocks that have a ref count of zero after being 3768 * decremented. 3769 */ 3770 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root 3771 *root) 3772 { 3773 int ret = 0; 3774 int wret; 3775 int level; 3776 struct btrfs_path *path; 3777 int i; 3778 int orig_level; 3779 int update_count; 3780 struct btrfs_root_item *root_item = &root->root_item; 3781 3782 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); 3783 path = btrfs_alloc_path(); 3784 BUG_ON(!path); 3785 3786 level = btrfs_header_level(root->node); 3787 orig_level = level; 3788 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 3789 path->nodes[level] = root->node; 3790 extent_buffer_get(root->node); 3791 path->slots[level] = 0; 3792 } else { 3793 struct btrfs_key key; 3794 struct btrfs_disk_key found_key; 3795 struct extent_buffer *node; 3796 3797 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 3798 level = root_item->drop_level; 3799 path->lowest_level = level; 3800 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3801 if (wret < 0) { 3802 ret = wret; 3803 goto out; 3804 } 3805 node = path->nodes[level]; 3806 btrfs_node_key(node, &found_key, path->slots[level]); 3807 WARN_ON(memcmp(&found_key, &root_item->drop_progress, 3808 sizeof(found_key))); 3809 /* 3810 * unlock our path, this is safe because only this 3811 * function is allowed to delete this snapshot 3812 */ 3813 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 3814 if (path->nodes[i] && path->locks[i]) { 3815 path->locks[i] = 0; 3816 btrfs_tree_unlock(path->nodes[i]); 3817 } 3818 } 3819 } 3820 while (1) { 3821 unsigned long update; 3822 wret = walk_down_tree(trans, root, path, &level); 3823 if (wret > 0) 3824 break; 3825 if (wret < 0) 3826 ret = wret; 3827 3828 wret = walk_up_tree(trans, root, path, &level, 3829 BTRFS_MAX_LEVEL); 3830 if (wret > 0) 3831 break; 3832 if (wret < 0) 3833 ret = wret; 3834 if (trans->transaction->in_commit || 3835 trans->transaction->delayed_refs.flushing) { 3836 ret = -EAGAIN; 3837 break; 3838 } 3839 atomic_inc(&root->fs_info->throttle_gen); 3840 wake_up(&root->fs_info->transaction_throttle); 3841 for (update_count = 0; update_count < 16; update_count++) { 3842 update = trans->delayed_ref_updates; 3843 trans->delayed_ref_updates = 0; 3844 if (update) 3845 btrfs_run_delayed_refs(trans, root, update); 3846 else 3847 break; 3848 } 3849 } 3850 for (i = 0; i <= orig_level; i++) { 3851 if (path->nodes[i]) { 3852 free_extent_buffer(path->nodes[i]); 3853 path->nodes[i] = NULL; 3854 } 3855 } 3856 out: 3857 btrfs_free_path(path); 3858 return ret; 3859 } 3860 3861 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 3862 struct btrfs_root *root, 3863 struct extent_buffer *node, 3864 struct extent_buffer *parent) 3865 { 3866 struct btrfs_path *path; 3867 int level; 3868 int parent_level; 3869 int ret = 0; 3870 int wret; 3871 3872 path = btrfs_alloc_path(); 3873 BUG_ON(!path); 3874 3875 btrfs_assert_tree_locked(parent); 3876 parent_level = btrfs_header_level(parent); 3877 extent_buffer_get(parent); 3878 path->nodes[parent_level] = parent; 3879 path->slots[parent_level] = btrfs_header_nritems(parent); 3880 3881 btrfs_assert_tree_locked(node); 3882 level = btrfs_header_level(node); 3883 extent_buffer_get(node); 3884 path->nodes[level] = node; 3885 path->slots[level] = 0; 3886 3887 while (1) { 3888 wret = walk_down_subtree(trans, root, path, &level); 3889 if (wret < 0) 3890 ret = wret; 3891 if (wret != 0) 3892 break; 3893 3894 wret = walk_up_tree(trans, root, path, &level, parent_level); 3895 if (wret < 0) 3896 ret = wret; 3897 if (wret != 0) 3898 break; 3899 } 3900 3901 btrfs_free_path(path); 3902 return ret; 3903 } 3904 3905 static unsigned long calc_ra(unsigned long start, unsigned long last, 3906 unsigned long nr) 3907 { 3908 return min(last, start + nr - 1); 3909 } 3910 3911 static noinline int relocate_inode_pages(struct inode *inode, u64 start, 3912 u64 len) 3913 { 3914 u64 page_start; 3915 u64 page_end; 3916 unsigned long first_index; 3917 unsigned long last_index; 3918 unsigned long i; 3919 struct page *page; 3920 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3921 struct file_ra_state *ra; 3922 struct btrfs_ordered_extent *ordered; 3923 unsigned int total_read = 0; 3924 unsigned int total_dirty = 0; 3925 int ret = 0; 3926 3927 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3928 3929 mutex_lock(&inode->i_mutex); 3930 first_index = start >> PAGE_CACHE_SHIFT; 3931 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; 3932 3933 /* make sure the dirty trick played by the caller work */ 3934 ret = invalidate_inode_pages2_range(inode->i_mapping, 3935 first_index, last_index); 3936 if (ret) 3937 goto out_unlock; 3938 3939 file_ra_state_init(ra, inode->i_mapping); 3940 3941 for (i = first_index ; i <= last_index; i++) { 3942 if (total_read % ra->ra_pages == 0) { 3943 btrfs_force_ra(inode->i_mapping, ra, NULL, i, 3944 calc_ra(i, last_index, ra->ra_pages)); 3945 } 3946 total_read++; 3947 again: 3948 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) 3949 BUG_ON(1); 3950 page = grab_cache_page(inode->i_mapping, i); 3951 if (!page) { 3952 ret = -ENOMEM; 3953 goto out_unlock; 3954 } 3955 if (!PageUptodate(page)) { 3956 btrfs_readpage(NULL, page); 3957 lock_page(page); 3958 if (!PageUptodate(page)) { 3959 unlock_page(page); 3960 page_cache_release(page); 3961 ret = -EIO; 3962 goto out_unlock; 3963 } 3964 } 3965 wait_on_page_writeback(page); 3966 3967 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 3968 page_end = page_start + PAGE_CACHE_SIZE - 1; 3969 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 3970 3971 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3972 if (ordered) { 3973 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3974 unlock_page(page); 3975 page_cache_release(page); 3976 btrfs_start_ordered_extent(inode, ordered, 1); 3977 btrfs_put_ordered_extent(ordered); 3978 goto again; 3979 } 3980 set_page_extent_mapped(page); 3981 3982 if (i == first_index) 3983 set_extent_bits(io_tree, page_start, page_end, 3984 EXTENT_BOUNDARY, GFP_NOFS); 3985 btrfs_set_extent_delalloc(inode, page_start, page_end); 3986 3987 set_page_dirty(page); 3988 total_dirty++; 3989 3990 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3991 unlock_page(page); 3992 page_cache_release(page); 3993 } 3994 3995 out_unlock: 3996 kfree(ra); 3997 mutex_unlock(&inode->i_mutex); 3998 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); 3999 return ret; 4000 } 4001 4002 static noinline int relocate_data_extent(struct inode *reloc_inode, 4003 struct btrfs_key *extent_key, 4004 u64 offset) 4005 { 4006 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 4007 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; 4008 struct extent_map *em; 4009 u64 start = extent_key->objectid - offset; 4010 u64 end = start + extent_key->offset - 1; 4011 4012 em = alloc_extent_map(GFP_NOFS); 4013 BUG_ON(!em || IS_ERR(em)); 4014 4015 em->start = start; 4016 em->len = extent_key->offset; 4017 em->block_len = extent_key->offset; 4018 em->block_start = extent_key->objectid; 4019 em->bdev = root->fs_info->fs_devices->latest_bdev; 4020 set_bit(EXTENT_FLAG_PINNED, &em->flags); 4021 4022 /* setup extent map to cheat btrfs_readpage */ 4023 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); 4024 while (1) { 4025 int ret; 4026 spin_lock(&em_tree->lock); 4027 ret = add_extent_mapping(em_tree, em); 4028 spin_unlock(&em_tree->lock); 4029 if (ret != -EEXIST) { 4030 free_extent_map(em); 4031 break; 4032 } 4033 btrfs_drop_extent_cache(reloc_inode, start, end, 0); 4034 } 4035 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); 4036 4037 return relocate_inode_pages(reloc_inode, start, extent_key->offset); 4038 } 4039 4040 struct btrfs_ref_path { 4041 u64 extent_start; 4042 u64 nodes[BTRFS_MAX_LEVEL]; 4043 u64 root_objectid; 4044 u64 root_generation; 4045 u64 owner_objectid; 4046 u32 num_refs; 4047 int lowest_level; 4048 int current_level; 4049 int shared_level; 4050 4051 struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; 4052 u64 new_nodes[BTRFS_MAX_LEVEL]; 4053 }; 4054 4055 struct disk_extent { 4056 u64 ram_bytes; 4057 u64 disk_bytenr; 4058 u64 disk_num_bytes; 4059 u64 offset; 4060 u64 num_bytes; 4061 u8 compression; 4062 u8 encryption; 4063 u16 other_encoding; 4064 }; 4065 4066 static int is_cowonly_root(u64 root_objectid) 4067 { 4068 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || 4069 root_objectid == BTRFS_EXTENT_TREE_OBJECTID || 4070 root_objectid == BTRFS_CHUNK_TREE_OBJECTID || 4071 root_objectid == BTRFS_DEV_TREE_OBJECTID || 4072 root_objectid == BTRFS_TREE_LOG_OBJECTID || 4073 root_objectid == BTRFS_CSUM_TREE_OBJECTID) 4074 return 1; 4075 return 0; 4076 } 4077 4078 static noinline int __next_ref_path(struct btrfs_trans_handle *trans, 4079 struct btrfs_root *extent_root, 4080 struct btrfs_ref_path *ref_path, 4081 int first_time) 4082 { 4083 struct extent_buffer *leaf; 4084 struct btrfs_path *path; 4085 struct btrfs_extent_ref *ref; 4086 struct btrfs_key key; 4087 struct btrfs_key found_key; 4088 u64 bytenr; 4089 u32 nritems; 4090 int level; 4091 int ret = 1; 4092 4093 path = btrfs_alloc_path(); 4094 if (!path) 4095 return -ENOMEM; 4096 4097 if (first_time) { 4098 ref_path->lowest_level = -1; 4099 ref_path->current_level = -1; 4100 ref_path->shared_level = -1; 4101 goto walk_up; 4102 } 4103 walk_down: 4104 level = ref_path->current_level - 1; 4105 while (level >= -1) { 4106 u64 parent; 4107 if (level < ref_path->lowest_level) 4108 break; 4109 4110 if (level >= 0) 4111 bytenr = ref_path->nodes[level]; 4112 else 4113 bytenr = ref_path->extent_start; 4114 BUG_ON(bytenr == 0); 4115 4116 parent = ref_path->nodes[level + 1]; 4117 ref_path->nodes[level + 1] = 0; 4118 ref_path->current_level = level; 4119 BUG_ON(parent == 0); 4120 4121 key.objectid = bytenr; 4122 key.offset = parent + 1; 4123 key.type = BTRFS_EXTENT_REF_KEY; 4124 4125 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); 4126 if (ret < 0) 4127 goto out; 4128 BUG_ON(ret == 0); 4129 4130 leaf = path->nodes[0]; 4131 nritems = btrfs_header_nritems(leaf); 4132 if (path->slots[0] >= nritems) { 4133 ret = btrfs_next_leaf(extent_root, path); 4134 if (ret < 0) 4135 goto out; 4136 if (ret > 0) 4137 goto next; 4138 leaf = path->nodes[0]; 4139 } 4140 4141 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4142 if (found_key.objectid == bytenr && 4143 found_key.type == BTRFS_EXTENT_REF_KEY) { 4144 if (level < ref_path->shared_level) 4145 ref_path->shared_level = level; 4146 goto found; 4147 } 4148 next: 4149 level--; 4150 btrfs_release_path(extent_root, path); 4151 cond_resched(); 4152 } 4153 /* reached lowest level */ 4154 ret = 1; 4155 goto out; 4156 walk_up: 4157 level = ref_path->current_level; 4158 while (level < BTRFS_MAX_LEVEL - 1) { 4159 u64 ref_objectid; 4160 4161 if (level >= 0) 4162 bytenr = ref_path->nodes[level]; 4163 else 4164 bytenr = ref_path->extent_start; 4165 4166 BUG_ON(bytenr == 0); 4167 4168 key.objectid = bytenr; 4169 key.offset = 0; 4170 key.type = BTRFS_EXTENT_REF_KEY; 4171 4172 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); 4173 if (ret < 0) 4174 goto out; 4175 4176 leaf = path->nodes[0]; 4177 nritems = btrfs_header_nritems(leaf); 4178 if (path->slots[0] >= nritems) { 4179 ret = btrfs_next_leaf(extent_root, path); 4180 if (ret < 0) 4181 goto out; 4182 if (ret > 0) { 4183 /* the extent was freed by someone */ 4184 if (ref_path->lowest_level == level) 4185 goto out; 4186 btrfs_release_path(extent_root, path); 4187 goto walk_down; 4188 } 4189 leaf = path->nodes[0]; 4190 } 4191 4192 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4193 if (found_key.objectid != bytenr || 4194 found_key.type != BTRFS_EXTENT_REF_KEY) { 4195 /* the extent was freed by someone */ 4196 if (ref_path->lowest_level == level) { 4197 ret = 1; 4198 goto out; 4199 } 4200 btrfs_release_path(extent_root, path); 4201 goto walk_down; 4202 } 4203 found: 4204 ref = btrfs_item_ptr(leaf, path->slots[0], 4205 struct btrfs_extent_ref); 4206 ref_objectid = btrfs_ref_objectid(leaf, ref); 4207 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { 4208 if (first_time) { 4209 level = (int)ref_objectid; 4210 BUG_ON(level >= BTRFS_MAX_LEVEL); 4211 ref_path->lowest_level = level; 4212 ref_path->current_level = level; 4213 ref_path->nodes[level] = bytenr; 4214 } else { 4215 WARN_ON(ref_objectid != level); 4216 } 4217 } else { 4218 WARN_ON(level != -1); 4219 } 4220 first_time = 0; 4221 4222 if (ref_path->lowest_level == level) { 4223 ref_path->owner_objectid = ref_objectid; 4224 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); 4225 } 4226 4227 /* 4228 * the block is tree root or the block isn't in reference 4229 * counted tree. 4230 */ 4231 if (found_key.objectid == found_key.offset || 4232 is_cowonly_root(btrfs_ref_root(leaf, ref))) { 4233 ref_path->root_objectid = btrfs_ref_root(leaf, ref); 4234 ref_path->root_generation = 4235 btrfs_ref_generation(leaf, ref); 4236 if (level < 0) { 4237 /* special reference from the tree log */ 4238 ref_path->nodes[0] = found_key.offset; 4239 ref_path->current_level = 0; 4240 } 4241 ret = 0; 4242 goto out; 4243 } 4244 4245 level++; 4246 BUG_ON(ref_path->nodes[level] != 0); 4247 ref_path->nodes[level] = found_key.offset; 4248 ref_path->current_level = level; 4249 4250 /* 4251 * the reference was created in the running transaction, 4252 * no need to continue walking up. 4253 */ 4254 if (btrfs_ref_generation(leaf, ref) == trans->transid) { 4255 ref_path->root_objectid = btrfs_ref_root(leaf, ref); 4256 ref_path->root_generation = 4257 btrfs_ref_generation(leaf, ref); 4258 ret = 0; 4259 goto out; 4260 } 4261 4262 btrfs_release_path(extent_root, path); 4263 cond_resched(); 4264 } 4265 /* reached max tree level, but no tree root found. */ 4266 BUG(); 4267 out: 4268 btrfs_free_path(path); 4269 return ret; 4270 } 4271 4272 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, 4273 struct btrfs_root *extent_root, 4274 struct btrfs_ref_path *ref_path, 4275 u64 extent_start) 4276 { 4277 memset(ref_path, 0, sizeof(*ref_path)); 4278 ref_path->extent_start = extent_start; 4279 4280 return __next_ref_path(trans, extent_root, ref_path, 1); 4281 } 4282 4283 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, 4284 struct btrfs_root *extent_root, 4285 struct btrfs_ref_path *ref_path) 4286 { 4287 return __next_ref_path(trans, extent_root, ref_path, 0); 4288 } 4289 4290 static noinline int get_new_locations(struct inode *reloc_inode, 4291 struct btrfs_key *extent_key, 4292 u64 offset, int no_fragment, 4293 struct disk_extent **extents, 4294 int *nr_extents) 4295 { 4296 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 4297 struct btrfs_path *path; 4298 struct btrfs_file_extent_item *fi; 4299 struct extent_buffer *leaf; 4300 struct disk_extent *exts = *extents; 4301 struct btrfs_key found_key; 4302 u64 cur_pos; 4303 u64 last_byte; 4304 u32 nritems; 4305 int nr = 0; 4306 int max = *nr_extents; 4307 int ret; 4308 4309 WARN_ON(!no_fragment && *extents); 4310 if (!exts) { 4311 max = 1; 4312 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); 4313 if (!exts) 4314 return -ENOMEM; 4315 } 4316 4317 path = btrfs_alloc_path(); 4318 BUG_ON(!path); 4319 4320 cur_pos = extent_key->objectid - offset; 4321 last_byte = extent_key->objectid + extent_key->offset; 4322 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, 4323 cur_pos, 0); 4324 if (ret < 0) 4325 goto out; 4326 if (ret > 0) { 4327 ret = -ENOENT; 4328 goto out; 4329 } 4330 4331 while (1) { 4332 leaf = path->nodes[0]; 4333 nritems = btrfs_header_nritems(leaf); 4334 if (path->slots[0] >= nritems) { 4335 ret = btrfs_next_leaf(root, path); 4336 if (ret < 0) 4337 goto out; 4338 if (ret > 0) 4339 break; 4340 leaf = path->nodes[0]; 4341 } 4342 4343 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4344 if (found_key.offset != cur_pos || 4345 found_key.type != BTRFS_EXTENT_DATA_KEY || 4346 found_key.objectid != reloc_inode->i_ino) 4347 break; 4348 4349 fi = btrfs_item_ptr(leaf, path->slots[0], 4350 struct btrfs_file_extent_item); 4351 if (btrfs_file_extent_type(leaf, fi) != 4352 BTRFS_FILE_EXTENT_REG || 4353 btrfs_file_extent_disk_bytenr(leaf, fi) == 0) 4354 break; 4355 4356 if (nr == max) { 4357 struct disk_extent *old = exts; 4358 max *= 2; 4359 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); 4360 memcpy(exts, old, sizeof(*exts) * nr); 4361 if (old != *extents) 4362 kfree(old); 4363 } 4364 4365 exts[nr].disk_bytenr = 4366 btrfs_file_extent_disk_bytenr(leaf, fi); 4367 exts[nr].disk_num_bytes = 4368 btrfs_file_extent_disk_num_bytes(leaf, fi); 4369 exts[nr].offset = btrfs_file_extent_offset(leaf, fi); 4370 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 4371 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 4372 exts[nr].compression = btrfs_file_extent_compression(leaf, fi); 4373 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); 4374 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, 4375 fi); 4376 BUG_ON(exts[nr].offset > 0); 4377 BUG_ON(exts[nr].compression || exts[nr].encryption); 4378 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); 4379 4380 cur_pos += exts[nr].num_bytes; 4381 nr++; 4382 4383 if (cur_pos + offset >= last_byte) 4384 break; 4385 4386 if (no_fragment) { 4387 ret = 1; 4388 goto out; 4389 } 4390 path->slots[0]++; 4391 } 4392 4393 BUG_ON(cur_pos + offset > last_byte); 4394 if (cur_pos + offset < last_byte) { 4395 ret = -ENOENT; 4396 goto out; 4397 } 4398 ret = 0; 4399 out: 4400 btrfs_free_path(path); 4401 if (ret) { 4402 if (exts != *extents) 4403 kfree(exts); 4404 } else { 4405 *extents = exts; 4406 *nr_extents = nr; 4407 } 4408 return ret; 4409 } 4410 4411 static noinline int replace_one_extent(struct btrfs_trans_handle *trans, 4412 struct btrfs_root *root, 4413 struct btrfs_path *path, 4414 struct btrfs_key *extent_key, 4415 struct btrfs_key *leaf_key, 4416 struct btrfs_ref_path *ref_path, 4417 struct disk_extent *new_extents, 4418 int nr_extents) 4419 { 4420 struct extent_buffer *leaf; 4421 struct btrfs_file_extent_item *fi; 4422 struct inode *inode = NULL; 4423 struct btrfs_key key; 4424 u64 lock_start = 0; 4425 u64 lock_end = 0; 4426 u64 num_bytes; 4427 u64 ext_offset; 4428 u64 search_end = (u64)-1; 4429 u32 nritems; 4430 int nr_scaned = 0; 4431 int extent_locked = 0; 4432 int extent_type; 4433 int ret; 4434 4435 memcpy(&key, leaf_key, sizeof(key)); 4436 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 4437 if (key.objectid < ref_path->owner_objectid || 4438 (key.objectid == ref_path->owner_objectid && 4439 key.type < BTRFS_EXTENT_DATA_KEY)) { 4440 key.objectid = ref_path->owner_objectid; 4441 key.type = BTRFS_EXTENT_DATA_KEY; 4442 key.offset = 0; 4443 } 4444 } 4445 4446 while (1) { 4447 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 4448 if (ret < 0) 4449 goto out; 4450 4451 leaf = path->nodes[0]; 4452 nritems = btrfs_header_nritems(leaf); 4453 next: 4454 if (extent_locked && ret > 0) { 4455 /* 4456 * the file extent item was modified by someone 4457 * before the extent got locked. 4458 */ 4459 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 4460 lock_end, GFP_NOFS); 4461 extent_locked = 0; 4462 } 4463 4464 if (path->slots[0] >= nritems) { 4465 if (++nr_scaned > 2) 4466 break; 4467 4468 BUG_ON(extent_locked); 4469 ret = btrfs_next_leaf(root, path); 4470 if (ret < 0) 4471 goto out; 4472 if (ret > 0) 4473 break; 4474 leaf = path->nodes[0]; 4475 nritems = btrfs_header_nritems(leaf); 4476 } 4477 4478 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4479 4480 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 4481 if ((key.objectid > ref_path->owner_objectid) || 4482 (key.objectid == ref_path->owner_objectid && 4483 key.type > BTRFS_EXTENT_DATA_KEY) || 4484 key.offset >= search_end) 4485 break; 4486 } 4487 4488 if (inode && key.objectid != inode->i_ino) { 4489 BUG_ON(extent_locked); 4490 btrfs_release_path(root, path); 4491 mutex_unlock(&inode->i_mutex); 4492 iput(inode); 4493 inode = NULL; 4494 continue; 4495 } 4496 4497 if (key.type != BTRFS_EXTENT_DATA_KEY) { 4498 path->slots[0]++; 4499 ret = 1; 4500 goto next; 4501 } 4502 fi = btrfs_item_ptr(leaf, path->slots[0], 4503 struct btrfs_file_extent_item); 4504 extent_type = btrfs_file_extent_type(leaf, fi); 4505 if ((extent_type != BTRFS_FILE_EXTENT_REG && 4506 extent_type != BTRFS_FILE_EXTENT_PREALLOC) || 4507 (btrfs_file_extent_disk_bytenr(leaf, fi) != 4508 extent_key->objectid)) { 4509 path->slots[0]++; 4510 ret = 1; 4511 goto next; 4512 } 4513 4514 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 4515 ext_offset = btrfs_file_extent_offset(leaf, fi); 4516 4517 if (search_end == (u64)-1) { 4518 search_end = key.offset - ext_offset + 4519 btrfs_file_extent_ram_bytes(leaf, fi); 4520 } 4521 4522 if (!extent_locked) { 4523 lock_start = key.offset; 4524 lock_end = lock_start + num_bytes - 1; 4525 } else { 4526 if (lock_start > key.offset || 4527 lock_end + 1 < key.offset + num_bytes) { 4528 unlock_extent(&BTRFS_I(inode)->io_tree, 4529 lock_start, lock_end, GFP_NOFS); 4530 extent_locked = 0; 4531 } 4532 } 4533 4534 if (!inode) { 4535 btrfs_release_path(root, path); 4536 4537 inode = btrfs_iget_locked(root->fs_info->sb, 4538 key.objectid, root); 4539 if (inode->i_state & I_NEW) { 4540 BTRFS_I(inode)->root = root; 4541 BTRFS_I(inode)->location.objectid = 4542 key.objectid; 4543 BTRFS_I(inode)->location.type = 4544 BTRFS_INODE_ITEM_KEY; 4545 BTRFS_I(inode)->location.offset = 0; 4546 btrfs_read_locked_inode(inode); 4547 unlock_new_inode(inode); 4548 } 4549 /* 4550 * some code call btrfs_commit_transaction while 4551 * holding the i_mutex, so we can't use mutex_lock 4552 * here. 4553 */ 4554 if (is_bad_inode(inode) || 4555 !mutex_trylock(&inode->i_mutex)) { 4556 iput(inode); 4557 inode = NULL; 4558 key.offset = (u64)-1; 4559 goto skip; 4560 } 4561 } 4562 4563 if (!extent_locked) { 4564 struct btrfs_ordered_extent *ordered; 4565 4566 btrfs_release_path(root, path); 4567 4568 lock_extent(&BTRFS_I(inode)->io_tree, lock_start, 4569 lock_end, GFP_NOFS); 4570 ordered = btrfs_lookup_first_ordered_extent(inode, 4571 lock_end); 4572 if (ordered && 4573 ordered->file_offset <= lock_end && 4574 ordered->file_offset + ordered->len > lock_start) { 4575 unlock_extent(&BTRFS_I(inode)->io_tree, 4576 lock_start, lock_end, GFP_NOFS); 4577 btrfs_start_ordered_extent(inode, ordered, 1); 4578 btrfs_put_ordered_extent(ordered); 4579 key.offset += num_bytes; 4580 goto skip; 4581 } 4582 if (ordered) 4583 btrfs_put_ordered_extent(ordered); 4584 4585 extent_locked = 1; 4586 continue; 4587 } 4588 4589 if (nr_extents == 1) { 4590 /* update extent pointer in place */ 4591 btrfs_set_file_extent_disk_bytenr(leaf, fi, 4592 new_extents[0].disk_bytenr); 4593 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 4594 new_extents[0].disk_num_bytes); 4595 btrfs_mark_buffer_dirty(leaf); 4596 4597 btrfs_drop_extent_cache(inode, key.offset, 4598 key.offset + num_bytes - 1, 0); 4599 4600 ret = btrfs_inc_extent_ref(trans, root, 4601 new_extents[0].disk_bytenr, 4602 new_extents[0].disk_num_bytes, 4603 leaf->start, 4604 root->root_key.objectid, 4605 trans->transid, 4606 key.objectid); 4607 BUG_ON(ret); 4608 4609 ret = btrfs_free_extent(trans, root, 4610 extent_key->objectid, 4611 extent_key->offset, 4612 leaf->start, 4613 btrfs_header_owner(leaf), 4614 btrfs_header_generation(leaf), 4615 key.objectid, 0); 4616 BUG_ON(ret); 4617 4618 btrfs_release_path(root, path); 4619 key.offset += num_bytes; 4620 } else { 4621 BUG_ON(1); 4622 #if 0 4623 u64 alloc_hint; 4624 u64 extent_len; 4625 int i; 4626 /* 4627 * drop old extent pointer at first, then insert the 4628 * new pointers one bye one 4629 */ 4630 btrfs_release_path(root, path); 4631 ret = btrfs_drop_extents(trans, root, inode, key.offset, 4632 key.offset + num_bytes, 4633 key.offset, &alloc_hint); 4634 BUG_ON(ret); 4635 4636 for (i = 0; i < nr_extents; i++) { 4637 if (ext_offset >= new_extents[i].num_bytes) { 4638 ext_offset -= new_extents[i].num_bytes; 4639 continue; 4640 } 4641 extent_len = min(new_extents[i].num_bytes - 4642 ext_offset, num_bytes); 4643 4644 ret = btrfs_insert_empty_item(trans, root, 4645 path, &key, 4646 sizeof(*fi)); 4647 BUG_ON(ret); 4648 4649 leaf = path->nodes[0]; 4650 fi = btrfs_item_ptr(leaf, path->slots[0], 4651 struct btrfs_file_extent_item); 4652 btrfs_set_file_extent_generation(leaf, fi, 4653 trans->transid); 4654 btrfs_set_file_extent_type(leaf, fi, 4655 BTRFS_FILE_EXTENT_REG); 4656 btrfs_set_file_extent_disk_bytenr(leaf, fi, 4657 new_extents[i].disk_bytenr); 4658 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 4659 new_extents[i].disk_num_bytes); 4660 btrfs_set_file_extent_ram_bytes(leaf, fi, 4661 new_extents[i].ram_bytes); 4662 4663 btrfs_set_file_extent_compression(leaf, fi, 4664 new_extents[i].compression); 4665 btrfs_set_file_extent_encryption(leaf, fi, 4666 new_extents[i].encryption); 4667 btrfs_set_file_extent_other_encoding(leaf, fi, 4668 new_extents[i].other_encoding); 4669 4670 btrfs_set_file_extent_num_bytes(leaf, fi, 4671 extent_len); 4672 ext_offset += new_extents[i].offset; 4673 btrfs_set_file_extent_offset(leaf, fi, 4674 ext_offset); 4675 btrfs_mark_buffer_dirty(leaf); 4676 4677 btrfs_drop_extent_cache(inode, key.offset, 4678 key.offset + extent_len - 1, 0); 4679 4680 ret = btrfs_inc_extent_ref(trans, root, 4681 new_extents[i].disk_bytenr, 4682 new_extents[i].disk_num_bytes, 4683 leaf->start, 4684 root->root_key.objectid, 4685 trans->transid, key.objectid); 4686 BUG_ON(ret); 4687 btrfs_release_path(root, path); 4688 4689 inode_add_bytes(inode, extent_len); 4690 4691 ext_offset = 0; 4692 num_bytes -= extent_len; 4693 key.offset += extent_len; 4694 4695 if (num_bytes == 0) 4696 break; 4697 } 4698 BUG_ON(i >= nr_extents); 4699 #endif 4700 } 4701 4702 if (extent_locked) { 4703 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 4704 lock_end, GFP_NOFS); 4705 extent_locked = 0; 4706 } 4707 skip: 4708 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 4709 key.offset >= search_end) 4710 break; 4711 4712 cond_resched(); 4713 } 4714 ret = 0; 4715 out: 4716 btrfs_release_path(root, path); 4717 if (inode) { 4718 mutex_unlock(&inode->i_mutex); 4719 if (extent_locked) { 4720 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 4721 lock_end, GFP_NOFS); 4722 } 4723 iput(inode); 4724 } 4725 return ret; 4726 } 4727 4728 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, 4729 struct btrfs_root *root, 4730 struct extent_buffer *buf, u64 orig_start) 4731 { 4732 int level; 4733 int ret; 4734 4735 BUG_ON(btrfs_header_generation(buf) != trans->transid); 4736 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 4737 4738 level = btrfs_header_level(buf); 4739 if (level == 0) { 4740 struct btrfs_leaf_ref *ref; 4741 struct btrfs_leaf_ref *orig_ref; 4742 4743 orig_ref = btrfs_lookup_leaf_ref(root, orig_start); 4744 if (!orig_ref) 4745 return -ENOENT; 4746 4747 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); 4748 if (!ref) { 4749 btrfs_free_leaf_ref(root, orig_ref); 4750 return -ENOMEM; 4751 } 4752 4753 ref->nritems = orig_ref->nritems; 4754 memcpy(ref->extents, orig_ref->extents, 4755 sizeof(ref->extents[0]) * ref->nritems); 4756 4757 btrfs_free_leaf_ref(root, orig_ref); 4758 4759 ref->root_gen = trans->transid; 4760 ref->bytenr = buf->start; 4761 ref->owner = btrfs_header_owner(buf); 4762 ref->generation = btrfs_header_generation(buf); 4763 4764 ret = btrfs_add_leaf_ref(root, ref, 0); 4765 WARN_ON(ret); 4766 btrfs_free_leaf_ref(root, ref); 4767 } 4768 return 0; 4769 } 4770 4771 static noinline int invalidate_extent_cache(struct btrfs_root *root, 4772 struct extent_buffer *leaf, 4773 struct btrfs_block_group_cache *group, 4774 struct btrfs_root *target_root) 4775 { 4776 struct btrfs_key key; 4777 struct inode *inode = NULL; 4778 struct btrfs_file_extent_item *fi; 4779 u64 num_bytes; 4780 u64 skip_objectid = 0; 4781 u32 nritems; 4782 u32 i; 4783 4784 nritems = btrfs_header_nritems(leaf); 4785 for (i = 0; i < nritems; i++) { 4786 btrfs_item_key_to_cpu(leaf, &key, i); 4787 if (key.objectid == skip_objectid || 4788 key.type != BTRFS_EXTENT_DATA_KEY) 4789 continue; 4790 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 4791 if (btrfs_file_extent_type(leaf, fi) == 4792 BTRFS_FILE_EXTENT_INLINE) 4793 continue; 4794 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) 4795 continue; 4796 if (!inode || inode->i_ino != key.objectid) { 4797 iput(inode); 4798 inode = btrfs_ilookup(target_root->fs_info->sb, 4799 key.objectid, target_root, 1); 4800 } 4801 if (!inode) { 4802 skip_objectid = key.objectid; 4803 continue; 4804 } 4805 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 4806 4807 lock_extent(&BTRFS_I(inode)->io_tree, key.offset, 4808 key.offset + num_bytes - 1, GFP_NOFS); 4809 btrfs_drop_extent_cache(inode, key.offset, 4810 key.offset + num_bytes - 1, 1); 4811 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, 4812 key.offset + num_bytes - 1, GFP_NOFS); 4813 cond_resched(); 4814 } 4815 iput(inode); 4816 return 0; 4817 } 4818 4819 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, 4820 struct btrfs_root *root, 4821 struct extent_buffer *leaf, 4822 struct btrfs_block_group_cache *group, 4823 struct inode *reloc_inode) 4824 { 4825 struct btrfs_key key; 4826 struct btrfs_key extent_key; 4827 struct btrfs_file_extent_item *fi; 4828 struct btrfs_leaf_ref *ref; 4829 struct disk_extent *new_extent; 4830 u64 bytenr; 4831 u64 num_bytes; 4832 u32 nritems; 4833 u32 i; 4834 int ext_index; 4835 int nr_extent; 4836 int ret; 4837 4838 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); 4839 BUG_ON(!new_extent); 4840 4841 ref = btrfs_lookup_leaf_ref(root, leaf->start); 4842 BUG_ON(!ref); 4843 4844 ext_index = -1; 4845 nritems = btrfs_header_nritems(leaf); 4846 for (i = 0; i < nritems; i++) { 4847 btrfs_item_key_to_cpu(leaf, &key, i); 4848 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 4849 continue; 4850 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 4851 if (btrfs_file_extent_type(leaf, fi) == 4852 BTRFS_FILE_EXTENT_INLINE) 4853 continue; 4854 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 4855 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 4856 if (bytenr == 0) 4857 continue; 4858 4859 ext_index++; 4860 if (bytenr >= group->key.objectid + group->key.offset || 4861 bytenr + num_bytes <= group->key.objectid) 4862 continue; 4863 4864 extent_key.objectid = bytenr; 4865 extent_key.offset = num_bytes; 4866 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 4867 nr_extent = 1; 4868 ret = get_new_locations(reloc_inode, &extent_key, 4869 group->key.objectid, 1, 4870 &new_extent, &nr_extent); 4871 if (ret > 0) 4872 continue; 4873 BUG_ON(ret < 0); 4874 4875 BUG_ON(ref->extents[ext_index].bytenr != bytenr); 4876 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); 4877 ref->extents[ext_index].bytenr = new_extent->disk_bytenr; 4878 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; 4879 4880 btrfs_set_file_extent_disk_bytenr(leaf, fi, 4881 new_extent->disk_bytenr); 4882 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 4883 new_extent->disk_num_bytes); 4884 btrfs_mark_buffer_dirty(leaf); 4885 4886 ret = btrfs_inc_extent_ref(trans, root, 4887 new_extent->disk_bytenr, 4888 new_extent->disk_num_bytes, 4889 leaf->start, 4890 root->root_key.objectid, 4891 trans->transid, key.objectid); 4892 BUG_ON(ret); 4893 4894 ret = btrfs_free_extent(trans, root, 4895 bytenr, num_bytes, leaf->start, 4896 btrfs_header_owner(leaf), 4897 btrfs_header_generation(leaf), 4898 key.objectid, 0); 4899 BUG_ON(ret); 4900 cond_resched(); 4901 } 4902 kfree(new_extent); 4903 BUG_ON(ext_index + 1 != ref->nritems); 4904 btrfs_free_leaf_ref(root, ref); 4905 return 0; 4906 } 4907 4908 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, 4909 struct btrfs_root *root) 4910 { 4911 struct btrfs_root *reloc_root; 4912 int ret; 4913 4914 if (root->reloc_root) { 4915 reloc_root = root->reloc_root; 4916 root->reloc_root = NULL; 4917 list_add(&reloc_root->dead_list, 4918 &root->fs_info->dead_reloc_roots); 4919 4920 btrfs_set_root_bytenr(&reloc_root->root_item, 4921 reloc_root->node->start); 4922 btrfs_set_root_level(&root->root_item, 4923 btrfs_header_level(reloc_root->node)); 4924 memset(&reloc_root->root_item.drop_progress, 0, 4925 sizeof(struct btrfs_disk_key)); 4926 reloc_root->root_item.drop_level = 0; 4927 4928 ret = btrfs_update_root(trans, root->fs_info->tree_root, 4929 &reloc_root->root_key, 4930 &reloc_root->root_item); 4931 BUG_ON(ret); 4932 } 4933 return 0; 4934 } 4935 4936 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) 4937 { 4938 struct btrfs_trans_handle *trans; 4939 struct btrfs_root *reloc_root; 4940 struct btrfs_root *prev_root = NULL; 4941 struct list_head dead_roots; 4942 int ret; 4943 unsigned long nr; 4944 4945 INIT_LIST_HEAD(&dead_roots); 4946 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); 4947 4948 while (!list_empty(&dead_roots)) { 4949 reloc_root = list_entry(dead_roots.prev, 4950 struct btrfs_root, dead_list); 4951 list_del_init(&reloc_root->dead_list); 4952 4953 BUG_ON(reloc_root->commit_root != NULL); 4954 while (1) { 4955 trans = btrfs_join_transaction(root, 1); 4956 BUG_ON(!trans); 4957 4958 mutex_lock(&root->fs_info->drop_mutex); 4959 ret = btrfs_drop_snapshot(trans, reloc_root); 4960 if (ret != -EAGAIN) 4961 break; 4962 mutex_unlock(&root->fs_info->drop_mutex); 4963 4964 nr = trans->blocks_used; 4965 ret = btrfs_end_transaction(trans, root); 4966 BUG_ON(ret); 4967 btrfs_btree_balance_dirty(root, nr); 4968 } 4969 4970 free_extent_buffer(reloc_root->node); 4971 4972 ret = btrfs_del_root(trans, root->fs_info->tree_root, 4973 &reloc_root->root_key); 4974 BUG_ON(ret); 4975 mutex_unlock(&root->fs_info->drop_mutex); 4976 4977 nr = trans->blocks_used; 4978 ret = btrfs_end_transaction(trans, root); 4979 BUG_ON(ret); 4980 btrfs_btree_balance_dirty(root, nr); 4981 4982 kfree(prev_root); 4983 prev_root = reloc_root; 4984 } 4985 if (prev_root) { 4986 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); 4987 kfree(prev_root); 4988 } 4989 return 0; 4990 } 4991 4992 int btrfs_add_dead_reloc_root(struct btrfs_root *root) 4993 { 4994 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); 4995 return 0; 4996 } 4997 4998 int btrfs_cleanup_reloc_trees(struct btrfs_root *root) 4999 { 5000 struct btrfs_root *reloc_root; 5001 struct btrfs_trans_handle *trans; 5002 struct btrfs_key location; 5003 int found; 5004 int ret; 5005 5006 mutex_lock(&root->fs_info->tree_reloc_mutex); 5007 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); 5008 BUG_ON(ret); 5009 found = !list_empty(&root->fs_info->dead_reloc_roots); 5010 mutex_unlock(&root->fs_info->tree_reloc_mutex); 5011 5012 if (found) { 5013 trans = btrfs_start_transaction(root, 1); 5014 BUG_ON(!trans); 5015 ret = btrfs_commit_transaction(trans, root); 5016 BUG_ON(ret); 5017 } 5018 5019 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; 5020 location.offset = (u64)-1; 5021 location.type = BTRFS_ROOT_ITEM_KEY; 5022 5023 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 5024 BUG_ON(!reloc_root); 5025 btrfs_orphan_cleanup(reloc_root); 5026 return 0; 5027 } 5028 5029 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, 5030 struct btrfs_root *root) 5031 { 5032 struct btrfs_root *reloc_root; 5033 struct extent_buffer *eb; 5034 struct btrfs_root_item *root_item; 5035 struct btrfs_key root_key; 5036 int ret; 5037 5038 BUG_ON(!root->ref_cows); 5039 if (root->reloc_root) 5040 return 0; 5041 5042 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 5043 BUG_ON(!root_item); 5044 5045 ret = btrfs_copy_root(trans, root, root->commit_root, 5046 &eb, BTRFS_TREE_RELOC_OBJECTID); 5047 BUG_ON(ret); 5048 5049 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 5050 root_key.offset = root->root_key.objectid; 5051 root_key.type = BTRFS_ROOT_ITEM_KEY; 5052 5053 memcpy(root_item, &root->root_item, sizeof(root_item)); 5054 btrfs_set_root_refs(root_item, 0); 5055 btrfs_set_root_bytenr(root_item, eb->start); 5056 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 5057 btrfs_set_root_generation(root_item, trans->transid); 5058 5059 btrfs_tree_unlock(eb); 5060 free_extent_buffer(eb); 5061 5062 ret = btrfs_insert_root(trans, root->fs_info->tree_root, 5063 &root_key, root_item); 5064 BUG_ON(ret); 5065 kfree(root_item); 5066 5067 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 5068 &root_key); 5069 BUG_ON(!reloc_root); 5070 reloc_root->last_trans = trans->transid; 5071 reloc_root->commit_root = NULL; 5072 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; 5073 5074 root->reloc_root = reloc_root; 5075 return 0; 5076 } 5077 5078 /* 5079 * Core function of space balance. 5080 * 5081 * The idea is using reloc trees to relocate tree blocks in reference 5082 * counted roots. There is one reloc tree for each subvol, and all 5083 * reloc trees share same root key objectid. Reloc trees are snapshots 5084 * of the latest committed roots of subvols (root->commit_root). 5085 * 5086 * To relocate a tree block referenced by a subvol, there are two steps. 5087 * COW the block through subvol's reloc tree, then update block pointer 5088 * in the subvol to point to the new block. Since all reloc trees share 5089 * same root key objectid, doing special handing for tree blocks owned 5090 * by them is easy. Once a tree block has been COWed in one reloc tree, 5091 * we can use the resulting new block directly when the same block is 5092 * required to COW again through other reloc trees. By this way, relocated 5093 * tree blocks are shared between reloc trees, so they are also shared 5094 * between subvols. 5095 */ 5096 static noinline int relocate_one_path(struct btrfs_trans_handle *trans, 5097 struct btrfs_root *root, 5098 struct btrfs_path *path, 5099 struct btrfs_key *first_key, 5100 struct btrfs_ref_path *ref_path, 5101 struct btrfs_block_group_cache *group, 5102 struct inode *reloc_inode) 5103 { 5104 struct btrfs_root *reloc_root; 5105 struct extent_buffer *eb = NULL; 5106 struct btrfs_key *keys; 5107 u64 *nodes; 5108 int level; 5109 int shared_level; 5110 int lowest_level = 0; 5111 int ret; 5112 5113 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 5114 lowest_level = ref_path->owner_objectid; 5115 5116 if (!root->ref_cows) { 5117 path->lowest_level = lowest_level; 5118 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); 5119 BUG_ON(ret < 0); 5120 path->lowest_level = 0; 5121 btrfs_release_path(root, path); 5122 return 0; 5123 } 5124 5125 mutex_lock(&root->fs_info->tree_reloc_mutex); 5126 ret = init_reloc_tree(trans, root); 5127 BUG_ON(ret); 5128 reloc_root = root->reloc_root; 5129 5130 shared_level = ref_path->shared_level; 5131 ref_path->shared_level = BTRFS_MAX_LEVEL - 1; 5132 5133 keys = ref_path->node_keys; 5134 nodes = ref_path->new_nodes; 5135 memset(&keys[shared_level + 1], 0, 5136 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); 5137 memset(&nodes[shared_level + 1], 0, 5138 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); 5139 5140 if (nodes[lowest_level] == 0) { 5141 path->lowest_level = lowest_level; 5142 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 5143 0, 1); 5144 BUG_ON(ret); 5145 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { 5146 eb = path->nodes[level]; 5147 if (!eb || eb == reloc_root->node) 5148 break; 5149 nodes[level] = eb->start; 5150 if (level == 0) 5151 btrfs_item_key_to_cpu(eb, &keys[level], 0); 5152 else 5153 btrfs_node_key_to_cpu(eb, &keys[level], 0); 5154 } 5155 if (nodes[0] && 5156 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 5157 eb = path->nodes[0]; 5158 ret = replace_extents_in_leaf(trans, reloc_root, eb, 5159 group, reloc_inode); 5160 BUG_ON(ret); 5161 } 5162 btrfs_release_path(reloc_root, path); 5163 } else { 5164 ret = btrfs_merge_path(trans, reloc_root, keys, nodes, 5165 lowest_level); 5166 BUG_ON(ret); 5167 } 5168 5169 /* 5170 * replace tree blocks in the fs tree with tree blocks in 5171 * the reloc tree. 5172 */ 5173 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); 5174 BUG_ON(ret < 0); 5175 5176 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 5177 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 5178 0, 0); 5179 BUG_ON(ret); 5180 extent_buffer_get(path->nodes[0]); 5181 eb = path->nodes[0]; 5182 btrfs_release_path(reloc_root, path); 5183 ret = invalidate_extent_cache(reloc_root, eb, group, root); 5184 BUG_ON(ret); 5185 free_extent_buffer(eb); 5186 } 5187 5188 mutex_unlock(&root->fs_info->tree_reloc_mutex); 5189 path->lowest_level = 0; 5190 return 0; 5191 } 5192 5193 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, 5194 struct btrfs_root *root, 5195 struct btrfs_path *path, 5196 struct btrfs_key *first_key, 5197 struct btrfs_ref_path *ref_path) 5198 { 5199 int ret; 5200 5201 ret = relocate_one_path(trans, root, path, first_key, 5202 ref_path, NULL, NULL); 5203 BUG_ON(ret); 5204 5205 return 0; 5206 } 5207 5208 static noinline int del_extent_zero(struct btrfs_trans_handle *trans, 5209 struct btrfs_root *extent_root, 5210 struct btrfs_path *path, 5211 struct btrfs_key *extent_key) 5212 { 5213 int ret; 5214 5215 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); 5216 if (ret) 5217 goto out; 5218 ret = btrfs_del_item(trans, extent_root, path); 5219 out: 5220 btrfs_release_path(extent_root, path); 5221 return ret; 5222 } 5223 5224 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, 5225 struct btrfs_ref_path *ref_path) 5226 { 5227 struct btrfs_key root_key; 5228 5229 root_key.objectid = ref_path->root_objectid; 5230 root_key.type = BTRFS_ROOT_ITEM_KEY; 5231 if (is_cowonly_root(ref_path->root_objectid)) 5232 root_key.offset = 0; 5233 else 5234 root_key.offset = (u64)-1; 5235 5236 return btrfs_read_fs_root_no_name(fs_info, &root_key); 5237 } 5238 5239 static noinline int relocate_one_extent(struct btrfs_root *extent_root, 5240 struct btrfs_path *path, 5241 struct btrfs_key *extent_key, 5242 struct btrfs_block_group_cache *group, 5243 struct inode *reloc_inode, int pass) 5244 { 5245 struct btrfs_trans_handle *trans; 5246 struct btrfs_root *found_root; 5247 struct btrfs_ref_path *ref_path = NULL; 5248 struct disk_extent *new_extents = NULL; 5249 int nr_extents = 0; 5250 int loops; 5251 int ret; 5252 int level; 5253 struct btrfs_key first_key; 5254 u64 prev_block = 0; 5255 5256 5257 trans = btrfs_start_transaction(extent_root, 1); 5258 BUG_ON(!trans); 5259 5260 if (extent_key->objectid == 0) { 5261 ret = del_extent_zero(trans, extent_root, path, extent_key); 5262 goto out; 5263 } 5264 5265 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); 5266 if (!ref_path) { 5267 ret = -ENOMEM; 5268 goto out; 5269 } 5270 5271 for (loops = 0; ; loops++) { 5272 if (loops == 0) { 5273 ret = btrfs_first_ref_path(trans, extent_root, ref_path, 5274 extent_key->objectid); 5275 } else { 5276 ret = btrfs_next_ref_path(trans, extent_root, ref_path); 5277 } 5278 if (ret < 0) 5279 goto out; 5280 if (ret > 0) 5281 break; 5282 5283 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || 5284 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) 5285 continue; 5286 5287 found_root = read_ref_root(extent_root->fs_info, ref_path); 5288 BUG_ON(!found_root); 5289 /* 5290 * for reference counted tree, only process reference paths 5291 * rooted at the latest committed root. 5292 */ 5293 if (found_root->ref_cows && 5294 ref_path->root_generation != found_root->root_key.offset) 5295 continue; 5296 5297 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 5298 if (pass == 0) { 5299 /* 5300 * copy data extents to new locations 5301 */ 5302 u64 group_start = group->key.objectid; 5303 ret = relocate_data_extent(reloc_inode, 5304 extent_key, 5305 group_start); 5306 if (ret < 0) 5307 goto out; 5308 break; 5309 } 5310 level = 0; 5311 } else { 5312 level = ref_path->owner_objectid; 5313 } 5314 5315 if (prev_block != ref_path->nodes[level]) { 5316 struct extent_buffer *eb; 5317 u64 block_start = ref_path->nodes[level]; 5318 u64 block_size = btrfs_level_size(found_root, level); 5319 5320 eb = read_tree_block(found_root, block_start, 5321 block_size, 0); 5322 btrfs_tree_lock(eb); 5323 BUG_ON(level != btrfs_header_level(eb)); 5324 5325 if (level == 0) 5326 btrfs_item_key_to_cpu(eb, &first_key, 0); 5327 else 5328 btrfs_node_key_to_cpu(eb, &first_key, 0); 5329 5330 btrfs_tree_unlock(eb); 5331 free_extent_buffer(eb); 5332 prev_block = block_start; 5333 } 5334 5335 mutex_lock(&extent_root->fs_info->trans_mutex); 5336 btrfs_record_root_in_trans(found_root); 5337 mutex_unlock(&extent_root->fs_info->trans_mutex); 5338 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 5339 /* 5340 * try to update data extent references while 5341 * keeping metadata shared between snapshots. 5342 */ 5343 if (pass == 1) { 5344 ret = relocate_one_path(trans, found_root, 5345 path, &first_key, ref_path, 5346 group, reloc_inode); 5347 if (ret < 0) 5348 goto out; 5349 continue; 5350 } 5351 /* 5352 * use fallback method to process the remaining 5353 * references. 5354 */ 5355 if (!new_extents) { 5356 u64 group_start = group->key.objectid; 5357 new_extents = kmalloc(sizeof(*new_extents), 5358 GFP_NOFS); 5359 nr_extents = 1; 5360 ret = get_new_locations(reloc_inode, 5361 extent_key, 5362 group_start, 1, 5363 &new_extents, 5364 &nr_extents); 5365 if (ret) 5366 goto out; 5367 } 5368 ret = replace_one_extent(trans, found_root, 5369 path, extent_key, 5370 &first_key, ref_path, 5371 new_extents, nr_extents); 5372 } else { 5373 ret = relocate_tree_block(trans, found_root, path, 5374 &first_key, ref_path); 5375 } 5376 if (ret < 0) 5377 goto out; 5378 } 5379 ret = 0; 5380 out: 5381 btrfs_end_transaction(trans, extent_root); 5382 kfree(new_extents); 5383 kfree(ref_path); 5384 return ret; 5385 } 5386 5387 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 5388 { 5389 u64 num_devices; 5390 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 5391 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 5392 5393 num_devices = root->fs_info->fs_devices->rw_devices; 5394 if (num_devices == 1) { 5395 stripped |= BTRFS_BLOCK_GROUP_DUP; 5396 stripped = flags & ~stripped; 5397 5398 /* turn raid0 into single device chunks */ 5399 if (flags & BTRFS_BLOCK_GROUP_RAID0) 5400 return stripped; 5401 5402 /* turn mirroring into duplication */ 5403 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 5404 BTRFS_BLOCK_GROUP_RAID10)) 5405 return stripped | BTRFS_BLOCK_GROUP_DUP; 5406 return flags; 5407 } else { 5408 /* they already had raid on here, just return */ 5409 if (flags & stripped) 5410 return flags; 5411 5412 stripped |= BTRFS_BLOCK_GROUP_DUP; 5413 stripped = flags & ~stripped; 5414 5415 /* switch duplicated blocks with raid1 */ 5416 if (flags & BTRFS_BLOCK_GROUP_DUP) 5417 return stripped | BTRFS_BLOCK_GROUP_RAID1; 5418 5419 /* turn single device chunks into raid0 */ 5420 return stripped | BTRFS_BLOCK_GROUP_RAID0; 5421 } 5422 return flags; 5423 } 5424 5425 static int __alloc_chunk_for_shrink(struct btrfs_root *root, 5426 struct btrfs_block_group_cache *shrink_block_group, 5427 int force) 5428 { 5429 struct btrfs_trans_handle *trans; 5430 u64 new_alloc_flags; 5431 u64 calc; 5432 5433 spin_lock(&shrink_block_group->lock); 5434 if (btrfs_block_group_used(&shrink_block_group->item) > 0) { 5435 spin_unlock(&shrink_block_group->lock); 5436 5437 trans = btrfs_start_transaction(root, 1); 5438 spin_lock(&shrink_block_group->lock); 5439 5440 new_alloc_flags = update_block_group_flags(root, 5441 shrink_block_group->flags); 5442 if (new_alloc_flags != shrink_block_group->flags) { 5443 calc = 5444 btrfs_block_group_used(&shrink_block_group->item); 5445 } else { 5446 calc = shrink_block_group->key.offset; 5447 } 5448 spin_unlock(&shrink_block_group->lock); 5449 5450 do_chunk_alloc(trans, root->fs_info->extent_root, 5451 calc + 2 * 1024 * 1024, new_alloc_flags, force); 5452 5453 btrfs_end_transaction(trans, root); 5454 } else 5455 spin_unlock(&shrink_block_group->lock); 5456 return 0; 5457 } 5458 5459 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 5460 struct btrfs_root *root, 5461 u64 objectid, u64 size) 5462 { 5463 struct btrfs_path *path; 5464 struct btrfs_inode_item *item; 5465 struct extent_buffer *leaf; 5466 int ret; 5467 5468 path = btrfs_alloc_path(); 5469 if (!path) 5470 return -ENOMEM; 5471 5472 path->leave_spinning = 1; 5473 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 5474 if (ret) 5475 goto out; 5476 5477 leaf = path->nodes[0]; 5478 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 5479 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); 5480 btrfs_set_inode_generation(leaf, item, 1); 5481 btrfs_set_inode_size(leaf, item, size); 5482 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 5483 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 5484 btrfs_mark_buffer_dirty(leaf); 5485 btrfs_release_path(root, path); 5486 out: 5487 btrfs_free_path(path); 5488 return ret; 5489 } 5490 5491 static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 5492 struct btrfs_block_group_cache *group) 5493 { 5494 struct inode *inode = NULL; 5495 struct btrfs_trans_handle *trans; 5496 struct btrfs_root *root; 5497 struct btrfs_key root_key; 5498 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 5499 int err = 0; 5500 5501 root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; 5502 root_key.type = BTRFS_ROOT_ITEM_KEY; 5503 root_key.offset = (u64)-1; 5504 root = btrfs_read_fs_root_no_name(fs_info, &root_key); 5505 if (IS_ERR(root)) 5506 return ERR_CAST(root); 5507 5508 trans = btrfs_start_transaction(root, 1); 5509 BUG_ON(!trans); 5510 5511 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 5512 if (err) 5513 goto out; 5514 5515 err = __insert_orphan_inode(trans, root, objectid, group->key.offset); 5516 BUG_ON(err); 5517 5518 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, 5519 group->key.offset, 0, group->key.offset, 5520 0, 0, 0); 5521 BUG_ON(err); 5522 5523 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); 5524 if (inode->i_state & I_NEW) { 5525 BTRFS_I(inode)->root = root; 5526 BTRFS_I(inode)->location.objectid = objectid; 5527 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; 5528 BTRFS_I(inode)->location.offset = 0; 5529 btrfs_read_locked_inode(inode); 5530 unlock_new_inode(inode); 5531 BUG_ON(is_bad_inode(inode)); 5532 } else { 5533 BUG_ON(1); 5534 } 5535 BTRFS_I(inode)->index_cnt = group->key.objectid; 5536 5537 err = btrfs_orphan_add(trans, inode); 5538 out: 5539 btrfs_end_transaction(trans, root); 5540 if (err) { 5541 if (inode) 5542 iput(inode); 5543 inode = ERR_PTR(err); 5544 } 5545 return inode; 5546 } 5547 5548 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 5549 { 5550 5551 struct btrfs_ordered_sum *sums; 5552 struct btrfs_sector_sum *sector_sum; 5553 struct btrfs_ordered_extent *ordered; 5554 struct btrfs_root *root = BTRFS_I(inode)->root; 5555 struct list_head list; 5556 size_t offset; 5557 int ret; 5558 u64 disk_bytenr; 5559 5560 INIT_LIST_HEAD(&list); 5561 5562 ordered = btrfs_lookup_ordered_extent(inode, file_pos); 5563 BUG_ON(ordered->file_offset != file_pos || ordered->len != len); 5564 5565 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; 5566 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, 5567 disk_bytenr + len - 1, &list); 5568 5569 while (!list_empty(&list)) { 5570 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 5571 list_del_init(&sums->list); 5572 5573 sector_sum = sums->sums; 5574 sums->bytenr = ordered->start; 5575 5576 offset = 0; 5577 while (offset < sums->len) { 5578 sector_sum->bytenr += ordered->start - disk_bytenr; 5579 sector_sum++; 5580 offset += root->sectorsize; 5581 } 5582 5583 btrfs_add_ordered_sum(inode, ordered, sums); 5584 } 5585 btrfs_put_ordered_extent(ordered); 5586 return 0; 5587 } 5588 5589 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) 5590 { 5591 struct btrfs_trans_handle *trans; 5592 struct btrfs_path *path; 5593 struct btrfs_fs_info *info = root->fs_info; 5594 struct extent_buffer *leaf; 5595 struct inode *reloc_inode; 5596 struct btrfs_block_group_cache *block_group; 5597 struct btrfs_key key; 5598 u64 skipped; 5599 u64 cur_byte; 5600 u64 total_found; 5601 u32 nritems; 5602 int ret; 5603 int progress; 5604 int pass = 0; 5605 5606 root = root->fs_info->extent_root; 5607 5608 block_group = btrfs_lookup_block_group(info, group_start); 5609 BUG_ON(!block_group); 5610 5611 printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n", 5612 (unsigned long long)block_group->key.objectid, 5613 (unsigned long long)block_group->flags); 5614 5615 path = btrfs_alloc_path(); 5616 BUG_ON(!path); 5617 5618 reloc_inode = create_reloc_inode(info, block_group); 5619 BUG_ON(IS_ERR(reloc_inode)); 5620 5621 __alloc_chunk_for_shrink(root, block_group, 1); 5622 set_block_group_readonly(block_group); 5623 5624 btrfs_start_delalloc_inodes(info->tree_root); 5625 btrfs_wait_ordered_extents(info->tree_root, 0); 5626 again: 5627 skipped = 0; 5628 total_found = 0; 5629 progress = 0; 5630 key.objectid = block_group->key.objectid; 5631 key.offset = 0; 5632 key.type = 0; 5633 cur_byte = key.objectid; 5634 5635 trans = btrfs_start_transaction(info->tree_root, 1); 5636 btrfs_commit_transaction(trans, info->tree_root); 5637 5638 mutex_lock(&root->fs_info->cleaner_mutex); 5639 btrfs_clean_old_snapshots(info->tree_root); 5640 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); 5641 mutex_unlock(&root->fs_info->cleaner_mutex); 5642 5643 trans = btrfs_start_transaction(info->tree_root, 1); 5644 btrfs_commit_transaction(trans, info->tree_root); 5645 5646 while (1) { 5647 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5648 if (ret < 0) 5649 goto out; 5650 next: 5651 leaf = path->nodes[0]; 5652 nritems = btrfs_header_nritems(leaf); 5653 if (path->slots[0] >= nritems) { 5654 ret = btrfs_next_leaf(root, path); 5655 if (ret < 0) 5656 goto out; 5657 if (ret == 1) { 5658 ret = 0; 5659 break; 5660 } 5661 leaf = path->nodes[0]; 5662 nritems = btrfs_header_nritems(leaf); 5663 } 5664 5665 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5666 5667 if (key.objectid >= block_group->key.objectid + 5668 block_group->key.offset) 5669 break; 5670 5671 if (progress && need_resched()) { 5672 btrfs_release_path(root, path); 5673 cond_resched(); 5674 progress = 0; 5675 continue; 5676 } 5677 progress = 1; 5678 5679 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY || 5680 key.objectid + key.offset <= cur_byte) { 5681 path->slots[0]++; 5682 goto next; 5683 } 5684 5685 total_found++; 5686 cur_byte = key.objectid + key.offset; 5687 btrfs_release_path(root, path); 5688 5689 __alloc_chunk_for_shrink(root, block_group, 0); 5690 ret = relocate_one_extent(root, path, &key, block_group, 5691 reloc_inode, pass); 5692 BUG_ON(ret < 0); 5693 if (ret > 0) 5694 skipped++; 5695 5696 key.objectid = cur_byte; 5697 key.type = 0; 5698 key.offset = 0; 5699 } 5700 5701 btrfs_release_path(root, path); 5702 5703 if (pass == 0) { 5704 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1); 5705 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1); 5706 } 5707 5708 if (total_found > 0) { 5709 printk(KERN_INFO "btrfs found %llu extents in pass %d\n", 5710 (unsigned long long)total_found, pass); 5711 pass++; 5712 if (total_found == skipped && pass > 2) { 5713 iput(reloc_inode); 5714 reloc_inode = create_reloc_inode(info, block_group); 5715 pass = 0; 5716 } 5717 goto again; 5718 } 5719 5720 /* delete reloc_inode */ 5721 iput(reloc_inode); 5722 5723 /* unpin extents in this range */ 5724 trans = btrfs_start_transaction(info->tree_root, 1); 5725 btrfs_commit_transaction(trans, info->tree_root); 5726 5727 spin_lock(&block_group->lock); 5728 WARN_ON(block_group->pinned > 0); 5729 WARN_ON(block_group->reserved > 0); 5730 WARN_ON(btrfs_block_group_used(&block_group->item) > 0); 5731 spin_unlock(&block_group->lock); 5732 put_block_group(block_group); 5733 ret = 0; 5734 out: 5735 btrfs_free_path(path); 5736 return ret; 5737 } 5738 5739 static int find_first_block_group(struct btrfs_root *root, 5740 struct btrfs_path *path, struct btrfs_key *key) 5741 { 5742 int ret = 0; 5743 struct btrfs_key found_key; 5744 struct extent_buffer *leaf; 5745 int slot; 5746 5747 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 5748 if (ret < 0) 5749 goto out; 5750 5751 while (1) { 5752 slot = path->slots[0]; 5753 leaf = path->nodes[0]; 5754 if (slot >= btrfs_header_nritems(leaf)) { 5755 ret = btrfs_next_leaf(root, path); 5756 if (ret == 0) 5757 continue; 5758 if (ret < 0) 5759 goto out; 5760 break; 5761 } 5762 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5763 5764 if (found_key.objectid >= key->objectid && 5765 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 5766 ret = 0; 5767 goto out; 5768 } 5769 path->slots[0]++; 5770 } 5771 ret = -ENOENT; 5772 out: 5773 return ret; 5774 } 5775 5776 int btrfs_free_block_groups(struct btrfs_fs_info *info) 5777 { 5778 struct btrfs_block_group_cache *block_group; 5779 struct btrfs_space_info *space_info; 5780 struct rb_node *n; 5781 5782 spin_lock(&info->block_group_cache_lock); 5783 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 5784 block_group = rb_entry(n, struct btrfs_block_group_cache, 5785 cache_node); 5786 rb_erase(&block_group->cache_node, 5787 &info->block_group_cache_tree); 5788 spin_unlock(&info->block_group_cache_lock); 5789 5790 btrfs_remove_free_space_cache(block_group); 5791 down_write(&block_group->space_info->groups_sem); 5792 list_del(&block_group->list); 5793 up_write(&block_group->space_info->groups_sem); 5794 5795 WARN_ON(atomic_read(&block_group->count) != 1); 5796 kfree(block_group); 5797 5798 spin_lock(&info->block_group_cache_lock); 5799 } 5800 spin_unlock(&info->block_group_cache_lock); 5801 5802 /* now that all the block groups are freed, go through and 5803 * free all the space_info structs. This is only called during 5804 * the final stages of unmount, and so we know nobody is 5805 * using them. We call synchronize_rcu() once before we start, 5806 * just to be on the safe side. 5807 */ 5808 synchronize_rcu(); 5809 5810 while(!list_empty(&info->space_info)) { 5811 space_info = list_entry(info->space_info.next, 5812 struct btrfs_space_info, 5813 list); 5814 5815 list_del(&space_info->list); 5816 kfree(space_info); 5817 } 5818 return 0; 5819 } 5820 5821 int btrfs_read_block_groups(struct btrfs_root *root) 5822 { 5823 struct btrfs_path *path; 5824 int ret; 5825 struct btrfs_block_group_cache *cache; 5826 struct btrfs_fs_info *info = root->fs_info; 5827 struct btrfs_space_info *space_info; 5828 struct btrfs_key key; 5829 struct btrfs_key found_key; 5830 struct extent_buffer *leaf; 5831 5832 root = info->extent_root; 5833 key.objectid = 0; 5834 key.offset = 0; 5835 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 5836 path = btrfs_alloc_path(); 5837 if (!path) 5838 return -ENOMEM; 5839 5840 while (1) { 5841 ret = find_first_block_group(root, path, &key); 5842 if (ret > 0) { 5843 ret = 0; 5844 goto error; 5845 } 5846 if (ret != 0) 5847 goto error; 5848 5849 leaf = path->nodes[0]; 5850 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5851 cache = kzalloc(sizeof(*cache), GFP_NOFS); 5852 if (!cache) { 5853 ret = -ENOMEM; 5854 break; 5855 } 5856 5857 atomic_set(&cache->count, 1); 5858 spin_lock_init(&cache->lock); 5859 mutex_init(&cache->alloc_mutex); 5860 mutex_init(&cache->cache_mutex); 5861 INIT_LIST_HEAD(&cache->list); 5862 read_extent_buffer(leaf, &cache->item, 5863 btrfs_item_ptr_offset(leaf, path->slots[0]), 5864 sizeof(cache->item)); 5865 memcpy(&cache->key, &found_key, sizeof(found_key)); 5866 5867 key.objectid = found_key.objectid + found_key.offset; 5868 btrfs_release_path(root, path); 5869 cache->flags = btrfs_block_group_flags(&cache->item); 5870 5871 ret = update_space_info(info, cache->flags, found_key.offset, 5872 btrfs_block_group_used(&cache->item), 5873 &space_info); 5874 BUG_ON(ret); 5875 cache->space_info = space_info; 5876 down_write(&space_info->groups_sem); 5877 list_add_tail(&cache->list, &space_info->block_groups); 5878 up_write(&space_info->groups_sem); 5879 5880 ret = btrfs_add_block_group_cache(root->fs_info, cache); 5881 BUG_ON(ret); 5882 5883 set_avail_alloc_bits(root->fs_info, cache->flags); 5884 if (btrfs_chunk_readonly(root, cache->key.objectid)) 5885 set_block_group_readonly(cache); 5886 } 5887 ret = 0; 5888 error: 5889 btrfs_free_path(path); 5890 return ret; 5891 } 5892 5893 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 5894 struct btrfs_root *root, u64 bytes_used, 5895 u64 type, u64 chunk_objectid, u64 chunk_offset, 5896 u64 size) 5897 { 5898 int ret; 5899 struct btrfs_root *extent_root; 5900 struct btrfs_block_group_cache *cache; 5901 5902 extent_root = root->fs_info->extent_root; 5903 5904 root->fs_info->last_trans_log_full_commit = trans->transid; 5905 5906 cache = kzalloc(sizeof(*cache), GFP_NOFS); 5907 if (!cache) 5908 return -ENOMEM; 5909 5910 cache->key.objectid = chunk_offset; 5911 cache->key.offset = size; 5912 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 5913 atomic_set(&cache->count, 1); 5914 spin_lock_init(&cache->lock); 5915 mutex_init(&cache->alloc_mutex); 5916 mutex_init(&cache->cache_mutex); 5917 INIT_LIST_HEAD(&cache->list); 5918 5919 btrfs_set_block_group_used(&cache->item, bytes_used); 5920 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 5921 cache->flags = type; 5922 btrfs_set_block_group_flags(&cache->item, type); 5923 5924 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 5925 &cache->space_info); 5926 BUG_ON(ret); 5927 down_write(&cache->space_info->groups_sem); 5928 list_add_tail(&cache->list, &cache->space_info->block_groups); 5929 up_write(&cache->space_info->groups_sem); 5930 5931 ret = btrfs_add_block_group_cache(root->fs_info, cache); 5932 BUG_ON(ret); 5933 5934 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, 5935 sizeof(cache->item)); 5936 BUG_ON(ret); 5937 5938 set_avail_alloc_bits(extent_root->fs_info, type); 5939 5940 return 0; 5941 } 5942 5943 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 5944 struct btrfs_root *root, u64 group_start) 5945 { 5946 struct btrfs_path *path; 5947 struct btrfs_block_group_cache *block_group; 5948 struct btrfs_key key; 5949 int ret; 5950 5951 root = root->fs_info->extent_root; 5952 5953 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 5954 BUG_ON(!block_group); 5955 BUG_ON(!block_group->ro); 5956 5957 memcpy(&key, &block_group->key, sizeof(key)); 5958 5959 path = btrfs_alloc_path(); 5960 BUG_ON(!path); 5961 5962 spin_lock(&root->fs_info->block_group_cache_lock); 5963 rb_erase(&block_group->cache_node, 5964 &root->fs_info->block_group_cache_tree); 5965 spin_unlock(&root->fs_info->block_group_cache_lock); 5966 btrfs_remove_free_space_cache(block_group); 5967 down_write(&block_group->space_info->groups_sem); 5968 list_del(&block_group->list); 5969 up_write(&block_group->space_info->groups_sem); 5970 5971 spin_lock(&block_group->space_info->lock); 5972 block_group->space_info->total_bytes -= block_group->key.offset; 5973 block_group->space_info->bytes_readonly -= block_group->key.offset; 5974 spin_unlock(&block_group->space_info->lock); 5975 block_group->space_info->full = 0; 5976 5977 put_block_group(block_group); 5978 put_block_group(block_group); 5979 5980 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 5981 if (ret > 0) 5982 ret = -EIO; 5983 if (ret < 0) 5984 goto out; 5985 5986 ret = btrfs_del_item(trans, root, path); 5987 out: 5988 btrfs_free_path(path); 5989 return ret; 5990 } 5991