1 /* 2 * Copyright (C) 2008 Red Hat. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include "ctree.h" 21 #include "free-space-cache.h" 22 #include "transaction.h" 23 24 struct btrfs_free_space { 25 struct rb_node bytes_index; 26 struct rb_node offset_index; 27 u64 offset; 28 u64 bytes; 29 }; 30 31 static int tree_insert_offset(struct rb_root *root, u64 offset, 32 struct rb_node *node) 33 { 34 struct rb_node **p = &root->rb_node; 35 struct rb_node *parent = NULL; 36 struct btrfs_free_space *info; 37 38 while (*p) { 39 parent = *p; 40 info = rb_entry(parent, struct btrfs_free_space, offset_index); 41 42 if (offset < info->offset) 43 p = &(*p)->rb_left; 44 else if (offset > info->offset) 45 p = &(*p)->rb_right; 46 else 47 return -EEXIST; 48 } 49 50 rb_link_node(node, parent, p); 51 rb_insert_color(node, root); 52 53 return 0; 54 } 55 56 static int tree_insert_bytes(struct rb_root *root, u64 bytes, 57 struct rb_node *node) 58 { 59 struct rb_node **p = &root->rb_node; 60 struct rb_node *parent = NULL; 61 struct btrfs_free_space *info; 62 63 while (*p) { 64 parent = *p; 65 info = rb_entry(parent, struct btrfs_free_space, bytes_index); 66 67 if (bytes < info->bytes) 68 p = &(*p)->rb_left; 69 else 70 p = &(*p)->rb_right; 71 } 72 73 rb_link_node(node, parent, p); 74 rb_insert_color(node, root); 75 76 return 0; 77 } 78 79 /* 80 * searches the tree for the given offset. 81 * 82 * fuzzy == 1: this is used for allocations where we are given a hint of where 83 * to look for free space. Because the hint may not be completely on an offset 84 * mark, or the hint may no longer point to free space we need to fudge our 85 * results a bit. So we look for free space starting at or after offset with at 86 * least bytes size. We prefer to find as close to the given offset as we can. 87 * Also if the offset is within a free space range, then we will return the free 88 * space that contains the given offset, which means we can return a free space 89 * chunk with an offset before the provided offset. 90 * 91 * fuzzy == 0: this is just a normal tree search. Give us the free space that 92 * starts at the given offset which is at least bytes size, and if its not there 93 * return NULL. 94 */ 95 static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 96 u64 offset, u64 bytes, 97 int fuzzy) 98 { 99 struct rb_node *n = root->rb_node; 100 struct btrfs_free_space *entry, *ret = NULL; 101 102 while (n) { 103 entry = rb_entry(n, struct btrfs_free_space, offset_index); 104 105 if (offset < entry->offset) { 106 if (fuzzy && 107 (!ret || entry->offset < ret->offset) && 108 (bytes <= entry->bytes)) 109 ret = entry; 110 n = n->rb_left; 111 } else if (offset > entry->offset) { 112 if (fuzzy && 113 (entry->offset + entry->bytes - 1) >= offset && 114 bytes <= entry->bytes) { 115 ret = entry; 116 break; 117 } 118 n = n->rb_right; 119 } else { 120 if (bytes > entry->bytes) { 121 n = n->rb_right; 122 continue; 123 } 124 ret = entry; 125 break; 126 } 127 } 128 129 return ret; 130 } 131 132 /* 133 * return a chunk at least bytes size, as close to offset that we can get. 134 */ 135 static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, 136 u64 offset, u64 bytes) 137 { 138 struct rb_node *n = root->rb_node; 139 struct btrfs_free_space *entry, *ret = NULL; 140 141 while (n) { 142 entry = rb_entry(n, struct btrfs_free_space, bytes_index); 143 144 if (bytes < entry->bytes) { 145 /* 146 * We prefer to get a hole size as close to the size we 147 * are asking for so we don't take small slivers out of 148 * huge holes, but we also want to get as close to the 149 * offset as possible so we don't have a whole lot of 150 * fragmentation. 151 */ 152 if (offset <= entry->offset) { 153 if (!ret) 154 ret = entry; 155 else if (entry->bytes < ret->bytes) 156 ret = entry; 157 else if (entry->offset < ret->offset) 158 ret = entry; 159 } 160 n = n->rb_left; 161 } else if (bytes > entry->bytes) { 162 n = n->rb_right; 163 } else { 164 /* 165 * Ok we may have multiple chunks of the wanted size, 166 * so we don't want to take the first one we find, we 167 * want to take the one closest to our given offset, so 168 * keep searching just in case theres a better match. 169 */ 170 n = n->rb_right; 171 if (offset > entry->offset) 172 continue; 173 else if (!ret || entry->offset < ret->offset) 174 ret = entry; 175 } 176 } 177 178 return ret; 179 } 180 181 static void unlink_free_space(struct btrfs_block_group_cache *block_group, 182 struct btrfs_free_space *info) 183 { 184 rb_erase(&info->offset_index, &block_group->free_space_offset); 185 rb_erase(&info->bytes_index, &block_group->free_space_bytes); 186 } 187 188 static int link_free_space(struct btrfs_block_group_cache *block_group, 189 struct btrfs_free_space *info) 190 { 191 int ret = 0; 192 193 194 BUG_ON(!info->bytes); 195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 196 &info->offset_index); 197 if (ret) 198 return ret; 199 200 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, 201 &info->bytes_index); 202 if (ret) 203 return ret; 204 205 return ret; 206 } 207 208 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 209 u64 offset, u64 bytes) 210 { 211 struct btrfs_free_space *right_info; 212 struct btrfs_free_space *left_info; 213 struct btrfs_free_space *info = NULL; 214 int ret = 0; 215 216 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 217 if (!info) 218 return -ENOMEM; 219 220 info->offset = offset; 221 info->bytes = bytes; 222 223 spin_lock(&block_group->tree_lock); 224 225 /* 226 * first we want to see if there is free space adjacent to the range we 227 * are adding, if there is remove that struct and add a new one to 228 * cover the entire range 229 */ 230 right_info = tree_search_offset(&block_group->free_space_offset, 231 offset+bytes, 0, 0); 232 left_info = tree_search_offset(&block_group->free_space_offset, 233 offset-1, 0, 1); 234 235 if (right_info) { 236 unlink_free_space(block_group, right_info); 237 info->bytes += right_info->bytes; 238 kfree(right_info); 239 } 240 241 if (left_info && left_info->offset + left_info->bytes == offset) { 242 unlink_free_space(block_group, left_info); 243 info->offset = left_info->offset; 244 info->bytes += left_info->bytes; 245 kfree(left_info); 246 } 247 248 ret = link_free_space(block_group, info); 249 if (ret) 250 kfree(info); 251 252 spin_unlock(&block_group->tree_lock); 253 254 if (ret) { 255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 256 BUG_ON(ret == -EEXIST); 257 } 258 259 return ret; 260 } 261 262 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 263 u64 offset, u64 bytes) 264 { 265 struct btrfs_free_space *info; 266 int ret = 0; 267 268 spin_lock(&block_group->tree_lock); 269 270 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 271 1); 272 if (info && info->offset == offset) { 273 if (info->bytes < bytes) { 274 printk(KERN_ERR "Found free space at %llu, size %llu," 275 "trying to use %llu\n", 276 (unsigned long long)info->offset, 277 (unsigned long long)info->bytes, 278 (unsigned long long)bytes); 279 WARN_ON(1); 280 ret = -EINVAL; 281 spin_unlock(&block_group->tree_lock); 282 goto out; 283 } 284 unlink_free_space(block_group, info); 285 286 if (info->bytes == bytes) { 287 kfree(info); 288 spin_unlock(&block_group->tree_lock); 289 goto out; 290 } 291 292 info->offset += bytes; 293 info->bytes -= bytes; 294 295 ret = link_free_space(block_group, info); 296 spin_unlock(&block_group->tree_lock); 297 BUG_ON(ret); 298 } else if (info && info->offset < offset && 299 info->offset + info->bytes >= offset + bytes) { 300 u64 old_start = info->offset; 301 /* 302 * we're freeing space in the middle of the info, 303 * this can happen during tree log replay 304 * 305 * first unlink the old info and then 306 * insert it again after the hole we're creating 307 */ 308 unlink_free_space(block_group, info); 309 if (offset + bytes < info->offset + info->bytes) { 310 u64 old_end = info->offset + info->bytes; 311 312 info->offset = offset + bytes; 313 info->bytes = old_end - info->offset; 314 ret = link_free_space(block_group, info); 315 BUG_ON(ret); 316 } else { 317 /* the hole we're creating ends at the end 318 * of the info struct, just free the info 319 */ 320 kfree(info); 321 } 322 spin_unlock(&block_group->tree_lock); 323 /* step two, insert a new info struct to cover anything 324 * before the hole 325 */ 326 ret = btrfs_add_free_space(block_group, old_start, 327 offset - old_start); 328 BUG_ON(ret); 329 } else { 330 spin_unlock(&block_group->tree_lock); 331 if (!info) { 332 printk(KERN_ERR "couldn't find space %llu to free\n", 333 (unsigned long long)offset); 334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", 335 block_group->cached, 336 (unsigned long long)block_group->key.objectid, 337 (unsigned long long)block_group->key.offset); 338 btrfs_dump_free_space(block_group, bytes); 339 } else if (info) { 340 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " 341 "but wanted offset=%llu bytes=%llu\n", 342 (unsigned long long)info->offset, 343 (unsigned long long)info->bytes, 344 (unsigned long long)offset, 345 (unsigned long long)bytes); 346 } 347 WARN_ON(1); 348 } 349 out: 350 return ret; 351 } 352 353 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 354 u64 bytes) 355 { 356 struct btrfs_free_space *info; 357 struct rb_node *n; 358 int count = 0; 359 360 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { 361 info = rb_entry(n, struct btrfs_free_space, offset_index); 362 if (info->bytes >= bytes) 363 count++; 364 printk(KERN_ERR "entry offset %llu, bytes %llu\n", 365 (unsigned long long)info->offset, 366 (unsigned long long)info->bytes); 367 } 368 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 369 "\n", count); 370 } 371 372 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) 373 { 374 struct btrfs_free_space *info; 375 struct rb_node *n; 376 u64 ret = 0; 377 378 for (n = rb_first(&block_group->free_space_offset); n; 379 n = rb_next(n)) { 380 info = rb_entry(n, struct btrfs_free_space, offset_index); 381 ret += info->bytes; 382 } 383 384 return ret; 385 } 386 387 /* 388 * for a given cluster, put all of its extents back into the free 389 * space cache. If the block group passed doesn't match the block group 390 * pointed to by the cluster, someone else raced in and freed the 391 * cluster already. In that case, we just return without changing anything 392 */ 393 static int 394 __btrfs_return_cluster_to_free_space( 395 struct btrfs_block_group_cache *block_group, 396 struct btrfs_free_cluster *cluster) 397 { 398 struct btrfs_free_space *entry; 399 struct rb_node *node; 400 401 spin_lock(&cluster->lock); 402 if (cluster->block_group != block_group) 403 goto out; 404 405 cluster->window_start = 0; 406 node = rb_first(&cluster->root); 407 while(node) { 408 entry = rb_entry(node, struct btrfs_free_space, offset_index); 409 node = rb_next(&entry->offset_index); 410 rb_erase(&entry->offset_index, &cluster->root); 411 link_free_space(block_group, entry); 412 } 413 list_del_init(&cluster->block_group_list); 414 415 btrfs_put_block_group(cluster->block_group); 416 cluster->block_group = NULL; 417 cluster->root.rb_node = NULL; 418 out: 419 spin_unlock(&cluster->lock); 420 return 0; 421 } 422 423 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 424 { 425 struct btrfs_free_space *info; 426 struct rb_node *node; 427 struct btrfs_free_cluster *cluster; 428 struct btrfs_free_cluster *safe; 429 430 spin_lock(&block_group->tree_lock); 431 432 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, 433 block_group_list) { 434 435 WARN_ON(cluster->block_group != block_group); 436 __btrfs_return_cluster_to_free_space(block_group, cluster); 437 } 438 439 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 440 info = rb_entry(node, struct btrfs_free_space, bytes_index); 441 unlink_free_space(block_group, info); 442 kfree(info); 443 if (need_resched()) { 444 spin_unlock(&block_group->tree_lock); 445 cond_resched(); 446 spin_lock(&block_group->tree_lock); 447 } 448 } 449 spin_unlock(&block_group->tree_lock); 450 } 451 452 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, 453 u64 offset, u64 bytes, u64 empty_size) 454 { 455 struct btrfs_free_space *entry = NULL; 456 u64 ret = 0; 457 458 spin_lock(&block_group->tree_lock); 459 entry = tree_search_offset(&block_group->free_space_offset, offset, 460 bytes + empty_size, 1); 461 if (!entry) 462 entry = tree_search_bytes(&block_group->free_space_bytes, 463 offset, bytes + empty_size); 464 if (entry) { 465 unlink_free_space(block_group, entry); 466 ret = entry->offset; 467 entry->offset += bytes; 468 entry->bytes -= bytes; 469 470 if (!entry->bytes) 471 kfree(entry); 472 else 473 link_free_space(block_group, entry); 474 } 475 spin_unlock(&block_group->tree_lock); 476 477 return ret; 478 } 479 480 /* 481 * given a cluster, put all of its extents back into the free space 482 * cache. If a block group is passed, this function will only free 483 * a cluster that belongs to the passed block group. 484 * 485 * Otherwise, it'll get a reference on the block group pointed to by the 486 * cluster and remove the cluster from it. 487 */ 488 int btrfs_return_cluster_to_free_space( 489 struct btrfs_block_group_cache *block_group, 490 struct btrfs_free_cluster *cluster) 491 { 492 int ret; 493 494 /* first, get a safe pointer to the block group */ 495 spin_lock(&cluster->lock); 496 if (!block_group) { 497 block_group = cluster->block_group; 498 if (!block_group) { 499 spin_unlock(&cluster->lock); 500 return 0; 501 } 502 } else if (cluster->block_group != block_group) { 503 /* someone else has already freed it don't redo their work */ 504 spin_unlock(&cluster->lock); 505 return 0; 506 } 507 atomic_inc(&block_group->count); 508 spin_unlock(&cluster->lock); 509 510 /* now return any extents the cluster had on it */ 511 spin_lock(&block_group->tree_lock); 512 ret = __btrfs_return_cluster_to_free_space(block_group, cluster); 513 spin_unlock(&block_group->tree_lock); 514 515 /* finally drop our ref */ 516 btrfs_put_block_group(block_group); 517 return ret; 518 } 519 520 /* 521 * given a cluster, try to allocate 'bytes' from it, returns 0 522 * if it couldn't find anything suitably large, or a logical disk offset 523 * if things worked out 524 */ 525 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, 526 struct btrfs_free_cluster *cluster, u64 bytes, 527 u64 min_start) 528 { 529 struct btrfs_free_space *entry = NULL; 530 struct rb_node *node; 531 u64 ret = 0; 532 533 spin_lock(&cluster->lock); 534 if (bytes > cluster->max_size) 535 goto out; 536 537 if (cluster->block_group != block_group) 538 goto out; 539 540 node = rb_first(&cluster->root); 541 if (!node) 542 goto out; 543 544 entry = rb_entry(node, struct btrfs_free_space, offset_index); 545 546 while(1) { 547 if (entry->bytes < bytes || entry->offset < min_start) { 548 struct rb_node *node; 549 550 node = rb_next(&entry->offset_index); 551 if (!node) 552 break; 553 entry = rb_entry(node, struct btrfs_free_space, 554 offset_index); 555 continue; 556 } 557 ret = entry->offset; 558 559 entry->offset += bytes; 560 entry->bytes -= bytes; 561 562 if (entry->bytes == 0) { 563 rb_erase(&entry->offset_index, &cluster->root); 564 kfree(entry); 565 } 566 break; 567 } 568 out: 569 spin_unlock(&cluster->lock); 570 return ret; 571 } 572 573 /* 574 * here we try to find a cluster of blocks in a block group. The goal 575 * is to find at least bytes free and up to empty_size + bytes free. 576 * We might not find them all in one contiguous area. 577 * 578 * returns zero and sets up cluster if things worked out, otherwise 579 * it returns -enospc 580 */ 581 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 582 struct btrfs_root *root, 583 struct btrfs_block_group_cache *block_group, 584 struct btrfs_free_cluster *cluster, 585 u64 offset, u64 bytes, u64 empty_size) 586 { 587 struct btrfs_free_space *entry = NULL; 588 struct rb_node *node; 589 struct btrfs_free_space *next; 590 struct btrfs_free_space *last; 591 u64 min_bytes; 592 u64 window_start; 593 u64 window_free; 594 u64 max_extent = 0; 595 int total_retries = 0; 596 int ret; 597 598 /* for metadata, allow allocates with more holes */ 599 if (btrfs_test_opt(root, SSD_SPREAD)) { 600 min_bytes = bytes + empty_size; 601 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 602 /* 603 * we want to do larger allocations when we are 604 * flushing out the delayed refs, it helps prevent 605 * making more work as we go along. 606 */ 607 if (trans->transaction->delayed_refs.flushing) 608 min_bytes = max(bytes, (bytes + empty_size) >> 1); 609 else 610 min_bytes = max(bytes, (bytes + empty_size) >> 4); 611 } else 612 min_bytes = max(bytes, (bytes + empty_size) >> 2); 613 614 spin_lock(&block_group->tree_lock); 615 spin_lock(&cluster->lock); 616 617 /* someone already found a cluster, hooray */ 618 if (cluster->block_group) { 619 ret = 0; 620 goto out; 621 } 622 again: 623 min_bytes = min(min_bytes, bytes + empty_size); 624 entry = tree_search_bytes(&block_group->free_space_bytes, 625 offset, min_bytes); 626 if (!entry) { 627 ret = -ENOSPC; 628 goto out; 629 } 630 window_start = entry->offset; 631 window_free = entry->bytes; 632 last = entry; 633 max_extent = entry->bytes; 634 635 while(1) { 636 /* out window is just right, lets fill it */ 637 if (window_free >= bytes + empty_size) 638 break; 639 640 node = rb_next(&last->offset_index); 641 if (!node) { 642 ret = -ENOSPC; 643 goto out; 644 } 645 next = rb_entry(node, struct btrfs_free_space, offset_index); 646 647 /* 648 * we haven't filled the empty size and the window is 649 * very large. reset and try again 650 */ 651 if (next->offset - (last->offset + last->bytes) > 128 * 1024 || 652 next->offset - window_start > (bytes + empty_size) * 2) { 653 entry = next; 654 window_start = entry->offset; 655 window_free = entry->bytes; 656 last = entry; 657 max_extent = 0; 658 total_retries++; 659 if (total_retries % 64 == 0) { 660 if (min_bytes >= (bytes + empty_size)) { 661 ret = -ENOSPC; 662 goto out; 663 } 664 /* 665 * grow our allocation a bit, we're not having 666 * much luck 667 */ 668 min_bytes *= 2; 669 goto again; 670 } 671 } else { 672 last = next; 673 window_free += next->bytes; 674 if (entry->bytes > max_extent) 675 max_extent = entry->bytes; 676 } 677 } 678 679 cluster->window_start = entry->offset; 680 681 /* 682 * now we've found our entries, pull them out of the free space 683 * cache and put them into the cluster rbtree 684 * 685 * The cluster includes an rbtree, but only uses the offset index 686 * of each free space cache entry. 687 */ 688 while(1) { 689 node = rb_next(&entry->offset_index); 690 unlink_free_space(block_group, entry); 691 ret = tree_insert_offset(&cluster->root, entry->offset, 692 &entry->offset_index); 693 BUG_ON(ret); 694 695 if (!node || entry == last) 696 break; 697 698 entry = rb_entry(node, struct btrfs_free_space, offset_index); 699 } 700 ret = 0; 701 cluster->max_size = max_extent; 702 atomic_inc(&block_group->count); 703 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 704 cluster->block_group = block_group; 705 out: 706 spin_unlock(&cluster->lock); 707 spin_unlock(&block_group->tree_lock); 708 709 return ret; 710 } 711 712 /* 713 * simple code to zero out a cluster 714 */ 715 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 716 { 717 spin_lock_init(&cluster->lock); 718 spin_lock_init(&cluster->refill_lock); 719 cluster->root.rb_node = NULL; 720 cluster->max_size = 0; 721 INIT_LIST_HEAD(&cluster->block_group_list); 722 cluster->block_group = NULL; 723 } 724 725