1 /* 2 * Copyright (C) 2008 Red Hat. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include "ctree.h" 21 #include "free-space-cache.h" 22 #include "transaction.h" 23 24 struct btrfs_free_space { 25 struct rb_node bytes_index; 26 struct rb_node offset_index; 27 u64 offset; 28 u64 bytes; 29 }; 30 31 static int tree_insert_offset(struct rb_root *root, u64 offset, 32 struct rb_node *node) 33 { 34 struct rb_node **p = &root->rb_node; 35 struct rb_node *parent = NULL; 36 struct btrfs_free_space *info; 37 38 while (*p) { 39 parent = *p; 40 info = rb_entry(parent, struct btrfs_free_space, offset_index); 41 42 if (offset < info->offset) 43 p = &(*p)->rb_left; 44 else if (offset > info->offset) 45 p = &(*p)->rb_right; 46 else 47 return -EEXIST; 48 } 49 50 rb_link_node(node, parent, p); 51 rb_insert_color(node, root); 52 53 return 0; 54 } 55 56 static int tree_insert_bytes(struct rb_root *root, u64 bytes, 57 struct rb_node *node) 58 { 59 struct rb_node **p = &root->rb_node; 60 struct rb_node *parent = NULL; 61 struct btrfs_free_space *info; 62 63 while (*p) { 64 parent = *p; 65 info = rb_entry(parent, struct btrfs_free_space, bytes_index); 66 67 if (bytes < info->bytes) 68 p = &(*p)->rb_left; 69 else 70 p = &(*p)->rb_right; 71 } 72 73 rb_link_node(node, parent, p); 74 rb_insert_color(node, root); 75 76 return 0; 77 } 78 79 /* 80 * searches the tree for the given offset. 81 * 82 * fuzzy == 1: this is used for allocations where we are given a hint of where 83 * to look for free space. Because the hint may not be completely on an offset 84 * mark, or the hint may no longer point to free space we need to fudge our 85 * results a bit. So we look for free space starting at or after offset with at 86 * least bytes size. We prefer to find as close to the given offset as we can. 87 * Also if the offset is within a free space range, then we will return the free 88 * space that contains the given offset, which means we can return a free space 89 * chunk with an offset before the provided offset. 90 * 91 * fuzzy == 0: this is just a normal tree search. Give us the free space that 92 * starts at the given offset which is at least bytes size, and if its not there 93 * return NULL. 94 */ 95 static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 96 u64 offset, u64 bytes, 97 int fuzzy) 98 { 99 struct rb_node *n = root->rb_node; 100 struct btrfs_free_space *entry, *ret = NULL; 101 102 while (n) { 103 entry = rb_entry(n, struct btrfs_free_space, offset_index); 104 105 if (offset < entry->offset) { 106 if (fuzzy && 107 (!ret || entry->offset < ret->offset) && 108 (bytes <= entry->bytes)) 109 ret = entry; 110 n = n->rb_left; 111 } else if (offset > entry->offset) { 112 if (fuzzy && 113 (entry->offset + entry->bytes - 1) >= offset && 114 bytes <= entry->bytes) { 115 ret = entry; 116 break; 117 } 118 n = n->rb_right; 119 } else { 120 if (bytes > entry->bytes) { 121 n = n->rb_right; 122 continue; 123 } 124 ret = entry; 125 break; 126 } 127 } 128 129 return ret; 130 } 131 132 /* 133 * return a chunk at least bytes size, as close to offset that we can get. 134 */ 135 static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, 136 u64 offset, u64 bytes) 137 { 138 struct rb_node *n = root->rb_node; 139 struct btrfs_free_space *entry, *ret = NULL; 140 141 while (n) { 142 entry = rb_entry(n, struct btrfs_free_space, bytes_index); 143 144 if (bytes < entry->bytes) { 145 /* 146 * We prefer to get a hole size as close to the size we 147 * are asking for so we don't take small slivers out of 148 * huge holes, but we also want to get as close to the 149 * offset as possible so we don't have a whole lot of 150 * fragmentation. 151 */ 152 if (offset <= entry->offset) { 153 if (!ret) 154 ret = entry; 155 else if (entry->bytes < ret->bytes) 156 ret = entry; 157 else if (entry->offset < ret->offset) 158 ret = entry; 159 } 160 n = n->rb_left; 161 } else if (bytes > entry->bytes) { 162 n = n->rb_right; 163 } else { 164 /* 165 * Ok we may have multiple chunks of the wanted size, 166 * so we don't want to take the first one we find, we 167 * want to take the one closest to our given offset, so 168 * keep searching just in case theres a better match. 169 */ 170 n = n->rb_right; 171 if (offset > entry->offset) 172 continue; 173 else if (!ret || entry->offset < ret->offset) 174 ret = entry; 175 } 176 } 177 178 return ret; 179 } 180 181 static void unlink_free_space(struct btrfs_block_group_cache *block_group, 182 struct btrfs_free_space *info) 183 { 184 rb_erase(&info->offset_index, &block_group->free_space_offset); 185 rb_erase(&info->bytes_index, &block_group->free_space_bytes); 186 } 187 188 static int link_free_space(struct btrfs_block_group_cache *block_group, 189 struct btrfs_free_space *info) 190 { 191 int ret = 0; 192 193 194 BUG_ON(!info->bytes); 195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 196 &info->offset_index); 197 if (ret) 198 return ret; 199 200 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, 201 &info->bytes_index); 202 if (ret) 203 return ret; 204 205 return ret; 206 } 207 208 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 209 u64 offset, u64 bytes) 210 { 211 struct btrfs_free_space *right_info; 212 struct btrfs_free_space *left_info; 213 struct btrfs_free_space *info = NULL; 214 int ret = 0; 215 216 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 217 if (!info) 218 return -ENOMEM; 219 220 info->offset = offset; 221 info->bytes = bytes; 222 223 spin_lock(&block_group->tree_lock); 224 225 /* 226 * first we want to see if there is free space adjacent to the range we 227 * are adding, if there is remove that struct and add a new one to 228 * cover the entire range 229 */ 230 right_info = tree_search_offset(&block_group->free_space_offset, 231 offset+bytes, 0, 0); 232 left_info = tree_search_offset(&block_group->free_space_offset, 233 offset-1, 0, 1); 234 235 if (right_info) { 236 unlink_free_space(block_group, right_info); 237 info->bytes += right_info->bytes; 238 kfree(right_info); 239 } 240 241 if (left_info && left_info->offset + left_info->bytes == offset) { 242 unlink_free_space(block_group, left_info); 243 info->offset = left_info->offset; 244 info->bytes += left_info->bytes; 245 kfree(left_info); 246 } 247 248 ret = link_free_space(block_group, info); 249 if (ret) 250 kfree(info); 251 252 spin_unlock(&block_group->tree_lock); 253 254 if (ret) { 255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 256 BUG_ON(ret == -EEXIST); 257 } 258 259 return ret; 260 } 261 262 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 263 u64 offset, u64 bytes) 264 { 265 struct btrfs_free_space *info; 266 int ret = 0; 267 268 spin_lock(&block_group->tree_lock); 269 270 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 271 1); 272 if (info && info->offset == offset) { 273 if (info->bytes < bytes) { 274 printk(KERN_ERR "Found free space at %llu, size %llu," 275 "trying to use %llu\n", 276 (unsigned long long)info->offset, 277 (unsigned long long)info->bytes, 278 (unsigned long long)bytes); 279 WARN_ON(1); 280 ret = -EINVAL; 281 spin_unlock(&block_group->tree_lock); 282 goto out; 283 } 284 unlink_free_space(block_group, info); 285 286 if (info->bytes == bytes) { 287 kfree(info); 288 spin_unlock(&block_group->tree_lock); 289 goto out; 290 } 291 292 info->offset += bytes; 293 info->bytes -= bytes; 294 295 ret = link_free_space(block_group, info); 296 spin_unlock(&block_group->tree_lock); 297 BUG_ON(ret); 298 } else if (info && info->offset < offset && 299 info->offset + info->bytes >= offset + bytes) { 300 u64 old_start = info->offset; 301 /* 302 * we're freeing space in the middle of the info, 303 * this can happen during tree log replay 304 * 305 * first unlink the old info and then 306 * insert it again after the hole we're creating 307 */ 308 unlink_free_space(block_group, info); 309 if (offset + bytes < info->offset + info->bytes) { 310 u64 old_end = info->offset + info->bytes; 311 312 info->offset = offset + bytes; 313 info->bytes = old_end - info->offset; 314 ret = link_free_space(block_group, info); 315 BUG_ON(ret); 316 } else { 317 /* the hole we're creating ends at the end 318 * of the info struct, just free the info 319 */ 320 kfree(info); 321 } 322 spin_unlock(&block_group->tree_lock); 323 /* step two, insert a new info struct to cover anything 324 * before the hole 325 */ 326 ret = btrfs_add_free_space(block_group, old_start, 327 offset - old_start); 328 BUG_ON(ret); 329 } else { 330 spin_unlock(&block_group->tree_lock); 331 if (!info) { 332 printk(KERN_ERR "couldn't find space %llu to free\n", 333 (unsigned long long)offset); 334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", 335 block_group->cached, block_group->key.objectid, 336 block_group->key.offset); 337 btrfs_dump_free_space(block_group, bytes); 338 } else if (info) { 339 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " 340 "but wanted offset=%llu bytes=%llu\n", 341 info->offset, info->bytes, offset, bytes); 342 } 343 WARN_ON(1); 344 } 345 out: 346 return ret; 347 } 348 349 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 350 u64 bytes) 351 { 352 struct btrfs_free_space *info; 353 struct rb_node *n; 354 int count = 0; 355 356 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { 357 info = rb_entry(n, struct btrfs_free_space, offset_index); 358 if (info->bytes >= bytes) 359 count++; 360 printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset, 361 info->bytes); 362 } 363 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 364 "\n", count); 365 } 366 367 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) 368 { 369 struct btrfs_free_space *info; 370 struct rb_node *n; 371 u64 ret = 0; 372 373 for (n = rb_first(&block_group->free_space_offset); n; 374 n = rb_next(n)) { 375 info = rb_entry(n, struct btrfs_free_space, offset_index); 376 ret += info->bytes; 377 } 378 379 return ret; 380 } 381 382 /* 383 * for a given cluster, put all of its extents back into the free 384 * space cache. If the block group passed doesn't match the block group 385 * pointed to by the cluster, someone else raced in and freed the 386 * cluster already. In that case, we just return without changing anything 387 */ 388 static int 389 __btrfs_return_cluster_to_free_space( 390 struct btrfs_block_group_cache *block_group, 391 struct btrfs_free_cluster *cluster) 392 { 393 struct btrfs_free_space *entry; 394 struct rb_node *node; 395 396 spin_lock(&cluster->lock); 397 if (cluster->block_group != block_group) 398 goto out; 399 400 cluster->window_start = 0; 401 node = rb_first(&cluster->root); 402 while(node) { 403 entry = rb_entry(node, struct btrfs_free_space, offset_index); 404 node = rb_next(&entry->offset_index); 405 rb_erase(&entry->offset_index, &cluster->root); 406 link_free_space(block_group, entry); 407 } 408 list_del_init(&cluster->block_group_list); 409 410 btrfs_put_block_group(cluster->block_group); 411 cluster->block_group = NULL; 412 cluster->root.rb_node = NULL; 413 out: 414 spin_unlock(&cluster->lock); 415 return 0; 416 } 417 418 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 419 { 420 struct btrfs_free_space *info; 421 struct rb_node *node; 422 struct btrfs_free_cluster *cluster; 423 struct btrfs_free_cluster *safe; 424 425 spin_lock(&block_group->tree_lock); 426 427 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, 428 block_group_list) { 429 430 WARN_ON(cluster->block_group != block_group); 431 __btrfs_return_cluster_to_free_space(block_group, cluster); 432 } 433 434 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 435 info = rb_entry(node, struct btrfs_free_space, bytes_index); 436 unlink_free_space(block_group, info); 437 kfree(info); 438 if (need_resched()) { 439 spin_unlock(&block_group->tree_lock); 440 cond_resched(); 441 spin_lock(&block_group->tree_lock); 442 } 443 } 444 spin_unlock(&block_group->tree_lock); 445 } 446 447 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, 448 u64 offset, u64 bytes, u64 empty_size) 449 { 450 struct btrfs_free_space *entry = NULL; 451 u64 ret = 0; 452 453 spin_lock(&block_group->tree_lock); 454 entry = tree_search_offset(&block_group->free_space_offset, offset, 455 bytes + empty_size, 1); 456 if (!entry) 457 entry = tree_search_bytes(&block_group->free_space_bytes, 458 offset, bytes + empty_size); 459 if (entry) { 460 unlink_free_space(block_group, entry); 461 ret = entry->offset; 462 entry->offset += bytes; 463 entry->bytes -= bytes; 464 465 if (!entry->bytes) 466 kfree(entry); 467 else 468 link_free_space(block_group, entry); 469 } 470 spin_unlock(&block_group->tree_lock); 471 472 return ret; 473 } 474 475 /* 476 * given a cluster, put all of its extents back into the free space 477 * cache. If a block group is passed, this function will only free 478 * a cluster that belongs to the passed block group. 479 * 480 * Otherwise, it'll get a reference on the block group pointed to by the 481 * cluster and remove the cluster from it. 482 */ 483 int btrfs_return_cluster_to_free_space( 484 struct btrfs_block_group_cache *block_group, 485 struct btrfs_free_cluster *cluster) 486 { 487 int ret; 488 489 /* first, get a safe pointer to the block group */ 490 spin_lock(&cluster->lock); 491 if (!block_group) { 492 block_group = cluster->block_group; 493 if (!block_group) { 494 spin_unlock(&cluster->lock); 495 return 0; 496 } 497 } else if (cluster->block_group != block_group) { 498 /* someone else has already freed it don't redo their work */ 499 spin_unlock(&cluster->lock); 500 return 0; 501 } 502 atomic_inc(&block_group->count); 503 spin_unlock(&cluster->lock); 504 505 /* now return any extents the cluster had on it */ 506 spin_lock(&block_group->tree_lock); 507 ret = __btrfs_return_cluster_to_free_space(block_group, cluster); 508 spin_unlock(&block_group->tree_lock); 509 510 /* finally drop our ref */ 511 btrfs_put_block_group(block_group); 512 return ret; 513 } 514 515 /* 516 * given a cluster, try to allocate 'bytes' from it, returns 0 517 * if it couldn't find anything suitably large, or a logical disk offset 518 * if things worked out 519 */ 520 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, 521 struct btrfs_free_cluster *cluster, u64 bytes, 522 u64 min_start) 523 { 524 struct btrfs_free_space *entry = NULL; 525 struct rb_node *node; 526 u64 ret = 0; 527 528 spin_lock(&cluster->lock); 529 if (bytes > cluster->max_size) 530 goto out; 531 532 if (cluster->block_group != block_group) 533 goto out; 534 535 node = rb_first(&cluster->root); 536 if (!node) 537 goto out; 538 539 entry = rb_entry(node, struct btrfs_free_space, offset_index); 540 541 while(1) { 542 if (entry->bytes < bytes || entry->offset < min_start) { 543 struct rb_node *node; 544 545 node = rb_next(&entry->offset_index); 546 if (!node) 547 break; 548 entry = rb_entry(node, struct btrfs_free_space, 549 offset_index); 550 continue; 551 } 552 ret = entry->offset; 553 554 entry->offset += bytes; 555 entry->bytes -= bytes; 556 557 if (entry->bytes == 0) { 558 rb_erase(&entry->offset_index, &cluster->root); 559 kfree(entry); 560 } 561 break; 562 } 563 out: 564 spin_unlock(&cluster->lock); 565 return ret; 566 } 567 568 /* 569 * here we try to find a cluster of blocks in a block group. The goal 570 * is to find at least bytes free and up to empty_size + bytes free. 571 * We might not find them all in one contiguous area. 572 * 573 * returns zero and sets up cluster if things worked out, otherwise 574 * it returns -enospc 575 */ 576 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 577 struct btrfs_block_group_cache *block_group, 578 struct btrfs_free_cluster *cluster, 579 u64 offset, u64 bytes, u64 empty_size) 580 { 581 struct btrfs_free_space *entry = NULL; 582 struct rb_node *node; 583 struct btrfs_free_space *next; 584 struct btrfs_free_space *last; 585 u64 min_bytes; 586 u64 window_start; 587 u64 window_free; 588 u64 max_extent = 0; 589 int total_retries = 0; 590 int ret; 591 592 /* for metadata, allow allocates with more holes */ 593 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 594 /* 595 * we want to do larger allocations when we are 596 * flushing out the delayed refs, it helps prevent 597 * making more work as we go along. 598 */ 599 if (trans->transaction->delayed_refs.flushing) 600 min_bytes = max(bytes, (bytes + empty_size) >> 1); 601 else 602 min_bytes = max(bytes, (bytes + empty_size) >> 4); 603 } else 604 min_bytes = max(bytes, (bytes + empty_size) >> 2); 605 606 spin_lock(&block_group->tree_lock); 607 spin_lock(&cluster->lock); 608 609 /* someone already found a cluster, hooray */ 610 if (cluster->block_group) { 611 ret = 0; 612 goto out; 613 } 614 again: 615 min_bytes = min(min_bytes, bytes + empty_size); 616 entry = tree_search_bytes(&block_group->free_space_bytes, 617 offset, min_bytes); 618 if (!entry) { 619 ret = -ENOSPC; 620 goto out; 621 } 622 window_start = entry->offset; 623 window_free = entry->bytes; 624 last = entry; 625 max_extent = entry->bytes; 626 627 while(1) { 628 /* out window is just right, lets fill it */ 629 if (window_free >= bytes + empty_size) 630 break; 631 632 node = rb_next(&last->offset_index); 633 if (!node) { 634 ret = -ENOSPC; 635 goto out; 636 } 637 next = rb_entry(node, struct btrfs_free_space, offset_index); 638 639 /* 640 * we haven't filled the empty size and the window is 641 * very large. reset and try again 642 */ 643 if (next->offset - window_start > (bytes + empty_size) * 2) { 644 entry = next; 645 window_start = entry->offset; 646 window_free = entry->bytes; 647 last = entry; 648 max_extent = 0; 649 total_retries++; 650 if (total_retries % 256 == 0) { 651 if (min_bytes >= (bytes + empty_size)) { 652 ret = -ENOSPC; 653 goto out; 654 } 655 /* 656 * grow our allocation a bit, we're not having 657 * much luck 658 */ 659 min_bytes *= 2; 660 goto again; 661 } 662 } else { 663 last = next; 664 window_free += next->bytes; 665 if (entry->bytes > max_extent) 666 max_extent = entry->bytes; 667 } 668 } 669 670 cluster->window_start = entry->offset; 671 672 /* 673 * now we've found our entries, pull them out of the free space 674 * cache and put them into the cluster rbtree 675 * 676 * The cluster includes an rbtree, but only uses the offset index 677 * of each free space cache entry. 678 */ 679 while(1) { 680 node = rb_next(&entry->offset_index); 681 unlink_free_space(block_group, entry); 682 ret = tree_insert_offset(&cluster->root, entry->offset, 683 &entry->offset_index); 684 BUG_ON(ret); 685 686 if (!node || entry == last) 687 break; 688 689 entry = rb_entry(node, struct btrfs_free_space, offset_index); 690 } 691 ret = 0; 692 cluster->max_size = max_extent; 693 atomic_inc(&block_group->count); 694 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 695 cluster->block_group = block_group; 696 out: 697 spin_unlock(&cluster->lock); 698 spin_unlock(&block_group->tree_lock); 699 700 return ret; 701 } 702 703 /* 704 * simple code to zero out a cluster 705 */ 706 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 707 { 708 spin_lock_init(&cluster->lock); 709 spin_lock_init(&cluster->refill_lock); 710 cluster->root.rb_node = NULL; 711 cluster->max_size = 0; 712 INIT_LIST_HEAD(&cluster->block_group_list); 713 cluster->block_group = NULL; 714 } 715 716