1 /* 2 * Copyright (C) 2008 Red Hat. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/pagemap.h> 20 #include <linux/sched.h> 21 #include <linux/slab.h> 22 #include <linux/math64.h> 23 #include "ctree.h" 24 #include "free-space-cache.h" 25 #include "transaction.h" 26 #include "disk-io.h" 27 28 #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 29 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 30 31 static void recalculate_thresholds(struct btrfs_block_group_cache 32 *block_group); 33 static int link_free_space(struct btrfs_block_group_cache *block_group, 34 struct btrfs_free_space *info); 35 36 struct inode *lookup_free_space_inode(struct btrfs_root *root, 37 struct btrfs_block_group_cache 38 *block_group, struct btrfs_path *path) 39 { 40 struct btrfs_key key; 41 struct btrfs_key location; 42 struct btrfs_disk_key disk_key; 43 struct btrfs_free_space_header *header; 44 struct extent_buffer *leaf; 45 struct inode *inode = NULL; 46 int ret; 47 48 spin_lock(&block_group->lock); 49 if (block_group->inode) 50 inode = igrab(block_group->inode); 51 spin_unlock(&block_group->lock); 52 if (inode) 53 return inode; 54 55 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 56 key.offset = block_group->key.objectid; 57 key.type = 0; 58 59 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 60 if (ret < 0) 61 return ERR_PTR(ret); 62 if (ret > 0) { 63 btrfs_release_path(root, path); 64 return ERR_PTR(-ENOENT); 65 } 66 67 leaf = path->nodes[0]; 68 header = btrfs_item_ptr(leaf, path->slots[0], 69 struct btrfs_free_space_header); 70 btrfs_free_space_key(leaf, header, &disk_key); 71 btrfs_disk_key_to_cpu(&location, &disk_key); 72 btrfs_release_path(root, path); 73 74 inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); 75 if (!inode) 76 return ERR_PTR(-ENOENT); 77 if (IS_ERR(inode)) 78 return inode; 79 if (is_bad_inode(inode)) { 80 iput(inode); 81 return ERR_PTR(-ENOENT); 82 } 83 84 spin_lock(&block_group->lock); 85 if (!root->fs_info->closing) { 86 block_group->inode = igrab(inode); 87 block_group->iref = 1; 88 } 89 spin_unlock(&block_group->lock); 90 91 return inode; 92 } 93 94 int create_free_space_inode(struct btrfs_root *root, 95 struct btrfs_trans_handle *trans, 96 struct btrfs_block_group_cache *block_group, 97 struct btrfs_path *path) 98 { 99 struct btrfs_key key; 100 struct btrfs_disk_key disk_key; 101 struct btrfs_free_space_header *header; 102 struct btrfs_inode_item *inode_item; 103 struct extent_buffer *leaf; 104 u64 objectid; 105 int ret; 106 107 ret = btrfs_find_free_objectid(trans, root, 0, &objectid); 108 if (ret < 0) 109 return ret; 110 111 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 112 if (ret) 113 return ret; 114 115 leaf = path->nodes[0]; 116 inode_item = btrfs_item_ptr(leaf, path->slots[0], 117 struct btrfs_inode_item); 118 btrfs_item_key(leaf, &disk_key, path->slots[0]); 119 memset_extent_buffer(leaf, 0, (unsigned long)inode_item, 120 sizeof(*inode_item)); 121 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 122 btrfs_set_inode_size(leaf, inode_item, 0); 123 btrfs_set_inode_nbytes(leaf, inode_item, 0); 124 btrfs_set_inode_uid(leaf, inode_item, 0); 125 btrfs_set_inode_gid(leaf, inode_item, 0); 126 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 127 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 128 BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); 129 btrfs_set_inode_nlink(leaf, inode_item, 1); 130 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 131 btrfs_set_inode_block_group(leaf, inode_item, 132 block_group->key.objectid); 133 btrfs_mark_buffer_dirty(leaf); 134 btrfs_release_path(root, path); 135 136 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 137 key.offset = block_group->key.objectid; 138 key.type = 0; 139 140 ret = btrfs_insert_empty_item(trans, root, path, &key, 141 sizeof(struct btrfs_free_space_header)); 142 if (ret < 0) { 143 btrfs_release_path(root, path); 144 return ret; 145 } 146 leaf = path->nodes[0]; 147 header = btrfs_item_ptr(leaf, path->slots[0], 148 struct btrfs_free_space_header); 149 memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); 150 btrfs_set_free_space_key(leaf, header, &disk_key); 151 btrfs_mark_buffer_dirty(leaf); 152 btrfs_release_path(root, path); 153 154 return 0; 155 } 156 157 int btrfs_truncate_free_space_cache(struct btrfs_root *root, 158 struct btrfs_trans_handle *trans, 159 struct btrfs_path *path, 160 struct inode *inode) 161 { 162 loff_t oldsize; 163 int ret = 0; 164 165 trans->block_rsv = root->orphan_block_rsv; 166 ret = btrfs_block_rsv_check(trans, root, 167 root->orphan_block_rsv, 168 0, 5); 169 if (ret) 170 return ret; 171 172 oldsize = i_size_read(inode); 173 btrfs_i_size_write(inode, 0); 174 truncate_pagecache(inode, oldsize, 0); 175 176 /* 177 * We don't need an orphan item because truncating the free space cache 178 * will never be split across transactions. 179 */ 180 ret = btrfs_truncate_inode_items(trans, root, inode, 181 0, BTRFS_EXTENT_DATA_KEY); 182 if (ret) { 183 WARN_ON(1); 184 return ret; 185 } 186 187 return btrfs_update_inode(trans, root, inode); 188 } 189 190 static int readahead_cache(struct inode *inode) 191 { 192 struct file_ra_state *ra; 193 unsigned long last_index; 194 195 ra = kzalloc(sizeof(*ra), GFP_NOFS); 196 if (!ra) 197 return -ENOMEM; 198 199 file_ra_state_init(ra, inode->i_mapping); 200 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 201 202 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); 203 204 kfree(ra); 205 206 return 0; 207 } 208 209 int load_free_space_cache(struct btrfs_fs_info *fs_info, 210 struct btrfs_block_group_cache *block_group) 211 { 212 struct btrfs_root *root = fs_info->tree_root; 213 struct inode *inode; 214 struct btrfs_free_space_header *header; 215 struct extent_buffer *leaf; 216 struct page *page; 217 struct btrfs_path *path; 218 u32 *checksums = NULL, *crc; 219 char *disk_crcs = NULL; 220 struct btrfs_key key; 221 struct list_head bitmaps; 222 u64 num_entries; 223 u64 num_bitmaps; 224 u64 generation; 225 u32 cur_crc = ~(u32)0; 226 pgoff_t index = 0; 227 unsigned long first_page_offset; 228 int num_checksums; 229 int ret = 0; 230 231 /* 232 * If we're unmounting then just return, since this does a search on the 233 * normal root and not the commit root and we could deadlock. 234 */ 235 smp_mb(); 236 if (fs_info->closing) 237 return 0; 238 239 /* 240 * If this block group has been marked to be cleared for one reason or 241 * another then we can't trust the on disk cache, so just return. 242 */ 243 spin_lock(&block_group->lock); 244 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 245 spin_unlock(&block_group->lock); 246 return 0; 247 } 248 spin_unlock(&block_group->lock); 249 250 INIT_LIST_HEAD(&bitmaps); 251 252 path = btrfs_alloc_path(); 253 if (!path) 254 return 0; 255 256 inode = lookup_free_space_inode(root, block_group, path); 257 if (IS_ERR(inode)) { 258 btrfs_free_path(path); 259 return 0; 260 } 261 262 /* Nothing in the space cache, goodbye */ 263 if (!i_size_read(inode)) { 264 btrfs_free_path(path); 265 goto out; 266 } 267 268 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 269 key.offset = block_group->key.objectid; 270 key.type = 0; 271 272 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 273 if (ret) { 274 btrfs_free_path(path); 275 goto out; 276 } 277 278 leaf = path->nodes[0]; 279 header = btrfs_item_ptr(leaf, path->slots[0], 280 struct btrfs_free_space_header); 281 num_entries = btrfs_free_space_entries(leaf, header); 282 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 283 generation = btrfs_free_space_generation(leaf, header); 284 btrfs_free_path(path); 285 286 if (BTRFS_I(inode)->generation != generation) { 287 printk(KERN_ERR "btrfs: free space inode generation (%llu) did" 288 " not match free space cache generation (%llu) for " 289 "block group %llu\n", 290 (unsigned long long)BTRFS_I(inode)->generation, 291 (unsigned long long)generation, 292 (unsigned long long)block_group->key.objectid); 293 goto free_cache; 294 } 295 296 if (!num_entries) 297 goto out; 298 299 /* Setup everything for doing checksumming */ 300 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; 301 checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); 302 if (!checksums) 303 goto out; 304 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 305 disk_crcs = kzalloc(first_page_offset, GFP_NOFS); 306 if (!disk_crcs) 307 goto out; 308 309 ret = readahead_cache(inode); 310 if (ret) { 311 ret = 0; 312 goto out; 313 } 314 315 while (1) { 316 struct btrfs_free_space_entry *entry; 317 struct btrfs_free_space *e; 318 void *addr; 319 unsigned long offset = 0; 320 unsigned long start_offset = 0; 321 int need_loop = 0; 322 323 if (!num_entries && !num_bitmaps) 324 break; 325 326 if (index == 0) { 327 start_offset = first_page_offset; 328 offset = start_offset; 329 } 330 331 page = grab_cache_page(inode->i_mapping, index); 332 if (!page) { 333 ret = 0; 334 goto free_cache; 335 } 336 337 if (!PageUptodate(page)) { 338 btrfs_readpage(NULL, page); 339 lock_page(page); 340 if (!PageUptodate(page)) { 341 unlock_page(page); 342 page_cache_release(page); 343 printk(KERN_ERR "btrfs: error reading free " 344 "space cache: %llu\n", 345 (unsigned long long) 346 block_group->key.objectid); 347 goto free_cache; 348 } 349 } 350 addr = kmap(page); 351 352 if (index == 0) { 353 u64 *gen; 354 355 memcpy(disk_crcs, addr, first_page_offset); 356 gen = addr + (sizeof(u32) * num_checksums); 357 if (*gen != BTRFS_I(inode)->generation) { 358 printk(KERN_ERR "btrfs: space cache generation" 359 " (%llu) does not match inode (%llu) " 360 "for block group %llu\n", 361 (unsigned long long)*gen, 362 (unsigned long long) 363 BTRFS_I(inode)->generation, 364 (unsigned long long) 365 block_group->key.objectid); 366 kunmap(page); 367 unlock_page(page); 368 page_cache_release(page); 369 goto free_cache; 370 } 371 crc = (u32 *)disk_crcs; 372 } 373 entry = addr + start_offset; 374 375 /* First lets check our crc before we do anything fun */ 376 cur_crc = ~(u32)0; 377 cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, 378 PAGE_CACHE_SIZE - start_offset); 379 btrfs_csum_final(cur_crc, (char *)&cur_crc); 380 if (cur_crc != *crc) { 381 printk(KERN_ERR "btrfs: crc mismatch for page %lu in " 382 "block group %llu\n", index, 383 (unsigned long long)block_group->key.objectid); 384 kunmap(page); 385 unlock_page(page); 386 page_cache_release(page); 387 goto free_cache; 388 } 389 crc++; 390 391 while (1) { 392 if (!num_entries) 393 break; 394 395 need_loop = 1; 396 e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 397 if (!e) { 398 kunmap(page); 399 unlock_page(page); 400 page_cache_release(page); 401 goto free_cache; 402 } 403 404 e->offset = le64_to_cpu(entry->offset); 405 e->bytes = le64_to_cpu(entry->bytes); 406 if (!e->bytes) { 407 kunmap(page); 408 kfree(e); 409 unlock_page(page); 410 page_cache_release(page); 411 goto free_cache; 412 } 413 414 if (entry->type == BTRFS_FREE_SPACE_EXTENT) { 415 spin_lock(&block_group->tree_lock); 416 ret = link_free_space(block_group, e); 417 spin_unlock(&block_group->tree_lock); 418 BUG_ON(ret); 419 } else { 420 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 421 if (!e->bitmap) { 422 kunmap(page); 423 kfree(e); 424 unlock_page(page); 425 page_cache_release(page); 426 goto free_cache; 427 } 428 spin_lock(&block_group->tree_lock); 429 ret = link_free_space(block_group, e); 430 block_group->total_bitmaps++; 431 recalculate_thresholds(block_group); 432 spin_unlock(&block_group->tree_lock); 433 list_add_tail(&e->list, &bitmaps); 434 } 435 436 num_entries--; 437 offset += sizeof(struct btrfs_free_space_entry); 438 if (offset + sizeof(struct btrfs_free_space_entry) >= 439 PAGE_CACHE_SIZE) 440 break; 441 entry++; 442 } 443 444 /* 445 * We read an entry out of this page, we need to move on to the 446 * next page. 447 */ 448 if (need_loop) { 449 kunmap(page); 450 goto next; 451 } 452 453 /* 454 * We add the bitmaps at the end of the entries in order that 455 * the bitmap entries are added to the cache. 456 */ 457 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 458 list_del_init(&e->list); 459 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 460 kunmap(page); 461 num_bitmaps--; 462 next: 463 unlock_page(page); 464 page_cache_release(page); 465 index++; 466 } 467 468 ret = 1; 469 out: 470 kfree(checksums); 471 kfree(disk_crcs); 472 iput(inode); 473 return ret; 474 475 free_cache: 476 /* This cache is bogus, make sure it gets cleared */ 477 spin_lock(&block_group->lock); 478 block_group->disk_cache_state = BTRFS_DC_CLEAR; 479 spin_unlock(&block_group->lock); 480 btrfs_remove_free_space_cache(block_group); 481 goto out; 482 } 483 484 int btrfs_write_out_cache(struct btrfs_root *root, 485 struct btrfs_trans_handle *trans, 486 struct btrfs_block_group_cache *block_group, 487 struct btrfs_path *path) 488 { 489 struct btrfs_free_space_header *header; 490 struct extent_buffer *leaf; 491 struct inode *inode; 492 struct rb_node *node; 493 struct list_head *pos, *n; 494 struct page *page; 495 struct extent_state *cached_state = NULL; 496 struct list_head bitmap_list; 497 struct btrfs_key key; 498 u64 bytes = 0; 499 u32 *crc, *checksums; 500 pgoff_t index = 0, last_index = 0; 501 unsigned long first_page_offset; 502 int num_checksums; 503 int entries = 0; 504 int bitmaps = 0; 505 int ret = 0; 506 507 root = root->fs_info->tree_root; 508 509 INIT_LIST_HEAD(&bitmap_list); 510 511 spin_lock(&block_group->lock); 512 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 513 spin_unlock(&block_group->lock); 514 return 0; 515 } 516 spin_unlock(&block_group->lock); 517 518 inode = lookup_free_space_inode(root, block_group, path); 519 if (IS_ERR(inode)) 520 return 0; 521 522 if (!i_size_read(inode)) { 523 iput(inode); 524 return 0; 525 } 526 527 node = rb_first(&block_group->free_space_offset); 528 if (!node) { 529 iput(inode); 530 return 0; 531 } 532 533 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 534 filemap_write_and_wait(inode->i_mapping); 535 btrfs_wait_ordered_range(inode, inode->i_size & 536 ~(root->sectorsize - 1), (u64)-1); 537 538 /* We need a checksum per page. */ 539 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; 540 crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); 541 if (!crc) { 542 iput(inode); 543 return 0; 544 } 545 546 /* Since the first page has all of our checksums and our generation we 547 * need to calculate the offset into the page that we can start writing 548 * our entries. 549 */ 550 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 551 552 /* 553 * Lock all pages first so we can lock the extent safely. 554 * 555 * NOTE: Because we hold the ref the entire time we're going to write to 556 * the page find_get_page should never fail, so we don't do a check 557 * after find_get_page at this point. Just putting this here so people 558 * know and don't freak out. 559 */ 560 while (index <= last_index) { 561 page = grab_cache_page(inode->i_mapping, index); 562 if (!page) { 563 pgoff_t i = 0; 564 565 while (i < index) { 566 page = find_get_page(inode->i_mapping, i); 567 unlock_page(page); 568 page_cache_release(page); 569 page_cache_release(page); 570 i++; 571 } 572 goto out_free; 573 } 574 index++; 575 } 576 577 index = 0; 578 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 579 0, &cached_state, GFP_NOFS); 580 581 /* Write out the extent entries */ 582 do { 583 struct btrfs_free_space_entry *entry; 584 void *addr; 585 unsigned long offset = 0; 586 unsigned long start_offset = 0; 587 588 if (index == 0) { 589 start_offset = first_page_offset; 590 offset = start_offset; 591 } 592 593 page = find_get_page(inode->i_mapping, index); 594 595 addr = kmap(page); 596 entry = addr + start_offset; 597 598 memset(addr, 0, PAGE_CACHE_SIZE); 599 while (1) { 600 struct btrfs_free_space *e; 601 602 e = rb_entry(node, struct btrfs_free_space, offset_index); 603 entries++; 604 605 entry->offset = cpu_to_le64(e->offset); 606 entry->bytes = cpu_to_le64(e->bytes); 607 if (e->bitmap) { 608 entry->type = BTRFS_FREE_SPACE_BITMAP; 609 list_add_tail(&e->list, &bitmap_list); 610 bitmaps++; 611 } else { 612 entry->type = BTRFS_FREE_SPACE_EXTENT; 613 } 614 node = rb_next(node); 615 if (!node) 616 break; 617 offset += sizeof(struct btrfs_free_space_entry); 618 if (offset + sizeof(struct btrfs_free_space_entry) >= 619 PAGE_CACHE_SIZE) 620 break; 621 entry++; 622 } 623 *crc = ~(u32)0; 624 *crc = btrfs_csum_data(root, addr + start_offset, *crc, 625 PAGE_CACHE_SIZE - start_offset); 626 kunmap(page); 627 628 btrfs_csum_final(*crc, (char *)crc); 629 crc++; 630 631 bytes += PAGE_CACHE_SIZE; 632 633 ClearPageChecked(page); 634 set_page_extent_mapped(page); 635 SetPageUptodate(page); 636 set_page_dirty(page); 637 638 /* 639 * We need to release our reference we got for grab_cache_page, 640 * except for the first page which will hold our checksums, we 641 * do that below. 642 */ 643 if (index != 0) { 644 unlock_page(page); 645 page_cache_release(page); 646 } 647 648 page_cache_release(page); 649 650 index++; 651 } while (node); 652 653 /* Write out the bitmaps */ 654 list_for_each_safe(pos, n, &bitmap_list) { 655 void *addr; 656 struct btrfs_free_space *entry = 657 list_entry(pos, struct btrfs_free_space, list); 658 659 page = find_get_page(inode->i_mapping, index); 660 661 addr = kmap(page); 662 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); 663 *crc = ~(u32)0; 664 *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE); 665 kunmap(page); 666 btrfs_csum_final(*crc, (char *)crc); 667 crc++; 668 bytes += PAGE_CACHE_SIZE; 669 670 ClearPageChecked(page); 671 set_page_extent_mapped(page); 672 SetPageUptodate(page); 673 set_page_dirty(page); 674 unlock_page(page); 675 page_cache_release(page); 676 page_cache_release(page); 677 list_del_init(&entry->list); 678 index++; 679 } 680 681 /* Zero out the rest of the pages just to make sure */ 682 while (index <= last_index) { 683 void *addr; 684 685 page = find_get_page(inode->i_mapping, index); 686 687 addr = kmap(page); 688 memset(addr, 0, PAGE_CACHE_SIZE); 689 kunmap(page); 690 ClearPageChecked(page); 691 set_page_extent_mapped(page); 692 SetPageUptodate(page); 693 set_page_dirty(page); 694 unlock_page(page); 695 page_cache_release(page); 696 page_cache_release(page); 697 bytes += PAGE_CACHE_SIZE; 698 index++; 699 } 700 701 btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state); 702 703 /* Write the checksums and trans id to the first page */ 704 { 705 void *addr; 706 u64 *gen; 707 708 page = find_get_page(inode->i_mapping, 0); 709 710 addr = kmap(page); 711 memcpy(addr, checksums, sizeof(u32) * num_checksums); 712 gen = addr + (sizeof(u32) * num_checksums); 713 *gen = trans->transid; 714 kunmap(page); 715 ClearPageChecked(page); 716 set_page_extent_mapped(page); 717 SetPageUptodate(page); 718 set_page_dirty(page); 719 unlock_page(page); 720 page_cache_release(page); 721 page_cache_release(page); 722 } 723 BTRFS_I(inode)->generation = trans->transid; 724 725 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 726 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 727 728 filemap_write_and_wait(inode->i_mapping); 729 730 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 731 key.offset = block_group->key.objectid; 732 key.type = 0; 733 734 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 735 if (ret < 0) { 736 ret = 0; 737 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 738 EXTENT_DIRTY | EXTENT_DELALLOC | 739 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); 740 goto out_free; 741 } 742 leaf = path->nodes[0]; 743 if (ret > 0) { 744 struct btrfs_key found_key; 745 BUG_ON(!path->slots[0]); 746 path->slots[0]--; 747 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 748 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 749 found_key.offset != block_group->key.objectid) { 750 ret = 0; 751 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 752 EXTENT_DIRTY | EXTENT_DELALLOC | 753 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 754 GFP_NOFS); 755 btrfs_release_path(root, path); 756 goto out_free; 757 } 758 } 759 header = btrfs_item_ptr(leaf, path->slots[0], 760 struct btrfs_free_space_header); 761 btrfs_set_free_space_entries(leaf, header, entries); 762 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 763 btrfs_set_free_space_generation(leaf, header, trans->transid); 764 btrfs_mark_buffer_dirty(leaf); 765 btrfs_release_path(root, path); 766 767 ret = 1; 768 769 out_free: 770 if (ret == 0) { 771 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 772 spin_lock(&block_group->lock); 773 block_group->disk_cache_state = BTRFS_DC_ERROR; 774 spin_unlock(&block_group->lock); 775 BTRFS_I(inode)->generation = 0; 776 } 777 kfree(checksums); 778 btrfs_update_inode(trans, root, inode); 779 iput(inode); 780 return ret; 781 } 782 783 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, 784 u64 offset) 785 { 786 BUG_ON(offset < bitmap_start); 787 offset -= bitmap_start; 788 return (unsigned long)(div64_u64(offset, sectorsize)); 789 } 790 791 static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize) 792 { 793 return (unsigned long)(div64_u64(bytes, sectorsize)); 794 } 795 796 static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group, 797 u64 offset) 798 { 799 u64 bitmap_start; 800 u64 bytes_per_bitmap; 801 802 bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize; 803 bitmap_start = offset - block_group->key.objectid; 804 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 805 bitmap_start *= bytes_per_bitmap; 806 bitmap_start += block_group->key.objectid; 807 808 return bitmap_start; 809 } 810 811 static int tree_insert_offset(struct rb_root *root, u64 offset, 812 struct rb_node *node, int bitmap) 813 { 814 struct rb_node **p = &root->rb_node; 815 struct rb_node *parent = NULL; 816 struct btrfs_free_space *info; 817 818 while (*p) { 819 parent = *p; 820 info = rb_entry(parent, struct btrfs_free_space, offset_index); 821 822 if (offset < info->offset) { 823 p = &(*p)->rb_left; 824 } else if (offset > info->offset) { 825 p = &(*p)->rb_right; 826 } else { 827 /* 828 * we could have a bitmap entry and an extent entry 829 * share the same offset. If this is the case, we want 830 * the extent entry to always be found first if we do a 831 * linear search through the tree, since we want to have 832 * the quickest allocation time, and allocating from an 833 * extent is faster than allocating from a bitmap. So 834 * if we're inserting a bitmap and we find an entry at 835 * this offset, we want to go right, or after this entry 836 * logically. If we are inserting an extent and we've 837 * found a bitmap, we want to go left, or before 838 * logically. 839 */ 840 if (bitmap) { 841 WARN_ON(info->bitmap); 842 p = &(*p)->rb_right; 843 } else { 844 WARN_ON(!info->bitmap); 845 p = &(*p)->rb_left; 846 } 847 } 848 } 849 850 rb_link_node(node, parent, p); 851 rb_insert_color(node, root); 852 853 return 0; 854 } 855 856 /* 857 * searches the tree for the given offset. 858 * 859 * fuzzy - If this is set, then we are trying to make an allocation, and we just 860 * want a section that has at least bytes size and comes at or after the given 861 * offset. 862 */ 863 static struct btrfs_free_space * 864 tree_search_offset(struct btrfs_block_group_cache *block_group, 865 u64 offset, int bitmap_only, int fuzzy) 866 { 867 struct rb_node *n = block_group->free_space_offset.rb_node; 868 struct btrfs_free_space *entry, *prev = NULL; 869 870 /* find entry that is closest to the 'offset' */ 871 while (1) { 872 if (!n) { 873 entry = NULL; 874 break; 875 } 876 877 entry = rb_entry(n, struct btrfs_free_space, offset_index); 878 prev = entry; 879 880 if (offset < entry->offset) 881 n = n->rb_left; 882 else if (offset > entry->offset) 883 n = n->rb_right; 884 else 885 break; 886 } 887 888 if (bitmap_only) { 889 if (!entry) 890 return NULL; 891 if (entry->bitmap) 892 return entry; 893 894 /* 895 * bitmap entry and extent entry may share same offset, 896 * in that case, bitmap entry comes after extent entry. 897 */ 898 n = rb_next(n); 899 if (!n) 900 return NULL; 901 entry = rb_entry(n, struct btrfs_free_space, offset_index); 902 if (entry->offset != offset) 903 return NULL; 904 905 WARN_ON(!entry->bitmap); 906 return entry; 907 } else if (entry) { 908 if (entry->bitmap) { 909 /* 910 * if previous extent entry covers the offset, 911 * we should return it instead of the bitmap entry 912 */ 913 n = &entry->offset_index; 914 while (1) { 915 n = rb_prev(n); 916 if (!n) 917 break; 918 prev = rb_entry(n, struct btrfs_free_space, 919 offset_index); 920 if (!prev->bitmap) { 921 if (prev->offset + prev->bytes > offset) 922 entry = prev; 923 break; 924 } 925 } 926 } 927 return entry; 928 } 929 930 if (!prev) 931 return NULL; 932 933 /* find last entry before the 'offset' */ 934 entry = prev; 935 if (entry->offset > offset) { 936 n = rb_prev(&entry->offset_index); 937 if (n) { 938 entry = rb_entry(n, struct btrfs_free_space, 939 offset_index); 940 BUG_ON(entry->offset > offset); 941 } else { 942 if (fuzzy) 943 return entry; 944 else 945 return NULL; 946 } 947 } 948 949 if (entry->bitmap) { 950 n = &entry->offset_index; 951 while (1) { 952 n = rb_prev(n); 953 if (!n) 954 break; 955 prev = rb_entry(n, struct btrfs_free_space, 956 offset_index); 957 if (!prev->bitmap) { 958 if (prev->offset + prev->bytes > offset) 959 return prev; 960 break; 961 } 962 } 963 if (entry->offset + BITS_PER_BITMAP * 964 block_group->sectorsize > offset) 965 return entry; 966 } else if (entry->offset + entry->bytes > offset) 967 return entry; 968 969 if (!fuzzy) 970 return NULL; 971 972 while (1) { 973 if (entry->bitmap) { 974 if (entry->offset + BITS_PER_BITMAP * 975 block_group->sectorsize > offset) 976 break; 977 } else { 978 if (entry->offset + entry->bytes > offset) 979 break; 980 } 981 982 n = rb_next(&entry->offset_index); 983 if (!n) 984 return NULL; 985 entry = rb_entry(n, struct btrfs_free_space, offset_index); 986 } 987 return entry; 988 } 989 990 static inline void 991 __unlink_free_space(struct btrfs_block_group_cache *block_group, 992 struct btrfs_free_space *info) 993 { 994 rb_erase(&info->offset_index, &block_group->free_space_offset); 995 block_group->free_extents--; 996 } 997 998 static void unlink_free_space(struct btrfs_block_group_cache *block_group, 999 struct btrfs_free_space *info) 1000 { 1001 __unlink_free_space(block_group, info); 1002 block_group->free_space -= info->bytes; 1003 } 1004 1005 static int link_free_space(struct btrfs_block_group_cache *block_group, 1006 struct btrfs_free_space *info) 1007 { 1008 int ret = 0; 1009 1010 BUG_ON(!info->bitmap && !info->bytes); 1011 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 1012 &info->offset_index, (info->bitmap != NULL)); 1013 if (ret) 1014 return ret; 1015 1016 block_group->free_space += info->bytes; 1017 block_group->free_extents++; 1018 return ret; 1019 } 1020 1021 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) 1022 { 1023 u64 max_bytes; 1024 u64 bitmap_bytes; 1025 u64 extent_bytes; 1026 u64 size = block_group->key.offset; 1027 1028 /* 1029 * The goal is to keep the total amount of memory used per 1gb of space 1030 * at or below 32k, so we need to adjust how much memory we allow to be 1031 * used by extent based free space tracking 1032 */ 1033 if (size < 1024 * 1024 * 1024) 1034 max_bytes = MAX_CACHE_BYTES_PER_GIG; 1035 else 1036 max_bytes = MAX_CACHE_BYTES_PER_GIG * 1037 div64_u64(size, 1024 * 1024 * 1024); 1038 1039 /* 1040 * we want to account for 1 more bitmap than what we have so we can make 1041 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as 1042 * we add more bitmaps. 1043 */ 1044 bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE; 1045 1046 if (bitmap_bytes >= max_bytes) { 1047 block_group->extents_thresh = 0; 1048 return; 1049 } 1050 1051 /* 1052 * we want the extent entry threshold to always be at most 1/2 the maxw 1053 * bytes we can have, or whatever is less than that. 1054 */ 1055 extent_bytes = max_bytes - bitmap_bytes; 1056 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); 1057 1058 block_group->extents_thresh = 1059 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); 1060 } 1061 1062 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, 1063 struct btrfs_free_space *info, u64 offset, 1064 u64 bytes) 1065 { 1066 unsigned long start, end; 1067 unsigned long i; 1068 1069 start = offset_to_bit(info->offset, block_group->sectorsize, offset); 1070 end = start + bytes_to_bits(bytes, block_group->sectorsize); 1071 BUG_ON(end > BITS_PER_BITMAP); 1072 1073 for (i = start; i < end; i++) 1074 clear_bit(i, info->bitmap); 1075 1076 info->bytes -= bytes; 1077 block_group->free_space -= bytes; 1078 } 1079 1080 static void bitmap_set_bits(struct btrfs_block_group_cache *block_group, 1081 struct btrfs_free_space *info, u64 offset, 1082 u64 bytes) 1083 { 1084 unsigned long start, end; 1085 unsigned long i; 1086 1087 start = offset_to_bit(info->offset, block_group->sectorsize, offset); 1088 end = start + bytes_to_bits(bytes, block_group->sectorsize); 1089 BUG_ON(end > BITS_PER_BITMAP); 1090 1091 for (i = start; i < end; i++) 1092 set_bit(i, info->bitmap); 1093 1094 info->bytes += bytes; 1095 block_group->free_space += bytes; 1096 } 1097 1098 static int search_bitmap(struct btrfs_block_group_cache *block_group, 1099 struct btrfs_free_space *bitmap_info, u64 *offset, 1100 u64 *bytes) 1101 { 1102 unsigned long found_bits = 0; 1103 unsigned long bits, i; 1104 unsigned long next_zero; 1105 1106 i = offset_to_bit(bitmap_info->offset, block_group->sectorsize, 1107 max_t(u64, *offset, bitmap_info->offset)); 1108 bits = bytes_to_bits(*bytes, block_group->sectorsize); 1109 1110 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); 1111 i < BITS_PER_BITMAP; 1112 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { 1113 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1114 BITS_PER_BITMAP, i); 1115 if ((next_zero - i) >= bits) { 1116 found_bits = next_zero - i; 1117 break; 1118 } 1119 i = next_zero; 1120 } 1121 1122 if (found_bits) { 1123 *offset = (u64)(i * block_group->sectorsize) + 1124 bitmap_info->offset; 1125 *bytes = (u64)(found_bits) * block_group->sectorsize; 1126 return 0; 1127 } 1128 1129 return -1; 1130 } 1131 1132 static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache 1133 *block_group, u64 *offset, 1134 u64 *bytes, int debug) 1135 { 1136 struct btrfs_free_space *entry; 1137 struct rb_node *node; 1138 int ret; 1139 1140 if (!block_group->free_space_offset.rb_node) 1141 return NULL; 1142 1143 entry = tree_search_offset(block_group, 1144 offset_to_bitmap(block_group, *offset), 1145 0, 1); 1146 if (!entry) 1147 return NULL; 1148 1149 for (node = &entry->offset_index; node; node = rb_next(node)) { 1150 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1151 if (entry->bytes < *bytes) 1152 continue; 1153 1154 if (entry->bitmap) { 1155 ret = search_bitmap(block_group, entry, offset, bytes); 1156 if (!ret) 1157 return entry; 1158 continue; 1159 } 1160 1161 *offset = entry->offset; 1162 *bytes = entry->bytes; 1163 return entry; 1164 } 1165 1166 return NULL; 1167 } 1168 1169 static void add_new_bitmap(struct btrfs_block_group_cache *block_group, 1170 struct btrfs_free_space *info, u64 offset) 1171 { 1172 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1173 int max_bitmaps = (int)div64_u64(block_group->key.offset + 1174 bytes_per_bg - 1, bytes_per_bg); 1175 BUG_ON(block_group->total_bitmaps >= max_bitmaps); 1176 1177 info->offset = offset_to_bitmap(block_group, offset); 1178 info->bytes = 0; 1179 link_free_space(block_group, info); 1180 block_group->total_bitmaps++; 1181 1182 recalculate_thresholds(block_group); 1183 } 1184 1185 static void free_bitmap(struct btrfs_block_group_cache *block_group, 1186 struct btrfs_free_space *bitmap_info) 1187 { 1188 unlink_free_space(block_group, bitmap_info); 1189 kfree(bitmap_info->bitmap); 1190 kfree(bitmap_info); 1191 block_group->total_bitmaps--; 1192 recalculate_thresholds(block_group); 1193 } 1194 1195 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, 1196 struct btrfs_free_space *bitmap_info, 1197 u64 *offset, u64 *bytes) 1198 { 1199 u64 end; 1200 u64 search_start, search_bytes; 1201 int ret; 1202 1203 again: 1204 end = bitmap_info->offset + 1205 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1; 1206 1207 /* 1208 * XXX - this can go away after a few releases. 1209 * 1210 * since the only user of btrfs_remove_free_space is the tree logging 1211 * stuff, and the only way to test that is under crash conditions, we 1212 * want to have this debug stuff here just in case somethings not 1213 * working. Search the bitmap for the space we are trying to use to 1214 * make sure its actually there. If its not there then we need to stop 1215 * because something has gone wrong. 1216 */ 1217 search_start = *offset; 1218 search_bytes = *bytes; 1219 search_bytes = min(search_bytes, end - search_start + 1); 1220 ret = search_bitmap(block_group, bitmap_info, &search_start, 1221 &search_bytes); 1222 BUG_ON(ret < 0 || search_start != *offset); 1223 1224 if (*offset > bitmap_info->offset && *offset + *bytes > end) { 1225 bitmap_clear_bits(block_group, bitmap_info, *offset, 1226 end - *offset + 1); 1227 *bytes -= end - *offset + 1; 1228 *offset = end + 1; 1229 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { 1230 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes); 1231 *bytes = 0; 1232 } 1233 1234 if (*bytes) { 1235 struct rb_node *next = rb_next(&bitmap_info->offset_index); 1236 if (!bitmap_info->bytes) 1237 free_bitmap(block_group, bitmap_info); 1238 1239 /* 1240 * no entry after this bitmap, but we still have bytes to 1241 * remove, so something has gone wrong. 1242 */ 1243 if (!next) 1244 return -EINVAL; 1245 1246 bitmap_info = rb_entry(next, struct btrfs_free_space, 1247 offset_index); 1248 1249 /* 1250 * if the next entry isn't a bitmap we need to return to let the 1251 * extent stuff do its work. 1252 */ 1253 if (!bitmap_info->bitmap) 1254 return -EAGAIN; 1255 1256 /* 1257 * Ok the next item is a bitmap, but it may not actually hold 1258 * the information for the rest of this free space stuff, so 1259 * look for it, and if we don't find it return so we can try 1260 * everything over again. 1261 */ 1262 search_start = *offset; 1263 search_bytes = *bytes; 1264 ret = search_bitmap(block_group, bitmap_info, &search_start, 1265 &search_bytes); 1266 if (ret < 0 || search_start != *offset) 1267 return -EAGAIN; 1268 1269 goto again; 1270 } else if (!bitmap_info->bytes) 1271 free_bitmap(block_group, bitmap_info); 1272 1273 return 0; 1274 } 1275 1276 static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, 1277 struct btrfs_free_space *info) 1278 { 1279 struct btrfs_free_space *bitmap_info; 1280 int added = 0; 1281 u64 bytes, offset, end; 1282 int ret; 1283 1284 /* 1285 * If we are below the extents threshold then we can add this as an 1286 * extent, and don't have to deal with the bitmap 1287 */ 1288 if (block_group->free_extents < block_group->extents_thresh && 1289 info->bytes > block_group->sectorsize * 4) 1290 return 0; 1291 1292 /* 1293 * some block groups are so tiny they can't be enveloped by a bitmap, so 1294 * don't even bother to create a bitmap for this 1295 */ 1296 if (BITS_PER_BITMAP * block_group->sectorsize > 1297 block_group->key.offset) 1298 return 0; 1299 1300 bytes = info->bytes; 1301 offset = info->offset; 1302 1303 again: 1304 bitmap_info = tree_search_offset(block_group, 1305 offset_to_bitmap(block_group, offset), 1306 1, 0); 1307 if (!bitmap_info) { 1308 BUG_ON(added); 1309 goto new_bitmap; 1310 } 1311 1312 end = bitmap_info->offset + 1313 (u64)(BITS_PER_BITMAP * block_group->sectorsize); 1314 1315 if (offset >= bitmap_info->offset && offset + bytes > end) { 1316 bitmap_set_bits(block_group, bitmap_info, offset, 1317 end - offset); 1318 bytes -= end - offset; 1319 offset = end; 1320 added = 0; 1321 } else if (offset >= bitmap_info->offset && offset + bytes <= end) { 1322 bitmap_set_bits(block_group, bitmap_info, offset, bytes); 1323 bytes = 0; 1324 } else { 1325 BUG(); 1326 } 1327 1328 if (!bytes) { 1329 ret = 1; 1330 goto out; 1331 } else 1332 goto again; 1333 1334 new_bitmap: 1335 if (info && info->bitmap) { 1336 add_new_bitmap(block_group, info, offset); 1337 added = 1; 1338 info = NULL; 1339 goto again; 1340 } else { 1341 spin_unlock(&block_group->tree_lock); 1342 1343 /* no pre-allocated info, allocate a new one */ 1344 if (!info) { 1345 info = kzalloc(sizeof(struct btrfs_free_space), 1346 GFP_NOFS); 1347 if (!info) { 1348 spin_lock(&block_group->tree_lock); 1349 ret = -ENOMEM; 1350 goto out; 1351 } 1352 } 1353 1354 /* allocate the bitmap */ 1355 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 1356 spin_lock(&block_group->tree_lock); 1357 if (!info->bitmap) { 1358 ret = -ENOMEM; 1359 goto out; 1360 } 1361 goto again; 1362 } 1363 1364 out: 1365 if (info) { 1366 if (info->bitmap) 1367 kfree(info->bitmap); 1368 kfree(info); 1369 } 1370 1371 return ret; 1372 } 1373 1374 bool try_merge_free_space(struct btrfs_block_group_cache *block_group, 1375 struct btrfs_free_space *info, bool update_stat) 1376 { 1377 struct btrfs_free_space *left_info; 1378 struct btrfs_free_space *right_info; 1379 bool merged = false; 1380 u64 offset = info->offset; 1381 u64 bytes = info->bytes; 1382 1383 /* 1384 * first we want to see if there is free space adjacent to the range we 1385 * are adding, if there is remove that struct and add a new one to 1386 * cover the entire range 1387 */ 1388 right_info = tree_search_offset(block_group, offset + bytes, 0, 0); 1389 if (right_info && rb_prev(&right_info->offset_index)) 1390 left_info = rb_entry(rb_prev(&right_info->offset_index), 1391 struct btrfs_free_space, offset_index); 1392 else 1393 left_info = tree_search_offset(block_group, offset - 1, 0, 0); 1394 1395 if (right_info && !right_info->bitmap) { 1396 if (update_stat) 1397 unlink_free_space(block_group, right_info); 1398 else 1399 __unlink_free_space(block_group, right_info); 1400 info->bytes += right_info->bytes; 1401 kfree(right_info); 1402 merged = true; 1403 } 1404 1405 if (left_info && !left_info->bitmap && 1406 left_info->offset + left_info->bytes == offset) { 1407 if (update_stat) 1408 unlink_free_space(block_group, left_info); 1409 else 1410 __unlink_free_space(block_group, left_info); 1411 info->offset = left_info->offset; 1412 info->bytes += left_info->bytes; 1413 kfree(left_info); 1414 merged = true; 1415 } 1416 1417 return merged; 1418 } 1419 1420 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 1421 u64 offset, u64 bytes) 1422 { 1423 struct btrfs_free_space *info; 1424 int ret = 0; 1425 1426 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 1427 if (!info) 1428 return -ENOMEM; 1429 1430 info->offset = offset; 1431 info->bytes = bytes; 1432 1433 spin_lock(&block_group->tree_lock); 1434 1435 if (try_merge_free_space(block_group, info, true)) 1436 goto link; 1437 1438 /* 1439 * There was no extent directly to the left or right of this new 1440 * extent then we know we're going to have to allocate a new extent, so 1441 * before we do that see if we need to drop this into a bitmap 1442 */ 1443 ret = insert_into_bitmap(block_group, info); 1444 if (ret < 0) { 1445 goto out; 1446 } else if (ret) { 1447 ret = 0; 1448 goto out; 1449 } 1450 link: 1451 ret = link_free_space(block_group, info); 1452 if (ret) 1453 kfree(info); 1454 out: 1455 spin_unlock(&block_group->tree_lock); 1456 1457 if (ret) { 1458 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); 1459 BUG_ON(ret == -EEXIST); 1460 } 1461 1462 return ret; 1463 } 1464 1465 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 1466 u64 offset, u64 bytes) 1467 { 1468 struct btrfs_free_space *info; 1469 struct btrfs_free_space *next_info = NULL; 1470 int ret = 0; 1471 1472 spin_lock(&block_group->tree_lock); 1473 1474 again: 1475 info = tree_search_offset(block_group, offset, 0, 0); 1476 if (!info) { 1477 /* 1478 * oops didn't find an extent that matched the space we wanted 1479 * to remove, look for a bitmap instead 1480 */ 1481 info = tree_search_offset(block_group, 1482 offset_to_bitmap(block_group, offset), 1483 1, 0); 1484 if (!info) { 1485 WARN_ON(1); 1486 goto out_lock; 1487 } 1488 } 1489 1490 if (info->bytes < bytes && rb_next(&info->offset_index)) { 1491 u64 end; 1492 next_info = rb_entry(rb_next(&info->offset_index), 1493 struct btrfs_free_space, 1494 offset_index); 1495 1496 if (next_info->bitmap) 1497 end = next_info->offset + BITS_PER_BITMAP * 1498 block_group->sectorsize - 1; 1499 else 1500 end = next_info->offset + next_info->bytes; 1501 1502 if (next_info->bytes < bytes || 1503 next_info->offset > offset || offset > end) { 1504 printk(KERN_CRIT "Found free space at %llu, size %llu," 1505 " trying to use %llu\n", 1506 (unsigned long long)info->offset, 1507 (unsigned long long)info->bytes, 1508 (unsigned long long)bytes); 1509 WARN_ON(1); 1510 ret = -EINVAL; 1511 goto out_lock; 1512 } 1513 1514 info = next_info; 1515 } 1516 1517 if (info->bytes == bytes) { 1518 unlink_free_space(block_group, info); 1519 if (info->bitmap) { 1520 kfree(info->bitmap); 1521 block_group->total_bitmaps--; 1522 } 1523 kfree(info); 1524 goto out_lock; 1525 } 1526 1527 if (!info->bitmap && info->offset == offset) { 1528 unlink_free_space(block_group, info); 1529 info->offset += bytes; 1530 info->bytes -= bytes; 1531 link_free_space(block_group, info); 1532 goto out_lock; 1533 } 1534 1535 if (!info->bitmap && info->offset <= offset && 1536 info->offset + info->bytes >= offset + bytes) { 1537 u64 old_start = info->offset; 1538 /* 1539 * we're freeing space in the middle of the info, 1540 * this can happen during tree log replay 1541 * 1542 * first unlink the old info and then 1543 * insert it again after the hole we're creating 1544 */ 1545 unlink_free_space(block_group, info); 1546 if (offset + bytes < info->offset + info->bytes) { 1547 u64 old_end = info->offset + info->bytes; 1548 1549 info->offset = offset + bytes; 1550 info->bytes = old_end - info->offset; 1551 ret = link_free_space(block_group, info); 1552 WARN_ON(ret); 1553 if (ret) 1554 goto out_lock; 1555 } else { 1556 /* the hole we're creating ends at the end 1557 * of the info struct, just free the info 1558 */ 1559 kfree(info); 1560 } 1561 spin_unlock(&block_group->tree_lock); 1562 1563 /* step two, insert a new info struct to cover 1564 * anything before the hole 1565 */ 1566 ret = btrfs_add_free_space(block_group, old_start, 1567 offset - old_start); 1568 WARN_ON(ret); 1569 goto out; 1570 } 1571 1572 ret = remove_from_bitmap(block_group, info, &offset, &bytes); 1573 if (ret == -EAGAIN) 1574 goto again; 1575 BUG_ON(ret); 1576 out_lock: 1577 spin_unlock(&block_group->tree_lock); 1578 out: 1579 return ret; 1580 } 1581 1582 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 1583 u64 bytes) 1584 { 1585 struct btrfs_free_space *info; 1586 struct rb_node *n; 1587 int count = 0; 1588 1589 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { 1590 info = rb_entry(n, struct btrfs_free_space, offset_index); 1591 if (info->bytes >= bytes) 1592 count++; 1593 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", 1594 (unsigned long long)info->offset, 1595 (unsigned long long)info->bytes, 1596 (info->bitmap) ? "yes" : "no"); 1597 } 1598 printk(KERN_INFO "block group has cluster?: %s\n", 1599 list_empty(&block_group->cluster_list) ? "no" : "yes"); 1600 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 1601 "\n", count); 1602 } 1603 1604 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) 1605 { 1606 struct btrfs_free_space *info; 1607 struct rb_node *n; 1608 u64 ret = 0; 1609 1610 for (n = rb_first(&block_group->free_space_offset); n; 1611 n = rb_next(n)) { 1612 info = rb_entry(n, struct btrfs_free_space, offset_index); 1613 ret += info->bytes; 1614 } 1615 1616 return ret; 1617 } 1618 1619 /* 1620 * for a given cluster, put all of its extents back into the free 1621 * space cache. If the block group passed doesn't match the block group 1622 * pointed to by the cluster, someone else raced in and freed the 1623 * cluster already. In that case, we just return without changing anything 1624 */ 1625 static int 1626 __btrfs_return_cluster_to_free_space( 1627 struct btrfs_block_group_cache *block_group, 1628 struct btrfs_free_cluster *cluster) 1629 { 1630 struct btrfs_free_space *entry; 1631 struct rb_node *node; 1632 bool bitmap; 1633 1634 spin_lock(&cluster->lock); 1635 if (cluster->block_group != block_group) 1636 goto out; 1637 1638 bitmap = cluster->points_to_bitmap; 1639 cluster->block_group = NULL; 1640 cluster->window_start = 0; 1641 list_del_init(&cluster->block_group_list); 1642 cluster->points_to_bitmap = false; 1643 1644 if (bitmap) 1645 goto out; 1646 1647 node = rb_first(&cluster->root); 1648 while (node) { 1649 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1650 node = rb_next(&entry->offset_index); 1651 rb_erase(&entry->offset_index, &cluster->root); 1652 BUG_ON(entry->bitmap); 1653 try_merge_free_space(block_group, entry, false); 1654 tree_insert_offset(&block_group->free_space_offset, 1655 entry->offset, &entry->offset_index, 0); 1656 } 1657 cluster->root = RB_ROOT; 1658 1659 out: 1660 spin_unlock(&cluster->lock); 1661 btrfs_put_block_group(block_group); 1662 return 0; 1663 } 1664 1665 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 1666 { 1667 struct btrfs_free_space *info; 1668 struct rb_node *node; 1669 struct btrfs_free_cluster *cluster; 1670 struct list_head *head; 1671 1672 spin_lock(&block_group->tree_lock); 1673 while ((head = block_group->cluster_list.next) != 1674 &block_group->cluster_list) { 1675 cluster = list_entry(head, struct btrfs_free_cluster, 1676 block_group_list); 1677 1678 WARN_ON(cluster->block_group != block_group); 1679 __btrfs_return_cluster_to_free_space(block_group, cluster); 1680 if (need_resched()) { 1681 spin_unlock(&block_group->tree_lock); 1682 cond_resched(); 1683 spin_lock(&block_group->tree_lock); 1684 } 1685 } 1686 1687 while ((node = rb_last(&block_group->free_space_offset)) != NULL) { 1688 info = rb_entry(node, struct btrfs_free_space, offset_index); 1689 unlink_free_space(block_group, info); 1690 if (info->bitmap) 1691 kfree(info->bitmap); 1692 kfree(info); 1693 if (need_resched()) { 1694 spin_unlock(&block_group->tree_lock); 1695 cond_resched(); 1696 spin_lock(&block_group->tree_lock); 1697 } 1698 } 1699 1700 spin_unlock(&block_group->tree_lock); 1701 } 1702 1703 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, 1704 u64 offset, u64 bytes, u64 empty_size) 1705 { 1706 struct btrfs_free_space *entry = NULL; 1707 u64 bytes_search = bytes + empty_size; 1708 u64 ret = 0; 1709 1710 spin_lock(&block_group->tree_lock); 1711 entry = find_free_space(block_group, &offset, &bytes_search, 0); 1712 if (!entry) 1713 goto out; 1714 1715 ret = offset; 1716 if (entry->bitmap) { 1717 bitmap_clear_bits(block_group, entry, offset, bytes); 1718 if (!entry->bytes) 1719 free_bitmap(block_group, entry); 1720 } else { 1721 unlink_free_space(block_group, entry); 1722 entry->offset += bytes; 1723 entry->bytes -= bytes; 1724 if (!entry->bytes) 1725 kfree(entry); 1726 else 1727 link_free_space(block_group, entry); 1728 } 1729 1730 out: 1731 spin_unlock(&block_group->tree_lock); 1732 1733 return ret; 1734 } 1735 1736 /* 1737 * given a cluster, put all of its extents back into the free space 1738 * cache. If a block group is passed, this function will only free 1739 * a cluster that belongs to the passed block group. 1740 * 1741 * Otherwise, it'll get a reference on the block group pointed to by the 1742 * cluster and remove the cluster from it. 1743 */ 1744 int btrfs_return_cluster_to_free_space( 1745 struct btrfs_block_group_cache *block_group, 1746 struct btrfs_free_cluster *cluster) 1747 { 1748 int ret; 1749 1750 /* first, get a safe pointer to the block group */ 1751 spin_lock(&cluster->lock); 1752 if (!block_group) { 1753 block_group = cluster->block_group; 1754 if (!block_group) { 1755 spin_unlock(&cluster->lock); 1756 return 0; 1757 } 1758 } else if (cluster->block_group != block_group) { 1759 /* someone else has already freed it don't redo their work */ 1760 spin_unlock(&cluster->lock); 1761 return 0; 1762 } 1763 atomic_inc(&block_group->count); 1764 spin_unlock(&cluster->lock); 1765 1766 /* now return any extents the cluster had on it */ 1767 spin_lock(&block_group->tree_lock); 1768 ret = __btrfs_return_cluster_to_free_space(block_group, cluster); 1769 spin_unlock(&block_group->tree_lock); 1770 1771 /* finally drop our ref */ 1772 btrfs_put_block_group(block_group); 1773 return ret; 1774 } 1775 1776 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, 1777 struct btrfs_free_cluster *cluster, 1778 u64 bytes, u64 min_start) 1779 { 1780 struct btrfs_free_space *entry; 1781 int err; 1782 u64 search_start = cluster->window_start; 1783 u64 search_bytes = bytes; 1784 u64 ret = 0; 1785 1786 spin_lock(&block_group->tree_lock); 1787 spin_lock(&cluster->lock); 1788 1789 if (!cluster->points_to_bitmap) 1790 goto out; 1791 1792 if (cluster->block_group != block_group) 1793 goto out; 1794 1795 /* 1796 * search_start is the beginning of the bitmap, but at some point it may 1797 * be a good idea to point to the actual start of the free area in the 1798 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only 1799 * to 1 to make sure we get the bitmap entry 1800 */ 1801 entry = tree_search_offset(block_group, 1802 offset_to_bitmap(block_group, search_start), 1803 1, 0); 1804 if (!entry || !entry->bitmap) 1805 goto out; 1806 1807 search_start = min_start; 1808 search_bytes = bytes; 1809 1810 err = search_bitmap(block_group, entry, &search_start, 1811 &search_bytes); 1812 if (err) 1813 goto out; 1814 1815 ret = search_start; 1816 bitmap_clear_bits(block_group, entry, ret, bytes); 1817 if (entry->bytes == 0) 1818 free_bitmap(block_group, entry); 1819 out: 1820 spin_unlock(&cluster->lock); 1821 spin_unlock(&block_group->tree_lock); 1822 1823 return ret; 1824 } 1825 1826 /* 1827 * given a cluster, try to allocate 'bytes' from it, returns 0 1828 * if it couldn't find anything suitably large, or a logical disk offset 1829 * if things worked out 1830 */ 1831 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, 1832 struct btrfs_free_cluster *cluster, u64 bytes, 1833 u64 min_start) 1834 { 1835 struct btrfs_free_space *entry = NULL; 1836 struct rb_node *node; 1837 u64 ret = 0; 1838 1839 if (cluster->points_to_bitmap) 1840 return btrfs_alloc_from_bitmap(block_group, cluster, bytes, 1841 min_start); 1842 1843 spin_lock(&cluster->lock); 1844 if (bytes > cluster->max_size) 1845 goto out; 1846 1847 if (cluster->block_group != block_group) 1848 goto out; 1849 1850 node = rb_first(&cluster->root); 1851 if (!node) 1852 goto out; 1853 1854 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1855 1856 while(1) { 1857 if (entry->bytes < bytes || entry->offset < min_start) { 1858 struct rb_node *node; 1859 1860 node = rb_next(&entry->offset_index); 1861 if (!node) 1862 break; 1863 entry = rb_entry(node, struct btrfs_free_space, 1864 offset_index); 1865 continue; 1866 } 1867 ret = entry->offset; 1868 1869 entry->offset += bytes; 1870 entry->bytes -= bytes; 1871 1872 if (entry->bytes == 0) 1873 rb_erase(&entry->offset_index, &cluster->root); 1874 break; 1875 } 1876 out: 1877 spin_unlock(&cluster->lock); 1878 1879 if (!ret) 1880 return 0; 1881 1882 spin_lock(&block_group->tree_lock); 1883 1884 block_group->free_space -= bytes; 1885 if (entry->bytes == 0) { 1886 block_group->free_extents--; 1887 kfree(entry); 1888 } 1889 1890 spin_unlock(&block_group->tree_lock); 1891 1892 return ret; 1893 } 1894 1895 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, 1896 struct btrfs_free_space *entry, 1897 struct btrfs_free_cluster *cluster, 1898 u64 offset, u64 bytes, u64 min_bytes) 1899 { 1900 unsigned long next_zero; 1901 unsigned long i; 1902 unsigned long search_bits; 1903 unsigned long total_bits; 1904 unsigned long found_bits; 1905 unsigned long start = 0; 1906 unsigned long total_found = 0; 1907 bool found = false; 1908 1909 i = offset_to_bit(entry->offset, block_group->sectorsize, 1910 max_t(u64, offset, entry->offset)); 1911 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 1912 total_bits = bytes_to_bits(bytes, block_group->sectorsize); 1913 1914 again: 1915 found_bits = 0; 1916 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); 1917 i < BITS_PER_BITMAP; 1918 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { 1919 next_zero = find_next_zero_bit(entry->bitmap, 1920 BITS_PER_BITMAP, i); 1921 if (next_zero - i >= search_bits) { 1922 found_bits = next_zero - i; 1923 break; 1924 } 1925 i = next_zero; 1926 } 1927 1928 if (!found_bits) 1929 return -1; 1930 1931 if (!found) { 1932 start = i; 1933 found = true; 1934 } 1935 1936 total_found += found_bits; 1937 1938 if (cluster->max_size < found_bits * block_group->sectorsize) 1939 cluster->max_size = found_bits * block_group->sectorsize; 1940 1941 if (total_found < total_bits) { 1942 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); 1943 if (i - start > total_bits * 2) { 1944 total_found = 0; 1945 cluster->max_size = 0; 1946 found = false; 1947 } 1948 goto again; 1949 } 1950 1951 cluster->window_start = start * block_group->sectorsize + 1952 entry->offset; 1953 cluster->points_to_bitmap = true; 1954 1955 return 0; 1956 } 1957 1958 /* 1959 * here we try to find a cluster of blocks in a block group. The goal 1960 * is to find at least bytes free and up to empty_size + bytes free. 1961 * We might not find them all in one contiguous area. 1962 * 1963 * returns zero and sets up cluster if things worked out, otherwise 1964 * it returns -enospc 1965 */ 1966 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 1967 struct btrfs_root *root, 1968 struct btrfs_block_group_cache *block_group, 1969 struct btrfs_free_cluster *cluster, 1970 u64 offset, u64 bytes, u64 empty_size) 1971 { 1972 struct btrfs_free_space *entry = NULL; 1973 struct rb_node *node; 1974 struct btrfs_free_space *next; 1975 struct btrfs_free_space *last = NULL; 1976 u64 min_bytes; 1977 u64 window_start; 1978 u64 window_free; 1979 u64 max_extent = 0; 1980 bool found_bitmap = false; 1981 int ret; 1982 1983 /* for metadata, allow allocates with more holes */ 1984 if (btrfs_test_opt(root, SSD_SPREAD)) { 1985 min_bytes = bytes + empty_size; 1986 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 1987 /* 1988 * we want to do larger allocations when we are 1989 * flushing out the delayed refs, it helps prevent 1990 * making more work as we go along. 1991 */ 1992 if (trans->transaction->delayed_refs.flushing) 1993 min_bytes = max(bytes, (bytes + empty_size) >> 1); 1994 else 1995 min_bytes = max(bytes, (bytes + empty_size) >> 4); 1996 } else 1997 min_bytes = max(bytes, (bytes + empty_size) >> 2); 1998 1999 spin_lock(&block_group->tree_lock); 2000 spin_lock(&cluster->lock); 2001 2002 /* someone already found a cluster, hooray */ 2003 if (cluster->block_group) { 2004 ret = 0; 2005 goto out; 2006 } 2007 again: 2008 entry = tree_search_offset(block_group, offset, found_bitmap, 1); 2009 if (!entry) { 2010 ret = -ENOSPC; 2011 goto out; 2012 } 2013 2014 /* 2015 * If found_bitmap is true, we exhausted our search for extent entries, 2016 * and we just want to search all of the bitmaps that we can find, and 2017 * ignore any extent entries we find. 2018 */ 2019 while (entry->bitmap || found_bitmap || 2020 (!entry->bitmap && entry->bytes < min_bytes)) { 2021 struct rb_node *node = rb_next(&entry->offset_index); 2022 2023 if (entry->bitmap && entry->bytes > bytes + empty_size) { 2024 ret = btrfs_bitmap_cluster(block_group, entry, cluster, 2025 offset, bytes + empty_size, 2026 min_bytes); 2027 if (!ret) 2028 goto got_it; 2029 } 2030 2031 if (!node) { 2032 ret = -ENOSPC; 2033 goto out; 2034 } 2035 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2036 } 2037 2038 /* 2039 * We already searched all the extent entries from the passed in offset 2040 * to the end and didn't find enough space for the cluster, and we also 2041 * didn't find any bitmaps that met our criteria, just go ahead and exit 2042 */ 2043 if (found_bitmap) { 2044 ret = -ENOSPC; 2045 goto out; 2046 } 2047 2048 cluster->points_to_bitmap = false; 2049 window_start = entry->offset; 2050 window_free = entry->bytes; 2051 last = entry; 2052 max_extent = entry->bytes; 2053 2054 while (1) { 2055 /* out window is just right, lets fill it */ 2056 if (window_free >= bytes + empty_size) 2057 break; 2058 2059 node = rb_next(&last->offset_index); 2060 if (!node) { 2061 if (found_bitmap) 2062 goto again; 2063 ret = -ENOSPC; 2064 goto out; 2065 } 2066 next = rb_entry(node, struct btrfs_free_space, offset_index); 2067 2068 /* 2069 * we found a bitmap, so if this search doesn't result in a 2070 * cluster, we know to go and search again for the bitmaps and 2071 * start looking for space there 2072 */ 2073 if (next->bitmap) { 2074 if (!found_bitmap) 2075 offset = next->offset; 2076 found_bitmap = true; 2077 last = next; 2078 continue; 2079 } 2080 2081 /* 2082 * we haven't filled the empty size and the window is 2083 * very large. reset and try again 2084 */ 2085 if (next->offset - (last->offset + last->bytes) > 128 * 1024 || 2086 next->offset - window_start > (bytes + empty_size) * 2) { 2087 entry = next; 2088 window_start = entry->offset; 2089 window_free = entry->bytes; 2090 last = entry; 2091 max_extent = entry->bytes; 2092 } else { 2093 last = next; 2094 window_free += next->bytes; 2095 if (entry->bytes > max_extent) 2096 max_extent = entry->bytes; 2097 } 2098 } 2099 2100 cluster->window_start = entry->offset; 2101 2102 /* 2103 * now we've found our entries, pull them out of the free space 2104 * cache and put them into the cluster rbtree 2105 * 2106 * The cluster includes an rbtree, but only uses the offset index 2107 * of each free space cache entry. 2108 */ 2109 while (1) { 2110 node = rb_next(&entry->offset_index); 2111 if (entry->bitmap && node) { 2112 entry = rb_entry(node, struct btrfs_free_space, 2113 offset_index); 2114 continue; 2115 } else if (entry->bitmap && !node) { 2116 break; 2117 } 2118 2119 rb_erase(&entry->offset_index, &block_group->free_space_offset); 2120 ret = tree_insert_offset(&cluster->root, entry->offset, 2121 &entry->offset_index, 0); 2122 BUG_ON(ret); 2123 2124 if (!node || entry == last) 2125 break; 2126 2127 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2128 } 2129 2130 cluster->max_size = max_extent; 2131 got_it: 2132 ret = 0; 2133 atomic_inc(&block_group->count); 2134 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 2135 cluster->block_group = block_group; 2136 out: 2137 spin_unlock(&cluster->lock); 2138 spin_unlock(&block_group->tree_lock); 2139 2140 return ret; 2141 } 2142 2143 /* 2144 * simple code to zero out a cluster 2145 */ 2146 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 2147 { 2148 spin_lock_init(&cluster->lock); 2149 spin_lock_init(&cluster->refill_lock); 2150 cluster->root = RB_ROOT; 2151 cluster->max_size = 0; 2152 cluster->points_to_bitmap = false; 2153 INIT_LIST_HEAD(&cluster->block_group_list); 2154 cluster->block_group = NULL; 2155 } 2156 2157