1 /* 2 * Copyright (C) 2008 Red Hat. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/pagemap.h> 20 #include <linux/sched.h> 21 #include <linux/slab.h> 22 #include <linux/math64.h> 23 #include <linux/ratelimit.h> 24 #include "ctree.h" 25 #include "free-space-cache.h" 26 #include "transaction.h" 27 #include "disk-io.h" 28 #include "extent_io.h" 29 #include "inode-map.h" 30 #include "volumes.h" 31 32 #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 33 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 34 35 struct btrfs_trim_range { 36 u64 start; 37 u64 bytes; 38 struct list_head list; 39 }; 40 41 static int link_free_space(struct btrfs_free_space_ctl *ctl, 42 struct btrfs_free_space *info); 43 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 44 struct btrfs_free_space *info); 45 46 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 47 struct btrfs_path *path, 48 u64 offset) 49 { 50 struct btrfs_key key; 51 struct btrfs_key location; 52 struct btrfs_disk_key disk_key; 53 struct btrfs_free_space_header *header; 54 struct extent_buffer *leaf; 55 struct inode *inode = NULL; 56 int ret; 57 58 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 59 key.offset = offset; 60 key.type = 0; 61 62 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 63 if (ret < 0) 64 return ERR_PTR(ret); 65 if (ret > 0) { 66 btrfs_release_path(path); 67 return ERR_PTR(-ENOENT); 68 } 69 70 leaf = path->nodes[0]; 71 header = btrfs_item_ptr(leaf, path->slots[0], 72 struct btrfs_free_space_header); 73 btrfs_free_space_key(leaf, header, &disk_key); 74 btrfs_disk_key_to_cpu(&location, &disk_key); 75 btrfs_release_path(path); 76 77 inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); 78 if (!inode) 79 return ERR_PTR(-ENOENT); 80 if (IS_ERR(inode)) 81 return inode; 82 if (is_bad_inode(inode)) { 83 iput(inode); 84 return ERR_PTR(-ENOENT); 85 } 86 87 mapping_set_gfp_mask(inode->i_mapping, 88 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 89 90 return inode; 91 } 92 93 struct inode *lookup_free_space_inode(struct btrfs_root *root, 94 struct btrfs_block_group_cache 95 *block_group, struct btrfs_path *path) 96 { 97 struct inode *inode = NULL; 98 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 99 100 spin_lock(&block_group->lock); 101 if (block_group->inode) 102 inode = igrab(block_group->inode); 103 spin_unlock(&block_group->lock); 104 if (inode) 105 return inode; 106 107 inode = __lookup_free_space_inode(root, path, 108 block_group->key.objectid); 109 if (IS_ERR(inode)) 110 return inode; 111 112 spin_lock(&block_group->lock); 113 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 114 btrfs_info(root->fs_info, 115 "Old style space inode found, converting."); 116 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 117 BTRFS_INODE_NODATACOW; 118 block_group->disk_cache_state = BTRFS_DC_CLEAR; 119 } 120 121 if (!block_group->iref) { 122 block_group->inode = igrab(inode); 123 block_group->iref = 1; 124 } 125 spin_unlock(&block_group->lock); 126 127 return inode; 128 } 129 130 static int __create_free_space_inode(struct btrfs_root *root, 131 struct btrfs_trans_handle *trans, 132 struct btrfs_path *path, 133 u64 ino, u64 offset) 134 { 135 struct btrfs_key key; 136 struct btrfs_disk_key disk_key; 137 struct btrfs_free_space_header *header; 138 struct btrfs_inode_item *inode_item; 139 struct extent_buffer *leaf; 140 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; 141 int ret; 142 143 ret = btrfs_insert_empty_inode(trans, root, path, ino); 144 if (ret) 145 return ret; 146 147 /* We inline crc's for the free disk space cache */ 148 if (ino != BTRFS_FREE_INO_OBJECTID) 149 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 150 151 leaf = path->nodes[0]; 152 inode_item = btrfs_item_ptr(leaf, path->slots[0], 153 struct btrfs_inode_item); 154 btrfs_item_key(leaf, &disk_key, path->slots[0]); 155 memset_extent_buffer(leaf, 0, (unsigned long)inode_item, 156 sizeof(*inode_item)); 157 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 158 btrfs_set_inode_size(leaf, inode_item, 0); 159 btrfs_set_inode_nbytes(leaf, inode_item, 0); 160 btrfs_set_inode_uid(leaf, inode_item, 0); 161 btrfs_set_inode_gid(leaf, inode_item, 0); 162 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 163 btrfs_set_inode_flags(leaf, inode_item, flags); 164 btrfs_set_inode_nlink(leaf, inode_item, 1); 165 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 166 btrfs_set_inode_block_group(leaf, inode_item, offset); 167 btrfs_mark_buffer_dirty(leaf); 168 btrfs_release_path(path); 169 170 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 171 key.offset = offset; 172 key.type = 0; 173 174 ret = btrfs_insert_empty_item(trans, root, path, &key, 175 sizeof(struct btrfs_free_space_header)); 176 if (ret < 0) { 177 btrfs_release_path(path); 178 return ret; 179 } 180 leaf = path->nodes[0]; 181 header = btrfs_item_ptr(leaf, path->slots[0], 182 struct btrfs_free_space_header); 183 memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); 184 btrfs_set_free_space_key(leaf, header, &disk_key); 185 btrfs_mark_buffer_dirty(leaf); 186 btrfs_release_path(path); 187 188 return 0; 189 } 190 191 int create_free_space_inode(struct btrfs_root *root, 192 struct btrfs_trans_handle *trans, 193 struct btrfs_block_group_cache *block_group, 194 struct btrfs_path *path) 195 { 196 int ret; 197 u64 ino; 198 199 ret = btrfs_find_free_objectid(root, &ino); 200 if (ret < 0) 201 return ret; 202 203 return __create_free_space_inode(root, trans, path, ino, 204 block_group->key.objectid); 205 } 206 207 int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, 208 struct btrfs_block_rsv *rsv) 209 { 210 u64 needed_bytes; 211 int ret; 212 213 /* 1 for slack space, 1 for updating the inode */ 214 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + 215 btrfs_calc_trans_metadata_size(root, 1); 216 217 spin_lock(&rsv->lock); 218 if (rsv->reserved < needed_bytes) 219 ret = -ENOSPC; 220 else 221 ret = 0; 222 spin_unlock(&rsv->lock); 223 return ret; 224 } 225 226 int btrfs_truncate_free_space_cache(struct btrfs_root *root, 227 struct btrfs_trans_handle *trans, 228 struct inode *inode) 229 { 230 int ret = 0; 231 232 btrfs_i_size_write(inode, 0); 233 truncate_pagecache(inode, 0); 234 235 /* 236 * We don't need an orphan item because truncating the free space cache 237 * will never be split across transactions. 238 */ 239 ret = btrfs_truncate_inode_items(trans, root, inode, 240 0, BTRFS_EXTENT_DATA_KEY); 241 if (ret) { 242 btrfs_abort_transaction(trans, root, ret); 243 return ret; 244 } 245 246 ret = btrfs_update_inode(trans, root, inode); 247 if (ret) 248 btrfs_abort_transaction(trans, root, ret); 249 250 return ret; 251 } 252 253 static int readahead_cache(struct inode *inode) 254 { 255 struct file_ra_state *ra; 256 unsigned long last_index; 257 258 ra = kzalloc(sizeof(*ra), GFP_NOFS); 259 if (!ra) 260 return -ENOMEM; 261 262 file_ra_state_init(ra, inode->i_mapping); 263 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 264 265 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); 266 267 kfree(ra); 268 269 return 0; 270 } 271 272 struct io_ctl { 273 void *cur, *orig; 274 struct page *page; 275 struct page **pages; 276 struct btrfs_root *root; 277 unsigned long size; 278 int index; 279 int num_pages; 280 unsigned check_crcs:1; 281 }; 282 283 static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, 284 struct btrfs_root *root, int write) 285 { 286 int num_pages; 287 int check_crcs = 0; 288 289 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); 290 291 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) 292 check_crcs = 1; 293 294 /* Make sure we can fit our crcs into the first page */ 295 if (write && check_crcs && 296 (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) 297 return -ENOSPC; 298 299 memset(io_ctl, 0, sizeof(struct io_ctl)); 300 301 io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); 302 if (!io_ctl->pages) 303 return -ENOMEM; 304 305 io_ctl->num_pages = num_pages; 306 io_ctl->root = root; 307 io_ctl->check_crcs = check_crcs; 308 309 return 0; 310 } 311 312 static void io_ctl_free(struct io_ctl *io_ctl) 313 { 314 kfree(io_ctl->pages); 315 } 316 317 static void io_ctl_unmap_page(struct io_ctl *io_ctl) 318 { 319 if (io_ctl->cur) { 320 kunmap(io_ctl->page); 321 io_ctl->cur = NULL; 322 io_ctl->orig = NULL; 323 } 324 } 325 326 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 327 { 328 ASSERT(io_ctl->index < io_ctl->num_pages); 329 io_ctl->page = io_ctl->pages[io_ctl->index++]; 330 io_ctl->cur = kmap(io_ctl->page); 331 io_ctl->orig = io_ctl->cur; 332 io_ctl->size = PAGE_CACHE_SIZE; 333 if (clear) 334 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); 335 } 336 337 static void io_ctl_drop_pages(struct io_ctl *io_ctl) 338 { 339 int i; 340 341 io_ctl_unmap_page(io_ctl); 342 343 for (i = 0; i < io_ctl->num_pages; i++) { 344 if (io_ctl->pages[i]) { 345 ClearPageChecked(io_ctl->pages[i]); 346 unlock_page(io_ctl->pages[i]); 347 page_cache_release(io_ctl->pages[i]); 348 } 349 } 350 } 351 352 static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, 353 int uptodate) 354 { 355 struct page *page; 356 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 357 int i; 358 359 for (i = 0; i < io_ctl->num_pages; i++) { 360 page = find_or_create_page(inode->i_mapping, i, mask); 361 if (!page) { 362 io_ctl_drop_pages(io_ctl); 363 return -ENOMEM; 364 } 365 io_ctl->pages[i] = page; 366 if (uptodate && !PageUptodate(page)) { 367 btrfs_readpage(NULL, page); 368 lock_page(page); 369 if (!PageUptodate(page)) { 370 btrfs_err(BTRFS_I(inode)->root->fs_info, 371 "error reading free space cache"); 372 io_ctl_drop_pages(io_ctl); 373 return -EIO; 374 } 375 } 376 } 377 378 for (i = 0; i < io_ctl->num_pages; i++) { 379 clear_page_dirty_for_io(io_ctl->pages[i]); 380 set_page_extent_mapped(io_ctl->pages[i]); 381 } 382 383 return 0; 384 } 385 386 static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) 387 { 388 __le64 *val; 389 390 io_ctl_map_page(io_ctl, 1); 391 392 /* 393 * Skip the csum areas. If we don't check crcs then we just have a 394 * 64bit chunk at the front of the first page. 395 */ 396 if (io_ctl->check_crcs) { 397 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 398 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 399 } else { 400 io_ctl->cur += sizeof(u64); 401 io_ctl->size -= sizeof(u64) * 2; 402 } 403 404 val = io_ctl->cur; 405 *val = cpu_to_le64(generation); 406 io_ctl->cur += sizeof(u64); 407 } 408 409 static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) 410 { 411 __le64 *gen; 412 413 /* 414 * Skip the crc area. If we don't check crcs then we just have a 64bit 415 * chunk at the front of the first page. 416 */ 417 if (io_ctl->check_crcs) { 418 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 419 io_ctl->size -= sizeof(u64) + 420 (sizeof(u32) * io_ctl->num_pages); 421 } else { 422 io_ctl->cur += sizeof(u64); 423 io_ctl->size -= sizeof(u64) * 2; 424 } 425 426 gen = io_ctl->cur; 427 if (le64_to_cpu(*gen) != generation) { 428 printk_ratelimited(KERN_ERR "BTRFS: space cache generation " 429 "(%Lu) does not match inode (%Lu)\n", *gen, 430 generation); 431 io_ctl_unmap_page(io_ctl); 432 return -EIO; 433 } 434 io_ctl->cur += sizeof(u64); 435 return 0; 436 } 437 438 static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) 439 { 440 u32 *tmp; 441 u32 crc = ~(u32)0; 442 unsigned offset = 0; 443 444 if (!io_ctl->check_crcs) { 445 io_ctl_unmap_page(io_ctl); 446 return; 447 } 448 449 if (index == 0) 450 offset = sizeof(u32) * io_ctl->num_pages; 451 452 crc = btrfs_csum_data(io_ctl->orig + offset, crc, 453 PAGE_CACHE_SIZE - offset); 454 btrfs_csum_final(crc, (char *)&crc); 455 io_ctl_unmap_page(io_ctl); 456 tmp = kmap(io_ctl->pages[0]); 457 tmp += index; 458 *tmp = crc; 459 kunmap(io_ctl->pages[0]); 460 } 461 462 static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) 463 { 464 u32 *tmp, val; 465 u32 crc = ~(u32)0; 466 unsigned offset = 0; 467 468 if (!io_ctl->check_crcs) { 469 io_ctl_map_page(io_ctl, 0); 470 return 0; 471 } 472 473 if (index == 0) 474 offset = sizeof(u32) * io_ctl->num_pages; 475 476 tmp = kmap(io_ctl->pages[0]); 477 tmp += index; 478 val = *tmp; 479 kunmap(io_ctl->pages[0]); 480 481 io_ctl_map_page(io_ctl, 0); 482 crc = btrfs_csum_data(io_ctl->orig + offset, crc, 483 PAGE_CACHE_SIZE - offset); 484 btrfs_csum_final(crc, (char *)&crc); 485 if (val != crc) { 486 printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " 487 "space cache\n"); 488 io_ctl_unmap_page(io_ctl); 489 return -EIO; 490 } 491 492 return 0; 493 } 494 495 static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, 496 void *bitmap) 497 { 498 struct btrfs_free_space_entry *entry; 499 500 if (!io_ctl->cur) 501 return -ENOSPC; 502 503 entry = io_ctl->cur; 504 entry->offset = cpu_to_le64(offset); 505 entry->bytes = cpu_to_le64(bytes); 506 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 507 BTRFS_FREE_SPACE_EXTENT; 508 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 509 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 510 511 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 512 return 0; 513 514 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 515 516 /* No more pages to map */ 517 if (io_ctl->index >= io_ctl->num_pages) 518 return 0; 519 520 /* map the next page */ 521 io_ctl_map_page(io_ctl, 1); 522 return 0; 523 } 524 525 static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) 526 { 527 if (!io_ctl->cur) 528 return -ENOSPC; 529 530 /* 531 * If we aren't at the start of the current page, unmap this one and 532 * map the next one if there is any left. 533 */ 534 if (io_ctl->cur != io_ctl->orig) { 535 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 536 if (io_ctl->index >= io_ctl->num_pages) 537 return -ENOSPC; 538 io_ctl_map_page(io_ctl, 0); 539 } 540 541 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); 542 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 543 if (io_ctl->index < io_ctl->num_pages) 544 io_ctl_map_page(io_ctl, 0); 545 return 0; 546 } 547 548 static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) 549 { 550 /* 551 * If we're not on the boundary we know we've modified the page and we 552 * need to crc the page. 553 */ 554 if (io_ctl->cur != io_ctl->orig) 555 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 556 else 557 io_ctl_unmap_page(io_ctl); 558 559 while (io_ctl->index < io_ctl->num_pages) { 560 io_ctl_map_page(io_ctl, 1); 561 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 562 } 563 } 564 565 static int io_ctl_read_entry(struct io_ctl *io_ctl, 566 struct btrfs_free_space *entry, u8 *type) 567 { 568 struct btrfs_free_space_entry *e; 569 int ret; 570 571 if (!io_ctl->cur) { 572 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 573 if (ret) 574 return ret; 575 } 576 577 e = io_ctl->cur; 578 entry->offset = le64_to_cpu(e->offset); 579 entry->bytes = le64_to_cpu(e->bytes); 580 *type = e->type; 581 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 582 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 583 584 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 585 return 0; 586 587 io_ctl_unmap_page(io_ctl); 588 589 return 0; 590 } 591 592 static int io_ctl_read_bitmap(struct io_ctl *io_ctl, 593 struct btrfs_free_space *entry) 594 { 595 int ret; 596 597 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 598 if (ret) 599 return ret; 600 601 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); 602 io_ctl_unmap_page(io_ctl); 603 604 return 0; 605 } 606 607 /* 608 * Since we attach pinned extents after the fact we can have contiguous sections 609 * of free space that are split up in entries. This poses a problem with the 610 * tree logging stuff since it could have allocated across what appears to be 2 611 * entries since we would have merged the entries when adding the pinned extents 612 * back to the free space cache. So run through the space cache that we just 613 * loaded and merge contiguous entries. This will make the log replay stuff not 614 * blow up and it will make for nicer allocator behavior. 615 */ 616 static void merge_space_tree(struct btrfs_free_space_ctl *ctl) 617 { 618 struct btrfs_free_space *e, *prev = NULL; 619 struct rb_node *n; 620 621 again: 622 spin_lock(&ctl->tree_lock); 623 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 624 e = rb_entry(n, struct btrfs_free_space, offset_index); 625 if (!prev) 626 goto next; 627 if (e->bitmap || prev->bitmap) 628 goto next; 629 if (prev->offset + prev->bytes == e->offset) { 630 unlink_free_space(ctl, prev); 631 unlink_free_space(ctl, e); 632 prev->bytes += e->bytes; 633 kmem_cache_free(btrfs_free_space_cachep, e); 634 link_free_space(ctl, prev); 635 prev = NULL; 636 spin_unlock(&ctl->tree_lock); 637 goto again; 638 } 639 next: 640 prev = e; 641 } 642 spin_unlock(&ctl->tree_lock); 643 } 644 645 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 646 struct btrfs_free_space_ctl *ctl, 647 struct btrfs_path *path, u64 offset) 648 { 649 struct btrfs_free_space_header *header; 650 struct extent_buffer *leaf; 651 struct io_ctl io_ctl; 652 struct btrfs_key key; 653 struct btrfs_free_space *e, *n; 654 LIST_HEAD(bitmaps); 655 u64 num_entries; 656 u64 num_bitmaps; 657 u64 generation; 658 u8 type; 659 int ret = 0; 660 661 /* Nothing in the space cache, goodbye */ 662 if (!i_size_read(inode)) 663 return 0; 664 665 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 666 key.offset = offset; 667 key.type = 0; 668 669 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 670 if (ret < 0) 671 return 0; 672 else if (ret > 0) { 673 btrfs_release_path(path); 674 return 0; 675 } 676 677 ret = -1; 678 679 leaf = path->nodes[0]; 680 header = btrfs_item_ptr(leaf, path->slots[0], 681 struct btrfs_free_space_header); 682 num_entries = btrfs_free_space_entries(leaf, header); 683 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 684 generation = btrfs_free_space_generation(leaf, header); 685 btrfs_release_path(path); 686 687 if (!BTRFS_I(inode)->generation) { 688 btrfs_info(root->fs_info, 689 "The free space cache file (%llu) is invalid. skip it\n", 690 offset); 691 return 0; 692 } 693 694 if (BTRFS_I(inode)->generation != generation) { 695 btrfs_err(root->fs_info, 696 "free space inode generation (%llu) " 697 "did not match free space cache generation (%llu)", 698 BTRFS_I(inode)->generation, generation); 699 return 0; 700 } 701 702 if (!num_entries) 703 return 0; 704 705 ret = io_ctl_init(&io_ctl, inode, root, 0); 706 if (ret) 707 return ret; 708 709 ret = readahead_cache(inode); 710 if (ret) 711 goto out; 712 713 ret = io_ctl_prepare_pages(&io_ctl, inode, 1); 714 if (ret) 715 goto out; 716 717 ret = io_ctl_check_crc(&io_ctl, 0); 718 if (ret) 719 goto free_cache; 720 721 ret = io_ctl_check_generation(&io_ctl, generation); 722 if (ret) 723 goto free_cache; 724 725 while (num_entries) { 726 e = kmem_cache_zalloc(btrfs_free_space_cachep, 727 GFP_NOFS); 728 if (!e) 729 goto free_cache; 730 731 ret = io_ctl_read_entry(&io_ctl, e, &type); 732 if (ret) { 733 kmem_cache_free(btrfs_free_space_cachep, e); 734 goto free_cache; 735 } 736 737 if (!e->bytes) { 738 kmem_cache_free(btrfs_free_space_cachep, e); 739 goto free_cache; 740 } 741 742 if (type == BTRFS_FREE_SPACE_EXTENT) { 743 spin_lock(&ctl->tree_lock); 744 ret = link_free_space(ctl, e); 745 spin_unlock(&ctl->tree_lock); 746 if (ret) { 747 btrfs_err(root->fs_info, 748 "Duplicate entries in free space cache, dumping"); 749 kmem_cache_free(btrfs_free_space_cachep, e); 750 goto free_cache; 751 } 752 } else { 753 ASSERT(num_bitmaps); 754 num_bitmaps--; 755 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 756 if (!e->bitmap) { 757 kmem_cache_free( 758 btrfs_free_space_cachep, e); 759 goto free_cache; 760 } 761 spin_lock(&ctl->tree_lock); 762 ret = link_free_space(ctl, e); 763 ctl->total_bitmaps++; 764 ctl->op->recalc_thresholds(ctl); 765 spin_unlock(&ctl->tree_lock); 766 if (ret) { 767 btrfs_err(root->fs_info, 768 "Duplicate entries in free space cache, dumping"); 769 kmem_cache_free(btrfs_free_space_cachep, e); 770 goto free_cache; 771 } 772 list_add_tail(&e->list, &bitmaps); 773 } 774 775 num_entries--; 776 } 777 778 io_ctl_unmap_page(&io_ctl); 779 780 /* 781 * We add the bitmaps at the end of the entries in order that 782 * the bitmap entries are added to the cache. 783 */ 784 list_for_each_entry_safe(e, n, &bitmaps, list) { 785 list_del_init(&e->list); 786 ret = io_ctl_read_bitmap(&io_ctl, e); 787 if (ret) 788 goto free_cache; 789 } 790 791 io_ctl_drop_pages(&io_ctl); 792 merge_space_tree(ctl); 793 ret = 1; 794 out: 795 io_ctl_free(&io_ctl); 796 return ret; 797 free_cache: 798 io_ctl_drop_pages(&io_ctl); 799 __btrfs_remove_free_space_cache(ctl); 800 goto out; 801 } 802 803 int load_free_space_cache(struct btrfs_fs_info *fs_info, 804 struct btrfs_block_group_cache *block_group) 805 { 806 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 807 struct btrfs_root *root = fs_info->tree_root; 808 struct inode *inode; 809 struct btrfs_path *path; 810 int ret = 0; 811 bool matched; 812 u64 used = btrfs_block_group_used(&block_group->item); 813 814 /* 815 * If this block group has been marked to be cleared for one reason or 816 * another then we can't trust the on disk cache, so just return. 817 */ 818 spin_lock(&block_group->lock); 819 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 820 spin_unlock(&block_group->lock); 821 return 0; 822 } 823 spin_unlock(&block_group->lock); 824 825 path = btrfs_alloc_path(); 826 if (!path) 827 return 0; 828 path->search_commit_root = 1; 829 path->skip_locking = 1; 830 831 inode = lookup_free_space_inode(root, block_group, path); 832 if (IS_ERR(inode)) { 833 btrfs_free_path(path); 834 return 0; 835 } 836 837 /* We may have converted the inode and made the cache invalid. */ 838 spin_lock(&block_group->lock); 839 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 840 spin_unlock(&block_group->lock); 841 btrfs_free_path(path); 842 goto out; 843 } 844 spin_unlock(&block_group->lock); 845 846 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 847 path, block_group->key.objectid); 848 btrfs_free_path(path); 849 if (ret <= 0) 850 goto out; 851 852 spin_lock(&ctl->tree_lock); 853 matched = (ctl->free_space == (block_group->key.offset - used - 854 block_group->bytes_super)); 855 spin_unlock(&ctl->tree_lock); 856 857 if (!matched) { 858 __btrfs_remove_free_space_cache(ctl); 859 btrfs_warn(fs_info, "block group %llu has wrong amount of free space", 860 block_group->key.objectid); 861 ret = -1; 862 } 863 out: 864 if (ret < 0) { 865 /* This cache is bogus, make sure it gets cleared */ 866 spin_lock(&block_group->lock); 867 block_group->disk_cache_state = BTRFS_DC_CLEAR; 868 spin_unlock(&block_group->lock); 869 ret = 0; 870 871 btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now", 872 block_group->key.objectid); 873 } 874 875 iput(inode); 876 return ret; 877 } 878 879 static noinline_for_stack 880 int write_cache_extent_entries(struct io_ctl *io_ctl, 881 struct btrfs_free_space_ctl *ctl, 882 struct btrfs_block_group_cache *block_group, 883 int *entries, int *bitmaps, 884 struct list_head *bitmap_list) 885 { 886 int ret; 887 struct btrfs_free_cluster *cluster = NULL; 888 struct rb_node *node = rb_first(&ctl->free_space_offset); 889 struct btrfs_trim_range *trim_entry; 890 891 /* Get the cluster for this block_group if it exists */ 892 if (block_group && !list_empty(&block_group->cluster_list)) { 893 cluster = list_entry(block_group->cluster_list.next, 894 struct btrfs_free_cluster, 895 block_group_list); 896 } 897 898 if (!node && cluster) { 899 node = rb_first(&cluster->root); 900 cluster = NULL; 901 } 902 903 /* Write out the extent entries */ 904 while (node) { 905 struct btrfs_free_space *e; 906 907 e = rb_entry(node, struct btrfs_free_space, offset_index); 908 *entries += 1; 909 910 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 911 e->bitmap); 912 if (ret) 913 goto fail; 914 915 if (e->bitmap) { 916 list_add_tail(&e->list, bitmap_list); 917 *bitmaps += 1; 918 } 919 node = rb_next(node); 920 if (!node && cluster) { 921 node = rb_first(&cluster->root); 922 cluster = NULL; 923 } 924 } 925 926 /* 927 * Make sure we don't miss any range that was removed from our rbtree 928 * because trimming is running. Otherwise after a umount+mount (or crash 929 * after committing the transaction) we would leak free space and get 930 * an inconsistent free space cache report from fsck. 931 */ 932 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 933 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 934 trim_entry->bytes, NULL); 935 if (ret) 936 goto fail; 937 *entries += 1; 938 } 939 940 return 0; 941 fail: 942 return -ENOSPC; 943 } 944 945 static noinline_for_stack int 946 update_cache_item(struct btrfs_trans_handle *trans, 947 struct btrfs_root *root, 948 struct inode *inode, 949 struct btrfs_path *path, u64 offset, 950 int entries, int bitmaps) 951 { 952 struct btrfs_key key; 953 struct btrfs_free_space_header *header; 954 struct extent_buffer *leaf; 955 int ret; 956 957 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 958 key.offset = offset; 959 key.type = 0; 960 961 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 962 if (ret < 0) { 963 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 964 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, 965 GFP_NOFS); 966 goto fail; 967 } 968 leaf = path->nodes[0]; 969 if (ret > 0) { 970 struct btrfs_key found_key; 971 ASSERT(path->slots[0]); 972 path->slots[0]--; 973 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 974 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 975 found_key.offset != offset) { 976 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 977 inode->i_size - 1, 978 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, 979 NULL, GFP_NOFS); 980 btrfs_release_path(path); 981 goto fail; 982 } 983 } 984 985 BTRFS_I(inode)->generation = trans->transid; 986 header = btrfs_item_ptr(leaf, path->slots[0], 987 struct btrfs_free_space_header); 988 btrfs_set_free_space_entries(leaf, header, entries); 989 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 990 btrfs_set_free_space_generation(leaf, header, trans->transid); 991 btrfs_mark_buffer_dirty(leaf); 992 btrfs_release_path(path); 993 994 return 0; 995 996 fail: 997 return -1; 998 } 999 1000 static noinline_for_stack int 1001 write_pinned_extent_entries(struct btrfs_root *root, 1002 struct btrfs_block_group_cache *block_group, 1003 struct io_ctl *io_ctl, 1004 int *entries) 1005 { 1006 u64 start, extent_start, extent_end, len; 1007 struct extent_io_tree *unpin = NULL; 1008 int ret; 1009 1010 if (!block_group) 1011 return 0; 1012 1013 /* 1014 * We want to add any pinned extents to our free space cache 1015 * so we don't leak the space 1016 * 1017 * We shouldn't have switched the pinned extents yet so this is the 1018 * right one 1019 */ 1020 unpin = root->fs_info->pinned_extents; 1021 1022 start = block_group->key.objectid; 1023 1024 while (start < block_group->key.objectid + block_group->key.offset) { 1025 ret = find_first_extent_bit(unpin, start, 1026 &extent_start, &extent_end, 1027 EXTENT_DIRTY, NULL); 1028 if (ret) 1029 return 0; 1030 1031 /* This pinned extent is out of our range */ 1032 if (extent_start >= block_group->key.objectid + 1033 block_group->key.offset) 1034 return 0; 1035 1036 extent_start = max(extent_start, start); 1037 extent_end = min(block_group->key.objectid + 1038 block_group->key.offset, extent_end + 1); 1039 len = extent_end - extent_start; 1040 1041 *entries += 1; 1042 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1043 if (ret) 1044 return -ENOSPC; 1045 1046 start = extent_end; 1047 } 1048 1049 return 0; 1050 } 1051 1052 static noinline_for_stack int 1053 write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) 1054 { 1055 struct list_head *pos, *n; 1056 int ret; 1057 1058 /* Write out the bitmaps */ 1059 list_for_each_safe(pos, n, bitmap_list) { 1060 struct btrfs_free_space *entry = 1061 list_entry(pos, struct btrfs_free_space, list); 1062 1063 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1064 if (ret) 1065 return -ENOSPC; 1066 list_del_init(&entry->list); 1067 } 1068 1069 return 0; 1070 } 1071 1072 static int flush_dirty_cache(struct inode *inode) 1073 { 1074 int ret; 1075 1076 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1077 if (ret) 1078 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1079 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, 1080 GFP_NOFS); 1081 1082 return ret; 1083 } 1084 1085 static void noinline_for_stack 1086 cleanup_write_cache_enospc(struct inode *inode, 1087 struct io_ctl *io_ctl, 1088 struct extent_state **cached_state, 1089 struct list_head *bitmap_list) 1090 { 1091 struct list_head *pos, *n; 1092 1093 list_for_each_safe(pos, n, bitmap_list) { 1094 struct btrfs_free_space *entry = 1095 list_entry(pos, struct btrfs_free_space, list); 1096 list_del_init(&entry->list); 1097 } 1098 io_ctl_drop_pages(io_ctl); 1099 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1100 i_size_read(inode) - 1, cached_state, 1101 GFP_NOFS); 1102 } 1103 1104 /** 1105 * __btrfs_write_out_cache - write out cached info to an inode 1106 * @root - the root the inode belongs to 1107 * @ctl - the free space cache we are going to write out 1108 * @block_group - the block_group for this cache if it belongs to a block_group 1109 * @trans - the trans handle 1110 * @path - the path to use 1111 * @offset - the offset for the key we'll insert 1112 * 1113 * This function writes out a free space cache struct to disk for quick recovery 1114 * on mount. This will return 0 if it was successfull in writing the cache out, 1115 * and -1 if it was not. 1116 */ 1117 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 1118 struct btrfs_free_space_ctl *ctl, 1119 struct btrfs_block_group_cache *block_group, 1120 struct btrfs_trans_handle *trans, 1121 struct btrfs_path *path, u64 offset) 1122 { 1123 struct extent_state *cached_state = NULL; 1124 struct io_ctl io_ctl; 1125 LIST_HEAD(bitmap_list); 1126 int entries = 0; 1127 int bitmaps = 0; 1128 int ret; 1129 1130 if (!i_size_read(inode)) 1131 return -1; 1132 1133 ret = io_ctl_init(&io_ctl, inode, root, 1); 1134 if (ret) 1135 return -1; 1136 1137 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1138 down_write(&block_group->data_rwsem); 1139 spin_lock(&block_group->lock); 1140 if (block_group->delalloc_bytes) { 1141 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1142 spin_unlock(&block_group->lock); 1143 up_write(&block_group->data_rwsem); 1144 BTRFS_I(inode)->generation = 0; 1145 ret = 0; 1146 goto out; 1147 } 1148 spin_unlock(&block_group->lock); 1149 } 1150 1151 /* Lock all pages first so we can lock the extent safely. */ 1152 io_ctl_prepare_pages(&io_ctl, inode, 0); 1153 1154 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1155 0, &cached_state); 1156 1157 io_ctl_set_generation(&io_ctl, trans->transid); 1158 1159 mutex_lock(&ctl->cache_writeout_mutex); 1160 /* Write out the extent entries in the free space cache */ 1161 ret = write_cache_extent_entries(&io_ctl, ctl, 1162 block_group, &entries, &bitmaps, 1163 &bitmap_list); 1164 if (ret) { 1165 mutex_unlock(&ctl->cache_writeout_mutex); 1166 goto out_nospc; 1167 } 1168 1169 /* 1170 * Some spaces that are freed in the current transaction are pinned, 1171 * they will be added into free space cache after the transaction is 1172 * committed, we shouldn't lose them. 1173 */ 1174 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); 1175 if (ret) { 1176 mutex_unlock(&ctl->cache_writeout_mutex); 1177 goto out_nospc; 1178 } 1179 1180 /* 1181 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1182 * locked while doing it because a concurrent trim can be manipulating 1183 * or freeing the bitmap. 1184 */ 1185 ret = write_bitmap_entries(&io_ctl, &bitmap_list); 1186 mutex_unlock(&ctl->cache_writeout_mutex); 1187 if (ret) 1188 goto out_nospc; 1189 1190 /* Zero out the rest of the pages just to make sure */ 1191 io_ctl_zero_remaining_pages(&io_ctl); 1192 1193 /* Everything is written out, now we dirty the pages in the file. */ 1194 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, 1195 0, i_size_read(inode), &cached_state); 1196 if (ret) 1197 goto out_nospc; 1198 1199 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1200 up_write(&block_group->data_rwsem); 1201 /* 1202 * Release the pages and unlock the extent, we will flush 1203 * them out later 1204 */ 1205 io_ctl_drop_pages(&io_ctl); 1206 1207 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1208 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 1209 1210 /* Flush the dirty pages in the cache file. */ 1211 ret = flush_dirty_cache(inode); 1212 if (ret) 1213 goto out; 1214 1215 /* Update the cache item to tell everyone this cache file is valid. */ 1216 ret = update_cache_item(trans, root, inode, path, offset, 1217 entries, bitmaps); 1218 out: 1219 io_ctl_free(&io_ctl); 1220 if (ret) { 1221 invalidate_inode_pages2(inode->i_mapping); 1222 BTRFS_I(inode)->generation = 0; 1223 } 1224 btrfs_update_inode(trans, root, inode); 1225 return ret; 1226 1227 out_nospc: 1228 cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); 1229 1230 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1231 up_write(&block_group->data_rwsem); 1232 1233 goto out; 1234 } 1235 1236 int btrfs_write_out_cache(struct btrfs_root *root, 1237 struct btrfs_trans_handle *trans, 1238 struct btrfs_block_group_cache *block_group, 1239 struct btrfs_path *path) 1240 { 1241 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1242 struct inode *inode; 1243 int ret = 0; 1244 enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN; 1245 1246 root = root->fs_info->tree_root; 1247 1248 spin_lock(&block_group->lock); 1249 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1250 spin_unlock(&block_group->lock); 1251 return 0; 1252 } 1253 1254 if (block_group->delalloc_bytes) { 1255 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1256 spin_unlock(&block_group->lock); 1257 return 0; 1258 } 1259 spin_unlock(&block_group->lock); 1260 1261 inode = lookup_free_space_inode(root, block_group, path); 1262 if (IS_ERR(inode)) 1263 return 0; 1264 1265 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1266 path, block_group->key.objectid); 1267 if (ret) { 1268 dcs = BTRFS_DC_ERROR; 1269 ret = 0; 1270 #ifdef DEBUG 1271 btrfs_err(root->fs_info, 1272 "failed to write free space cache for block group %llu", 1273 block_group->key.objectid); 1274 #endif 1275 } 1276 1277 spin_lock(&block_group->lock); 1278 block_group->disk_cache_state = dcs; 1279 spin_unlock(&block_group->lock); 1280 iput(inode); 1281 return ret; 1282 } 1283 1284 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1285 u64 offset) 1286 { 1287 ASSERT(offset >= bitmap_start); 1288 offset -= bitmap_start; 1289 return (unsigned long)(div_u64(offset, unit)); 1290 } 1291 1292 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1293 { 1294 return (unsigned long)(div_u64(bytes, unit)); 1295 } 1296 1297 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1298 u64 offset) 1299 { 1300 u64 bitmap_start; 1301 u64 bytes_per_bitmap; 1302 1303 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1304 bitmap_start = offset - ctl->start; 1305 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1306 bitmap_start *= bytes_per_bitmap; 1307 bitmap_start += ctl->start; 1308 1309 return bitmap_start; 1310 } 1311 1312 static int tree_insert_offset(struct rb_root *root, u64 offset, 1313 struct rb_node *node, int bitmap) 1314 { 1315 struct rb_node **p = &root->rb_node; 1316 struct rb_node *parent = NULL; 1317 struct btrfs_free_space *info; 1318 1319 while (*p) { 1320 parent = *p; 1321 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1322 1323 if (offset < info->offset) { 1324 p = &(*p)->rb_left; 1325 } else if (offset > info->offset) { 1326 p = &(*p)->rb_right; 1327 } else { 1328 /* 1329 * we could have a bitmap entry and an extent entry 1330 * share the same offset. If this is the case, we want 1331 * the extent entry to always be found first if we do a 1332 * linear search through the tree, since we want to have 1333 * the quickest allocation time, and allocating from an 1334 * extent is faster than allocating from a bitmap. So 1335 * if we're inserting a bitmap and we find an entry at 1336 * this offset, we want to go right, or after this entry 1337 * logically. If we are inserting an extent and we've 1338 * found a bitmap, we want to go left, or before 1339 * logically. 1340 */ 1341 if (bitmap) { 1342 if (info->bitmap) { 1343 WARN_ON_ONCE(1); 1344 return -EEXIST; 1345 } 1346 p = &(*p)->rb_right; 1347 } else { 1348 if (!info->bitmap) { 1349 WARN_ON_ONCE(1); 1350 return -EEXIST; 1351 } 1352 p = &(*p)->rb_left; 1353 } 1354 } 1355 } 1356 1357 rb_link_node(node, parent, p); 1358 rb_insert_color(node, root); 1359 1360 return 0; 1361 } 1362 1363 /* 1364 * searches the tree for the given offset. 1365 * 1366 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1367 * want a section that has at least bytes size and comes at or after the given 1368 * offset. 1369 */ 1370 static struct btrfs_free_space * 1371 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1372 u64 offset, int bitmap_only, int fuzzy) 1373 { 1374 struct rb_node *n = ctl->free_space_offset.rb_node; 1375 struct btrfs_free_space *entry, *prev = NULL; 1376 1377 /* find entry that is closest to the 'offset' */ 1378 while (1) { 1379 if (!n) { 1380 entry = NULL; 1381 break; 1382 } 1383 1384 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1385 prev = entry; 1386 1387 if (offset < entry->offset) 1388 n = n->rb_left; 1389 else if (offset > entry->offset) 1390 n = n->rb_right; 1391 else 1392 break; 1393 } 1394 1395 if (bitmap_only) { 1396 if (!entry) 1397 return NULL; 1398 if (entry->bitmap) 1399 return entry; 1400 1401 /* 1402 * bitmap entry and extent entry may share same offset, 1403 * in that case, bitmap entry comes after extent entry. 1404 */ 1405 n = rb_next(n); 1406 if (!n) 1407 return NULL; 1408 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1409 if (entry->offset != offset) 1410 return NULL; 1411 1412 WARN_ON(!entry->bitmap); 1413 return entry; 1414 } else if (entry) { 1415 if (entry->bitmap) { 1416 /* 1417 * if previous extent entry covers the offset, 1418 * we should return it instead of the bitmap entry 1419 */ 1420 n = rb_prev(&entry->offset_index); 1421 if (n) { 1422 prev = rb_entry(n, struct btrfs_free_space, 1423 offset_index); 1424 if (!prev->bitmap && 1425 prev->offset + prev->bytes > offset) 1426 entry = prev; 1427 } 1428 } 1429 return entry; 1430 } 1431 1432 if (!prev) 1433 return NULL; 1434 1435 /* find last entry before the 'offset' */ 1436 entry = prev; 1437 if (entry->offset > offset) { 1438 n = rb_prev(&entry->offset_index); 1439 if (n) { 1440 entry = rb_entry(n, struct btrfs_free_space, 1441 offset_index); 1442 ASSERT(entry->offset <= offset); 1443 } else { 1444 if (fuzzy) 1445 return entry; 1446 else 1447 return NULL; 1448 } 1449 } 1450 1451 if (entry->bitmap) { 1452 n = rb_prev(&entry->offset_index); 1453 if (n) { 1454 prev = rb_entry(n, struct btrfs_free_space, 1455 offset_index); 1456 if (!prev->bitmap && 1457 prev->offset + prev->bytes > offset) 1458 return prev; 1459 } 1460 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1461 return entry; 1462 } else if (entry->offset + entry->bytes > offset) 1463 return entry; 1464 1465 if (!fuzzy) 1466 return NULL; 1467 1468 while (1) { 1469 if (entry->bitmap) { 1470 if (entry->offset + BITS_PER_BITMAP * 1471 ctl->unit > offset) 1472 break; 1473 } else { 1474 if (entry->offset + entry->bytes > offset) 1475 break; 1476 } 1477 1478 n = rb_next(&entry->offset_index); 1479 if (!n) 1480 return NULL; 1481 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1482 } 1483 return entry; 1484 } 1485 1486 static inline void 1487 __unlink_free_space(struct btrfs_free_space_ctl *ctl, 1488 struct btrfs_free_space *info) 1489 { 1490 rb_erase(&info->offset_index, &ctl->free_space_offset); 1491 ctl->free_extents--; 1492 } 1493 1494 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1495 struct btrfs_free_space *info) 1496 { 1497 __unlink_free_space(ctl, info); 1498 ctl->free_space -= info->bytes; 1499 } 1500 1501 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1502 struct btrfs_free_space *info) 1503 { 1504 int ret = 0; 1505 1506 ASSERT(info->bytes || info->bitmap); 1507 ret = tree_insert_offset(&ctl->free_space_offset, info->offset, 1508 &info->offset_index, (info->bitmap != NULL)); 1509 if (ret) 1510 return ret; 1511 1512 ctl->free_space += info->bytes; 1513 ctl->free_extents++; 1514 return ret; 1515 } 1516 1517 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 1518 { 1519 struct btrfs_block_group_cache *block_group = ctl->private; 1520 u64 max_bytes; 1521 u64 bitmap_bytes; 1522 u64 extent_bytes; 1523 u64 size = block_group->key.offset; 1524 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 1525 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1526 1527 max_bitmaps = max(max_bitmaps, 1); 1528 1529 ASSERT(ctl->total_bitmaps <= max_bitmaps); 1530 1531 /* 1532 * The goal is to keep the total amount of memory used per 1gb of space 1533 * at or below 32k, so we need to adjust how much memory we allow to be 1534 * used by extent based free space tracking 1535 */ 1536 if (size < 1024 * 1024 * 1024) 1537 max_bytes = MAX_CACHE_BYTES_PER_GIG; 1538 else 1539 max_bytes = MAX_CACHE_BYTES_PER_GIG * 1540 div64_u64(size, 1024 * 1024 * 1024); 1541 1542 /* 1543 * we want to account for 1 more bitmap than what we have so we can make 1544 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as 1545 * we add more bitmaps. 1546 */ 1547 bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE; 1548 1549 if (bitmap_bytes >= max_bytes) { 1550 ctl->extents_thresh = 0; 1551 return; 1552 } 1553 1554 /* 1555 * we want the extent entry threshold to always be at most 1/2 the maxw 1556 * bytes we can have, or whatever is less than that. 1557 */ 1558 extent_bytes = max_bytes - bitmap_bytes; 1559 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); 1560 1561 ctl->extents_thresh = 1562 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); 1563 } 1564 1565 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1566 struct btrfs_free_space *info, 1567 u64 offset, u64 bytes) 1568 { 1569 unsigned long start, count; 1570 1571 start = offset_to_bit(info->offset, ctl->unit, offset); 1572 count = bytes_to_bits(bytes, ctl->unit); 1573 ASSERT(start + count <= BITS_PER_BITMAP); 1574 1575 bitmap_clear(info->bitmap, start, count); 1576 1577 info->bytes -= bytes; 1578 } 1579 1580 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1581 struct btrfs_free_space *info, u64 offset, 1582 u64 bytes) 1583 { 1584 __bitmap_clear_bits(ctl, info, offset, bytes); 1585 ctl->free_space -= bytes; 1586 } 1587 1588 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1589 struct btrfs_free_space *info, u64 offset, 1590 u64 bytes) 1591 { 1592 unsigned long start, count; 1593 1594 start = offset_to_bit(info->offset, ctl->unit, offset); 1595 count = bytes_to_bits(bytes, ctl->unit); 1596 ASSERT(start + count <= BITS_PER_BITMAP); 1597 1598 bitmap_set(info->bitmap, start, count); 1599 1600 info->bytes += bytes; 1601 ctl->free_space += bytes; 1602 } 1603 1604 /* 1605 * If we can not find suitable extent, we will use bytes to record 1606 * the size of the max extent. 1607 */ 1608 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1609 struct btrfs_free_space *bitmap_info, u64 *offset, 1610 u64 *bytes) 1611 { 1612 unsigned long found_bits = 0; 1613 unsigned long max_bits = 0; 1614 unsigned long bits, i; 1615 unsigned long next_zero; 1616 unsigned long extent_bits; 1617 1618 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1619 max_t(u64, *offset, bitmap_info->offset)); 1620 bits = bytes_to_bits(*bytes, ctl->unit); 1621 1622 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1623 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1624 BITS_PER_BITMAP, i); 1625 extent_bits = next_zero - i; 1626 if (extent_bits >= bits) { 1627 found_bits = extent_bits; 1628 break; 1629 } else if (extent_bits > max_bits) { 1630 max_bits = extent_bits; 1631 } 1632 i = next_zero; 1633 } 1634 1635 if (found_bits) { 1636 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1637 *bytes = (u64)(found_bits) * ctl->unit; 1638 return 0; 1639 } 1640 1641 *bytes = (u64)(max_bits) * ctl->unit; 1642 return -1; 1643 } 1644 1645 /* Cache the size of the max extent in bytes */ 1646 static struct btrfs_free_space * 1647 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 1648 unsigned long align, u64 *max_extent_size) 1649 { 1650 struct btrfs_free_space *entry; 1651 struct rb_node *node; 1652 u64 tmp; 1653 u64 align_off; 1654 int ret; 1655 1656 if (!ctl->free_space_offset.rb_node) 1657 goto out; 1658 1659 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); 1660 if (!entry) 1661 goto out; 1662 1663 for (node = &entry->offset_index; node; node = rb_next(node)) { 1664 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1665 if (entry->bytes < *bytes) { 1666 if (entry->bytes > *max_extent_size) 1667 *max_extent_size = entry->bytes; 1668 continue; 1669 } 1670 1671 /* make sure the space returned is big enough 1672 * to match our requested alignment 1673 */ 1674 if (*bytes >= align) { 1675 tmp = entry->offset - ctl->start + align - 1; 1676 do_div(tmp, align); 1677 tmp = tmp * align + ctl->start; 1678 align_off = tmp - entry->offset; 1679 } else { 1680 align_off = 0; 1681 tmp = entry->offset; 1682 } 1683 1684 if (entry->bytes < *bytes + align_off) { 1685 if (entry->bytes > *max_extent_size) 1686 *max_extent_size = entry->bytes; 1687 continue; 1688 } 1689 1690 if (entry->bitmap) { 1691 u64 size = *bytes; 1692 1693 ret = search_bitmap(ctl, entry, &tmp, &size); 1694 if (!ret) { 1695 *offset = tmp; 1696 *bytes = size; 1697 return entry; 1698 } else if (size > *max_extent_size) { 1699 *max_extent_size = size; 1700 } 1701 continue; 1702 } 1703 1704 *offset = tmp; 1705 *bytes = entry->bytes - align_off; 1706 return entry; 1707 } 1708 out: 1709 return NULL; 1710 } 1711 1712 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 1713 struct btrfs_free_space *info, u64 offset) 1714 { 1715 info->offset = offset_to_bitmap(ctl, offset); 1716 info->bytes = 0; 1717 INIT_LIST_HEAD(&info->list); 1718 link_free_space(ctl, info); 1719 ctl->total_bitmaps++; 1720 1721 ctl->op->recalc_thresholds(ctl); 1722 } 1723 1724 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 1725 struct btrfs_free_space *bitmap_info) 1726 { 1727 unlink_free_space(ctl, bitmap_info); 1728 kfree(bitmap_info->bitmap); 1729 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 1730 ctl->total_bitmaps--; 1731 ctl->op->recalc_thresholds(ctl); 1732 } 1733 1734 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 1735 struct btrfs_free_space *bitmap_info, 1736 u64 *offset, u64 *bytes) 1737 { 1738 u64 end; 1739 u64 search_start, search_bytes; 1740 int ret; 1741 1742 again: 1743 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 1744 1745 /* 1746 * We need to search for bits in this bitmap. We could only cover some 1747 * of the extent in this bitmap thanks to how we add space, so we need 1748 * to search for as much as it as we can and clear that amount, and then 1749 * go searching for the next bit. 1750 */ 1751 search_start = *offset; 1752 search_bytes = ctl->unit; 1753 search_bytes = min(search_bytes, end - search_start + 1); 1754 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); 1755 if (ret < 0 || search_start != *offset) 1756 return -EINVAL; 1757 1758 /* We may have found more bits than what we need */ 1759 search_bytes = min(search_bytes, *bytes); 1760 1761 /* Cannot clear past the end of the bitmap */ 1762 search_bytes = min(search_bytes, end - search_start + 1); 1763 1764 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); 1765 *offset += search_bytes; 1766 *bytes -= search_bytes; 1767 1768 if (*bytes) { 1769 struct rb_node *next = rb_next(&bitmap_info->offset_index); 1770 if (!bitmap_info->bytes) 1771 free_bitmap(ctl, bitmap_info); 1772 1773 /* 1774 * no entry after this bitmap, but we still have bytes to 1775 * remove, so something has gone wrong. 1776 */ 1777 if (!next) 1778 return -EINVAL; 1779 1780 bitmap_info = rb_entry(next, struct btrfs_free_space, 1781 offset_index); 1782 1783 /* 1784 * if the next entry isn't a bitmap we need to return to let the 1785 * extent stuff do its work. 1786 */ 1787 if (!bitmap_info->bitmap) 1788 return -EAGAIN; 1789 1790 /* 1791 * Ok the next item is a bitmap, but it may not actually hold 1792 * the information for the rest of this free space stuff, so 1793 * look for it, and if we don't find it return so we can try 1794 * everything over again. 1795 */ 1796 search_start = *offset; 1797 search_bytes = ctl->unit; 1798 ret = search_bitmap(ctl, bitmap_info, &search_start, 1799 &search_bytes); 1800 if (ret < 0 || search_start != *offset) 1801 return -EAGAIN; 1802 1803 goto again; 1804 } else if (!bitmap_info->bytes) 1805 free_bitmap(ctl, bitmap_info); 1806 1807 return 0; 1808 } 1809 1810 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 1811 struct btrfs_free_space *info, u64 offset, 1812 u64 bytes) 1813 { 1814 u64 bytes_to_set = 0; 1815 u64 end; 1816 1817 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 1818 1819 bytes_to_set = min(end - offset, bytes); 1820 1821 bitmap_set_bits(ctl, info, offset, bytes_to_set); 1822 1823 return bytes_to_set; 1824 1825 } 1826 1827 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 1828 struct btrfs_free_space *info) 1829 { 1830 struct btrfs_block_group_cache *block_group = ctl->private; 1831 1832 /* 1833 * If we are below the extents threshold then we can add this as an 1834 * extent, and don't have to deal with the bitmap 1835 */ 1836 if (ctl->free_extents < ctl->extents_thresh) { 1837 /* 1838 * If this block group has some small extents we don't want to 1839 * use up all of our free slots in the cache with them, we want 1840 * to reserve them to larger extents, however if we have plent 1841 * of cache left then go ahead an dadd them, no sense in adding 1842 * the overhead of a bitmap if we don't have to. 1843 */ 1844 if (info->bytes <= block_group->sectorsize * 4) { 1845 if (ctl->free_extents * 2 <= ctl->extents_thresh) 1846 return false; 1847 } else { 1848 return false; 1849 } 1850 } 1851 1852 /* 1853 * The original block groups from mkfs can be really small, like 8 1854 * megabytes, so don't bother with a bitmap for those entries. However 1855 * some block groups can be smaller than what a bitmap would cover but 1856 * are still large enough that they could overflow the 32k memory limit, 1857 * so allow those block groups to still be allowed to have a bitmap 1858 * entry. 1859 */ 1860 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) 1861 return false; 1862 1863 return true; 1864 } 1865 1866 static struct btrfs_free_space_op free_space_op = { 1867 .recalc_thresholds = recalculate_thresholds, 1868 .use_bitmap = use_bitmap, 1869 }; 1870 1871 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 1872 struct btrfs_free_space *info) 1873 { 1874 struct btrfs_free_space *bitmap_info; 1875 struct btrfs_block_group_cache *block_group = NULL; 1876 int added = 0; 1877 u64 bytes, offset, bytes_added; 1878 int ret; 1879 1880 bytes = info->bytes; 1881 offset = info->offset; 1882 1883 if (!ctl->op->use_bitmap(ctl, info)) 1884 return 0; 1885 1886 if (ctl->op == &free_space_op) 1887 block_group = ctl->private; 1888 again: 1889 /* 1890 * Since we link bitmaps right into the cluster we need to see if we 1891 * have a cluster here, and if so and it has our bitmap we need to add 1892 * the free space to that bitmap. 1893 */ 1894 if (block_group && !list_empty(&block_group->cluster_list)) { 1895 struct btrfs_free_cluster *cluster; 1896 struct rb_node *node; 1897 struct btrfs_free_space *entry; 1898 1899 cluster = list_entry(block_group->cluster_list.next, 1900 struct btrfs_free_cluster, 1901 block_group_list); 1902 spin_lock(&cluster->lock); 1903 node = rb_first(&cluster->root); 1904 if (!node) { 1905 spin_unlock(&cluster->lock); 1906 goto no_cluster_bitmap; 1907 } 1908 1909 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1910 if (!entry->bitmap) { 1911 spin_unlock(&cluster->lock); 1912 goto no_cluster_bitmap; 1913 } 1914 1915 if (entry->offset == offset_to_bitmap(ctl, offset)) { 1916 bytes_added = add_bytes_to_bitmap(ctl, entry, 1917 offset, bytes); 1918 bytes -= bytes_added; 1919 offset += bytes_added; 1920 } 1921 spin_unlock(&cluster->lock); 1922 if (!bytes) { 1923 ret = 1; 1924 goto out; 1925 } 1926 } 1927 1928 no_cluster_bitmap: 1929 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1930 1, 0); 1931 if (!bitmap_info) { 1932 ASSERT(added == 0); 1933 goto new_bitmap; 1934 } 1935 1936 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); 1937 bytes -= bytes_added; 1938 offset += bytes_added; 1939 added = 0; 1940 1941 if (!bytes) { 1942 ret = 1; 1943 goto out; 1944 } else 1945 goto again; 1946 1947 new_bitmap: 1948 if (info && info->bitmap) { 1949 add_new_bitmap(ctl, info, offset); 1950 added = 1; 1951 info = NULL; 1952 goto again; 1953 } else { 1954 spin_unlock(&ctl->tree_lock); 1955 1956 /* no pre-allocated info, allocate a new one */ 1957 if (!info) { 1958 info = kmem_cache_zalloc(btrfs_free_space_cachep, 1959 GFP_NOFS); 1960 if (!info) { 1961 spin_lock(&ctl->tree_lock); 1962 ret = -ENOMEM; 1963 goto out; 1964 } 1965 } 1966 1967 /* allocate the bitmap */ 1968 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 1969 spin_lock(&ctl->tree_lock); 1970 if (!info->bitmap) { 1971 ret = -ENOMEM; 1972 goto out; 1973 } 1974 goto again; 1975 } 1976 1977 out: 1978 if (info) { 1979 if (info->bitmap) 1980 kfree(info->bitmap); 1981 kmem_cache_free(btrfs_free_space_cachep, info); 1982 } 1983 1984 return ret; 1985 } 1986 1987 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 1988 struct btrfs_free_space *info, bool update_stat) 1989 { 1990 struct btrfs_free_space *left_info; 1991 struct btrfs_free_space *right_info; 1992 bool merged = false; 1993 u64 offset = info->offset; 1994 u64 bytes = info->bytes; 1995 1996 /* 1997 * first we want to see if there is free space adjacent to the range we 1998 * are adding, if there is remove that struct and add a new one to 1999 * cover the entire range 2000 */ 2001 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2002 if (right_info && rb_prev(&right_info->offset_index)) 2003 left_info = rb_entry(rb_prev(&right_info->offset_index), 2004 struct btrfs_free_space, offset_index); 2005 else 2006 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2007 2008 if (right_info && !right_info->bitmap) { 2009 if (update_stat) 2010 unlink_free_space(ctl, right_info); 2011 else 2012 __unlink_free_space(ctl, right_info); 2013 info->bytes += right_info->bytes; 2014 kmem_cache_free(btrfs_free_space_cachep, right_info); 2015 merged = true; 2016 } 2017 2018 if (left_info && !left_info->bitmap && 2019 left_info->offset + left_info->bytes == offset) { 2020 if (update_stat) 2021 unlink_free_space(ctl, left_info); 2022 else 2023 __unlink_free_space(ctl, left_info); 2024 info->offset = left_info->offset; 2025 info->bytes += left_info->bytes; 2026 kmem_cache_free(btrfs_free_space_cachep, left_info); 2027 merged = true; 2028 } 2029 2030 return merged; 2031 } 2032 2033 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2034 struct btrfs_free_space *info, 2035 bool update_stat) 2036 { 2037 struct btrfs_free_space *bitmap; 2038 unsigned long i; 2039 unsigned long j; 2040 const u64 end = info->offset + info->bytes; 2041 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2042 u64 bytes; 2043 2044 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2045 if (!bitmap) 2046 return false; 2047 2048 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2049 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2050 if (j == i) 2051 return false; 2052 bytes = (j - i) * ctl->unit; 2053 info->bytes += bytes; 2054 2055 if (update_stat) 2056 bitmap_clear_bits(ctl, bitmap, end, bytes); 2057 else 2058 __bitmap_clear_bits(ctl, bitmap, end, bytes); 2059 2060 if (!bitmap->bytes) 2061 free_bitmap(ctl, bitmap); 2062 2063 return true; 2064 } 2065 2066 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2067 struct btrfs_free_space *info, 2068 bool update_stat) 2069 { 2070 struct btrfs_free_space *bitmap; 2071 u64 bitmap_offset; 2072 unsigned long i; 2073 unsigned long j; 2074 unsigned long prev_j; 2075 u64 bytes; 2076 2077 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2078 /* If we're on a boundary, try the previous logical bitmap. */ 2079 if (bitmap_offset == info->offset) { 2080 if (info->offset == 0) 2081 return false; 2082 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2083 } 2084 2085 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2086 if (!bitmap) 2087 return false; 2088 2089 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2090 j = 0; 2091 prev_j = (unsigned long)-1; 2092 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2093 if (j > i) 2094 break; 2095 prev_j = j; 2096 } 2097 if (prev_j == i) 2098 return false; 2099 2100 if (prev_j == (unsigned long)-1) 2101 bytes = (i + 1) * ctl->unit; 2102 else 2103 bytes = (i - prev_j) * ctl->unit; 2104 2105 info->offset -= bytes; 2106 info->bytes += bytes; 2107 2108 if (update_stat) 2109 bitmap_clear_bits(ctl, bitmap, info->offset, bytes); 2110 else 2111 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes); 2112 2113 if (!bitmap->bytes) 2114 free_bitmap(ctl, bitmap); 2115 2116 return true; 2117 } 2118 2119 /* 2120 * We prefer always to allocate from extent entries, both for clustered and 2121 * non-clustered allocation requests. So when attempting to add a new extent 2122 * entry, try to see if there's adjacent free space in bitmap entries, and if 2123 * there is, migrate that space from the bitmaps to the extent. 2124 * Like this we get better chances of satisfying space allocation requests 2125 * because we attempt to satisfy them based on a single cache entry, and never 2126 * on 2 or more entries - even if the entries represent a contiguous free space 2127 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2128 * ends). 2129 */ 2130 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2131 struct btrfs_free_space *info, 2132 bool update_stat) 2133 { 2134 /* 2135 * Only work with disconnected entries, as we can change their offset, 2136 * and must be extent entries. 2137 */ 2138 ASSERT(!info->bitmap); 2139 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2140 2141 if (ctl->total_bitmaps > 0) { 2142 bool stole_end; 2143 bool stole_front = false; 2144 2145 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2146 if (ctl->total_bitmaps > 0) 2147 stole_front = steal_from_bitmap_to_front(ctl, info, 2148 update_stat); 2149 2150 if (stole_end || stole_front) 2151 try_merge_free_space(ctl, info, update_stat); 2152 } 2153 } 2154 2155 int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, 2156 u64 offset, u64 bytes) 2157 { 2158 struct btrfs_free_space *info; 2159 int ret = 0; 2160 2161 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2162 if (!info) 2163 return -ENOMEM; 2164 2165 info->offset = offset; 2166 info->bytes = bytes; 2167 RB_CLEAR_NODE(&info->offset_index); 2168 2169 spin_lock(&ctl->tree_lock); 2170 2171 if (try_merge_free_space(ctl, info, true)) 2172 goto link; 2173 2174 /* 2175 * There was no extent directly to the left or right of this new 2176 * extent then we know we're going to have to allocate a new extent, so 2177 * before we do that see if we need to drop this into a bitmap 2178 */ 2179 ret = insert_into_bitmap(ctl, info); 2180 if (ret < 0) { 2181 goto out; 2182 } else if (ret) { 2183 ret = 0; 2184 goto out; 2185 } 2186 link: 2187 /* 2188 * Only steal free space from adjacent bitmaps if we're sure we're not 2189 * going to add the new free space to existing bitmap entries - because 2190 * that would mean unnecessary work that would be reverted. Therefore 2191 * attempt to steal space from bitmaps if we're adding an extent entry. 2192 */ 2193 steal_from_bitmap(ctl, info, true); 2194 2195 ret = link_free_space(ctl, info); 2196 if (ret) 2197 kmem_cache_free(btrfs_free_space_cachep, info); 2198 out: 2199 spin_unlock(&ctl->tree_lock); 2200 2201 if (ret) { 2202 printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret); 2203 ASSERT(ret != -EEXIST); 2204 } 2205 2206 return ret; 2207 } 2208 2209 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 2210 u64 offset, u64 bytes) 2211 { 2212 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2213 struct btrfs_free_space *info; 2214 int ret; 2215 bool re_search = false; 2216 2217 spin_lock(&ctl->tree_lock); 2218 2219 again: 2220 ret = 0; 2221 if (!bytes) 2222 goto out_lock; 2223 2224 info = tree_search_offset(ctl, offset, 0, 0); 2225 if (!info) { 2226 /* 2227 * oops didn't find an extent that matched the space we wanted 2228 * to remove, look for a bitmap instead 2229 */ 2230 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2231 1, 0); 2232 if (!info) { 2233 /* 2234 * If we found a partial bit of our free space in a 2235 * bitmap but then couldn't find the other part this may 2236 * be a problem, so WARN about it. 2237 */ 2238 WARN_ON(re_search); 2239 goto out_lock; 2240 } 2241 } 2242 2243 re_search = false; 2244 if (!info->bitmap) { 2245 unlink_free_space(ctl, info); 2246 if (offset == info->offset) { 2247 u64 to_free = min(bytes, info->bytes); 2248 2249 info->bytes -= to_free; 2250 info->offset += to_free; 2251 if (info->bytes) { 2252 ret = link_free_space(ctl, info); 2253 WARN_ON(ret); 2254 } else { 2255 kmem_cache_free(btrfs_free_space_cachep, info); 2256 } 2257 2258 offset += to_free; 2259 bytes -= to_free; 2260 goto again; 2261 } else { 2262 u64 old_end = info->bytes + info->offset; 2263 2264 info->bytes = offset - info->offset; 2265 ret = link_free_space(ctl, info); 2266 WARN_ON(ret); 2267 if (ret) 2268 goto out_lock; 2269 2270 /* Not enough bytes in this entry to satisfy us */ 2271 if (old_end < offset + bytes) { 2272 bytes -= old_end - offset; 2273 offset = old_end; 2274 goto again; 2275 } else if (old_end == offset + bytes) { 2276 /* all done */ 2277 goto out_lock; 2278 } 2279 spin_unlock(&ctl->tree_lock); 2280 2281 ret = btrfs_add_free_space(block_group, offset + bytes, 2282 old_end - (offset + bytes)); 2283 WARN_ON(ret); 2284 goto out; 2285 } 2286 } 2287 2288 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2289 if (ret == -EAGAIN) { 2290 re_search = true; 2291 goto again; 2292 } 2293 out_lock: 2294 spin_unlock(&ctl->tree_lock); 2295 out: 2296 return ret; 2297 } 2298 2299 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 2300 u64 bytes) 2301 { 2302 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2303 struct btrfs_free_space *info; 2304 struct rb_node *n; 2305 int count = 0; 2306 2307 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2308 info = rb_entry(n, struct btrfs_free_space, offset_index); 2309 if (info->bytes >= bytes && !block_group->ro) 2310 count++; 2311 btrfs_crit(block_group->fs_info, 2312 "entry offset %llu, bytes %llu, bitmap %s", 2313 info->offset, info->bytes, 2314 (info->bitmap) ? "yes" : "no"); 2315 } 2316 btrfs_info(block_group->fs_info, "block group has cluster?: %s", 2317 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2318 btrfs_info(block_group->fs_info, 2319 "%d blocks of free space at or bigger than bytes is", count); 2320 } 2321 2322 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) 2323 { 2324 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2325 2326 spin_lock_init(&ctl->tree_lock); 2327 ctl->unit = block_group->sectorsize; 2328 ctl->start = block_group->key.objectid; 2329 ctl->private = block_group; 2330 ctl->op = &free_space_op; 2331 INIT_LIST_HEAD(&ctl->trimming_ranges); 2332 mutex_init(&ctl->cache_writeout_mutex); 2333 2334 /* 2335 * we only want to have 32k of ram per block group for keeping 2336 * track of free space, and if we pass 1/2 of that we want to 2337 * start converting things over to using bitmaps 2338 */ 2339 ctl->extents_thresh = ((1024 * 32) / 2) / 2340 sizeof(struct btrfs_free_space); 2341 } 2342 2343 /* 2344 * for a given cluster, put all of its extents back into the free 2345 * space cache. If the block group passed doesn't match the block group 2346 * pointed to by the cluster, someone else raced in and freed the 2347 * cluster already. In that case, we just return without changing anything 2348 */ 2349 static int 2350 __btrfs_return_cluster_to_free_space( 2351 struct btrfs_block_group_cache *block_group, 2352 struct btrfs_free_cluster *cluster) 2353 { 2354 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2355 struct btrfs_free_space *entry; 2356 struct rb_node *node; 2357 2358 spin_lock(&cluster->lock); 2359 if (cluster->block_group != block_group) 2360 goto out; 2361 2362 cluster->block_group = NULL; 2363 cluster->window_start = 0; 2364 list_del_init(&cluster->block_group_list); 2365 2366 node = rb_first(&cluster->root); 2367 while (node) { 2368 bool bitmap; 2369 2370 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2371 node = rb_next(&entry->offset_index); 2372 rb_erase(&entry->offset_index, &cluster->root); 2373 RB_CLEAR_NODE(&entry->offset_index); 2374 2375 bitmap = (entry->bitmap != NULL); 2376 if (!bitmap) { 2377 try_merge_free_space(ctl, entry, false); 2378 steal_from_bitmap(ctl, entry, false); 2379 } 2380 tree_insert_offset(&ctl->free_space_offset, 2381 entry->offset, &entry->offset_index, bitmap); 2382 } 2383 cluster->root = RB_ROOT; 2384 2385 out: 2386 spin_unlock(&cluster->lock); 2387 btrfs_put_block_group(block_group); 2388 return 0; 2389 } 2390 2391 static void __btrfs_remove_free_space_cache_locked( 2392 struct btrfs_free_space_ctl *ctl) 2393 { 2394 struct btrfs_free_space *info; 2395 struct rb_node *node; 2396 2397 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 2398 info = rb_entry(node, struct btrfs_free_space, offset_index); 2399 if (!info->bitmap) { 2400 unlink_free_space(ctl, info); 2401 kmem_cache_free(btrfs_free_space_cachep, info); 2402 } else { 2403 free_bitmap(ctl, info); 2404 } 2405 if (need_resched()) { 2406 spin_unlock(&ctl->tree_lock); 2407 cond_resched(); 2408 spin_lock(&ctl->tree_lock); 2409 } 2410 } 2411 } 2412 2413 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 2414 { 2415 spin_lock(&ctl->tree_lock); 2416 __btrfs_remove_free_space_cache_locked(ctl); 2417 spin_unlock(&ctl->tree_lock); 2418 } 2419 2420 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 2421 { 2422 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2423 struct btrfs_free_cluster *cluster; 2424 struct list_head *head; 2425 2426 spin_lock(&ctl->tree_lock); 2427 while ((head = block_group->cluster_list.next) != 2428 &block_group->cluster_list) { 2429 cluster = list_entry(head, struct btrfs_free_cluster, 2430 block_group_list); 2431 2432 WARN_ON(cluster->block_group != block_group); 2433 __btrfs_return_cluster_to_free_space(block_group, cluster); 2434 if (need_resched()) { 2435 spin_unlock(&ctl->tree_lock); 2436 cond_resched(); 2437 spin_lock(&ctl->tree_lock); 2438 } 2439 } 2440 __btrfs_remove_free_space_cache_locked(ctl); 2441 spin_unlock(&ctl->tree_lock); 2442 2443 } 2444 2445 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, 2446 u64 offset, u64 bytes, u64 empty_size, 2447 u64 *max_extent_size) 2448 { 2449 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2450 struct btrfs_free_space *entry = NULL; 2451 u64 bytes_search = bytes + empty_size; 2452 u64 ret = 0; 2453 u64 align_gap = 0; 2454 u64 align_gap_len = 0; 2455 2456 spin_lock(&ctl->tree_lock); 2457 entry = find_free_space(ctl, &offset, &bytes_search, 2458 block_group->full_stripe_len, max_extent_size); 2459 if (!entry) 2460 goto out; 2461 2462 ret = offset; 2463 if (entry->bitmap) { 2464 bitmap_clear_bits(ctl, entry, offset, bytes); 2465 if (!entry->bytes) 2466 free_bitmap(ctl, entry); 2467 } else { 2468 unlink_free_space(ctl, entry); 2469 align_gap_len = offset - entry->offset; 2470 align_gap = entry->offset; 2471 2472 entry->offset = offset + bytes; 2473 WARN_ON(entry->bytes < bytes + align_gap_len); 2474 2475 entry->bytes -= bytes + align_gap_len; 2476 if (!entry->bytes) 2477 kmem_cache_free(btrfs_free_space_cachep, entry); 2478 else 2479 link_free_space(ctl, entry); 2480 } 2481 out: 2482 spin_unlock(&ctl->tree_lock); 2483 2484 if (align_gap_len) 2485 __btrfs_add_free_space(ctl, align_gap, align_gap_len); 2486 return ret; 2487 } 2488 2489 /* 2490 * given a cluster, put all of its extents back into the free space 2491 * cache. If a block group is passed, this function will only free 2492 * a cluster that belongs to the passed block group. 2493 * 2494 * Otherwise, it'll get a reference on the block group pointed to by the 2495 * cluster and remove the cluster from it. 2496 */ 2497 int btrfs_return_cluster_to_free_space( 2498 struct btrfs_block_group_cache *block_group, 2499 struct btrfs_free_cluster *cluster) 2500 { 2501 struct btrfs_free_space_ctl *ctl; 2502 int ret; 2503 2504 /* first, get a safe pointer to the block group */ 2505 spin_lock(&cluster->lock); 2506 if (!block_group) { 2507 block_group = cluster->block_group; 2508 if (!block_group) { 2509 spin_unlock(&cluster->lock); 2510 return 0; 2511 } 2512 } else if (cluster->block_group != block_group) { 2513 /* someone else has already freed it don't redo their work */ 2514 spin_unlock(&cluster->lock); 2515 return 0; 2516 } 2517 atomic_inc(&block_group->count); 2518 spin_unlock(&cluster->lock); 2519 2520 ctl = block_group->free_space_ctl; 2521 2522 /* now return any extents the cluster had on it */ 2523 spin_lock(&ctl->tree_lock); 2524 ret = __btrfs_return_cluster_to_free_space(block_group, cluster); 2525 spin_unlock(&ctl->tree_lock); 2526 2527 /* finally drop our ref */ 2528 btrfs_put_block_group(block_group); 2529 return ret; 2530 } 2531 2532 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, 2533 struct btrfs_free_cluster *cluster, 2534 struct btrfs_free_space *entry, 2535 u64 bytes, u64 min_start, 2536 u64 *max_extent_size) 2537 { 2538 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2539 int err; 2540 u64 search_start = cluster->window_start; 2541 u64 search_bytes = bytes; 2542 u64 ret = 0; 2543 2544 search_start = min_start; 2545 search_bytes = bytes; 2546 2547 err = search_bitmap(ctl, entry, &search_start, &search_bytes); 2548 if (err) { 2549 if (search_bytes > *max_extent_size) 2550 *max_extent_size = search_bytes; 2551 return 0; 2552 } 2553 2554 ret = search_start; 2555 __bitmap_clear_bits(ctl, entry, ret, bytes); 2556 2557 return ret; 2558 } 2559 2560 /* 2561 * given a cluster, try to allocate 'bytes' from it, returns 0 2562 * if it couldn't find anything suitably large, or a logical disk offset 2563 * if things worked out 2564 */ 2565 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, 2566 struct btrfs_free_cluster *cluster, u64 bytes, 2567 u64 min_start, u64 *max_extent_size) 2568 { 2569 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2570 struct btrfs_free_space *entry = NULL; 2571 struct rb_node *node; 2572 u64 ret = 0; 2573 2574 spin_lock(&cluster->lock); 2575 if (bytes > cluster->max_size) 2576 goto out; 2577 2578 if (cluster->block_group != block_group) 2579 goto out; 2580 2581 node = rb_first(&cluster->root); 2582 if (!node) 2583 goto out; 2584 2585 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2586 while (1) { 2587 if (entry->bytes < bytes && entry->bytes > *max_extent_size) 2588 *max_extent_size = entry->bytes; 2589 2590 if (entry->bytes < bytes || 2591 (!entry->bitmap && entry->offset < min_start)) { 2592 node = rb_next(&entry->offset_index); 2593 if (!node) 2594 break; 2595 entry = rb_entry(node, struct btrfs_free_space, 2596 offset_index); 2597 continue; 2598 } 2599 2600 if (entry->bitmap) { 2601 ret = btrfs_alloc_from_bitmap(block_group, 2602 cluster, entry, bytes, 2603 cluster->window_start, 2604 max_extent_size); 2605 if (ret == 0) { 2606 node = rb_next(&entry->offset_index); 2607 if (!node) 2608 break; 2609 entry = rb_entry(node, struct btrfs_free_space, 2610 offset_index); 2611 continue; 2612 } 2613 cluster->window_start += bytes; 2614 } else { 2615 ret = entry->offset; 2616 2617 entry->offset += bytes; 2618 entry->bytes -= bytes; 2619 } 2620 2621 if (entry->bytes == 0) 2622 rb_erase(&entry->offset_index, &cluster->root); 2623 break; 2624 } 2625 out: 2626 spin_unlock(&cluster->lock); 2627 2628 if (!ret) 2629 return 0; 2630 2631 spin_lock(&ctl->tree_lock); 2632 2633 ctl->free_space -= bytes; 2634 if (entry->bytes == 0) { 2635 ctl->free_extents--; 2636 if (entry->bitmap) { 2637 kfree(entry->bitmap); 2638 ctl->total_bitmaps--; 2639 ctl->op->recalc_thresholds(ctl); 2640 } 2641 kmem_cache_free(btrfs_free_space_cachep, entry); 2642 } 2643 2644 spin_unlock(&ctl->tree_lock); 2645 2646 return ret; 2647 } 2648 2649 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, 2650 struct btrfs_free_space *entry, 2651 struct btrfs_free_cluster *cluster, 2652 u64 offset, u64 bytes, 2653 u64 cont1_bytes, u64 min_bytes) 2654 { 2655 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2656 unsigned long next_zero; 2657 unsigned long i; 2658 unsigned long want_bits; 2659 unsigned long min_bits; 2660 unsigned long found_bits; 2661 unsigned long start = 0; 2662 unsigned long total_found = 0; 2663 int ret; 2664 2665 i = offset_to_bit(entry->offset, ctl->unit, 2666 max_t(u64, offset, entry->offset)); 2667 want_bits = bytes_to_bits(bytes, ctl->unit); 2668 min_bits = bytes_to_bits(min_bytes, ctl->unit); 2669 2670 again: 2671 found_bits = 0; 2672 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 2673 next_zero = find_next_zero_bit(entry->bitmap, 2674 BITS_PER_BITMAP, i); 2675 if (next_zero - i >= min_bits) { 2676 found_bits = next_zero - i; 2677 break; 2678 } 2679 i = next_zero; 2680 } 2681 2682 if (!found_bits) 2683 return -ENOSPC; 2684 2685 if (!total_found) { 2686 start = i; 2687 cluster->max_size = 0; 2688 } 2689 2690 total_found += found_bits; 2691 2692 if (cluster->max_size < found_bits * ctl->unit) 2693 cluster->max_size = found_bits * ctl->unit; 2694 2695 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 2696 i = next_zero + 1; 2697 goto again; 2698 } 2699 2700 cluster->window_start = start * ctl->unit + entry->offset; 2701 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2702 ret = tree_insert_offset(&cluster->root, entry->offset, 2703 &entry->offset_index, 1); 2704 ASSERT(!ret); /* -EEXIST; Logic error */ 2705 2706 trace_btrfs_setup_cluster(block_group, cluster, 2707 total_found * ctl->unit, 1); 2708 return 0; 2709 } 2710 2711 /* 2712 * This searches the block group for just extents to fill the cluster with. 2713 * Try to find a cluster with at least bytes total bytes, at least one 2714 * extent of cont1_bytes, and other clusters of at least min_bytes. 2715 */ 2716 static noinline int 2717 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, 2718 struct btrfs_free_cluster *cluster, 2719 struct list_head *bitmaps, u64 offset, u64 bytes, 2720 u64 cont1_bytes, u64 min_bytes) 2721 { 2722 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2723 struct btrfs_free_space *first = NULL; 2724 struct btrfs_free_space *entry = NULL; 2725 struct btrfs_free_space *last; 2726 struct rb_node *node; 2727 u64 window_free; 2728 u64 max_extent; 2729 u64 total_size = 0; 2730 2731 entry = tree_search_offset(ctl, offset, 0, 1); 2732 if (!entry) 2733 return -ENOSPC; 2734 2735 /* 2736 * We don't want bitmaps, so just move along until we find a normal 2737 * extent entry. 2738 */ 2739 while (entry->bitmap || entry->bytes < min_bytes) { 2740 if (entry->bitmap && list_empty(&entry->list)) 2741 list_add_tail(&entry->list, bitmaps); 2742 node = rb_next(&entry->offset_index); 2743 if (!node) 2744 return -ENOSPC; 2745 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2746 } 2747 2748 window_free = entry->bytes; 2749 max_extent = entry->bytes; 2750 first = entry; 2751 last = entry; 2752 2753 for (node = rb_next(&entry->offset_index); node; 2754 node = rb_next(&entry->offset_index)) { 2755 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2756 2757 if (entry->bitmap) { 2758 if (list_empty(&entry->list)) 2759 list_add_tail(&entry->list, bitmaps); 2760 continue; 2761 } 2762 2763 if (entry->bytes < min_bytes) 2764 continue; 2765 2766 last = entry; 2767 window_free += entry->bytes; 2768 if (entry->bytes > max_extent) 2769 max_extent = entry->bytes; 2770 } 2771 2772 if (window_free < bytes || max_extent < cont1_bytes) 2773 return -ENOSPC; 2774 2775 cluster->window_start = first->offset; 2776 2777 node = &first->offset_index; 2778 2779 /* 2780 * now we've found our entries, pull them out of the free space 2781 * cache and put them into the cluster rbtree 2782 */ 2783 do { 2784 int ret; 2785 2786 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2787 node = rb_next(&entry->offset_index); 2788 if (entry->bitmap || entry->bytes < min_bytes) 2789 continue; 2790 2791 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2792 ret = tree_insert_offset(&cluster->root, entry->offset, 2793 &entry->offset_index, 0); 2794 total_size += entry->bytes; 2795 ASSERT(!ret); /* -EEXIST; Logic error */ 2796 } while (node && entry != last); 2797 2798 cluster->max_size = max_extent; 2799 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 2800 return 0; 2801 } 2802 2803 /* 2804 * This specifically looks for bitmaps that may work in the cluster, we assume 2805 * that we have already failed to find extents that will work. 2806 */ 2807 static noinline int 2808 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, 2809 struct btrfs_free_cluster *cluster, 2810 struct list_head *bitmaps, u64 offset, u64 bytes, 2811 u64 cont1_bytes, u64 min_bytes) 2812 { 2813 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2814 struct btrfs_free_space *entry; 2815 int ret = -ENOSPC; 2816 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 2817 2818 if (ctl->total_bitmaps == 0) 2819 return -ENOSPC; 2820 2821 /* 2822 * The bitmap that covers offset won't be in the list unless offset 2823 * is just its start offset. 2824 */ 2825 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 2826 if (entry->offset != bitmap_offset) { 2827 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 2828 if (entry && list_empty(&entry->list)) 2829 list_add(&entry->list, bitmaps); 2830 } 2831 2832 list_for_each_entry(entry, bitmaps, list) { 2833 if (entry->bytes < bytes) 2834 continue; 2835 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 2836 bytes, cont1_bytes, min_bytes); 2837 if (!ret) 2838 return 0; 2839 } 2840 2841 /* 2842 * The bitmaps list has all the bitmaps that record free space 2843 * starting after offset, so no more search is required. 2844 */ 2845 return -ENOSPC; 2846 } 2847 2848 /* 2849 * here we try to find a cluster of blocks in a block group. The goal 2850 * is to find at least bytes+empty_size. 2851 * We might not find them all in one contiguous area. 2852 * 2853 * returns zero and sets up cluster if things worked out, otherwise 2854 * it returns -enospc 2855 */ 2856 int btrfs_find_space_cluster(struct btrfs_root *root, 2857 struct btrfs_block_group_cache *block_group, 2858 struct btrfs_free_cluster *cluster, 2859 u64 offset, u64 bytes, u64 empty_size) 2860 { 2861 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2862 struct btrfs_free_space *entry, *tmp; 2863 LIST_HEAD(bitmaps); 2864 u64 min_bytes; 2865 u64 cont1_bytes; 2866 int ret; 2867 2868 /* 2869 * Choose the minimum extent size we'll require for this 2870 * cluster. For SSD_SPREAD, don't allow any fragmentation. 2871 * For metadata, allow allocates with smaller extents. For 2872 * data, keep it dense. 2873 */ 2874 if (btrfs_test_opt(root, SSD_SPREAD)) { 2875 cont1_bytes = min_bytes = bytes + empty_size; 2876 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 2877 cont1_bytes = bytes; 2878 min_bytes = block_group->sectorsize; 2879 } else { 2880 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 2881 min_bytes = block_group->sectorsize; 2882 } 2883 2884 spin_lock(&ctl->tree_lock); 2885 2886 /* 2887 * If we know we don't have enough space to make a cluster don't even 2888 * bother doing all the work to try and find one. 2889 */ 2890 if (ctl->free_space < bytes) { 2891 spin_unlock(&ctl->tree_lock); 2892 return -ENOSPC; 2893 } 2894 2895 spin_lock(&cluster->lock); 2896 2897 /* someone already found a cluster, hooray */ 2898 if (cluster->block_group) { 2899 ret = 0; 2900 goto out; 2901 } 2902 2903 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 2904 min_bytes); 2905 2906 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2907 bytes + empty_size, 2908 cont1_bytes, min_bytes); 2909 if (ret) 2910 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 2911 offset, bytes + empty_size, 2912 cont1_bytes, min_bytes); 2913 2914 /* Clear our temporary list */ 2915 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 2916 list_del_init(&entry->list); 2917 2918 if (!ret) { 2919 atomic_inc(&block_group->count); 2920 list_add_tail(&cluster->block_group_list, 2921 &block_group->cluster_list); 2922 cluster->block_group = block_group; 2923 } else { 2924 trace_btrfs_failed_cluster_setup(block_group); 2925 } 2926 out: 2927 spin_unlock(&cluster->lock); 2928 spin_unlock(&ctl->tree_lock); 2929 2930 return ret; 2931 } 2932 2933 /* 2934 * simple code to zero out a cluster 2935 */ 2936 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 2937 { 2938 spin_lock_init(&cluster->lock); 2939 spin_lock_init(&cluster->refill_lock); 2940 cluster->root = RB_ROOT; 2941 cluster->max_size = 0; 2942 INIT_LIST_HEAD(&cluster->block_group_list); 2943 cluster->block_group = NULL; 2944 } 2945 2946 static int do_trimming(struct btrfs_block_group_cache *block_group, 2947 u64 *total_trimmed, u64 start, u64 bytes, 2948 u64 reserved_start, u64 reserved_bytes, 2949 struct btrfs_trim_range *trim_entry) 2950 { 2951 struct btrfs_space_info *space_info = block_group->space_info; 2952 struct btrfs_fs_info *fs_info = block_group->fs_info; 2953 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2954 int ret; 2955 int update = 0; 2956 u64 trimmed = 0; 2957 2958 spin_lock(&space_info->lock); 2959 spin_lock(&block_group->lock); 2960 if (!block_group->ro) { 2961 block_group->reserved += reserved_bytes; 2962 space_info->bytes_reserved += reserved_bytes; 2963 update = 1; 2964 } 2965 spin_unlock(&block_group->lock); 2966 spin_unlock(&space_info->lock); 2967 2968 ret = btrfs_discard_extent(fs_info->extent_root, 2969 start, bytes, &trimmed); 2970 if (!ret) 2971 *total_trimmed += trimmed; 2972 2973 mutex_lock(&ctl->cache_writeout_mutex); 2974 btrfs_add_free_space(block_group, reserved_start, reserved_bytes); 2975 list_del(&trim_entry->list); 2976 mutex_unlock(&ctl->cache_writeout_mutex); 2977 2978 if (update) { 2979 spin_lock(&space_info->lock); 2980 spin_lock(&block_group->lock); 2981 if (block_group->ro) 2982 space_info->bytes_readonly += reserved_bytes; 2983 block_group->reserved -= reserved_bytes; 2984 space_info->bytes_reserved -= reserved_bytes; 2985 spin_unlock(&space_info->lock); 2986 spin_unlock(&block_group->lock); 2987 } 2988 2989 return ret; 2990 } 2991 2992 static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, 2993 u64 *total_trimmed, u64 start, u64 end, u64 minlen) 2994 { 2995 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2996 struct btrfs_free_space *entry; 2997 struct rb_node *node; 2998 int ret = 0; 2999 u64 extent_start; 3000 u64 extent_bytes; 3001 u64 bytes; 3002 3003 while (start < end) { 3004 struct btrfs_trim_range trim_entry; 3005 3006 mutex_lock(&ctl->cache_writeout_mutex); 3007 spin_lock(&ctl->tree_lock); 3008 3009 if (ctl->free_space < minlen) { 3010 spin_unlock(&ctl->tree_lock); 3011 mutex_unlock(&ctl->cache_writeout_mutex); 3012 break; 3013 } 3014 3015 entry = tree_search_offset(ctl, start, 0, 1); 3016 if (!entry) { 3017 spin_unlock(&ctl->tree_lock); 3018 mutex_unlock(&ctl->cache_writeout_mutex); 3019 break; 3020 } 3021 3022 /* skip bitmaps */ 3023 while (entry->bitmap) { 3024 node = rb_next(&entry->offset_index); 3025 if (!node) { 3026 spin_unlock(&ctl->tree_lock); 3027 mutex_unlock(&ctl->cache_writeout_mutex); 3028 goto out; 3029 } 3030 entry = rb_entry(node, struct btrfs_free_space, 3031 offset_index); 3032 } 3033 3034 if (entry->offset >= end) { 3035 spin_unlock(&ctl->tree_lock); 3036 mutex_unlock(&ctl->cache_writeout_mutex); 3037 break; 3038 } 3039 3040 extent_start = entry->offset; 3041 extent_bytes = entry->bytes; 3042 start = max(start, extent_start); 3043 bytes = min(extent_start + extent_bytes, end) - start; 3044 if (bytes < minlen) { 3045 spin_unlock(&ctl->tree_lock); 3046 mutex_unlock(&ctl->cache_writeout_mutex); 3047 goto next; 3048 } 3049 3050 unlink_free_space(ctl, entry); 3051 kmem_cache_free(btrfs_free_space_cachep, entry); 3052 3053 spin_unlock(&ctl->tree_lock); 3054 trim_entry.start = extent_start; 3055 trim_entry.bytes = extent_bytes; 3056 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3057 mutex_unlock(&ctl->cache_writeout_mutex); 3058 3059 ret = do_trimming(block_group, total_trimmed, start, bytes, 3060 extent_start, extent_bytes, &trim_entry); 3061 if (ret) 3062 break; 3063 next: 3064 start += bytes; 3065 3066 if (fatal_signal_pending(current)) { 3067 ret = -ERESTARTSYS; 3068 break; 3069 } 3070 3071 cond_resched(); 3072 } 3073 out: 3074 return ret; 3075 } 3076 3077 static int trim_bitmaps(struct btrfs_block_group_cache *block_group, 3078 u64 *total_trimmed, u64 start, u64 end, u64 minlen) 3079 { 3080 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3081 struct btrfs_free_space *entry; 3082 int ret = 0; 3083 int ret2; 3084 u64 bytes; 3085 u64 offset = offset_to_bitmap(ctl, start); 3086 3087 while (offset < end) { 3088 bool next_bitmap = false; 3089 struct btrfs_trim_range trim_entry; 3090 3091 mutex_lock(&ctl->cache_writeout_mutex); 3092 spin_lock(&ctl->tree_lock); 3093 3094 if (ctl->free_space < minlen) { 3095 spin_unlock(&ctl->tree_lock); 3096 mutex_unlock(&ctl->cache_writeout_mutex); 3097 break; 3098 } 3099 3100 entry = tree_search_offset(ctl, offset, 1, 0); 3101 if (!entry) { 3102 spin_unlock(&ctl->tree_lock); 3103 mutex_unlock(&ctl->cache_writeout_mutex); 3104 next_bitmap = true; 3105 goto next; 3106 } 3107 3108 bytes = minlen; 3109 ret2 = search_bitmap(ctl, entry, &start, &bytes); 3110 if (ret2 || start >= end) { 3111 spin_unlock(&ctl->tree_lock); 3112 mutex_unlock(&ctl->cache_writeout_mutex); 3113 next_bitmap = true; 3114 goto next; 3115 } 3116 3117 bytes = min(bytes, end - start); 3118 if (bytes < minlen) { 3119 spin_unlock(&ctl->tree_lock); 3120 mutex_unlock(&ctl->cache_writeout_mutex); 3121 goto next; 3122 } 3123 3124 bitmap_clear_bits(ctl, entry, start, bytes); 3125 if (entry->bytes == 0) 3126 free_bitmap(ctl, entry); 3127 3128 spin_unlock(&ctl->tree_lock); 3129 trim_entry.start = start; 3130 trim_entry.bytes = bytes; 3131 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3132 mutex_unlock(&ctl->cache_writeout_mutex); 3133 3134 ret = do_trimming(block_group, total_trimmed, start, bytes, 3135 start, bytes, &trim_entry); 3136 if (ret) 3137 break; 3138 next: 3139 if (next_bitmap) { 3140 offset += BITS_PER_BITMAP * ctl->unit; 3141 } else { 3142 start += bytes; 3143 if (start >= offset + BITS_PER_BITMAP * ctl->unit) 3144 offset += BITS_PER_BITMAP * ctl->unit; 3145 } 3146 3147 if (fatal_signal_pending(current)) { 3148 ret = -ERESTARTSYS; 3149 break; 3150 } 3151 3152 cond_resched(); 3153 } 3154 3155 return ret; 3156 } 3157 3158 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 3159 u64 *trimmed, u64 start, u64 end, u64 minlen) 3160 { 3161 int ret; 3162 3163 *trimmed = 0; 3164 3165 spin_lock(&block_group->lock); 3166 if (block_group->removed) { 3167 spin_unlock(&block_group->lock); 3168 return 0; 3169 } 3170 atomic_inc(&block_group->trimming); 3171 spin_unlock(&block_group->lock); 3172 3173 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); 3174 if (ret) 3175 goto out; 3176 3177 ret = trim_bitmaps(block_group, trimmed, start, end, minlen); 3178 out: 3179 spin_lock(&block_group->lock); 3180 if (atomic_dec_and_test(&block_group->trimming) && 3181 block_group->removed) { 3182 struct extent_map_tree *em_tree; 3183 struct extent_map *em; 3184 3185 spin_unlock(&block_group->lock); 3186 3187 lock_chunks(block_group->fs_info->chunk_root); 3188 em_tree = &block_group->fs_info->mapping_tree.map_tree; 3189 write_lock(&em_tree->lock); 3190 em = lookup_extent_mapping(em_tree, block_group->key.objectid, 3191 1); 3192 BUG_ON(!em); /* logic error, can't happen */ 3193 /* 3194 * remove_extent_mapping() will delete us from the pinned_chunks 3195 * list, which is protected by the chunk mutex. 3196 */ 3197 remove_extent_mapping(em_tree, em); 3198 write_unlock(&em_tree->lock); 3199 unlock_chunks(block_group->fs_info->chunk_root); 3200 3201 /* once for us and once for the tree */ 3202 free_extent_map(em); 3203 free_extent_map(em); 3204 3205 /* 3206 * We've left one free space entry and other tasks trimming 3207 * this block group have left 1 entry each one. Free them. 3208 */ 3209 __btrfs_remove_free_space_cache(block_group->free_space_ctl); 3210 } else { 3211 spin_unlock(&block_group->lock); 3212 } 3213 3214 return ret; 3215 } 3216 3217 /* 3218 * Find the left-most item in the cache tree, and then return the 3219 * smallest inode number in the item. 3220 * 3221 * Note: the returned inode number may not be the smallest one in 3222 * the tree, if the left-most item is a bitmap. 3223 */ 3224 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) 3225 { 3226 struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; 3227 struct btrfs_free_space *entry = NULL; 3228 u64 ino = 0; 3229 3230 spin_lock(&ctl->tree_lock); 3231 3232 if (RB_EMPTY_ROOT(&ctl->free_space_offset)) 3233 goto out; 3234 3235 entry = rb_entry(rb_first(&ctl->free_space_offset), 3236 struct btrfs_free_space, offset_index); 3237 3238 if (!entry->bitmap) { 3239 ino = entry->offset; 3240 3241 unlink_free_space(ctl, entry); 3242 entry->offset++; 3243 entry->bytes--; 3244 if (!entry->bytes) 3245 kmem_cache_free(btrfs_free_space_cachep, entry); 3246 else 3247 link_free_space(ctl, entry); 3248 } else { 3249 u64 offset = 0; 3250 u64 count = 1; 3251 int ret; 3252 3253 ret = search_bitmap(ctl, entry, &offset, &count); 3254 /* Logic error; Should be empty if it can't find anything */ 3255 ASSERT(!ret); 3256 3257 ino = offset; 3258 bitmap_clear_bits(ctl, entry, offset, 1); 3259 if (entry->bytes == 0) 3260 free_bitmap(ctl, entry); 3261 } 3262 out: 3263 spin_unlock(&ctl->tree_lock); 3264 3265 return ino; 3266 } 3267 3268 struct inode *lookup_free_ino_inode(struct btrfs_root *root, 3269 struct btrfs_path *path) 3270 { 3271 struct inode *inode = NULL; 3272 3273 spin_lock(&root->ino_cache_lock); 3274 if (root->ino_cache_inode) 3275 inode = igrab(root->ino_cache_inode); 3276 spin_unlock(&root->ino_cache_lock); 3277 if (inode) 3278 return inode; 3279 3280 inode = __lookup_free_space_inode(root, path, 0); 3281 if (IS_ERR(inode)) 3282 return inode; 3283 3284 spin_lock(&root->ino_cache_lock); 3285 if (!btrfs_fs_closing(root->fs_info)) 3286 root->ino_cache_inode = igrab(inode); 3287 spin_unlock(&root->ino_cache_lock); 3288 3289 return inode; 3290 } 3291 3292 int create_free_ino_inode(struct btrfs_root *root, 3293 struct btrfs_trans_handle *trans, 3294 struct btrfs_path *path) 3295 { 3296 return __create_free_space_inode(root, trans, path, 3297 BTRFS_FREE_INO_OBJECTID, 0); 3298 } 3299 3300 int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 3301 { 3302 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 3303 struct btrfs_path *path; 3304 struct inode *inode; 3305 int ret = 0; 3306 u64 root_gen = btrfs_root_generation(&root->root_item); 3307 3308 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 3309 return 0; 3310 3311 /* 3312 * If we're unmounting then just return, since this does a search on the 3313 * normal root and not the commit root and we could deadlock. 3314 */ 3315 if (btrfs_fs_closing(fs_info)) 3316 return 0; 3317 3318 path = btrfs_alloc_path(); 3319 if (!path) 3320 return 0; 3321 3322 inode = lookup_free_ino_inode(root, path); 3323 if (IS_ERR(inode)) 3324 goto out; 3325 3326 if (root_gen != BTRFS_I(inode)->generation) 3327 goto out_put; 3328 3329 ret = __load_free_space_cache(root, inode, ctl, path, 0); 3330 3331 if (ret < 0) 3332 btrfs_err(fs_info, 3333 "failed to load free ino cache for root %llu", 3334 root->root_key.objectid); 3335 out_put: 3336 iput(inode); 3337 out: 3338 btrfs_free_path(path); 3339 return ret; 3340 } 3341 3342 int btrfs_write_out_ino_cache(struct btrfs_root *root, 3343 struct btrfs_trans_handle *trans, 3344 struct btrfs_path *path, 3345 struct inode *inode) 3346 { 3347 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 3348 int ret; 3349 3350 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 3351 return 0; 3352 3353 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 3354 if (ret) { 3355 btrfs_delalloc_release_metadata(inode, inode->i_size); 3356 #ifdef DEBUG 3357 btrfs_err(root->fs_info, 3358 "failed to write free ino cache for root %llu", 3359 root->root_key.objectid); 3360 #endif 3361 } 3362 3363 return ret; 3364 } 3365 3366 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3367 /* 3368 * Use this if you need to make a bitmap or extent entry specifically, it 3369 * doesn't do any of the merging that add_free_space does, this acts a lot like 3370 * how the free space cache loading stuff works, so you can get really weird 3371 * configurations. 3372 */ 3373 int test_add_free_space_entry(struct btrfs_block_group_cache *cache, 3374 u64 offset, u64 bytes, bool bitmap) 3375 { 3376 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 3377 struct btrfs_free_space *info = NULL, *bitmap_info; 3378 void *map = NULL; 3379 u64 bytes_added; 3380 int ret; 3381 3382 again: 3383 if (!info) { 3384 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 3385 if (!info) 3386 return -ENOMEM; 3387 } 3388 3389 if (!bitmap) { 3390 spin_lock(&ctl->tree_lock); 3391 info->offset = offset; 3392 info->bytes = bytes; 3393 ret = link_free_space(ctl, info); 3394 spin_unlock(&ctl->tree_lock); 3395 if (ret) 3396 kmem_cache_free(btrfs_free_space_cachep, info); 3397 return ret; 3398 } 3399 3400 if (!map) { 3401 map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 3402 if (!map) { 3403 kmem_cache_free(btrfs_free_space_cachep, info); 3404 return -ENOMEM; 3405 } 3406 } 3407 3408 spin_lock(&ctl->tree_lock); 3409 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 3410 1, 0); 3411 if (!bitmap_info) { 3412 info->bitmap = map; 3413 map = NULL; 3414 add_new_bitmap(ctl, info, offset); 3415 bitmap_info = info; 3416 info = NULL; 3417 } 3418 3419 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); 3420 bytes -= bytes_added; 3421 offset += bytes_added; 3422 spin_unlock(&ctl->tree_lock); 3423 3424 if (bytes) 3425 goto again; 3426 3427 if (info) 3428 kmem_cache_free(btrfs_free_space_cachep, info); 3429 if (map) 3430 kfree(map); 3431 return 0; 3432 } 3433 3434 /* 3435 * Checks to see if the given range is in the free space cache. This is really 3436 * just used to check the absence of space, so if there is free space in the 3437 * range at all we will return 1. 3438 */ 3439 int test_check_exists(struct btrfs_block_group_cache *cache, 3440 u64 offset, u64 bytes) 3441 { 3442 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 3443 struct btrfs_free_space *info; 3444 int ret = 0; 3445 3446 spin_lock(&ctl->tree_lock); 3447 info = tree_search_offset(ctl, offset, 0, 0); 3448 if (!info) { 3449 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 3450 1, 0); 3451 if (!info) 3452 goto out; 3453 } 3454 3455 have_info: 3456 if (info->bitmap) { 3457 u64 bit_off, bit_bytes; 3458 struct rb_node *n; 3459 struct btrfs_free_space *tmp; 3460 3461 bit_off = offset; 3462 bit_bytes = ctl->unit; 3463 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); 3464 if (!ret) { 3465 if (bit_off == offset) { 3466 ret = 1; 3467 goto out; 3468 } else if (bit_off > offset && 3469 offset + bytes > bit_off) { 3470 ret = 1; 3471 goto out; 3472 } 3473 } 3474 3475 n = rb_prev(&info->offset_index); 3476 while (n) { 3477 tmp = rb_entry(n, struct btrfs_free_space, 3478 offset_index); 3479 if (tmp->offset + tmp->bytes < offset) 3480 break; 3481 if (offset + bytes < tmp->offset) { 3482 n = rb_prev(&info->offset_index); 3483 continue; 3484 } 3485 info = tmp; 3486 goto have_info; 3487 } 3488 3489 n = rb_next(&info->offset_index); 3490 while (n) { 3491 tmp = rb_entry(n, struct btrfs_free_space, 3492 offset_index); 3493 if (offset + bytes < tmp->offset) 3494 break; 3495 if (tmp->offset + tmp->bytes < offset) { 3496 n = rb_next(&info->offset_index); 3497 continue; 3498 } 3499 info = tmp; 3500 goto have_info; 3501 } 3502 3503 ret = 0; 3504 goto out; 3505 } 3506 3507 if (info->offset == offset) { 3508 ret = 1; 3509 goto out; 3510 } 3511 3512 if (offset > info->offset && offset < info->offset + info->bytes) 3513 ret = 1; 3514 out: 3515 spin_unlock(&ctl->tree_lock); 3516 return ret; 3517 } 3518 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 3519