1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include "ctree.h" 15 #include "free-space-cache.h" 16 #include "transaction.h" 17 #include "disk-io.h" 18 #include "extent_io.h" 19 #include "volumes.h" 20 #include "space-info.h" 21 #include "delalloc-space.h" 22 #include "block-group.h" 23 #include "discard.h" 24 25 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 26 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 27 #define FORCE_EXTENT_THRESHOLD SZ_1M 28 29 struct btrfs_trim_range { 30 u64 start; 31 u64 bytes; 32 struct list_head list; 33 }; 34 35 static int link_free_space(struct btrfs_free_space_ctl *ctl, 36 struct btrfs_free_space *info); 37 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 38 struct btrfs_free_space *info); 39 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 40 struct btrfs_free_space *bitmap_info, u64 *offset, 41 u64 *bytes, bool for_alloc); 42 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 43 struct btrfs_free_space *bitmap_info); 44 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 45 struct btrfs_free_space *info, u64 offset, 46 u64 bytes); 47 48 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 49 struct btrfs_path *path, 50 u64 offset) 51 { 52 struct btrfs_fs_info *fs_info = root->fs_info; 53 struct btrfs_key key; 54 struct btrfs_key location; 55 struct btrfs_disk_key disk_key; 56 struct btrfs_free_space_header *header; 57 struct extent_buffer *leaf; 58 struct inode *inode = NULL; 59 unsigned nofs_flag; 60 int ret; 61 62 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 63 key.offset = offset; 64 key.type = 0; 65 66 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 67 if (ret < 0) 68 return ERR_PTR(ret); 69 if (ret > 0) { 70 btrfs_release_path(path); 71 return ERR_PTR(-ENOENT); 72 } 73 74 leaf = path->nodes[0]; 75 header = btrfs_item_ptr(leaf, path->slots[0], 76 struct btrfs_free_space_header); 77 btrfs_free_space_key(leaf, header, &disk_key); 78 btrfs_disk_key_to_cpu(&location, &disk_key); 79 btrfs_release_path(path); 80 81 /* 82 * We are often under a trans handle at this point, so we need to make 83 * sure NOFS is set to keep us from deadlocking. 84 */ 85 nofs_flag = memalloc_nofs_save(); 86 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); 87 btrfs_release_path(path); 88 memalloc_nofs_restore(nofs_flag); 89 if (IS_ERR(inode)) 90 return inode; 91 92 mapping_set_gfp_mask(inode->i_mapping, 93 mapping_gfp_constraint(inode->i_mapping, 94 ~(__GFP_FS | __GFP_HIGHMEM))); 95 96 return inode; 97 } 98 99 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 100 struct btrfs_path *path) 101 { 102 struct btrfs_fs_info *fs_info = block_group->fs_info; 103 struct inode *inode = NULL; 104 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 105 106 spin_lock(&block_group->lock); 107 if (block_group->inode) 108 inode = igrab(block_group->inode); 109 spin_unlock(&block_group->lock); 110 if (inode) 111 return inode; 112 113 inode = __lookup_free_space_inode(fs_info->tree_root, path, 114 block_group->start); 115 if (IS_ERR(inode)) 116 return inode; 117 118 spin_lock(&block_group->lock); 119 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 120 btrfs_info(fs_info, "Old style space inode found, converting."); 121 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 122 BTRFS_INODE_NODATACOW; 123 block_group->disk_cache_state = BTRFS_DC_CLEAR; 124 } 125 126 if (!block_group->iref) { 127 block_group->inode = igrab(inode); 128 block_group->iref = 1; 129 } 130 spin_unlock(&block_group->lock); 131 132 return inode; 133 } 134 135 static int __create_free_space_inode(struct btrfs_root *root, 136 struct btrfs_trans_handle *trans, 137 struct btrfs_path *path, 138 u64 ino, u64 offset) 139 { 140 struct btrfs_key key; 141 struct btrfs_disk_key disk_key; 142 struct btrfs_free_space_header *header; 143 struct btrfs_inode_item *inode_item; 144 struct extent_buffer *leaf; 145 /* We inline CRCs for the free disk space cache */ 146 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 147 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 148 int ret; 149 150 ret = btrfs_insert_empty_inode(trans, root, path, ino); 151 if (ret) 152 return ret; 153 154 leaf = path->nodes[0]; 155 inode_item = btrfs_item_ptr(leaf, path->slots[0], 156 struct btrfs_inode_item); 157 btrfs_item_key(leaf, &disk_key, path->slots[0]); 158 memzero_extent_buffer(leaf, (unsigned long)inode_item, 159 sizeof(*inode_item)); 160 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 161 btrfs_set_inode_size(leaf, inode_item, 0); 162 btrfs_set_inode_nbytes(leaf, inode_item, 0); 163 btrfs_set_inode_uid(leaf, inode_item, 0); 164 btrfs_set_inode_gid(leaf, inode_item, 0); 165 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 166 btrfs_set_inode_flags(leaf, inode_item, flags); 167 btrfs_set_inode_nlink(leaf, inode_item, 1); 168 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 169 btrfs_set_inode_block_group(leaf, inode_item, offset); 170 btrfs_mark_buffer_dirty(leaf); 171 btrfs_release_path(path); 172 173 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 174 key.offset = offset; 175 key.type = 0; 176 ret = btrfs_insert_empty_item(trans, root, path, &key, 177 sizeof(struct btrfs_free_space_header)); 178 if (ret < 0) { 179 btrfs_release_path(path); 180 return ret; 181 } 182 183 leaf = path->nodes[0]; 184 header = btrfs_item_ptr(leaf, path->slots[0], 185 struct btrfs_free_space_header); 186 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 187 btrfs_set_free_space_key(leaf, header, &disk_key); 188 btrfs_mark_buffer_dirty(leaf); 189 btrfs_release_path(path); 190 191 return 0; 192 } 193 194 int create_free_space_inode(struct btrfs_trans_handle *trans, 195 struct btrfs_block_group *block_group, 196 struct btrfs_path *path) 197 { 198 int ret; 199 u64 ino; 200 201 ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino); 202 if (ret < 0) 203 return ret; 204 205 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 206 ino, block_group->start); 207 } 208 209 /* 210 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 211 * handles lookup, otherwise it takes ownership and iputs the inode. 212 * Don't reuse an inode pointer after passing it into this function. 213 */ 214 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 215 struct inode *inode, 216 struct btrfs_block_group *block_group) 217 { 218 struct btrfs_path *path; 219 struct btrfs_key key; 220 int ret = 0; 221 222 path = btrfs_alloc_path(); 223 if (!path) 224 return -ENOMEM; 225 226 if (!inode) 227 inode = lookup_free_space_inode(block_group, path); 228 if (IS_ERR(inode)) { 229 if (PTR_ERR(inode) != -ENOENT) 230 ret = PTR_ERR(inode); 231 goto out; 232 } 233 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 234 if (ret) { 235 btrfs_add_delayed_iput(inode); 236 goto out; 237 } 238 clear_nlink(inode); 239 /* One for the block groups ref */ 240 spin_lock(&block_group->lock); 241 if (block_group->iref) { 242 block_group->iref = 0; 243 block_group->inode = NULL; 244 spin_unlock(&block_group->lock); 245 iput(inode); 246 } else { 247 spin_unlock(&block_group->lock); 248 } 249 /* One for the lookup ref */ 250 btrfs_add_delayed_iput(inode); 251 252 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 253 key.type = 0; 254 key.offset = block_group->start; 255 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 256 -1, 1); 257 if (ret) { 258 if (ret > 0) 259 ret = 0; 260 goto out; 261 } 262 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); 263 out: 264 btrfs_free_path(path); 265 return ret; 266 } 267 268 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, 269 struct btrfs_block_rsv *rsv) 270 { 271 u64 needed_bytes; 272 int ret; 273 274 /* 1 for slack space, 1 for updating the inode */ 275 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + 276 btrfs_calc_metadata_size(fs_info, 1); 277 278 spin_lock(&rsv->lock); 279 if (rsv->reserved < needed_bytes) 280 ret = -ENOSPC; 281 else 282 ret = 0; 283 spin_unlock(&rsv->lock); 284 return ret; 285 } 286 287 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 288 struct btrfs_block_group *block_group, 289 struct inode *inode) 290 { 291 struct btrfs_root *root = BTRFS_I(inode)->root; 292 int ret = 0; 293 bool locked = false; 294 295 if (block_group) { 296 struct btrfs_path *path = btrfs_alloc_path(); 297 298 if (!path) { 299 ret = -ENOMEM; 300 goto fail; 301 } 302 locked = true; 303 mutex_lock(&trans->transaction->cache_write_mutex); 304 if (!list_empty(&block_group->io_list)) { 305 list_del_init(&block_group->io_list); 306 307 btrfs_wait_cache_io(trans, block_group, path); 308 btrfs_put_block_group(block_group); 309 } 310 311 /* 312 * now that we've truncated the cache away, its no longer 313 * setup or written 314 */ 315 spin_lock(&block_group->lock); 316 block_group->disk_cache_state = BTRFS_DC_CLEAR; 317 spin_unlock(&block_group->lock); 318 btrfs_free_path(path); 319 } 320 321 btrfs_i_size_write(BTRFS_I(inode), 0); 322 truncate_pagecache(inode, 0); 323 324 /* 325 * We skip the throttling logic for free space cache inodes, so we don't 326 * need to check for -EAGAIN. 327 */ 328 ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 329 0, BTRFS_EXTENT_DATA_KEY); 330 if (ret) 331 goto fail; 332 333 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 334 335 fail: 336 if (locked) 337 mutex_unlock(&trans->transaction->cache_write_mutex); 338 if (ret) 339 btrfs_abort_transaction(trans, ret); 340 341 return ret; 342 } 343 344 static void readahead_cache(struct inode *inode) 345 { 346 struct file_ra_state *ra; 347 unsigned long last_index; 348 349 ra = kzalloc(sizeof(*ra), GFP_NOFS); 350 if (!ra) 351 return; 352 353 file_ra_state_init(ra, inode->i_mapping); 354 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 355 356 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); 357 358 kfree(ra); 359 } 360 361 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 362 int write) 363 { 364 int num_pages; 365 366 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 367 368 /* Make sure we can fit our crcs and generation into the first page */ 369 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 370 return -ENOSPC; 371 372 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 373 374 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 375 if (!io_ctl->pages) 376 return -ENOMEM; 377 378 io_ctl->num_pages = num_pages; 379 io_ctl->fs_info = btrfs_sb(inode->i_sb); 380 io_ctl->inode = inode; 381 382 return 0; 383 } 384 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 385 386 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 387 { 388 kfree(io_ctl->pages); 389 io_ctl->pages = NULL; 390 } 391 392 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 393 { 394 if (io_ctl->cur) { 395 io_ctl->cur = NULL; 396 io_ctl->orig = NULL; 397 } 398 } 399 400 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 401 { 402 ASSERT(io_ctl->index < io_ctl->num_pages); 403 io_ctl->page = io_ctl->pages[io_ctl->index++]; 404 io_ctl->cur = page_address(io_ctl->page); 405 io_ctl->orig = io_ctl->cur; 406 io_ctl->size = PAGE_SIZE; 407 if (clear) 408 clear_page(io_ctl->cur); 409 } 410 411 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 412 { 413 int i; 414 415 io_ctl_unmap_page(io_ctl); 416 417 for (i = 0; i < io_ctl->num_pages; i++) { 418 if (io_ctl->pages[i]) { 419 ClearPageChecked(io_ctl->pages[i]); 420 unlock_page(io_ctl->pages[i]); 421 put_page(io_ctl->pages[i]); 422 } 423 } 424 } 425 426 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 427 { 428 struct page *page; 429 struct inode *inode = io_ctl->inode; 430 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 431 int i; 432 433 for (i = 0; i < io_ctl->num_pages; i++) { 434 page = find_or_create_page(inode->i_mapping, i, mask); 435 if (!page) { 436 io_ctl_drop_pages(io_ctl); 437 return -ENOMEM; 438 } 439 io_ctl->pages[i] = page; 440 if (uptodate && !PageUptodate(page)) { 441 btrfs_readpage(NULL, page); 442 lock_page(page); 443 if (page->mapping != inode->i_mapping) { 444 btrfs_err(BTRFS_I(inode)->root->fs_info, 445 "free space cache page truncated"); 446 io_ctl_drop_pages(io_ctl); 447 return -EIO; 448 } 449 if (!PageUptodate(page)) { 450 btrfs_err(BTRFS_I(inode)->root->fs_info, 451 "error reading free space cache"); 452 io_ctl_drop_pages(io_ctl); 453 return -EIO; 454 } 455 } 456 } 457 458 for (i = 0; i < io_ctl->num_pages; i++) { 459 clear_page_dirty_for_io(io_ctl->pages[i]); 460 set_page_extent_mapped(io_ctl->pages[i]); 461 } 462 463 return 0; 464 } 465 466 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 467 { 468 io_ctl_map_page(io_ctl, 1); 469 470 /* 471 * Skip the csum areas. If we don't check crcs then we just have a 472 * 64bit chunk at the front of the first page. 473 */ 474 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 475 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 476 477 put_unaligned_le64(generation, io_ctl->cur); 478 io_ctl->cur += sizeof(u64); 479 } 480 481 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 482 { 483 u64 cache_gen; 484 485 /* 486 * Skip the crc area. If we don't check crcs then we just have a 64bit 487 * chunk at the front of the first page. 488 */ 489 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 490 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 491 492 cache_gen = get_unaligned_le64(io_ctl->cur); 493 if (cache_gen != generation) { 494 btrfs_err_rl(io_ctl->fs_info, 495 "space cache generation (%llu) does not match inode (%llu)", 496 cache_gen, generation); 497 io_ctl_unmap_page(io_ctl); 498 return -EIO; 499 } 500 io_ctl->cur += sizeof(u64); 501 return 0; 502 } 503 504 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 505 { 506 u32 *tmp; 507 u32 crc = ~(u32)0; 508 unsigned offset = 0; 509 510 if (index == 0) 511 offset = sizeof(u32) * io_ctl->num_pages; 512 513 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 514 btrfs_crc32c_final(crc, (u8 *)&crc); 515 io_ctl_unmap_page(io_ctl); 516 tmp = page_address(io_ctl->pages[0]); 517 tmp += index; 518 *tmp = crc; 519 } 520 521 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 522 { 523 u32 *tmp, val; 524 u32 crc = ~(u32)0; 525 unsigned offset = 0; 526 527 if (index == 0) 528 offset = sizeof(u32) * io_ctl->num_pages; 529 530 tmp = page_address(io_ctl->pages[0]); 531 tmp += index; 532 val = *tmp; 533 534 io_ctl_map_page(io_ctl, 0); 535 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 536 btrfs_crc32c_final(crc, (u8 *)&crc); 537 if (val != crc) { 538 btrfs_err_rl(io_ctl->fs_info, 539 "csum mismatch on free space cache"); 540 io_ctl_unmap_page(io_ctl); 541 return -EIO; 542 } 543 544 return 0; 545 } 546 547 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 548 void *bitmap) 549 { 550 struct btrfs_free_space_entry *entry; 551 552 if (!io_ctl->cur) 553 return -ENOSPC; 554 555 entry = io_ctl->cur; 556 put_unaligned_le64(offset, &entry->offset); 557 put_unaligned_le64(bytes, &entry->bytes); 558 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 559 BTRFS_FREE_SPACE_EXTENT; 560 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 561 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 562 563 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 564 return 0; 565 566 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 567 568 /* No more pages to map */ 569 if (io_ctl->index >= io_ctl->num_pages) 570 return 0; 571 572 /* map the next page */ 573 io_ctl_map_page(io_ctl, 1); 574 return 0; 575 } 576 577 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 578 { 579 if (!io_ctl->cur) 580 return -ENOSPC; 581 582 /* 583 * If we aren't at the start of the current page, unmap this one and 584 * map the next one if there is any left. 585 */ 586 if (io_ctl->cur != io_ctl->orig) { 587 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 588 if (io_ctl->index >= io_ctl->num_pages) 589 return -ENOSPC; 590 io_ctl_map_page(io_ctl, 0); 591 } 592 593 copy_page(io_ctl->cur, bitmap); 594 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 595 if (io_ctl->index < io_ctl->num_pages) 596 io_ctl_map_page(io_ctl, 0); 597 return 0; 598 } 599 600 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 601 { 602 /* 603 * If we're not on the boundary we know we've modified the page and we 604 * need to crc the page. 605 */ 606 if (io_ctl->cur != io_ctl->orig) 607 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 608 else 609 io_ctl_unmap_page(io_ctl); 610 611 while (io_ctl->index < io_ctl->num_pages) { 612 io_ctl_map_page(io_ctl, 1); 613 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 614 } 615 } 616 617 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 618 struct btrfs_free_space *entry, u8 *type) 619 { 620 struct btrfs_free_space_entry *e; 621 int ret; 622 623 if (!io_ctl->cur) { 624 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 625 if (ret) 626 return ret; 627 } 628 629 e = io_ctl->cur; 630 entry->offset = get_unaligned_le64(&e->offset); 631 entry->bytes = get_unaligned_le64(&e->bytes); 632 *type = e->type; 633 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 634 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 635 636 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 637 return 0; 638 639 io_ctl_unmap_page(io_ctl); 640 641 return 0; 642 } 643 644 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 645 struct btrfs_free_space *entry) 646 { 647 int ret; 648 649 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 650 if (ret) 651 return ret; 652 653 copy_page(entry->bitmap, io_ctl->cur); 654 io_ctl_unmap_page(io_ctl); 655 656 return 0; 657 } 658 659 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 660 { 661 struct btrfs_block_group *block_group = ctl->private; 662 u64 max_bytes; 663 u64 bitmap_bytes; 664 u64 extent_bytes; 665 u64 size = block_group->length; 666 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 667 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 668 669 max_bitmaps = max_t(u64, max_bitmaps, 1); 670 671 ASSERT(ctl->total_bitmaps <= max_bitmaps); 672 673 /* 674 * We are trying to keep the total amount of memory used per 1GiB of 675 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 676 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 677 * bitmaps, we may end up using more memory than this. 678 */ 679 if (size < SZ_1G) 680 max_bytes = MAX_CACHE_BYTES_PER_GIG; 681 else 682 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 683 684 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 685 686 /* 687 * we want the extent entry threshold to always be at most 1/2 the max 688 * bytes we can have, or whatever is less than that. 689 */ 690 extent_bytes = max_bytes - bitmap_bytes; 691 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 692 693 ctl->extents_thresh = 694 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 695 } 696 697 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 698 struct btrfs_free_space_ctl *ctl, 699 struct btrfs_path *path, u64 offset) 700 { 701 struct btrfs_fs_info *fs_info = root->fs_info; 702 struct btrfs_free_space_header *header; 703 struct extent_buffer *leaf; 704 struct btrfs_io_ctl io_ctl; 705 struct btrfs_key key; 706 struct btrfs_free_space *e, *n; 707 LIST_HEAD(bitmaps); 708 u64 num_entries; 709 u64 num_bitmaps; 710 u64 generation; 711 u8 type; 712 int ret = 0; 713 714 /* Nothing in the space cache, goodbye */ 715 if (!i_size_read(inode)) 716 return 0; 717 718 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 719 key.offset = offset; 720 key.type = 0; 721 722 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 723 if (ret < 0) 724 return 0; 725 else if (ret > 0) { 726 btrfs_release_path(path); 727 return 0; 728 } 729 730 ret = -1; 731 732 leaf = path->nodes[0]; 733 header = btrfs_item_ptr(leaf, path->slots[0], 734 struct btrfs_free_space_header); 735 num_entries = btrfs_free_space_entries(leaf, header); 736 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 737 generation = btrfs_free_space_generation(leaf, header); 738 btrfs_release_path(path); 739 740 if (!BTRFS_I(inode)->generation) { 741 btrfs_info(fs_info, 742 "the free space cache file (%llu) is invalid, skip it", 743 offset); 744 return 0; 745 } 746 747 if (BTRFS_I(inode)->generation != generation) { 748 btrfs_err(fs_info, 749 "free space inode generation (%llu) did not match free space cache generation (%llu)", 750 BTRFS_I(inode)->generation, generation); 751 return 0; 752 } 753 754 if (!num_entries) 755 return 0; 756 757 ret = io_ctl_init(&io_ctl, inode, 0); 758 if (ret) 759 return ret; 760 761 readahead_cache(inode); 762 763 ret = io_ctl_prepare_pages(&io_ctl, true); 764 if (ret) 765 goto out; 766 767 ret = io_ctl_check_crc(&io_ctl, 0); 768 if (ret) 769 goto free_cache; 770 771 ret = io_ctl_check_generation(&io_ctl, generation); 772 if (ret) 773 goto free_cache; 774 775 while (num_entries) { 776 e = kmem_cache_zalloc(btrfs_free_space_cachep, 777 GFP_NOFS); 778 if (!e) 779 goto free_cache; 780 781 ret = io_ctl_read_entry(&io_ctl, e, &type); 782 if (ret) { 783 kmem_cache_free(btrfs_free_space_cachep, e); 784 goto free_cache; 785 } 786 787 if (!e->bytes) { 788 kmem_cache_free(btrfs_free_space_cachep, e); 789 goto free_cache; 790 } 791 792 if (type == BTRFS_FREE_SPACE_EXTENT) { 793 spin_lock(&ctl->tree_lock); 794 ret = link_free_space(ctl, e); 795 spin_unlock(&ctl->tree_lock); 796 if (ret) { 797 btrfs_err(fs_info, 798 "Duplicate entries in free space cache, dumping"); 799 kmem_cache_free(btrfs_free_space_cachep, e); 800 goto free_cache; 801 } 802 } else { 803 ASSERT(num_bitmaps); 804 num_bitmaps--; 805 e->bitmap = kmem_cache_zalloc( 806 btrfs_free_space_bitmap_cachep, GFP_NOFS); 807 if (!e->bitmap) { 808 kmem_cache_free( 809 btrfs_free_space_cachep, e); 810 goto free_cache; 811 } 812 spin_lock(&ctl->tree_lock); 813 ret = link_free_space(ctl, e); 814 ctl->total_bitmaps++; 815 recalculate_thresholds(ctl); 816 spin_unlock(&ctl->tree_lock); 817 if (ret) { 818 btrfs_err(fs_info, 819 "Duplicate entries in free space cache, dumping"); 820 kmem_cache_free(btrfs_free_space_cachep, e); 821 goto free_cache; 822 } 823 list_add_tail(&e->list, &bitmaps); 824 } 825 826 num_entries--; 827 } 828 829 io_ctl_unmap_page(&io_ctl); 830 831 /* 832 * We add the bitmaps at the end of the entries in order that 833 * the bitmap entries are added to the cache. 834 */ 835 list_for_each_entry_safe(e, n, &bitmaps, list) { 836 list_del_init(&e->list); 837 ret = io_ctl_read_bitmap(&io_ctl, e); 838 if (ret) 839 goto free_cache; 840 } 841 842 io_ctl_drop_pages(&io_ctl); 843 ret = 1; 844 out: 845 io_ctl_free(&io_ctl); 846 return ret; 847 free_cache: 848 io_ctl_drop_pages(&io_ctl); 849 __btrfs_remove_free_space_cache(ctl); 850 goto out; 851 } 852 853 static int copy_free_space_cache(struct btrfs_block_group *block_group, 854 struct btrfs_free_space_ctl *ctl) 855 { 856 struct btrfs_free_space *info; 857 struct rb_node *n; 858 int ret = 0; 859 860 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 861 info = rb_entry(n, struct btrfs_free_space, offset_index); 862 if (!info->bitmap) { 863 unlink_free_space(ctl, info); 864 ret = btrfs_add_free_space(block_group, info->offset, 865 info->bytes); 866 kmem_cache_free(btrfs_free_space_cachep, info); 867 } else { 868 u64 offset = info->offset; 869 u64 bytes = ctl->unit; 870 871 while (search_bitmap(ctl, info, &offset, &bytes, 872 false) == 0) { 873 ret = btrfs_add_free_space(block_group, offset, 874 bytes); 875 if (ret) 876 break; 877 bitmap_clear_bits(ctl, info, offset, bytes); 878 offset = info->offset; 879 bytes = ctl->unit; 880 } 881 free_bitmap(ctl, info); 882 } 883 cond_resched(); 884 } 885 return ret; 886 } 887 888 int load_free_space_cache(struct btrfs_block_group *block_group) 889 { 890 struct btrfs_fs_info *fs_info = block_group->fs_info; 891 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 892 struct btrfs_free_space_ctl tmp_ctl = {}; 893 struct inode *inode; 894 struct btrfs_path *path; 895 int ret = 0; 896 bool matched; 897 u64 used = block_group->used; 898 899 /* 900 * Because we could potentially discard our loaded free space, we want 901 * to load everything into a temporary structure first, and then if it's 902 * valid copy it all into the actual free space ctl. 903 */ 904 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 905 906 /* 907 * If this block group has been marked to be cleared for one reason or 908 * another then we can't trust the on disk cache, so just return. 909 */ 910 spin_lock(&block_group->lock); 911 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 912 spin_unlock(&block_group->lock); 913 return 0; 914 } 915 spin_unlock(&block_group->lock); 916 917 path = btrfs_alloc_path(); 918 if (!path) 919 return 0; 920 path->search_commit_root = 1; 921 path->skip_locking = 1; 922 923 /* 924 * We must pass a path with search_commit_root set to btrfs_iget in 925 * order to avoid a deadlock when allocating extents for the tree root. 926 * 927 * When we are COWing an extent buffer from the tree root, when looking 928 * for a free extent, at extent-tree.c:find_free_extent(), we can find 929 * block group without its free space cache loaded. When we find one 930 * we must load its space cache which requires reading its free space 931 * cache's inode item from the root tree. If this inode item is located 932 * in the same leaf that we started COWing before, then we end up in 933 * deadlock on the extent buffer (trying to read lock it when we 934 * previously write locked it). 935 * 936 * It's safe to read the inode item using the commit root because 937 * block groups, once loaded, stay in memory forever (until they are 938 * removed) as well as their space caches once loaded. New block groups 939 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 940 * we will never try to read their inode item while the fs is mounted. 941 */ 942 inode = lookup_free_space_inode(block_group, path); 943 if (IS_ERR(inode)) { 944 btrfs_free_path(path); 945 return 0; 946 } 947 948 /* We may have converted the inode and made the cache invalid. */ 949 spin_lock(&block_group->lock); 950 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 951 spin_unlock(&block_group->lock); 952 btrfs_free_path(path); 953 goto out; 954 } 955 spin_unlock(&block_group->lock); 956 957 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 958 path, block_group->start); 959 btrfs_free_path(path); 960 if (ret <= 0) 961 goto out; 962 963 matched = (tmp_ctl.free_space == (block_group->length - used - 964 block_group->bytes_super)); 965 966 if (matched) { 967 ret = copy_free_space_cache(block_group, &tmp_ctl); 968 /* 969 * ret == 1 means we successfully loaded the free space cache, 970 * so we need to re-set it here. 971 */ 972 if (ret == 0) 973 ret = 1; 974 } else { 975 __btrfs_remove_free_space_cache(&tmp_ctl); 976 btrfs_warn(fs_info, 977 "block group %llu has wrong amount of free space", 978 block_group->start); 979 ret = -1; 980 } 981 out: 982 if (ret < 0) { 983 /* This cache is bogus, make sure it gets cleared */ 984 spin_lock(&block_group->lock); 985 block_group->disk_cache_state = BTRFS_DC_CLEAR; 986 spin_unlock(&block_group->lock); 987 ret = 0; 988 989 btrfs_warn(fs_info, 990 "failed to load free space cache for block group %llu, rebuilding it now", 991 block_group->start); 992 } 993 994 spin_lock(&ctl->tree_lock); 995 btrfs_discard_update_discardable(block_group); 996 spin_unlock(&ctl->tree_lock); 997 iput(inode); 998 return ret; 999 } 1000 1001 static noinline_for_stack 1002 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1003 struct btrfs_free_space_ctl *ctl, 1004 struct btrfs_block_group *block_group, 1005 int *entries, int *bitmaps, 1006 struct list_head *bitmap_list) 1007 { 1008 int ret; 1009 struct btrfs_free_cluster *cluster = NULL; 1010 struct btrfs_free_cluster *cluster_locked = NULL; 1011 struct rb_node *node = rb_first(&ctl->free_space_offset); 1012 struct btrfs_trim_range *trim_entry; 1013 1014 /* Get the cluster for this block_group if it exists */ 1015 if (block_group && !list_empty(&block_group->cluster_list)) { 1016 cluster = list_entry(block_group->cluster_list.next, 1017 struct btrfs_free_cluster, 1018 block_group_list); 1019 } 1020 1021 if (!node && cluster) { 1022 cluster_locked = cluster; 1023 spin_lock(&cluster_locked->lock); 1024 node = rb_first(&cluster->root); 1025 cluster = NULL; 1026 } 1027 1028 /* Write out the extent entries */ 1029 while (node) { 1030 struct btrfs_free_space *e; 1031 1032 e = rb_entry(node, struct btrfs_free_space, offset_index); 1033 *entries += 1; 1034 1035 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1036 e->bitmap); 1037 if (ret) 1038 goto fail; 1039 1040 if (e->bitmap) { 1041 list_add_tail(&e->list, bitmap_list); 1042 *bitmaps += 1; 1043 } 1044 node = rb_next(node); 1045 if (!node && cluster) { 1046 node = rb_first(&cluster->root); 1047 cluster_locked = cluster; 1048 spin_lock(&cluster_locked->lock); 1049 cluster = NULL; 1050 } 1051 } 1052 if (cluster_locked) { 1053 spin_unlock(&cluster_locked->lock); 1054 cluster_locked = NULL; 1055 } 1056 1057 /* 1058 * Make sure we don't miss any range that was removed from our rbtree 1059 * because trimming is running. Otherwise after a umount+mount (or crash 1060 * after committing the transaction) we would leak free space and get 1061 * an inconsistent free space cache report from fsck. 1062 */ 1063 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1064 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1065 trim_entry->bytes, NULL); 1066 if (ret) 1067 goto fail; 1068 *entries += 1; 1069 } 1070 1071 return 0; 1072 fail: 1073 if (cluster_locked) 1074 spin_unlock(&cluster_locked->lock); 1075 return -ENOSPC; 1076 } 1077 1078 static noinline_for_stack int 1079 update_cache_item(struct btrfs_trans_handle *trans, 1080 struct btrfs_root *root, 1081 struct inode *inode, 1082 struct btrfs_path *path, u64 offset, 1083 int entries, int bitmaps) 1084 { 1085 struct btrfs_key key; 1086 struct btrfs_free_space_header *header; 1087 struct extent_buffer *leaf; 1088 int ret; 1089 1090 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1091 key.offset = offset; 1092 key.type = 0; 1093 1094 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1095 if (ret < 0) { 1096 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1097 EXTENT_DELALLOC, 0, 0, NULL); 1098 goto fail; 1099 } 1100 leaf = path->nodes[0]; 1101 if (ret > 0) { 1102 struct btrfs_key found_key; 1103 ASSERT(path->slots[0]); 1104 path->slots[0]--; 1105 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1106 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1107 found_key.offset != offset) { 1108 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1109 inode->i_size - 1, EXTENT_DELALLOC, 0, 1110 0, NULL); 1111 btrfs_release_path(path); 1112 goto fail; 1113 } 1114 } 1115 1116 BTRFS_I(inode)->generation = trans->transid; 1117 header = btrfs_item_ptr(leaf, path->slots[0], 1118 struct btrfs_free_space_header); 1119 btrfs_set_free_space_entries(leaf, header, entries); 1120 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1121 btrfs_set_free_space_generation(leaf, header, trans->transid); 1122 btrfs_mark_buffer_dirty(leaf); 1123 btrfs_release_path(path); 1124 1125 return 0; 1126 1127 fail: 1128 return -1; 1129 } 1130 1131 static noinline_for_stack int write_pinned_extent_entries( 1132 struct btrfs_trans_handle *trans, 1133 struct btrfs_block_group *block_group, 1134 struct btrfs_io_ctl *io_ctl, 1135 int *entries) 1136 { 1137 u64 start, extent_start, extent_end, len; 1138 struct extent_io_tree *unpin = NULL; 1139 int ret; 1140 1141 if (!block_group) 1142 return 0; 1143 1144 /* 1145 * We want to add any pinned extents to our free space cache 1146 * so we don't leak the space 1147 * 1148 * We shouldn't have switched the pinned extents yet so this is the 1149 * right one 1150 */ 1151 unpin = &trans->transaction->pinned_extents; 1152 1153 start = block_group->start; 1154 1155 while (start < block_group->start + block_group->length) { 1156 ret = find_first_extent_bit(unpin, start, 1157 &extent_start, &extent_end, 1158 EXTENT_DIRTY, NULL); 1159 if (ret) 1160 return 0; 1161 1162 /* This pinned extent is out of our range */ 1163 if (extent_start >= block_group->start + block_group->length) 1164 return 0; 1165 1166 extent_start = max(extent_start, start); 1167 extent_end = min(block_group->start + block_group->length, 1168 extent_end + 1); 1169 len = extent_end - extent_start; 1170 1171 *entries += 1; 1172 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1173 if (ret) 1174 return -ENOSPC; 1175 1176 start = extent_end; 1177 } 1178 1179 return 0; 1180 } 1181 1182 static noinline_for_stack int 1183 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1184 { 1185 struct btrfs_free_space *entry, *next; 1186 int ret; 1187 1188 /* Write out the bitmaps */ 1189 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1190 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1191 if (ret) 1192 return -ENOSPC; 1193 list_del_init(&entry->list); 1194 } 1195 1196 return 0; 1197 } 1198 1199 static int flush_dirty_cache(struct inode *inode) 1200 { 1201 int ret; 1202 1203 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1204 if (ret) 1205 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1206 EXTENT_DELALLOC, 0, 0, NULL); 1207 1208 return ret; 1209 } 1210 1211 static void noinline_for_stack 1212 cleanup_bitmap_list(struct list_head *bitmap_list) 1213 { 1214 struct btrfs_free_space *entry, *next; 1215 1216 list_for_each_entry_safe(entry, next, bitmap_list, list) 1217 list_del_init(&entry->list); 1218 } 1219 1220 static void noinline_for_stack 1221 cleanup_write_cache_enospc(struct inode *inode, 1222 struct btrfs_io_ctl *io_ctl, 1223 struct extent_state **cached_state) 1224 { 1225 io_ctl_drop_pages(io_ctl); 1226 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1227 i_size_read(inode) - 1, cached_state); 1228 } 1229 1230 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1231 struct btrfs_trans_handle *trans, 1232 struct btrfs_block_group *block_group, 1233 struct btrfs_io_ctl *io_ctl, 1234 struct btrfs_path *path, u64 offset) 1235 { 1236 int ret; 1237 struct inode *inode = io_ctl->inode; 1238 1239 if (!inode) 1240 return 0; 1241 1242 /* Flush the dirty pages in the cache file. */ 1243 ret = flush_dirty_cache(inode); 1244 if (ret) 1245 goto out; 1246 1247 /* Update the cache item to tell everyone this cache file is valid. */ 1248 ret = update_cache_item(trans, root, inode, path, offset, 1249 io_ctl->entries, io_ctl->bitmaps); 1250 out: 1251 if (ret) { 1252 invalidate_inode_pages2(inode->i_mapping); 1253 BTRFS_I(inode)->generation = 0; 1254 if (block_group) 1255 btrfs_debug(root->fs_info, 1256 "failed to write free space cache for block group %llu error %d", 1257 block_group->start, ret); 1258 } 1259 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1260 1261 if (block_group) { 1262 /* the dirty list is protected by the dirty_bgs_lock */ 1263 spin_lock(&trans->transaction->dirty_bgs_lock); 1264 1265 /* the disk_cache_state is protected by the block group lock */ 1266 spin_lock(&block_group->lock); 1267 1268 /* 1269 * only mark this as written if we didn't get put back on 1270 * the dirty list while waiting for IO. Otherwise our 1271 * cache state won't be right, and we won't get written again 1272 */ 1273 if (!ret && list_empty(&block_group->dirty_list)) 1274 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1275 else if (ret) 1276 block_group->disk_cache_state = BTRFS_DC_ERROR; 1277 1278 spin_unlock(&block_group->lock); 1279 spin_unlock(&trans->transaction->dirty_bgs_lock); 1280 io_ctl->inode = NULL; 1281 iput(inode); 1282 } 1283 1284 return ret; 1285 1286 } 1287 1288 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1289 struct btrfs_block_group *block_group, 1290 struct btrfs_path *path) 1291 { 1292 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1293 block_group, &block_group->io_ctl, 1294 path, block_group->start); 1295 } 1296 1297 /** 1298 * __btrfs_write_out_cache - write out cached info to an inode 1299 * @root - the root the inode belongs to 1300 * @ctl - the free space cache we are going to write out 1301 * @block_group - the block_group for this cache if it belongs to a block_group 1302 * @trans - the trans handle 1303 * 1304 * This function writes out a free space cache struct to disk for quick recovery 1305 * on mount. This will return 0 if it was successful in writing the cache out, 1306 * or an errno if it was not. 1307 */ 1308 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 1309 struct btrfs_free_space_ctl *ctl, 1310 struct btrfs_block_group *block_group, 1311 struct btrfs_io_ctl *io_ctl, 1312 struct btrfs_trans_handle *trans) 1313 { 1314 struct extent_state *cached_state = NULL; 1315 LIST_HEAD(bitmap_list); 1316 int entries = 0; 1317 int bitmaps = 0; 1318 int ret; 1319 int must_iput = 0; 1320 1321 if (!i_size_read(inode)) 1322 return -EIO; 1323 1324 WARN_ON(io_ctl->pages); 1325 ret = io_ctl_init(io_ctl, inode, 1); 1326 if (ret) 1327 return ret; 1328 1329 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1330 down_write(&block_group->data_rwsem); 1331 spin_lock(&block_group->lock); 1332 if (block_group->delalloc_bytes) { 1333 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1334 spin_unlock(&block_group->lock); 1335 up_write(&block_group->data_rwsem); 1336 BTRFS_I(inode)->generation = 0; 1337 ret = 0; 1338 must_iput = 1; 1339 goto out; 1340 } 1341 spin_unlock(&block_group->lock); 1342 } 1343 1344 /* Lock all pages first so we can lock the extent safely. */ 1345 ret = io_ctl_prepare_pages(io_ctl, false); 1346 if (ret) 1347 goto out_unlock; 1348 1349 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1350 &cached_state); 1351 1352 io_ctl_set_generation(io_ctl, trans->transid); 1353 1354 mutex_lock(&ctl->cache_writeout_mutex); 1355 /* Write out the extent entries in the free space cache */ 1356 spin_lock(&ctl->tree_lock); 1357 ret = write_cache_extent_entries(io_ctl, ctl, 1358 block_group, &entries, &bitmaps, 1359 &bitmap_list); 1360 if (ret) 1361 goto out_nospc_locked; 1362 1363 /* 1364 * Some spaces that are freed in the current transaction are pinned, 1365 * they will be added into free space cache after the transaction is 1366 * committed, we shouldn't lose them. 1367 * 1368 * If this changes while we are working we'll get added back to 1369 * the dirty list and redo it. No locking needed 1370 */ 1371 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1372 if (ret) 1373 goto out_nospc_locked; 1374 1375 /* 1376 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1377 * locked while doing it because a concurrent trim can be manipulating 1378 * or freeing the bitmap. 1379 */ 1380 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1381 spin_unlock(&ctl->tree_lock); 1382 mutex_unlock(&ctl->cache_writeout_mutex); 1383 if (ret) 1384 goto out_nospc; 1385 1386 /* Zero out the rest of the pages just to make sure */ 1387 io_ctl_zero_remaining_pages(io_ctl); 1388 1389 /* Everything is written out, now we dirty the pages in the file. */ 1390 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 1391 io_ctl->num_pages, 0, i_size_read(inode), 1392 &cached_state, false); 1393 if (ret) 1394 goto out_nospc; 1395 1396 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1397 up_write(&block_group->data_rwsem); 1398 /* 1399 * Release the pages and unlock the extent, we will flush 1400 * them out later 1401 */ 1402 io_ctl_drop_pages(io_ctl); 1403 io_ctl_free(io_ctl); 1404 1405 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1406 i_size_read(inode) - 1, &cached_state); 1407 1408 /* 1409 * at this point the pages are under IO and we're happy, 1410 * The caller is responsible for waiting on them and updating 1411 * the cache and the inode 1412 */ 1413 io_ctl->entries = entries; 1414 io_ctl->bitmaps = bitmaps; 1415 1416 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); 1417 if (ret) 1418 goto out; 1419 1420 return 0; 1421 1422 out_nospc_locked: 1423 cleanup_bitmap_list(&bitmap_list); 1424 spin_unlock(&ctl->tree_lock); 1425 mutex_unlock(&ctl->cache_writeout_mutex); 1426 1427 out_nospc: 1428 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1429 1430 out_unlock: 1431 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1432 up_write(&block_group->data_rwsem); 1433 1434 out: 1435 io_ctl->inode = NULL; 1436 io_ctl_free(io_ctl); 1437 if (ret) { 1438 invalidate_inode_pages2(inode->i_mapping); 1439 BTRFS_I(inode)->generation = 0; 1440 } 1441 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1442 if (must_iput) 1443 iput(inode); 1444 return ret; 1445 } 1446 1447 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1448 struct btrfs_block_group *block_group, 1449 struct btrfs_path *path) 1450 { 1451 struct btrfs_fs_info *fs_info = trans->fs_info; 1452 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1453 struct inode *inode; 1454 int ret = 0; 1455 1456 spin_lock(&block_group->lock); 1457 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1458 spin_unlock(&block_group->lock); 1459 return 0; 1460 } 1461 spin_unlock(&block_group->lock); 1462 1463 inode = lookup_free_space_inode(block_group, path); 1464 if (IS_ERR(inode)) 1465 return 0; 1466 1467 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, 1468 block_group, &block_group->io_ctl, trans); 1469 if (ret) { 1470 btrfs_debug(fs_info, 1471 "failed to write free space cache for block group %llu error %d", 1472 block_group->start, ret); 1473 spin_lock(&block_group->lock); 1474 block_group->disk_cache_state = BTRFS_DC_ERROR; 1475 spin_unlock(&block_group->lock); 1476 1477 block_group->io_ctl.inode = NULL; 1478 iput(inode); 1479 } 1480 1481 /* 1482 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1483 * to wait for IO and put the inode 1484 */ 1485 1486 return ret; 1487 } 1488 1489 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1490 u64 offset) 1491 { 1492 ASSERT(offset >= bitmap_start); 1493 offset -= bitmap_start; 1494 return (unsigned long)(div_u64(offset, unit)); 1495 } 1496 1497 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1498 { 1499 return (unsigned long)(div_u64(bytes, unit)); 1500 } 1501 1502 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1503 u64 offset) 1504 { 1505 u64 bitmap_start; 1506 u64 bytes_per_bitmap; 1507 1508 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1509 bitmap_start = offset - ctl->start; 1510 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1511 bitmap_start *= bytes_per_bitmap; 1512 bitmap_start += ctl->start; 1513 1514 return bitmap_start; 1515 } 1516 1517 static int tree_insert_offset(struct rb_root *root, u64 offset, 1518 struct rb_node *node, int bitmap) 1519 { 1520 struct rb_node **p = &root->rb_node; 1521 struct rb_node *parent = NULL; 1522 struct btrfs_free_space *info; 1523 1524 while (*p) { 1525 parent = *p; 1526 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1527 1528 if (offset < info->offset) { 1529 p = &(*p)->rb_left; 1530 } else if (offset > info->offset) { 1531 p = &(*p)->rb_right; 1532 } else { 1533 /* 1534 * we could have a bitmap entry and an extent entry 1535 * share the same offset. If this is the case, we want 1536 * the extent entry to always be found first if we do a 1537 * linear search through the tree, since we want to have 1538 * the quickest allocation time, and allocating from an 1539 * extent is faster than allocating from a bitmap. So 1540 * if we're inserting a bitmap and we find an entry at 1541 * this offset, we want to go right, or after this entry 1542 * logically. If we are inserting an extent and we've 1543 * found a bitmap, we want to go left, or before 1544 * logically. 1545 */ 1546 if (bitmap) { 1547 if (info->bitmap) { 1548 WARN_ON_ONCE(1); 1549 return -EEXIST; 1550 } 1551 p = &(*p)->rb_right; 1552 } else { 1553 if (!info->bitmap) { 1554 WARN_ON_ONCE(1); 1555 return -EEXIST; 1556 } 1557 p = &(*p)->rb_left; 1558 } 1559 } 1560 } 1561 1562 rb_link_node(node, parent, p); 1563 rb_insert_color(node, root); 1564 1565 return 0; 1566 } 1567 1568 /* 1569 * searches the tree for the given offset. 1570 * 1571 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1572 * want a section that has at least bytes size and comes at or after the given 1573 * offset. 1574 */ 1575 static struct btrfs_free_space * 1576 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1577 u64 offset, int bitmap_only, int fuzzy) 1578 { 1579 struct rb_node *n = ctl->free_space_offset.rb_node; 1580 struct btrfs_free_space *entry, *prev = NULL; 1581 1582 /* find entry that is closest to the 'offset' */ 1583 while (1) { 1584 if (!n) { 1585 entry = NULL; 1586 break; 1587 } 1588 1589 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1590 prev = entry; 1591 1592 if (offset < entry->offset) 1593 n = n->rb_left; 1594 else if (offset > entry->offset) 1595 n = n->rb_right; 1596 else 1597 break; 1598 } 1599 1600 if (bitmap_only) { 1601 if (!entry) 1602 return NULL; 1603 if (entry->bitmap) 1604 return entry; 1605 1606 /* 1607 * bitmap entry and extent entry may share same offset, 1608 * in that case, bitmap entry comes after extent entry. 1609 */ 1610 n = rb_next(n); 1611 if (!n) 1612 return NULL; 1613 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1614 if (entry->offset != offset) 1615 return NULL; 1616 1617 WARN_ON(!entry->bitmap); 1618 return entry; 1619 } else if (entry) { 1620 if (entry->bitmap) { 1621 /* 1622 * if previous extent entry covers the offset, 1623 * we should return it instead of the bitmap entry 1624 */ 1625 n = rb_prev(&entry->offset_index); 1626 if (n) { 1627 prev = rb_entry(n, struct btrfs_free_space, 1628 offset_index); 1629 if (!prev->bitmap && 1630 prev->offset + prev->bytes > offset) 1631 entry = prev; 1632 } 1633 } 1634 return entry; 1635 } 1636 1637 if (!prev) 1638 return NULL; 1639 1640 /* find last entry before the 'offset' */ 1641 entry = prev; 1642 if (entry->offset > offset) { 1643 n = rb_prev(&entry->offset_index); 1644 if (n) { 1645 entry = rb_entry(n, struct btrfs_free_space, 1646 offset_index); 1647 ASSERT(entry->offset <= offset); 1648 } else { 1649 if (fuzzy) 1650 return entry; 1651 else 1652 return NULL; 1653 } 1654 } 1655 1656 if (entry->bitmap) { 1657 n = rb_prev(&entry->offset_index); 1658 if (n) { 1659 prev = rb_entry(n, struct btrfs_free_space, 1660 offset_index); 1661 if (!prev->bitmap && 1662 prev->offset + prev->bytes > offset) 1663 return prev; 1664 } 1665 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1666 return entry; 1667 } else if (entry->offset + entry->bytes > offset) 1668 return entry; 1669 1670 if (!fuzzy) 1671 return NULL; 1672 1673 while (1) { 1674 if (entry->bitmap) { 1675 if (entry->offset + BITS_PER_BITMAP * 1676 ctl->unit > offset) 1677 break; 1678 } else { 1679 if (entry->offset + entry->bytes > offset) 1680 break; 1681 } 1682 1683 n = rb_next(&entry->offset_index); 1684 if (!n) 1685 return NULL; 1686 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1687 } 1688 return entry; 1689 } 1690 1691 static inline void 1692 __unlink_free_space(struct btrfs_free_space_ctl *ctl, 1693 struct btrfs_free_space *info) 1694 { 1695 rb_erase(&info->offset_index, &ctl->free_space_offset); 1696 ctl->free_extents--; 1697 1698 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1699 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1700 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1701 } 1702 } 1703 1704 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1705 struct btrfs_free_space *info) 1706 { 1707 __unlink_free_space(ctl, info); 1708 ctl->free_space -= info->bytes; 1709 } 1710 1711 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1712 struct btrfs_free_space *info) 1713 { 1714 int ret = 0; 1715 1716 ASSERT(info->bytes || info->bitmap); 1717 ret = tree_insert_offset(&ctl->free_space_offset, info->offset, 1718 &info->offset_index, (info->bitmap != NULL)); 1719 if (ret) 1720 return ret; 1721 1722 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1723 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1724 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1725 } 1726 1727 ctl->free_space += info->bytes; 1728 ctl->free_extents++; 1729 return ret; 1730 } 1731 1732 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1733 struct btrfs_free_space *info, 1734 u64 offset, u64 bytes) 1735 { 1736 unsigned long start, count, end; 1737 int extent_delta = -1; 1738 1739 start = offset_to_bit(info->offset, ctl->unit, offset); 1740 count = bytes_to_bits(bytes, ctl->unit); 1741 end = start + count; 1742 ASSERT(end <= BITS_PER_BITMAP); 1743 1744 bitmap_clear(info->bitmap, start, count); 1745 1746 info->bytes -= bytes; 1747 if (info->max_extent_size > ctl->unit) 1748 info->max_extent_size = 0; 1749 1750 if (start && test_bit(start - 1, info->bitmap)) 1751 extent_delta++; 1752 1753 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1754 extent_delta++; 1755 1756 info->bitmap_extents += extent_delta; 1757 if (!btrfs_free_space_trimmed(info)) { 1758 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1759 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1760 } 1761 } 1762 1763 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1764 struct btrfs_free_space *info, u64 offset, 1765 u64 bytes) 1766 { 1767 __bitmap_clear_bits(ctl, info, offset, bytes); 1768 ctl->free_space -= bytes; 1769 } 1770 1771 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1772 struct btrfs_free_space *info, u64 offset, 1773 u64 bytes) 1774 { 1775 unsigned long start, count, end; 1776 int extent_delta = 1; 1777 1778 start = offset_to_bit(info->offset, ctl->unit, offset); 1779 count = bytes_to_bits(bytes, ctl->unit); 1780 end = start + count; 1781 ASSERT(end <= BITS_PER_BITMAP); 1782 1783 bitmap_set(info->bitmap, start, count); 1784 1785 info->bytes += bytes; 1786 ctl->free_space += bytes; 1787 1788 if (start && test_bit(start - 1, info->bitmap)) 1789 extent_delta--; 1790 1791 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1792 extent_delta--; 1793 1794 info->bitmap_extents += extent_delta; 1795 if (!btrfs_free_space_trimmed(info)) { 1796 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1797 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1798 } 1799 } 1800 1801 /* 1802 * If we can not find suitable extent, we will use bytes to record 1803 * the size of the max extent. 1804 */ 1805 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1806 struct btrfs_free_space *bitmap_info, u64 *offset, 1807 u64 *bytes, bool for_alloc) 1808 { 1809 unsigned long found_bits = 0; 1810 unsigned long max_bits = 0; 1811 unsigned long bits, i; 1812 unsigned long next_zero; 1813 unsigned long extent_bits; 1814 1815 /* 1816 * Skip searching the bitmap if we don't have a contiguous section that 1817 * is large enough for this allocation. 1818 */ 1819 if (for_alloc && 1820 bitmap_info->max_extent_size && 1821 bitmap_info->max_extent_size < *bytes) { 1822 *bytes = bitmap_info->max_extent_size; 1823 return -1; 1824 } 1825 1826 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1827 max_t(u64, *offset, bitmap_info->offset)); 1828 bits = bytes_to_bits(*bytes, ctl->unit); 1829 1830 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1831 if (for_alloc && bits == 1) { 1832 found_bits = 1; 1833 break; 1834 } 1835 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1836 BITS_PER_BITMAP, i); 1837 extent_bits = next_zero - i; 1838 if (extent_bits >= bits) { 1839 found_bits = extent_bits; 1840 break; 1841 } else if (extent_bits > max_bits) { 1842 max_bits = extent_bits; 1843 } 1844 i = next_zero; 1845 } 1846 1847 if (found_bits) { 1848 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1849 *bytes = (u64)(found_bits) * ctl->unit; 1850 return 0; 1851 } 1852 1853 *bytes = (u64)(max_bits) * ctl->unit; 1854 bitmap_info->max_extent_size = *bytes; 1855 return -1; 1856 } 1857 1858 static inline u64 get_max_extent_size(struct btrfs_free_space *entry) 1859 { 1860 if (entry->bitmap) 1861 return entry->max_extent_size; 1862 return entry->bytes; 1863 } 1864 1865 /* Cache the size of the max extent in bytes */ 1866 static struct btrfs_free_space * 1867 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 1868 unsigned long align, u64 *max_extent_size) 1869 { 1870 struct btrfs_free_space *entry; 1871 struct rb_node *node; 1872 u64 tmp; 1873 u64 align_off; 1874 int ret; 1875 1876 if (!ctl->free_space_offset.rb_node) 1877 goto out; 1878 1879 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); 1880 if (!entry) 1881 goto out; 1882 1883 for (node = &entry->offset_index; node; node = rb_next(node)) { 1884 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1885 if (entry->bytes < *bytes) { 1886 *max_extent_size = max(get_max_extent_size(entry), 1887 *max_extent_size); 1888 continue; 1889 } 1890 1891 /* make sure the space returned is big enough 1892 * to match our requested alignment 1893 */ 1894 if (*bytes >= align) { 1895 tmp = entry->offset - ctl->start + align - 1; 1896 tmp = div64_u64(tmp, align); 1897 tmp = tmp * align + ctl->start; 1898 align_off = tmp - entry->offset; 1899 } else { 1900 align_off = 0; 1901 tmp = entry->offset; 1902 } 1903 1904 if (entry->bytes < *bytes + align_off) { 1905 *max_extent_size = max(get_max_extent_size(entry), 1906 *max_extent_size); 1907 continue; 1908 } 1909 1910 if (entry->bitmap) { 1911 u64 size = *bytes; 1912 1913 ret = search_bitmap(ctl, entry, &tmp, &size, true); 1914 if (!ret) { 1915 *offset = tmp; 1916 *bytes = size; 1917 return entry; 1918 } else { 1919 *max_extent_size = 1920 max(get_max_extent_size(entry), 1921 *max_extent_size); 1922 } 1923 continue; 1924 } 1925 1926 *offset = tmp; 1927 *bytes = entry->bytes - align_off; 1928 return entry; 1929 } 1930 out: 1931 return NULL; 1932 } 1933 1934 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 1935 struct btrfs_free_space *info, u64 offset) 1936 { 1937 info->offset = offset_to_bitmap(ctl, offset); 1938 info->bytes = 0; 1939 info->bitmap_extents = 0; 1940 INIT_LIST_HEAD(&info->list); 1941 link_free_space(ctl, info); 1942 ctl->total_bitmaps++; 1943 recalculate_thresholds(ctl); 1944 } 1945 1946 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 1947 struct btrfs_free_space *bitmap_info) 1948 { 1949 /* 1950 * Normally when this is called, the bitmap is completely empty. However, 1951 * if we are blowing up the free space cache for one reason or another 1952 * via __btrfs_remove_free_space_cache(), then it may not be freed and 1953 * we may leave stats on the table. 1954 */ 1955 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 1956 ctl->discardable_extents[BTRFS_STAT_CURR] -= 1957 bitmap_info->bitmap_extents; 1958 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 1959 1960 } 1961 unlink_free_space(ctl, bitmap_info); 1962 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 1963 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 1964 ctl->total_bitmaps--; 1965 recalculate_thresholds(ctl); 1966 } 1967 1968 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 1969 struct btrfs_free_space *bitmap_info, 1970 u64 *offset, u64 *bytes) 1971 { 1972 u64 end; 1973 u64 search_start, search_bytes; 1974 int ret; 1975 1976 again: 1977 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 1978 1979 /* 1980 * We need to search for bits in this bitmap. We could only cover some 1981 * of the extent in this bitmap thanks to how we add space, so we need 1982 * to search for as much as it as we can and clear that amount, and then 1983 * go searching for the next bit. 1984 */ 1985 search_start = *offset; 1986 search_bytes = ctl->unit; 1987 search_bytes = min(search_bytes, end - search_start + 1); 1988 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 1989 false); 1990 if (ret < 0 || search_start != *offset) 1991 return -EINVAL; 1992 1993 /* We may have found more bits than what we need */ 1994 search_bytes = min(search_bytes, *bytes); 1995 1996 /* Cannot clear past the end of the bitmap */ 1997 search_bytes = min(search_bytes, end - search_start + 1); 1998 1999 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); 2000 *offset += search_bytes; 2001 *bytes -= search_bytes; 2002 2003 if (*bytes) { 2004 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2005 if (!bitmap_info->bytes) 2006 free_bitmap(ctl, bitmap_info); 2007 2008 /* 2009 * no entry after this bitmap, but we still have bytes to 2010 * remove, so something has gone wrong. 2011 */ 2012 if (!next) 2013 return -EINVAL; 2014 2015 bitmap_info = rb_entry(next, struct btrfs_free_space, 2016 offset_index); 2017 2018 /* 2019 * if the next entry isn't a bitmap we need to return to let the 2020 * extent stuff do its work. 2021 */ 2022 if (!bitmap_info->bitmap) 2023 return -EAGAIN; 2024 2025 /* 2026 * Ok the next item is a bitmap, but it may not actually hold 2027 * the information for the rest of this free space stuff, so 2028 * look for it, and if we don't find it return so we can try 2029 * everything over again. 2030 */ 2031 search_start = *offset; 2032 search_bytes = ctl->unit; 2033 ret = search_bitmap(ctl, bitmap_info, &search_start, 2034 &search_bytes, false); 2035 if (ret < 0 || search_start != *offset) 2036 return -EAGAIN; 2037 2038 goto again; 2039 } else if (!bitmap_info->bytes) 2040 free_bitmap(ctl, bitmap_info); 2041 2042 return 0; 2043 } 2044 2045 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2046 struct btrfs_free_space *info, u64 offset, 2047 u64 bytes, enum btrfs_trim_state trim_state) 2048 { 2049 u64 bytes_to_set = 0; 2050 u64 end; 2051 2052 /* 2053 * This is a tradeoff to make bitmap trim state minimal. We mark the 2054 * whole bitmap untrimmed if at any point we add untrimmed regions. 2055 */ 2056 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2057 if (btrfs_free_space_trimmed(info)) { 2058 ctl->discardable_extents[BTRFS_STAT_CURR] += 2059 info->bitmap_extents; 2060 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2061 } 2062 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2063 } 2064 2065 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2066 2067 bytes_to_set = min(end - offset, bytes); 2068 2069 bitmap_set_bits(ctl, info, offset, bytes_to_set); 2070 2071 /* 2072 * We set some bytes, we have no idea what the max extent size is 2073 * anymore. 2074 */ 2075 info->max_extent_size = 0; 2076 2077 return bytes_to_set; 2078 2079 } 2080 2081 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2082 struct btrfs_free_space *info) 2083 { 2084 struct btrfs_block_group *block_group = ctl->private; 2085 struct btrfs_fs_info *fs_info = block_group->fs_info; 2086 bool forced = false; 2087 2088 #ifdef CONFIG_BTRFS_DEBUG 2089 if (btrfs_should_fragment_free_space(block_group)) 2090 forced = true; 2091 #endif 2092 2093 /* This is a way to reclaim large regions from the bitmaps. */ 2094 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2095 return false; 2096 2097 /* 2098 * If we are below the extents threshold then we can add this as an 2099 * extent, and don't have to deal with the bitmap 2100 */ 2101 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2102 /* 2103 * If this block group has some small extents we don't want to 2104 * use up all of our free slots in the cache with them, we want 2105 * to reserve them to larger extents, however if we have plenty 2106 * of cache left then go ahead an dadd them, no sense in adding 2107 * the overhead of a bitmap if we don't have to. 2108 */ 2109 if (info->bytes <= fs_info->sectorsize * 8) { 2110 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2111 return false; 2112 } else { 2113 return false; 2114 } 2115 } 2116 2117 /* 2118 * The original block groups from mkfs can be really small, like 8 2119 * megabytes, so don't bother with a bitmap for those entries. However 2120 * some block groups can be smaller than what a bitmap would cover but 2121 * are still large enough that they could overflow the 32k memory limit, 2122 * so allow those block groups to still be allowed to have a bitmap 2123 * entry. 2124 */ 2125 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2126 return false; 2127 2128 return true; 2129 } 2130 2131 static const struct btrfs_free_space_op free_space_op = { 2132 .use_bitmap = use_bitmap, 2133 }; 2134 2135 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2136 struct btrfs_free_space *info) 2137 { 2138 struct btrfs_free_space *bitmap_info; 2139 struct btrfs_block_group *block_group = NULL; 2140 int added = 0; 2141 u64 bytes, offset, bytes_added; 2142 enum btrfs_trim_state trim_state; 2143 int ret; 2144 2145 bytes = info->bytes; 2146 offset = info->offset; 2147 trim_state = info->trim_state; 2148 2149 if (!ctl->op->use_bitmap(ctl, info)) 2150 return 0; 2151 2152 if (ctl->op == &free_space_op) 2153 block_group = ctl->private; 2154 again: 2155 /* 2156 * Since we link bitmaps right into the cluster we need to see if we 2157 * have a cluster here, and if so and it has our bitmap we need to add 2158 * the free space to that bitmap. 2159 */ 2160 if (block_group && !list_empty(&block_group->cluster_list)) { 2161 struct btrfs_free_cluster *cluster; 2162 struct rb_node *node; 2163 struct btrfs_free_space *entry; 2164 2165 cluster = list_entry(block_group->cluster_list.next, 2166 struct btrfs_free_cluster, 2167 block_group_list); 2168 spin_lock(&cluster->lock); 2169 node = rb_first(&cluster->root); 2170 if (!node) { 2171 spin_unlock(&cluster->lock); 2172 goto no_cluster_bitmap; 2173 } 2174 2175 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2176 if (!entry->bitmap) { 2177 spin_unlock(&cluster->lock); 2178 goto no_cluster_bitmap; 2179 } 2180 2181 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2182 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2183 bytes, trim_state); 2184 bytes -= bytes_added; 2185 offset += bytes_added; 2186 } 2187 spin_unlock(&cluster->lock); 2188 if (!bytes) { 2189 ret = 1; 2190 goto out; 2191 } 2192 } 2193 2194 no_cluster_bitmap: 2195 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2196 1, 0); 2197 if (!bitmap_info) { 2198 ASSERT(added == 0); 2199 goto new_bitmap; 2200 } 2201 2202 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2203 trim_state); 2204 bytes -= bytes_added; 2205 offset += bytes_added; 2206 added = 0; 2207 2208 if (!bytes) { 2209 ret = 1; 2210 goto out; 2211 } else 2212 goto again; 2213 2214 new_bitmap: 2215 if (info && info->bitmap) { 2216 add_new_bitmap(ctl, info, offset); 2217 added = 1; 2218 info = NULL; 2219 goto again; 2220 } else { 2221 spin_unlock(&ctl->tree_lock); 2222 2223 /* no pre-allocated info, allocate a new one */ 2224 if (!info) { 2225 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2226 GFP_NOFS); 2227 if (!info) { 2228 spin_lock(&ctl->tree_lock); 2229 ret = -ENOMEM; 2230 goto out; 2231 } 2232 } 2233 2234 /* allocate the bitmap */ 2235 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2236 GFP_NOFS); 2237 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2238 spin_lock(&ctl->tree_lock); 2239 if (!info->bitmap) { 2240 ret = -ENOMEM; 2241 goto out; 2242 } 2243 goto again; 2244 } 2245 2246 out: 2247 if (info) { 2248 if (info->bitmap) 2249 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2250 info->bitmap); 2251 kmem_cache_free(btrfs_free_space_cachep, info); 2252 } 2253 2254 return ret; 2255 } 2256 2257 /* 2258 * Free space merging rules: 2259 * 1) Merge trimmed areas together 2260 * 2) Let untrimmed areas coalesce with trimmed areas 2261 * 3) Always pull neighboring regions from bitmaps 2262 * 2263 * The above rules are for when we merge free space based on btrfs_trim_state. 2264 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2265 * same reason: to promote larger extent regions which makes life easier for 2266 * find_free_extent(). Rule 2 enables coalescing based on the common path 2267 * being returning free space from btrfs_finish_extent_commit(). So when free 2268 * space is trimmed, it will prevent aggregating trimmed new region and 2269 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2270 * and provide find_free_extent() with the largest extents possible hoping for 2271 * the reuse path. 2272 */ 2273 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2274 struct btrfs_free_space *info, bool update_stat) 2275 { 2276 struct btrfs_free_space *left_info = NULL; 2277 struct btrfs_free_space *right_info; 2278 bool merged = false; 2279 u64 offset = info->offset; 2280 u64 bytes = info->bytes; 2281 const bool is_trimmed = btrfs_free_space_trimmed(info); 2282 2283 /* 2284 * first we want to see if there is free space adjacent to the range we 2285 * are adding, if there is remove that struct and add a new one to 2286 * cover the entire range 2287 */ 2288 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2289 if (right_info && rb_prev(&right_info->offset_index)) 2290 left_info = rb_entry(rb_prev(&right_info->offset_index), 2291 struct btrfs_free_space, offset_index); 2292 else if (!right_info) 2293 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2294 2295 /* See try_merge_free_space() comment. */ 2296 if (right_info && !right_info->bitmap && 2297 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2298 if (update_stat) 2299 unlink_free_space(ctl, right_info); 2300 else 2301 __unlink_free_space(ctl, right_info); 2302 info->bytes += right_info->bytes; 2303 kmem_cache_free(btrfs_free_space_cachep, right_info); 2304 merged = true; 2305 } 2306 2307 /* See try_merge_free_space() comment. */ 2308 if (left_info && !left_info->bitmap && 2309 left_info->offset + left_info->bytes == offset && 2310 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2311 if (update_stat) 2312 unlink_free_space(ctl, left_info); 2313 else 2314 __unlink_free_space(ctl, left_info); 2315 info->offset = left_info->offset; 2316 info->bytes += left_info->bytes; 2317 kmem_cache_free(btrfs_free_space_cachep, left_info); 2318 merged = true; 2319 } 2320 2321 return merged; 2322 } 2323 2324 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2325 struct btrfs_free_space *info, 2326 bool update_stat) 2327 { 2328 struct btrfs_free_space *bitmap; 2329 unsigned long i; 2330 unsigned long j; 2331 const u64 end = info->offset + info->bytes; 2332 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2333 u64 bytes; 2334 2335 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2336 if (!bitmap) 2337 return false; 2338 2339 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2340 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2341 if (j == i) 2342 return false; 2343 bytes = (j - i) * ctl->unit; 2344 info->bytes += bytes; 2345 2346 /* See try_merge_free_space() comment. */ 2347 if (!btrfs_free_space_trimmed(bitmap)) 2348 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2349 2350 if (update_stat) 2351 bitmap_clear_bits(ctl, bitmap, end, bytes); 2352 else 2353 __bitmap_clear_bits(ctl, bitmap, end, bytes); 2354 2355 if (!bitmap->bytes) 2356 free_bitmap(ctl, bitmap); 2357 2358 return true; 2359 } 2360 2361 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2362 struct btrfs_free_space *info, 2363 bool update_stat) 2364 { 2365 struct btrfs_free_space *bitmap; 2366 u64 bitmap_offset; 2367 unsigned long i; 2368 unsigned long j; 2369 unsigned long prev_j; 2370 u64 bytes; 2371 2372 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2373 /* If we're on a boundary, try the previous logical bitmap. */ 2374 if (bitmap_offset == info->offset) { 2375 if (info->offset == 0) 2376 return false; 2377 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2378 } 2379 2380 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2381 if (!bitmap) 2382 return false; 2383 2384 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2385 j = 0; 2386 prev_j = (unsigned long)-1; 2387 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2388 if (j > i) 2389 break; 2390 prev_j = j; 2391 } 2392 if (prev_j == i) 2393 return false; 2394 2395 if (prev_j == (unsigned long)-1) 2396 bytes = (i + 1) * ctl->unit; 2397 else 2398 bytes = (i - prev_j) * ctl->unit; 2399 2400 info->offset -= bytes; 2401 info->bytes += bytes; 2402 2403 /* See try_merge_free_space() comment. */ 2404 if (!btrfs_free_space_trimmed(bitmap)) 2405 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2406 2407 if (update_stat) 2408 bitmap_clear_bits(ctl, bitmap, info->offset, bytes); 2409 else 2410 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes); 2411 2412 if (!bitmap->bytes) 2413 free_bitmap(ctl, bitmap); 2414 2415 return true; 2416 } 2417 2418 /* 2419 * We prefer always to allocate from extent entries, both for clustered and 2420 * non-clustered allocation requests. So when attempting to add a new extent 2421 * entry, try to see if there's adjacent free space in bitmap entries, and if 2422 * there is, migrate that space from the bitmaps to the extent. 2423 * Like this we get better chances of satisfying space allocation requests 2424 * because we attempt to satisfy them based on a single cache entry, and never 2425 * on 2 or more entries - even if the entries represent a contiguous free space 2426 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2427 * ends). 2428 */ 2429 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2430 struct btrfs_free_space *info, 2431 bool update_stat) 2432 { 2433 /* 2434 * Only work with disconnected entries, as we can change their offset, 2435 * and must be extent entries. 2436 */ 2437 ASSERT(!info->bitmap); 2438 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2439 2440 if (ctl->total_bitmaps > 0) { 2441 bool stole_end; 2442 bool stole_front = false; 2443 2444 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2445 if (ctl->total_bitmaps > 0) 2446 stole_front = steal_from_bitmap_to_front(ctl, info, 2447 update_stat); 2448 2449 if (stole_end || stole_front) 2450 try_merge_free_space(ctl, info, update_stat); 2451 } 2452 } 2453 2454 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, 2455 struct btrfs_free_space_ctl *ctl, 2456 u64 offset, u64 bytes, 2457 enum btrfs_trim_state trim_state) 2458 { 2459 struct btrfs_block_group *block_group = ctl->private; 2460 struct btrfs_free_space *info; 2461 int ret = 0; 2462 u64 filter_bytes = bytes; 2463 2464 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2465 if (!info) 2466 return -ENOMEM; 2467 2468 info->offset = offset; 2469 info->bytes = bytes; 2470 info->trim_state = trim_state; 2471 RB_CLEAR_NODE(&info->offset_index); 2472 2473 spin_lock(&ctl->tree_lock); 2474 2475 if (try_merge_free_space(ctl, info, true)) 2476 goto link; 2477 2478 /* 2479 * There was no extent directly to the left or right of this new 2480 * extent then we know we're going to have to allocate a new extent, so 2481 * before we do that see if we need to drop this into a bitmap 2482 */ 2483 ret = insert_into_bitmap(ctl, info); 2484 if (ret < 0) { 2485 goto out; 2486 } else if (ret) { 2487 ret = 0; 2488 goto out; 2489 } 2490 link: 2491 /* 2492 * Only steal free space from adjacent bitmaps if we're sure we're not 2493 * going to add the new free space to existing bitmap entries - because 2494 * that would mean unnecessary work that would be reverted. Therefore 2495 * attempt to steal space from bitmaps if we're adding an extent entry. 2496 */ 2497 steal_from_bitmap(ctl, info, true); 2498 2499 filter_bytes = max(filter_bytes, info->bytes); 2500 2501 ret = link_free_space(ctl, info); 2502 if (ret) 2503 kmem_cache_free(btrfs_free_space_cachep, info); 2504 out: 2505 btrfs_discard_update_discardable(block_group); 2506 spin_unlock(&ctl->tree_lock); 2507 2508 if (ret) { 2509 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2510 ASSERT(ret != -EEXIST); 2511 } 2512 2513 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2514 btrfs_discard_check_filter(block_group, filter_bytes); 2515 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2516 } 2517 2518 return ret; 2519 } 2520 2521 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2522 u64 bytenr, u64 size) 2523 { 2524 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2525 2526 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2527 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2528 2529 return __btrfs_add_free_space(block_group->fs_info, 2530 block_group->free_space_ctl, 2531 bytenr, size, trim_state); 2532 } 2533 2534 /* 2535 * This is a subtle distinction because when adding free space back in general, 2536 * we want it to be added as untrimmed for async. But in the case where we add 2537 * it on loading of a block group, we want to consider it trimmed. 2538 */ 2539 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2540 u64 bytenr, u64 size) 2541 { 2542 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2543 2544 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2545 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2546 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2547 2548 return __btrfs_add_free_space(block_group->fs_info, 2549 block_group->free_space_ctl, 2550 bytenr, size, trim_state); 2551 } 2552 2553 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2554 u64 offset, u64 bytes) 2555 { 2556 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2557 struct btrfs_free_space *info; 2558 int ret; 2559 bool re_search = false; 2560 2561 spin_lock(&ctl->tree_lock); 2562 2563 again: 2564 ret = 0; 2565 if (!bytes) 2566 goto out_lock; 2567 2568 info = tree_search_offset(ctl, offset, 0, 0); 2569 if (!info) { 2570 /* 2571 * oops didn't find an extent that matched the space we wanted 2572 * to remove, look for a bitmap instead 2573 */ 2574 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2575 1, 0); 2576 if (!info) { 2577 /* 2578 * If we found a partial bit of our free space in a 2579 * bitmap but then couldn't find the other part this may 2580 * be a problem, so WARN about it. 2581 */ 2582 WARN_ON(re_search); 2583 goto out_lock; 2584 } 2585 } 2586 2587 re_search = false; 2588 if (!info->bitmap) { 2589 unlink_free_space(ctl, info); 2590 if (offset == info->offset) { 2591 u64 to_free = min(bytes, info->bytes); 2592 2593 info->bytes -= to_free; 2594 info->offset += to_free; 2595 if (info->bytes) { 2596 ret = link_free_space(ctl, info); 2597 WARN_ON(ret); 2598 } else { 2599 kmem_cache_free(btrfs_free_space_cachep, info); 2600 } 2601 2602 offset += to_free; 2603 bytes -= to_free; 2604 goto again; 2605 } else { 2606 u64 old_end = info->bytes + info->offset; 2607 2608 info->bytes = offset - info->offset; 2609 ret = link_free_space(ctl, info); 2610 WARN_ON(ret); 2611 if (ret) 2612 goto out_lock; 2613 2614 /* Not enough bytes in this entry to satisfy us */ 2615 if (old_end < offset + bytes) { 2616 bytes -= old_end - offset; 2617 offset = old_end; 2618 goto again; 2619 } else if (old_end == offset + bytes) { 2620 /* all done */ 2621 goto out_lock; 2622 } 2623 spin_unlock(&ctl->tree_lock); 2624 2625 ret = __btrfs_add_free_space(block_group->fs_info, ctl, 2626 offset + bytes, 2627 old_end - (offset + bytes), 2628 info->trim_state); 2629 WARN_ON(ret); 2630 goto out; 2631 } 2632 } 2633 2634 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2635 if (ret == -EAGAIN) { 2636 re_search = true; 2637 goto again; 2638 } 2639 out_lock: 2640 btrfs_discard_update_discardable(block_group); 2641 spin_unlock(&ctl->tree_lock); 2642 out: 2643 return ret; 2644 } 2645 2646 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2647 u64 bytes) 2648 { 2649 struct btrfs_fs_info *fs_info = block_group->fs_info; 2650 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2651 struct btrfs_free_space *info; 2652 struct rb_node *n; 2653 int count = 0; 2654 2655 spin_lock(&ctl->tree_lock); 2656 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2657 info = rb_entry(n, struct btrfs_free_space, offset_index); 2658 if (info->bytes >= bytes && !block_group->ro) 2659 count++; 2660 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2661 info->offset, info->bytes, 2662 (info->bitmap) ? "yes" : "no"); 2663 } 2664 spin_unlock(&ctl->tree_lock); 2665 btrfs_info(fs_info, "block group has cluster?: %s", 2666 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2667 btrfs_info(fs_info, 2668 "%d blocks of free space at or bigger than bytes is", count); 2669 } 2670 2671 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2672 struct btrfs_free_space_ctl *ctl) 2673 { 2674 struct btrfs_fs_info *fs_info = block_group->fs_info; 2675 2676 spin_lock_init(&ctl->tree_lock); 2677 ctl->unit = fs_info->sectorsize; 2678 ctl->start = block_group->start; 2679 ctl->private = block_group; 2680 ctl->op = &free_space_op; 2681 INIT_LIST_HEAD(&ctl->trimming_ranges); 2682 mutex_init(&ctl->cache_writeout_mutex); 2683 2684 /* 2685 * we only want to have 32k of ram per block group for keeping 2686 * track of free space, and if we pass 1/2 of that we want to 2687 * start converting things over to using bitmaps 2688 */ 2689 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2690 } 2691 2692 /* 2693 * for a given cluster, put all of its extents back into the free 2694 * space cache. If the block group passed doesn't match the block group 2695 * pointed to by the cluster, someone else raced in and freed the 2696 * cluster already. In that case, we just return without changing anything 2697 */ 2698 static void __btrfs_return_cluster_to_free_space( 2699 struct btrfs_block_group *block_group, 2700 struct btrfs_free_cluster *cluster) 2701 { 2702 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2703 struct btrfs_free_space *entry; 2704 struct rb_node *node; 2705 2706 spin_lock(&cluster->lock); 2707 if (cluster->block_group != block_group) 2708 goto out; 2709 2710 cluster->block_group = NULL; 2711 cluster->window_start = 0; 2712 list_del_init(&cluster->block_group_list); 2713 2714 node = rb_first(&cluster->root); 2715 while (node) { 2716 bool bitmap; 2717 2718 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2719 node = rb_next(&entry->offset_index); 2720 rb_erase(&entry->offset_index, &cluster->root); 2721 RB_CLEAR_NODE(&entry->offset_index); 2722 2723 bitmap = (entry->bitmap != NULL); 2724 if (!bitmap) { 2725 /* Merging treats extents as if they were new */ 2726 if (!btrfs_free_space_trimmed(entry)) { 2727 ctl->discardable_extents[BTRFS_STAT_CURR]--; 2728 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 2729 entry->bytes; 2730 } 2731 2732 try_merge_free_space(ctl, entry, false); 2733 steal_from_bitmap(ctl, entry, false); 2734 2735 /* As we insert directly, update these statistics */ 2736 if (!btrfs_free_space_trimmed(entry)) { 2737 ctl->discardable_extents[BTRFS_STAT_CURR]++; 2738 ctl->discardable_bytes[BTRFS_STAT_CURR] += 2739 entry->bytes; 2740 } 2741 } 2742 tree_insert_offset(&ctl->free_space_offset, 2743 entry->offset, &entry->offset_index, bitmap); 2744 } 2745 cluster->root = RB_ROOT; 2746 2747 out: 2748 spin_unlock(&cluster->lock); 2749 btrfs_put_block_group(block_group); 2750 } 2751 2752 static void __btrfs_remove_free_space_cache_locked( 2753 struct btrfs_free_space_ctl *ctl) 2754 { 2755 struct btrfs_free_space *info; 2756 struct rb_node *node; 2757 2758 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 2759 info = rb_entry(node, struct btrfs_free_space, offset_index); 2760 if (!info->bitmap) { 2761 unlink_free_space(ctl, info); 2762 kmem_cache_free(btrfs_free_space_cachep, info); 2763 } else { 2764 free_bitmap(ctl, info); 2765 } 2766 2767 cond_resched_lock(&ctl->tree_lock); 2768 } 2769 } 2770 2771 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 2772 { 2773 spin_lock(&ctl->tree_lock); 2774 __btrfs_remove_free_space_cache_locked(ctl); 2775 if (ctl->private) 2776 btrfs_discard_update_discardable(ctl->private); 2777 spin_unlock(&ctl->tree_lock); 2778 } 2779 2780 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 2781 { 2782 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2783 struct btrfs_free_cluster *cluster; 2784 struct list_head *head; 2785 2786 spin_lock(&ctl->tree_lock); 2787 while ((head = block_group->cluster_list.next) != 2788 &block_group->cluster_list) { 2789 cluster = list_entry(head, struct btrfs_free_cluster, 2790 block_group_list); 2791 2792 WARN_ON(cluster->block_group != block_group); 2793 __btrfs_return_cluster_to_free_space(block_group, cluster); 2794 2795 cond_resched_lock(&ctl->tree_lock); 2796 } 2797 __btrfs_remove_free_space_cache_locked(ctl); 2798 btrfs_discard_update_discardable(block_group); 2799 spin_unlock(&ctl->tree_lock); 2800 2801 } 2802 2803 /** 2804 * btrfs_is_free_space_trimmed - see if everything is trimmed 2805 * @block_group: block_group of interest 2806 * 2807 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 2808 */ 2809 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 2810 { 2811 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2812 struct btrfs_free_space *info; 2813 struct rb_node *node; 2814 bool ret = true; 2815 2816 spin_lock(&ctl->tree_lock); 2817 node = rb_first(&ctl->free_space_offset); 2818 2819 while (node) { 2820 info = rb_entry(node, struct btrfs_free_space, offset_index); 2821 2822 if (!btrfs_free_space_trimmed(info)) { 2823 ret = false; 2824 break; 2825 } 2826 2827 node = rb_next(node); 2828 } 2829 2830 spin_unlock(&ctl->tree_lock); 2831 return ret; 2832 } 2833 2834 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 2835 u64 offset, u64 bytes, u64 empty_size, 2836 u64 *max_extent_size) 2837 { 2838 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2839 struct btrfs_discard_ctl *discard_ctl = 2840 &block_group->fs_info->discard_ctl; 2841 struct btrfs_free_space *entry = NULL; 2842 u64 bytes_search = bytes + empty_size; 2843 u64 ret = 0; 2844 u64 align_gap = 0; 2845 u64 align_gap_len = 0; 2846 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2847 2848 spin_lock(&ctl->tree_lock); 2849 entry = find_free_space(ctl, &offset, &bytes_search, 2850 block_group->full_stripe_len, max_extent_size); 2851 if (!entry) 2852 goto out; 2853 2854 ret = offset; 2855 if (entry->bitmap) { 2856 bitmap_clear_bits(ctl, entry, offset, bytes); 2857 2858 if (!btrfs_free_space_trimmed(entry)) 2859 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 2860 2861 if (!entry->bytes) 2862 free_bitmap(ctl, entry); 2863 } else { 2864 unlink_free_space(ctl, entry); 2865 align_gap_len = offset - entry->offset; 2866 align_gap = entry->offset; 2867 align_gap_trim_state = entry->trim_state; 2868 2869 if (!btrfs_free_space_trimmed(entry)) 2870 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 2871 2872 entry->offset = offset + bytes; 2873 WARN_ON(entry->bytes < bytes + align_gap_len); 2874 2875 entry->bytes -= bytes + align_gap_len; 2876 if (!entry->bytes) 2877 kmem_cache_free(btrfs_free_space_cachep, entry); 2878 else 2879 link_free_space(ctl, entry); 2880 } 2881 out: 2882 btrfs_discard_update_discardable(block_group); 2883 spin_unlock(&ctl->tree_lock); 2884 2885 if (align_gap_len) 2886 __btrfs_add_free_space(block_group->fs_info, ctl, 2887 align_gap, align_gap_len, 2888 align_gap_trim_state); 2889 return ret; 2890 } 2891 2892 /* 2893 * given a cluster, put all of its extents back into the free space 2894 * cache. If a block group is passed, this function will only free 2895 * a cluster that belongs to the passed block group. 2896 * 2897 * Otherwise, it'll get a reference on the block group pointed to by the 2898 * cluster and remove the cluster from it. 2899 */ 2900 void btrfs_return_cluster_to_free_space( 2901 struct btrfs_block_group *block_group, 2902 struct btrfs_free_cluster *cluster) 2903 { 2904 struct btrfs_free_space_ctl *ctl; 2905 2906 /* first, get a safe pointer to the block group */ 2907 spin_lock(&cluster->lock); 2908 if (!block_group) { 2909 block_group = cluster->block_group; 2910 if (!block_group) { 2911 spin_unlock(&cluster->lock); 2912 return; 2913 } 2914 } else if (cluster->block_group != block_group) { 2915 /* someone else has already freed it don't redo their work */ 2916 spin_unlock(&cluster->lock); 2917 return; 2918 } 2919 btrfs_get_block_group(block_group); 2920 spin_unlock(&cluster->lock); 2921 2922 ctl = block_group->free_space_ctl; 2923 2924 /* now return any extents the cluster had on it */ 2925 spin_lock(&ctl->tree_lock); 2926 __btrfs_return_cluster_to_free_space(block_group, cluster); 2927 spin_unlock(&ctl->tree_lock); 2928 2929 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 2930 2931 /* finally drop our ref */ 2932 btrfs_put_block_group(block_group); 2933 } 2934 2935 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 2936 struct btrfs_free_cluster *cluster, 2937 struct btrfs_free_space *entry, 2938 u64 bytes, u64 min_start, 2939 u64 *max_extent_size) 2940 { 2941 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2942 int err; 2943 u64 search_start = cluster->window_start; 2944 u64 search_bytes = bytes; 2945 u64 ret = 0; 2946 2947 search_start = min_start; 2948 search_bytes = bytes; 2949 2950 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 2951 if (err) { 2952 *max_extent_size = max(get_max_extent_size(entry), 2953 *max_extent_size); 2954 return 0; 2955 } 2956 2957 ret = search_start; 2958 __bitmap_clear_bits(ctl, entry, ret, bytes); 2959 2960 return ret; 2961 } 2962 2963 /* 2964 * given a cluster, try to allocate 'bytes' from it, returns 0 2965 * if it couldn't find anything suitably large, or a logical disk offset 2966 * if things worked out 2967 */ 2968 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 2969 struct btrfs_free_cluster *cluster, u64 bytes, 2970 u64 min_start, u64 *max_extent_size) 2971 { 2972 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2973 struct btrfs_discard_ctl *discard_ctl = 2974 &block_group->fs_info->discard_ctl; 2975 struct btrfs_free_space *entry = NULL; 2976 struct rb_node *node; 2977 u64 ret = 0; 2978 2979 spin_lock(&cluster->lock); 2980 if (bytes > cluster->max_size) 2981 goto out; 2982 2983 if (cluster->block_group != block_group) 2984 goto out; 2985 2986 node = rb_first(&cluster->root); 2987 if (!node) 2988 goto out; 2989 2990 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2991 while (1) { 2992 if (entry->bytes < bytes) 2993 *max_extent_size = max(get_max_extent_size(entry), 2994 *max_extent_size); 2995 2996 if (entry->bytes < bytes || 2997 (!entry->bitmap && entry->offset < min_start)) { 2998 node = rb_next(&entry->offset_index); 2999 if (!node) 3000 break; 3001 entry = rb_entry(node, struct btrfs_free_space, 3002 offset_index); 3003 continue; 3004 } 3005 3006 if (entry->bitmap) { 3007 ret = btrfs_alloc_from_bitmap(block_group, 3008 cluster, entry, bytes, 3009 cluster->window_start, 3010 max_extent_size); 3011 if (ret == 0) { 3012 node = rb_next(&entry->offset_index); 3013 if (!node) 3014 break; 3015 entry = rb_entry(node, struct btrfs_free_space, 3016 offset_index); 3017 continue; 3018 } 3019 cluster->window_start += bytes; 3020 } else { 3021 ret = entry->offset; 3022 3023 entry->offset += bytes; 3024 entry->bytes -= bytes; 3025 } 3026 3027 if (entry->bytes == 0) 3028 rb_erase(&entry->offset_index, &cluster->root); 3029 break; 3030 } 3031 out: 3032 spin_unlock(&cluster->lock); 3033 3034 if (!ret) 3035 return 0; 3036 3037 spin_lock(&ctl->tree_lock); 3038 3039 if (!btrfs_free_space_trimmed(entry)) 3040 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3041 3042 ctl->free_space -= bytes; 3043 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3044 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3045 if (entry->bytes == 0) { 3046 ctl->free_extents--; 3047 if (entry->bitmap) { 3048 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3049 entry->bitmap); 3050 ctl->total_bitmaps--; 3051 recalculate_thresholds(ctl); 3052 } else if (!btrfs_free_space_trimmed(entry)) { 3053 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3054 } 3055 kmem_cache_free(btrfs_free_space_cachep, entry); 3056 } 3057 3058 spin_unlock(&ctl->tree_lock); 3059 3060 return ret; 3061 } 3062 3063 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3064 struct btrfs_free_space *entry, 3065 struct btrfs_free_cluster *cluster, 3066 u64 offset, u64 bytes, 3067 u64 cont1_bytes, u64 min_bytes) 3068 { 3069 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3070 unsigned long next_zero; 3071 unsigned long i; 3072 unsigned long want_bits; 3073 unsigned long min_bits; 3074 unsigned long found_bits; 3075 unsigned long max_bits = 0; 3076 unsigned long start = 0; 3077 unsigned long total_found = 0; 3078 int ret; 3079 3080 i = offset_to_bit(entry->offset, ctl->unit, 3081 max_t(u64, offset, entry->offset)); 3082 want_bits = bytes_to_bits(bytes, ctl->unit); 3083 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3084 3085 /* 3086 * Don't bother looking for a cluster in this bitmap if it's heavily 3087 * fragmented. 3088 */ 3089 if (entry->max_extent_size && 3090 entry->max_extent_size < cont1_bytes) 3091 return -ENOSPC; 3092 again: 3093 found_bits = 0; 3094 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3095 next_zero = find_next_zero_bit(entry->bitmap, 3096 BITS_PER_BITMAP, i); 3097 if (next_zero - i >= min_bits) { 3098 found_bits = next_zero - i; 3099 if (found_bits > max_bits) 3100 max_bits = found_bits; 3101 break; 3102 } 3103 if (next_zero - i > max_bits) 3104 max_bits = next_zero - i; 3105 i = next_zero; 3106 } 3107 3108 if (!found_bits) { 3109 entry->max_extent_size = (u64)max_bits * ctl->unit; 3110 return -ENOSPC; 3111 } 3112 3113 if (!total_found) { 3114 start = i; 3115 cluster->max_size = 0; 3116 } 3117 3118 total_found += found_bits; 3119 3120 if (cluster->max_size < found_bits * ctl->unit) 3121 cluster->max_size = found_bits * ctl->unit; 3122 3123 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3124 i = next_zero + 1; 3125 goto again; 3126 } 3127 3128 cluster->window_start = start * ctl->unit + entry->offset; 3129 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3130 ret = tree_insert_offset(&cluster->root, entry->offset, 3131 &entry->offset_index, 1); 3132 ASSERT(!ret); /* -EEXIST; Logic error */ 3133 3134 trace_btrfs_setup_cluster(block_group, cluster, 3135 total_found * ctl->unit, 1); 3136 return 0; 3137 } 3138 3139 /* 3140 * This searches the block group for just extents to fill the cluster with. 3141 * Try to find a cluster with at least bytes total bytes, at least one 3142 * extent of cont1_bytes, and other clusters of at least min_bytes. 3143 */ 3144 static noinline int 3145 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3146 struct btrfs_free_cluster *cluster, 3147 struct list_head *bitmaps, u64 offset, u64 bytes, 3148 u64 cont1_bytes, u64 min_bytes) 3149 { 3150 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3151 struct btrfs_free_space *first = NULL; 3152 struct btrfs_free_space *entry = NULL; 3153 struct btrfs_free_space *last; 3154 struct rb_node *node; 3155 u64 window_free; 3156 u64 max_extent; 3157 u64 total_size = 0; 3158 3159 entry = tree_search_offset(ctl, offset, 0, 1); 3160 if (!entry) 3161 return -ENOSPC; 3162 3163 /* 3164 * We don't want bitmaps, so just move along until we find a normal 3165 * extent entry. 3166 */ 3167 while (entry->bitmap || entry->bytes < min_bytes) { 3168 if (entry->bitmap && list_empty(&entry->list)) 3169 list_add_tail(&entry->list, bitmaps); 3170 node = rb_next(&entry->offset_index); 3171 if (!node) 3172 return -ENOSPC; 3173 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3174 } 3175 3176 window_free = entry->bytes; 3177 max_extent = entry->bytes; 3178 first = entry; 3179 last = entry; 3180 3181 for (node = rb_next(&entry->offset_index); node; 3182 node = rb_next(&entry->offset_index)) { 3183 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3184 3185 if (entry->bitmap) { 3186 if (list_empty(&entry->list)) 3187 list_add_tail(&entry->list, bitmaps); 3188 continue; 3189 } 3190 3191 if (entry->bytes < min_bytes) 3192 continue; 3193 3194 last = entry; 3195 window_free += entry->bytes; 3196 if (entry->bytes > max_extent) 3197 max_extent = entry->bytes; 3198 } 3199 3200 if (window_free < bytes || max_extent < cont1_bytes) 3201 return -ENOSPC; 3202 3203 cluster->window_start = first->offset; 3204 3205 node = &first->offset_index; 3206 3207 /* 3208 * now we've found our entries, pull them out of the free space 3209 * cache and put them into the cluster rbtree 3210 */ 3211 do { 3212 int ret; 3213 3214 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3215 node = rb_next(&entry->offset_index); 3216 if (entry->bitmap || entry->bytes < min_bytes) 3217 continue; 3218 3219 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3220 ret = tree_insert_offset(&cluster->root, entry->offset, 3221 &entry->offset_index, 0); 3222 total_size += entry->bytes; 3223 ASSERT(!ret); /* -EEXIST; Logic error */ 3224 } while (node && entry != last); 3225 3226 cluster->max_size = max_extent; 3227 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3228 return 0; 3229 } 3230 3231 /* 3232 * This specifically looks for bitmaps that may work in the cluster, we assume 3233 * that we have already failed to find extents that will work. 3234 */ 3235 static noinline int 3236 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3237 struct btrfs_free_cluster *cluster, 3238 struct list_head *bitmaps, u64 offset, u64 bytes, 3239 u64 cont1_bytes, u64 min_bytes) 3240 { 3241 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3242 struct btrfs_free_space *entry = NULL; 3243 int ret = -ENOSPC; 3244 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3245 3246 if (ctl->total_bitmaps == 0) 3247 return -ENOSPC; 3248 3249 /* 3250 * The bitmap that covers offset won't be in the list unless offset 3251 * is just its start offset. 3252 */ 3253 if (!list_empty(bitmaps)) 3254 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3255 3256 if (!entry || entry->offset != bitmap_offset) { 3257 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3258 if (entry && list_empty(&entry->list)) 3259 list_add(&entry->list, bitmaps); 3260 } 3261 3262 list_for_each_entry(entry, bitmaps, list) { 3263 if (entry->bytes < bytes) 3264 continue; 3265 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3266 bytes, cont1_bytes, min_bytes); 3267 if (!ret) 3268 return 0; 3269 } 3270 3271 /* 3272 * The bitmaps list has all the bitmaps that record free space 3273 * starting after offset, so no more search is required. 3274 */ 3275 return -ENOSPC; 3276 } 3277 3278 /* 3279 * here we try to find a cluster of blocks in a block group. The goal 3280 * is to find at least bytes+empty_size. 3281 * We might not find them all in one contiguous area. 3282 * 3283 * returns zero and sets up cluster if things worked out, otherwise 3284 * it returns -enospc 3285 */ 3286 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3287 struct btrfs_free_cluster *cluster, 3288 u64 offset, u64 bytes, u64 empty_size) 3289 { 3290 struct btrfs_fs_info *fs_info = block_group->fs_info; 3291 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3292 struct btrfs_free_space *entry, *tmp; 3293 LIST_HEAD(bitmaps); 3294 u64 min_bytes; 3295 u64 cont1_bytes; 3296 int ret; 3297 3298 /* 3299 * Choose the minimum extent size we'll require for this 3300 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3301 * For metadata, allow allocates with smaller extents. For 3302 * data, keep it dense. 3303 */ 3304 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3305 cont1_bytes = min_bytes = bytes + empty_size; 3306 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3307 cont1_bytes = bytes; 3308 min_bytes = fs_info->sectorsize; 3309 } else { 3310 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3311 min_bytes = fs_info->sectorsize; 3312 } 3313 3314 spin_lock(&ctl->tree_lock); 3315 3316 /* 3317 * If we know we don't have enough space to make a cluster don't even 3318 * bother doing all the work to try and find one. 3319 */ 3320 if (ctl->free_space < bytes) { 3321 spin_unlock(&ctl->tree_lock); 3322 return -ENOSPC; 3323 } 3324 3325 spin_lock(&cluster->lock); 3326 3327 /* someone already found a cluster, hooray */ 3328 if (cluster->block_group) { 3329 ret = 0; 3330 goto out; 3331 } 3332 3333 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3334 min_bytes); 3335 3336 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3337 bytes + empty_size, 3338 cont1_bytes, min_bytes); 3339 if (ret) 3340 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3341 offset, bytes + empty_size, 3342 cont1_bytes, min_bytes); 3343 3344 /* Clear our temporary list */ 3345 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3346 list_del_init(&entry->list); 3347 3348 if (!ret) { 3349 btrfs_get_block_group(block_group); 3350 list_add_tail(&cluster->block_group_list, 3351 &block_group->cluster_list); 3352 cluster->block_group = block_group; 3353 } else { 3354 trace_btrfs_failed_cluster_setup(block_group); 3355 } 3356 out: 3357 spin_unlock(&cluster->lock); 3358 spin_unlock(&ctl->tree_lock); 3359 3360 return ret; 3361 } 3362 3363 /* 3364 * simple code to zero out a cluster 3365 */ 3366 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3367 { 3368 spin_lock_init(&cluster->lock); 3369 spin_lock_init(&cluster->refill_lock); 3370 cluster->root = RB_ROOT; 3371 cluster->max_size = 0; 3372 cluster->fragmented = false; 3373 INIT_LIST_HEAD(&cluster->block_group_list); 3374 cluster->block_group = NULL; 3375 } 3376 3377 static int do_trimming(struct btrfs_block_group *block_group, 3378 u64 *total_trimmed, u64 start, u64 bytes, 3379 u64 reserved_start, u64 reserved_bytes, 3380 enum btrfs_trim_state reserved_trim_state, 3381 struct btrfs_trim_range *trim_entry) 3382 { 3383 struct btrfs_space_info *space_info = block_group->space_info; 3384 struct btrfs_fs_info *fs_info = block_group->fs_info; 3385 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3386 int ret; 3387 int update = 0; 3388 const u64 end = start + bytes; 3389 const u64 reserved_end = reserved_start + reserved_bytes; 3390 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3391 u64 trimmed = 0; 3392 3393 spin_lock(&space_info->lock); 3394 spin_lock(&block_group->lock); 3395 if (!block_group->ro) { 3396 block_group->reserved += reserved_bytes; 3397 space_info->bytes_reserved += reserved_bytes; 3398 update = 1; 3399 } 3400 spin_unlock(&block_group->lock); 3401 spin_unlock(&space_info->lock); 3402 3403 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3404 if (!ret) { 3405 *total_trimmed += trimmed; 3406 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3407 } 3408 3409 mutex_lock(&ctl->cache_writeout_mutex); 3410 if (reserved_start < start) 3411 __btrfs_add_free_space(fs_info, ctl, reserved_start, 3412 start - reserved_start, 3413 reserved_trim_state); 3414 if (start + bytes < reserved_start + reserved_bytes) 3415 __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end, 3416 reserved_trim_state); 3417 __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state); 3418 list_del(&trim_entry->list); 3419 mutex_unlock(&ctl->cache_writeout_mutex); 3420 3421 if (update) { 3422 spin_lock(&space_info->lock); 3423 spin_lock(&block_group->lock); 3424 if (block_group->ro) 3425 space_info->bytes_readonly += reserved_bytes; 3426 block_group->reserved -= reserved_bytes; 3427 space_info->bytes_reserved -= reserved_bytes; 3428 spin_unlock(&block_group->lock); 3429 spin_unlock(&space_info->lock); 3430 } 3431 3432 return ret; 3433 } 3434 3435 /* 3436 * If @async is set, then we will trim 1 region and return. 3437 */ 3438 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3439 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3440 bool async) 3441 { 3442 struct btrfs_discard_ctl *discard_ctl = 3443 &block_group->fs_info->discard_ctl; 3444 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3445 struct btrfs_free_space *entry; 3446 struct rb_node *node; 3447 int ret = 0; 3448 u64 extent_start; 3449 u64 extent_bytes; 3450 enum btrfs_trim_state extent_trim_state; 3451 u64 bytes; 3452 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3453 3454 while (start < end) { 3455 struct btrfs_trim_range trim_entry; 3456 3457 mutex_lock(&ctl->cache_writeout_mutex); 3458 spin_lock(&ctl->tree_lock); 3459 3460 if (ctl->free_space < minlen) 3461 goto out_unlock; 3462 3463 entry = tree_search_offset(ctl, start, 0, 1); 3464 if (!entry) 3465 goto out_unlock; 3466 3467 /* Skip bitmaps and if async, already trimmed entries */ 3468 while (entry->bitmap || 3469 (async && btrfs_free_space_trimmed(entry))) { 3470 node = rb_next(&entry->offset_index); 3471 if (!node) 3472 goto out_unlock; 3473 entry = rb_entry(node, struct btrfs_free_space, 3474 offset_index); 3475 } 3476 3477 if (entry->offset >= end) 3478 goto out_unlock; 3479 3480 extent_start = entry->offset; 3481 extent_bytes = entry->bytes; 3482 extent_trim_state = entry->trim_state; 3483 if (async) { 3484 start = entry->offset; 3485 bytes = entry->bytes; 3486 if (bytes < minlen) { 3487 spin_unlock(&ctl->tree_lock); 3488 mutex_unlock(&ctl->cache_writeout_mutex); 3489 goto next; 3490 } 3491 unlink_free_space(ctl, entry); 3492 /* 3493 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3494 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3495 * X when we come back around. So trim it now. 3496 */ 3497 if (max_discard_size && 3498 bytes >= (max_discard_size + 3499 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3500 bytes = max_discard_size; 3501 extent_bytes = max_discard_size; 3502 entry->offset += max_discard_size; 3503 entry->bytes -= max_discard_size; 3504 link_free_space(ctl, entry); 3505 } else { 3506 kmem_cache_free(btrfs_free_space_cachep, entry); 3507 } 3508 } else { 3509 start = max(start, extent_start); 3510 bytes = min(extent_start + extent_bytes, end) - start; 3511 if (bytes < minlen) { 3512 spin_unlock(&ctl->tree_lock); 3513 mutex_unlock(&ctl->cache_writeout_mutex); 3514 goto next; 3515 } 3516 3517 unlink_free_space(ctl, entry); 3518 kmem_cache_free(btrfs_free_space_cachep, entry); 3519 } 3520 3521 spin_unlock(&ctl->tree_lock); 3522 trim_entry.start = extent_start; 3523 trim_entry.bytes = extent_bytes; 3524 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3525 mutex_unlock(&ctl->cache_writeout_mutex); 3526 3527 ret = do_trimming(block_group, total_trimmed, start, bytes, 3528 extent_start, extent_bytes, extent_trim_state, 3529 &trim_entry); 3530 if (ret) { 3531 block_group->discard_cursor = start + bytes; 3532 break; 3533 } 3534 next: 3535 start += bytes; 3536 block_group->discard_cursor = start; 3537 if (async && *total_trimmed) 3538 break; 3539 3540 if (fatal_signal_pending(current)) { 3541 ret = -ERESTARTSYS; 3542 break; 3543 } 3544 3545 cond_resched(); 3546 } 3547 3548 return ret; 3549 3550 out_unlock: 3551 block_group->discard_cursor = btrfs_block_group_end(block_group); 3552 spin_unlock(&ctl->tree_lock); 3553 mutex_unlock(&ctl->cache_writeout_mutex); 3554 3555 return ret; 3556 } 3557 3558 /* 3559 * If we break out of trimming a bitmap prematurely, we should reset the 3560 * trimming bit. In a rather contrieved case, it's possible to race here so 3561 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3562 * 3563 * start = start of bitmap 3564 * end = near end of bitmap 3565 * 3566 * Thread 1: Thread 2: 3567 * trim_bitmaps(start) 3568 * trim_bitmaps(end) 3569 * end_trimming_bitmap() 3570 * reset_trimming_bitmap() 3571 */ 3572 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3573 { 3574 struct btrfs_free_space *entry; 3575 3576 spin_lock(&ctl->tree_lock); 3577 entry = tree_search_offset(ctl, offset, 1, 0); 3578 if (entry) { 3579 if (btrfs_free_space_trimmed(entry)) { 3580 ctl->discardable_extents[BTRFS_STAT_CURR] += 3581 entry->bitmap_extents; 3582 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3583 } 3584 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3585 } 3586 3587 spin_unlock(&ctl->tree_lock); 3588 } 3589 3590 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3591 struct btrfs_free_space *entry) 3592 { 3593 if (btrfs_free_space_trimming_bitmap(entry)) { 3594 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3595 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3596 entry->bitmap_extents; 3597 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3598 } 3599 } 3600 3601 /* 3602 * If @async is set, then we will trim 1 region and return. 3603 */ 3604 static int trim_bitmaps(struct btrfs_block_group *block_group, 3605 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3606 u64 maxlen, bool async) 3607 { 3608 struct btrfs_discard_ctl *discard_ctl = 3609 &block_group->fs_info->discard_ctl; 3610 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3611 struct btrfs_free_space *entry; 3612 int ret = 0; 3613 int ret2; 3614 u64 bytes; 3615 u64 offset = offset_to_bitmap(ctl, start); 3616 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3617 3618 while (offset < end) { 3619 bool next_bitmap = false; 3620 struct btrfs_trim_range trim_entry; 3621 3622 mutex_lock(&ctl->cache_writeout_mutex); 3623 spin_lock(&ctl->tree_lock); 3624 3625 if (ctl->free_space < minlen) { 3626 block_group->discard_cursor = 3627 btrfs_block_group_end(block_group); 3628 spin_unlock(&ctl->tree_lock); 3629 mutex_unlock(&ctl->cache_writeout_mutex); 3630 break; 3631 } 3632 3633 entry = tree_search_offset(ctl, offset, 1, 0); 3634 /* 3635 * Bitmaps are marked trimmed lossily now to prevent constant 3636 * discarding of the same bitmap (the reason why we are bound 3637 * by the filters). So, retrim the block group bitmaps when we 3638 * are preparing to punt to the unused_bgs list. This uses 3639 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3640 * which is the only discard index which sets minlen to 0. 3641 */ 3642 if (!entry || (async && minlen && start == offset && 3643 btrfs_free_space_trimmed(entry))) { 3644 spin_unlock(&ctl->tree_lock); 3645 mutex_unlock(&ctl->cache_writeout_mutex); 3646 next_bitmap = true; 3647 goto next; 3648 } 3649 3650 /* 3651 * Async discard bitmap trimming begins at by setting the start 3652 * to be key.objectid and the offset_to_bitmap() aligns to the 3653 * start of the bitmap. This lets us know we are fully 3654 * scanning the bitmap rather than only some portion of it. 3655 */ 3656 if (start == offset) 3657 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3658 3659 bytes = minlen; 3660 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3661 if (ret2 || start >= end) { 3662 /* 3663 * We lossily consider a bitmap trimmed if we only skip 3664 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3665 */ 3666 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3667 end_trimming_bitmap(ctl, entry); 3668 else 3669 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3670 spin_unlock(&ctl->tree_lock); 3671 mutex_unlock(&ctl->cache_writeout_mutex); 3672 next_bitmap = true; 3673 goto next; 3674 } 3675 3676 /* 3677 * We already trimmed a region, but are using the locking above 3678 * to reset the trim_state. 3679 */ 3680 if (async && *total_trimmed) { 3681 spin_unlock(&ctl->tree_lock); 3682 mutex_unlock(&ctl->cache_writeout_mutex); 3683 goto out; 3684 } 3685 3686 bytes = min(bytes, end - start); 3687 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 3688 spin_unlock(&ctl->tree_lock); 3689 mutex_unlock(&ctl->cache_writeout_mutex); 3690 goto next; 3691 } 3692 3693 /* 3694 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3695 * If X < @minlen, we won't trim X when we come back around. 3696 * So trim it now. We differ here from trimming extents as we 3697 * don't keep individual state per bit. 3698 */ 3699 if (async && 3700 max_discard_size && 3701 bytes > (max_discard_size + minlen)) 3702 bytes = max_discard_size; 3703 3704 bitmap_clear_bits(ctl, entry, start, bytes); 3705 if (entry->bytes == 0) 3706 free_bitmap(ctl, entry); 3707 3708 spin_unlock(&ctl->tree_lock); 3709 trim_entry.start = start; 3710 trim_entry.bytes = bytes; 3711 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3712 mutex_unlock(&ctl->cache_writeout_mutex); 3713 3714 ret = do_trimming(block_group, total_trimmed, start, bytes, 3715 start, bytes, 0, &trim_entry); 3716 if (ret) { 3717 reset_trimming_bitmap(ctl, offset); 3718 block_group->discard_cursor = 3719 btrfs_block_group_end(block_group); 3720 break; 3721 } 3722 next: 3723 if (next_bitmap) { 3724 offset += BITS_PER_BITMAP * ctl->unit; 3725 start = offset; 3726 } else { 3727 start += bytes; 3728 } 3729 block_group->discard_cursor = start; 3730 3731 if (fatal_signal_pending(current)) { 3732 if (start != offset) 3733 reset_trimming_bitmap(ctl, offset); 3734 ret = -ERESTARTSYS; 3735 break; 3736 } 3737 3738 cond_resched(); 3739 } 3740 3741 if (offset >= end) 3742 block_group->discard_cursor = end; 3743 3744 out: 3745 return ret; 3746 } 3747 3748 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 3749 u64 *trimmed, u64 start, u64 end, u64 minlen) 3750 { 3751 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3752 int ret; 3753 u64 rem = 0; 3754 3755 *trimmed = 0; 3756 3757 spin_lock(&block_group->lock); 3758 if (block_group->removed) { 3759 spin_unlock(&block_group->lock); 3760 return 0; 3761 } 3762 btrfs_freeze_block_group(block_group); 3763 spin_unlock(&block_group->lock); 3764 3765 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 3766 if (ret) 3767 goto out; 3768 3769 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 3770 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 3771 /* If we ended in the middle of a bitmap, reset the trimming flag */ 3772 if (rem) 3773 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 3774 out: 3775 btrfs_unfreeze_block_group(block_group); 3776 return ret; 3777 } 3778 3779 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 3780 u64 *trimmed, u64 start, u64 end, u64 minlen, 3781 bool async) 3782 { 3783 int ret; 3784 3785 *trimmed = 0; 3786 3787 spin_lock(&block_group->lock); 3788 if (block_group->removed) { 3789 spin_unlock(&block_group->lock); 3790 return 0; 3791 } 3792 btrfs_freeze_block_group(block_group); 3793 spin_unlock(&block_group->lock); 3794 3795 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 3796 btrfs_unfreeze_block_group(block_group); 3797 3798 return ret; 3799 } 3800 3801 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 3802 u64 *trimmed, u64 start, u64 end, u64 minlen, 3803 u64 maxlen, bool async) 3804 { 3805 int ret; 3806 3807 *trimmed = 0; 3808 3809 spin_lock(&block_group->lock); 3810 if (block_group->removed) { 3811 spin_unlock(&block_group->lock); 3812 return 0; 3813 } 3814 btrfs_freeze_block_group(block_group); 3815 spin_unlock(&block_group->lock); 3816 3817 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 3818 async); 3819 3820 btrfs_unfreeze_block_group(block_group); 3821 3822 return ret; 3823 } 3824 3825 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 3826 { 3827 return btrfs_super_cache_generation(fs_info->super_copy); 3828 } 3829 3830 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 3831 struct btrfs_trans_handle *trans) 3832 { 3833 struct btrfs_block_group *block_group; 3834 struct rb_node *node; 3835 int ret; 3836 3837 btrfs_info(fs_info, "cleaning free space cache v1"); 3838 3839 node = rb_first(&fs_info->block_group_cache_tree); 3840 while (node) { 3841 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 3842 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 3843 if (ret) 3844 goto out; 3845 node = rb_next(node); 3846 } 3847 out: 3848 return ret; 3849 } 3850 3851 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 3852 { 3853 struct btrfs_trans_handle *trans; 3854 int ret; 3855 3856 /* 3857 * update_super_roots will appropriately set or unset 3858 * super_copy->cache_generation based on SPACE_CACHE and 3859 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 3860 * transaction commit whether we are enabling space cache v1 and don't 3861 * have any other work to do, or are disabling it and removing free 3862 * space inodes. 3863 */ 3864 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3865 if (IS_ERR(trans)) 3866 return PTR_ERR(trans); 3867 3868 if (!active) { 3869 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 3870 ret = cleanup_free_space_cache_v1(fs_info, trans); 3871 if (ret) { 3872 btrfs_abort_transaction(trans, ret); 3873 btrfs_end_transaction(trans); 3874 goto out; 3875 } 3876 } 3877 3878 ret = btrfs_commit_transaction(trans); 3879 out: 3880 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 3881 3882 return ret; 3883 } 3884 3885 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3886 /* 3887 * Use this if you need to make a bitmap or extent entry specifically, it 3888 * doesn't do any of the merging that add_free_space does, this acts a lot like 3889 * how the free space cache loading stuff works, so you can get really weird 3890 * configurations. 3891 */ 3892 int test_add_free_space_entry(struct btrfs_block_group *cache, 3893 u64 offset, u64 bytes, bool bitmap) 3894 { 3895 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 3896 struct btrfs_free_space *info = NULL, *bitmap_info; 3897 void *map = NULL; 3898 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 3899 u64 bytes_added; 3900 int ret; 3901 3902 again: 3903 if (!info) { 3904 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 3905 if (!info) 3906 return -ENOMEM; 3907 } 3908 3909 if (!bitmap) { 3910 spin_lock(&ctl->tree_lock); 3911 info->offset = offset; 3912 info->bytes = bytes; 3913 info->max_extent_size = 0; 3914 ret = link_free_space(ctl, info); 3915 spin_unlock(&ctl->tree_lock); 3916 if (ret) 3917 kmem_cache_free(btrfs_free_space_cachep, info); 3918 return ret; 3919 } 3920 3921 if (!map) { 3922 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 3923 if (!map) { 3924 kmem_cache_free(btrfs_free_space_cachep, info); 3925 return -ENOMEM; 3926 } 3927 } 3928 3929 spin_lock(&ctl->tree_lock); 3930 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 3931 1, 0); 3932 if (!bitmap_info) { 3933 info->bitmap = map; 3934 map = NULL; 3935 add_new_bitmap(ctl, info, offset); 3936 bitmap_info = info; 3937 info = NULL; 3938 } 3939 3940 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 3941 trim_state); 3942 3943 bytes -= bytes_added; 3944 offset += bytes_added; 3945 spin_unlock(&ctl->tree_lock); 3946 3947 if (bytes) 3948 goto again; 3949 3950 if (info) 3951 kmem_cache_free(btrfs_free_space_cachep, info); 3952 if (map) 3953 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 3954 return 0; 3955 } 3956 3957 /* 3958 * Checks to see if the given range is in the free space cache. This is really 3959 * just used to check the absence of space, so if there is free space in the 3960 * range at all we will return 1. 3961 */ 3962 int test_check_exists(struct btrfs_block_group *cache, 3963 u64 offset, u64 bytes) 3964 { 3965 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 3966 struct btrfs_free_space *info; 3967 int ret = 0; 3968 3969 spin_lock(&ctl->tree_lock); 3970 info = tree_search_offset(ctl, offset, 0, 0); 3971 if (!info) { 3972 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 3973 1, 0); 3974 if (!info) 3975 goto out; 3976 } 3977 3978 have_info: 3979 if (info->bitmap) { 3980 u64 bit_off, bit_bytes; 3981 struct rb_node *n; 3982 struct btrfs_free_space *tmp; 3983 3984 bit_off = offset; 3985 bit_bytes = ctl->unit; 3986 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 3987 if (!ret) { 3988 if (bit_off == offset) { 3989 ret = 1; 3990 goto out; 3991 } else if (bit_off > offset && 3992 offset + bytes > bit_off) { 3993 ret = 1; 3994 goto out; 3995 } 3996 } 3997 3998 n = rb_prev(&info->offset_index); 3999 while (n) { 4000 tmp = rb_entry(n, struct btrfs_free_space, 4001 offset_index); 4002 if (tmp->offset + tmp->bytes < offset) 4003 break; 4004 if (offset + bytes < tmp->offset) { 4005 n = rb_prev(&tmp->offset_index); 4006 continue; 4007 } 4008 info = tmp; 4009 goto have_info; 4010 } 4011 4012 n = rb_next(&info->offset_index); 4013 while (n) { 4014 tmp = rb_entry(n, struct btrfs_free_space, 4015 offset_index); 4016 if (offset + bytes < tmp->offset) 4017 break; 4018 if (tmp->offset + tmp->bytes < offset) { 4019 n = rb_next(&tmp->offset_index); 4020 continue; 4021 } 4022 info = tmp; 4023 goto have_info; 4024 } 4025 4026 ret = 0; 4027 goto out; 4028 } 4029 4030 if (info->offset == offset) { 4031 ret = 1; 4032 goto out; 4033 } 4034 4035 if (offset > info->offset && offset < info->offset + info->bytes) 4036 ret = 1; 4037 out: 4038 spin_unlock(&ctl->tree_lock); 4039 return ret; 4040 } 4041 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4042