1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include "ctree.h" 15 #include "fs.h" 16 #include "messages.h" 17 #include "misc.h" 18 #include "free-space-cache.h" 19 #include "transaction.h" 20 #include "disk-io.h" 21 #include "extent_io.h" 22 #include "volumes.h" 23 #include "space-info.h" 24 #include "delalloc-space.h" 25 #include "block-group.h" 26 #include "discard.h" 27 #include "subpage.h" 28 #include "inode-item.h" 29 #include "accessors.h" 30 #include "file-item.h" 31 #include "file.h" 32 #include "super.h" 33 34 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 35 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 36 #define FORCE_EXTENT_THRESHOLD SZ_1M 37 38 static struct kmem_cache *btrfs_free_space_cachep; 39 static struct kmem_cache *btrfs_free_space_bitmap_cachep; 40 41 struct btrfs_trim_range { 42 u64 start; 43 u64 bytes; 44 struct list_head list; 45 }; 46 47 static int link_free_space(struct btrfs_free_space_ctl *ctl, 48 struct btrfs_free_space *info); 49 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 50 struct btrfs_free_space *info, bool update_stat); 51 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 52 struct btrfs_free_space *bitmap_info, u64 *offset, 53 u64 *bytes, bool for_alloc); 54 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 55 struct btrfs_free_space *bitmap_info); 56 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 57 struct btrfs_free_space *info, u64 offset, 58 u64 bytes, bool update_stats); 59 60 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 61 { 62 struct btrfs_free_space *info; 63 struct rb_node *node; 64 65 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 66 info = rb_entry(node, struct btrfs_free_space, offset_index); 67 if (!info->bitmap) { 68 unlink_free_space(ctl, info, true); 69 kmem_cache_free(btrfs_free_space_cachep, info); 70 } else { 71 free_bitmap(ctl, info); 72 } 73 74 cond_resched_lock(&ctl->tree_lock); 75 } 76 } 77 78 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 79 struct btrfs_path *path, 80 u64 offset) 81 { 82 struct btrfs_fs_info *fs_info = root->fs_info; 83 struct btrfs_key key; 84 struct btrfs_key location; 85 struct btrfs_disk_key disk_key; 86 struct btrfs_free_space_header *header; 87 struct extent_buffer *leaf; 88 struct inode *inode = NULL; 89 unsigned nofs_flag; 90 int ret; 91 92 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 93 key.offset = offset; 94 key.type = 0; 95 96 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 97 if (ret < 0) 98 return ERR_PTR(ret); 99 if (ret > 0) { 100 btrfs_release_path(path); 101 return ERR_PTR(-ENOENT); 102 } 103 104 leaf = path->nodes[0]; 105 header = btrfs_item_ptr(leaf, path->slots[0], 106 struct btrfs_free_space_header); 107 btrfs_free_space_key(leaf, header, &disk_key); 108 btrfs_disk_key_to_cpu(&location, &disk_key); 109 btrfs_release_path(path); 110 111 /* 112 * We are often under a trans handle at this point, so we need to make 113 * sure NOFS is set to keep us from deadlocking. 114 */ 115 nofs_flag = memalloc_nofs_save(); 116 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); 117 btrfs_release_path(path); 118 memalloc_nofs_restore(nofs_flag); 119 if (IS_ERR(inode)) 120 return inode; 121 122 mapping_set_gfp_mask(inode->i_mapping, 123 mapping_gfp_constraint(inode->i_mapping, 124 ~(__GFP_FS | __GFP_HIGHMEM))); 125 126 return inode; 127 } 128 129 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 130 struct btrfs_path *path) 131 { 132 struct btrfs_fs_info *fs_info = block_group->fs_info; 133 struct inode *inode = NULL; 134 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 135 136 spin_lock(&block_group->lock); 137 if (block_group->inode) 138 inode = igrab(block_group->inode); 139 spin_unlock(&block_group->lock); 140 if (inode) 141 return inode; 142 143 inode = __lookup_free_space_inode(fs_info->tree_root, path, 144 block_group->start); 145 if (IS_ERR(inode)) 146 return inode; 147 148 spin_lock(&block_group->lock); 149 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 150 btrfs_info(fs_info, "Old style space inode found, converting."); 151 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 152 BTRFS_INODE_NODATACOW; 153 block_group->disk_cache_state = BTRFS_DC_CLEAR; 154 } 155 156 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) 157 block_group->inode = igrab(inode); 158 spin_unlock(&block_group->lock); 159 160 return inode; 161 } 162 163 static int __create_free_space_inode(struct btrfs_root *root, 164 struct btrfs_trans_handle *trans, 165 struct btrfs_path *path, 166 u64 ino, u64 offset) 167 { 168 struct btrfs_key key; 169 struct btrfs_disk_key disk_key; 170 struct btrfs_free_space_header *header; 171 struct btrfs_inode_item *inode_item; 172 struct extent_buffer *leaf; 173 /* We inline CRCs for the free disk space cache */ 174 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 175 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 176 int ret; 177 178 ret = btrfs_insert_empty_inode(trans, root, path, ino); 179 if (ret) 180 return ret; 181 182 leaf = path->nodes[0]; 183 inode_item = btrfs_item_ptr(leaf, path->slots[0], 184 struct btrfs_inode_item); 185 btrfs_item_key(leaf, &disk_key, path->slots[0]); 186 memzero_extent_buffer(leaf, (unsigned long)inode_item, 187 sizeof(*inode_item)); 188 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 189 btrfs_set_inode_size(leaf, inode_item, 0); 190 btrfs_set_inode_nbytes(leaf, inode_item, 0); 191 btrfs_set_inode_uid(leaf, inode_item, 0); 192 btrfs_set_inode_gid(leaf, inode_item, 0); 193 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 194 btrfs_set_inode_flags(leaf, inode_item, flags); 195 btrfs_set_inode_nlink(leaf, inode_item, 1); 196 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 197 btrfs_set_inode_block_group(leaf, inode_item, offset); 198 btrfs_mark_buffer_dirty(trans, leaf); 199 btrfs_release_path(path); 200 201 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 202 key.offset = offset; 203 key.type = 0; 204 ret = btrfs_insert_empty_item(trans, root, path, &key, 205 sizeof(struct btrfs_free_space_header)); 206 if (ret < 0) { 207 btrfs_release_path(path); 208 return ret; 209 } 210 211 leaf = path->nodes[0]; 212 header = btrfs_item_ptr(leaf, path->slots[0], 213 struct btrfs_free_space_header); 214 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 215 btrfs_set_free_space_key(leaf, header, &disk_key); 216 btrfs_mark_buffer_dirty(trans, leaf); 217 btrfs_release_path(path); 218 219 return 0; 220 } 221 222 int create_free_space_inode(struct btrfs_trans_handle *trans, 223 struct btrfs_block_group *block_group, 224 struct btrfs_path *path) 225 { 226 int ret; 227 u64 ino; 228 229 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); 230 if (ret < 0) 231 return ret; 232 233 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 234 ino, block_group->start); 235 } 236 237 /* 238 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 239 * handles lookup, otherwise it takes ownership and iputs the inode. 240 * Don't reuse an inode pointer after passing it into this function. 241 */ 242 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 243 struct inode *inode, 244 struct btrfs_block_group *block_group) 245 { 246 struct btrfs_path *path; 247 struct btrfs_key key; 248 int ret = 0; 249 250 path = btrfs_alloc_path(); 251 if (!path) 252 return -ENOMEM; 253 254 if (!inode) 255 inode = lookup_free_space_inode(block_group, path); 256 if (IS_ERR(inode)) { 257 if (PTR_ERR(inode) != -ENOENT) 258 ret = PTR_ERR(inode); 259 goto out; 260 } 261 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 262 if (ret) { 263 btrfs_add_delayed_iput(BTRFS_I(inode)); 264 goto out; 265 } 266 clear_nlink(inode); 267 /* One for the block groups ref */ 268 spin_lock(&block_group->lock); 269 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { 270 block_group->inode = NULL; 271 spin_unlock(&block_group->lock); 272 iput(inode); 273 } else { 274 spin_unlock(&block_group->lock); 275 } 276 /* One for the lookup ref */ 277 btrfs_add_delayed_iput(BTRFS_I(inode)); 278 279 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 280 key.type = 0; 281 key.offset = block_group->start; 282 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 283 -1, 1); 284 if (ret) { 285 if (ret > 0) 286 ret = 0; 287 goto out; 288 } 289 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); 290 out: 291 btrfs_free_path(path); 292 return ret; 293 } 294 295 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 296 struct btrfs_block_group *block_group, 297 struct inode *vfs_inode) 298 { 299 struct btrfs_truncate_control control = { 300 .inode = BTRFS_I(vfs_inode), 301 .new_size = 0, 302 .ino = btrfs_ino(BTRFS_I(vfs_inode)), 303 .min_type = BTRFS_EXTENT_DATA_KEY, 304 .clear_extent_range = true, 305 }; 306 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 307 struct btrfs_root *root = inode->root; 308 struct extent_state *cached_state = NULL; 309 int ret = 0; 310 bool locked = false; 311 312 if (block_group) { 313 struct btrfs_path *path = btrfs_alloc_path(); 314 315 if (!path) { 316 ret = -ENOMEM; 317 goto fail; 318 } 319 locked = true; 320 mutex_lock(&trans->transaction->cache_write_mutex); 321 if (!list_empty(&block_group->io_list)) { 322 list_del_init(&block_group->io_list); 323 324 btrfs_wait_cache_io(trans, block_group, path); 325 btrfs_put_block_group(block_group); 326 } 327 328 /* 329 * now that we've truncated the cache away, its no longer 330 * setup or written 331 */ 332 spin_lock(&block_group->lock); 333 block_group->disk_cache_state = BTRFS_DC_CLEAR; 334 spin_unlock(&block_group->lock); 335 btrfs_free_path(path); 336 } 337 338 btrfs_i_size_write(inode, 0); 339 truncate_pagecache(vfs_inode, 0); 340 341 lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 342 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 343 344 /* 345 * We skip the throttling logic for free space cache inodes, so we don't 346 * need to check for -EAGAIN. 347 */ 348 ret = btrfs_truncate_inode_items(trans, root, &control); 349 350 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 351 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 352 353 unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 354 if (ret) 355 goto fail; 356 357 ret = btrfs_update_inode(trans, root, inode); 358 359 fail: 360 if (locked) 361 mutex_unlock(&trans->transaction->cache_write_mutex); 362 if (ret) 363 btrfs_abort_transaction(trans, ret); 364 365 return ret; 366 } 367 368 static void readahead_cache(struct inode *inode) 369 { 370 struct file_ra_state ra; 371 unsigned long last_index; 372 373 file_ra_state_init(&ra, inode->i_mapping); 374 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 375 376 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); 377 } 378 379 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 380 int write) 381 { 382 int num_pages; 383 384 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 385 386 /* Make sure we can fit our crcs and generation into the first page */ 387 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 388 return -ENOSPC; 389 390 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 391 392 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 393 if (!io_ctl->pages) 394 return -ENOMEM; 395 396 io_ctl->num_pages = num_pages; 397 io_ctl->fs_info = btrfs_sb(inode->i_sb); 398 io_ctl->inode = inode; 399 400 return 0; 401 } 402 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 403 404 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 405 { 406 kfree(io_ctl->pages); 407 io_ctl->pages = NULL; 408 } 409 410 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 411 { 412 if (io_ctl->cur) { 413 io_ctl->cur = NULL; 414 io_ctl->orig = NULL; 415 } 416 } 417 418 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 419 { 420 ASSERT(io_ctl->index < io_ctl->num_pages); 421 io_ctl->page = io_ctl->pages[io_ctl->index++]; 422 io_ctl->cur = page_address(io_ctl->page); 423 io_ctl->orig = io_ctl->cur; 424 io_ctl->size = PAGE_SIZE; 425 if (clear) 426 clear_page(io_ctl->cur); 427 } 428 429 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 430 { 431 int i; 432 433 io_ctl_unmap_page(io_ctl); 434 435 for (i = 0; i < io_ctl->num_pages; i++) { 436 if (io_ctl->pages[i]) { 437 btrfs_page_clear_checked(io_ctl->fs_info, 438 io_ctl->pages[i], 439 page_offset(io_ctl->pages[i]), 440 PAGE_SIZE); 441 unlock_page(io_ctl->pages[i]); 442 put_page(io_ctl->pages[i]); 443 } 444 } 445 } 446 447 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 448 { 449 struct page *page; 450 struct inode *inode = io_ctl->inode; 451 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 452 int i; 453 454 for (i = 0; i < io_ctl->num_pages; i++) { 455 int ret; 456 457 page = find_or_create_page(inode->i_mapping, i, mask); 458 if (!page) { 459 io_ctl_drop_pages(io_ctl); 460 return -ENOMEM; 461 } 462 463 ret = set_page_extent_mapped(page); 464 if (ret < 0) { 465 unlock_page(page); 466 put_page(page); 467 io_ctl_drop_pages(io_ctl); 468 return ret; 469 } 470 471 io_ctl->pages[i] = page; 472 if (uptodate && !PageUptodate(page)) { 473 btrfs_read_folio(NULL, page_folio(page)); 474 lock_page(page); 475 if (page->mapping != inode->i_mapping) { 476 btrfs_err(BTRFS_I(inode)->root->fs_info, 477 "free space cache page truncated"); 478 io_ctl_drop_pages(io_ctl); 479 return -EIO; 480 } 481 if (!PageUptodate(page)) { 482 btrfs_err(BTRFS_I(inode)->root->fs_info, 483 "error reading free space cache"); 484 io_ctl_drop_pages(io_ctl); 485 return -EIO; 486 } 487 } 488 } 489 490 for (i = 0; i < io_ctl->num_pages; i++) 491 clear_page_dirty_for_io(io_ctl->pages[i]); 492 493 return 0; 494 } 495 496 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 497 { 498 io_ctl_map_page(io_ctl, 1); 499 500 /* 501 * Skip the csum areas. If we don't check crcs then we just have a 502 * 64bit chunk at the front of the first page. 503 */ 504 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 505 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 506 507 put_unaligned_le64(generation, io_ctl->cur); 508 io_ctl->cur += sizeof(u64); 509 } 510 511 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 512 { 513 u64 cache_gen; 514 515 /* 516 * Skip the crc area. If we don't check crcs then we just have a 64bit 517 * chunk at the front of the first page. 518 */ 519 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 520 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 521 522 cache_gen = get_unaligned_le64(io_ctl->cur); 523 if (cache_gen != generation) { 524 btrfs_err_rl(io_ctl->fs_info, 525 "space cache generation (%llu) does not match inode (%llu)", 526 cache_gen, generation); 527 io_ctl_unmap_page(io_ctl); 528 return -EIO; 529 } 530 io_ctl->cur += sizeof(u64); 531 return 0; 532 } 533 534 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 535 { 536 u32 *tmp; 537 u32 crc = ~(u32)0; 538 unsigned offset = 0; 539 540 if (index == 0) 541 offset = sizeof(u32) * io_ctl->num_pages; 542 543 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 544 btrfs_crc32c_final(crc, (u8 *)&crc); 545 io_ctl_unmap_page(io_ctl); 546 tmp = page_address(io_ctl->pages[0]); 547 tmp += index; 548 *tmp = crc; 549 } 550 551 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 552 { 553 u32 *tmp, val; 554 u32 crc = ~(u32)0; 555 unsigned offset = 0; 556 557 if (index == 0) 558 offset = sizeof(u32) * io_ctl->num_pages; 559 560 tmp = page_address(io_ctl->pages[0]); 561 tmp += index; 562 val = *tmp; 563 564 io_ctl_map_page(io_ctl, 0); 565 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 566 btrfs_crc32c_final(crc, (u8 *)&crc); 567 if (val != crc) { 568 btrfs_err_rl(io_ctl->fs_info, 569 "csum mismatch on free space cache"); 570 io_ctl_unmap_page(io_ctl); 571 return -EIO; 572 } 573 574 return 0; 575 } 576 577 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 578 void *bitmap) 579 { 580 struct btrfs_free_space_entry *entry; 581 582 if (!io_ctl->cur) 583 return -ENOSPC; 584 585 entry = io_ctl->cur; 586 put_unaligned_le64(offset, &entry->offset); 587 put_unaligned_le64(bytes, &entry->bytes); 588 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 589 BTRFS_FREE_SPACE_EXTENT; 590 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 591 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 592 593 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 594 return 0; 595 596 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 597 598 /* No more pages to map */ 599 if (io_ctl->index >= io_ctl->num_pages) 600 return 0; 601 602 /* map the next page */ 603 io_ctl_map_page(io_ctl, 1); 604 return 0; 605 } 606 607 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 608 { 609 if (!io_ctl->cur) 610 return -ENOSPC; 611 612 /* 613 * If we aren't at the start of the current page, unmap this one and 614 * map the next one if there is any left. 615 */ 616 if (io_ctl->cur != io_ctl->orig) { 617 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 618 if (io_ctl->index >= io_ctl->num_pages) 619 return -ENOSPC; 620 io_ctl_map_page(io_ctl, 0); 621 } 622 623 copy_page(io_ctl->cur, bitmap); 624 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 625 if (io_ctl->index < io_ctl->num_pages) 626 io_ctl_map_page(io_ctl, 0); 627 return 0; 628 } 629 630 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 631 { 632 /* 633 * If we're not on the boundary we know we've modified the page and we 634 * need to crc the page. 635 */ 636 if (io_ctl->cur != io_ctl->orig) 637 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 638 else 639 io_ctl_unmap_page(io_ctl); 640 641 while (io_ctl->index < io_ctl->num_pages) { 642 io_ctl_map_page(io_ctl, 1); 643 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 644 } 645 } 646 647 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 648 struct btrfs_free_space *entry, u8 *type) 649 { 650 struct btrfs_free_space_entry *e; 651 int ret; 652 653 if (!io_ctl->cur) { 654 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 655 if (ret) 656 return ret; 657 } 658 659 e = io_ctl->cur; 660 entry->offset = get_unaligned_le64(&e->offset); 661 entry->bytes = get_unaligned_le64(&e->bytes); 662 *type = e->type; 663 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 664 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 665 666 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 667 return 0; 668 669 io_ctl_unmap_page(io_ctl); 670 671 return 0; 672 } 673 674 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 675 struct btrfs_free_space *entry) 676 { 677 int ret; 678 679 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 680 if (ret) 681 return ret; 682 683 copy_page(entry->bitmap, io_ctl->cur); 684 io_ctl_unmap_page(io_ctl); 685 686 return 0; 687 } 688 689 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 690 { 691 struct btrfs_block_group *block_group = ctl->block_group; 692 u64 max_bytes; 693 u64 bitmap_bytes; 694 u64 extent_bytes; 695 u64 size = block_group->length; 696 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 697 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 698 699 max_bitmaps = max_t(u64, max_bitmaps, 1); 700 701 if (ctl->total_bitmaps > max_bitmaps) 702 btrfs_err(block_group->fs_info, 703 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", 704 block_group->start, block_group->length, 705 ctl->total_bitmaps, ctl->unit, max_bitmaps, 706 bytes_per_bg); 707 ASSERT(ctl->total_bitmaps <= max_bitmaps); 708 709 /* 710 * We are trying to keep the total amount of memory used per 1GiB of 711 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 712 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 713 * bitmaps, we may end up using more memory than this. 714 */ 715 if (size < SZ_1G) 716 max_bytes = MAX_CACHE_BYTES_PER_GIG; 717 else 718 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 719 720 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 721 722 /* 723 * we want the extent entry threshold to always be at most 1/2 the max 724 * bytes we can have, or whatever is less than that. 725 */ 726 extent_bytes = max_bytes - bitmap_bytes; 727 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 728 729 ctl->extents_thresh = 730 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 731 } 732 733 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 734 struct btrfs_free_space_ctl *ctl, 735 struct btrfs_path *path, u64 offset) 736 { 737 struct btrfs_fs_info *fs_info = root->fs_info; 738 struct btrfs_free_space_header *header; 739 struct extent_buffer *leaf; 740 struct btrfs_io_ctl io_ctl; 741 struct btrfs_key key; 742 struct btrfs_free_space *e, *n; 743 LIST_HEAD(bitmaps); 744 u64 num_entries; 745 u64 num_bitmaps; 746 u64 generation; 747 u8 type; 748 int ret = 0; 749 750 /* Nothing in the space cache, goodbye */ 751 if (!i_size_read(inode)) 752 return 0; 753 754 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 755 key.offset = offset; 756 key.type = 0; 757 758 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 759 if (ret < 0) 760 return 0; 761 else if (ret > 0) { 762 btrfs_release_path(path); 763 return 0; 764 } 765 766 ret = -1; 767 768 leaf = path->nodes[0]; 769 header = btrfs_item_ptr(leaf, path->slots[0], 770 struct btrfs_free_space_header); 771 num_entries = btrfs_free_space_entries(leaf, header); 772 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 773 generation = btrfs_free_space_generation(leaf, header); 774 btrfs_release_path(path); 775 776 if (!BTRFS_I(inode)->generation) { 777 btrfs_info(fs_info, 778 "the free space cache file (%llu) is invalid, skip it", 779 offset); 780 return 0; 781 } 782 783 if (BTRFS_I(inode)->generation != generation) { 784 btrfs_err(fs_info, 785 "free space inode generation (%llu) did not match free space cache generation (%llu)", 786 BTRFS_I(inode)->generation, generation); 787 return 0; 788 } 789 790 if (!num_entries) 791 return 0; 792 793 ret = io_ctl_init(&io_ctl, inode, 0); 794 if (ret) 795 return ret; 796 797 readahead_cache(inode); 798 799 ret = io_ctl_prepare_pages(&io_ctl, true); 800 if (ret) 801 goto out; 802 803 ret = io_ctl_check_crc(&io_ctl, 0); 804 if (ret) 805 goto free_cache; 806 807 ret = io_ctl_check_generation(&io_ctl, generation); 808 if (ret) 809 goto free_cache; 810 811 while (num_entries) { 812 e = kmem_cache_zalloc(btrfs_free_space_cachep, 813 GFP_NOFS); 814 if (!e) { 815 ret = -ENOMEM; 816 goto free_cache; 817 } 818 819 ret = io_ctl_read_entry(&io_ctl, e, &type); 820 if (ret) { 821 kmem_cache_free(btrfs_free_space_cachep, e); 822 goto free_cache; 823 } 824 825 if (!e->bytes) { 826 ret = -1; 827 kmem_cache_free(btrfs_free_space_cachep, e); 828 goto free_cache; 829 } 830 831 if (type == BTRFS_FREE_SPACE_EXTENT) { 832 spin_lock(&ctl->tree_lock); 833 ret = link_free_space(ctl, e); 834 spin_unlock(&ctl->tree_lock); 835 if (ret) { 836 btrfs_err(fs_info, 837 "Duplicate entries in free space cache, dumping"); 838 kmem_cache_free(btrfs_free_space_cachep, e); 839 goto free_cache; 840 } 841 } else { 842 ASSERT(num_bitmaps); 843 num_bitmaps--; 844 e->bitmap = kmem_cache_zalloc( 845 btrfs_free_space_bitmap_cachep, GFP_NOFS); 846 if (!e->bitmap) { 847 ret = -ENOMEM; 848 kmem_cache_free( 849 btrfs_free_space_cachep, e); 850 goto free_cache; 851 } 852 spin_lock(&ctl->tree_lock); 853 ret = link_free_space(ctl, e); 854 if (ret) { 855 spin_unlock(&ctl->tree_lock); 856 btrfs_err(fs_info, 857 "Duplicate entries in free space cache, dumping"); 858 kmem_cache_free(btrfs_free_space_bitmap_cachep, e->bitmap); 859 kmem_cache_free(btrfs_free_space_cachep, e); 860 goto free_cache; 861 } 862 ctl->total_bitmaps++; 863 recalculate_thresholds(ctl); 864 spin_unlock(&ctl->tree_lock); 865 list_add_tail(&e->list, &bitmaps); 866 } 867 868 num_entries--; 869 } 870 871 io_ctl_unmap_page(&io_ctl); 872 873 /* 874 * We add the bitmaps at the end of the entries in order that 875 * the bitmap entries are added to the cache. 876 */ 877 list_for_each_entry_safe(e, n, &bitmaps, list) { 878 list_del_init(&e->list); 879 ret = io_ctl_read_bitmap(&io_ctl, e); 880 if (ret) 881 goto free_cache; 882 } 883 884 io_ctl_drop_pages(&io_ctl); 885 ret = 1; 886 out: 887 io_ctl_free(&io_ctl); 888 return ret; 889 free_cache: 890 io_ctl_drop_pages(&io_ctl); 891 892 spin_lock(&ctl->tree_lock); 893 __btrfs_remove_free_space_cache(ctl); 894 spin_unlock(&ctl->tree_lock); 895 goto out; 896 } 897 898 static int copy_free_space_cache(struct btrfs_block_group *block_group, 899 struct btrfs_free_space_ctl *ctl) 900 { 901 struct btrfs_free_space *info; 902 struct rb_node *n; 903 int ret = 0; 904 905 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 906 info = rb_entry(n, struct btrfs_free_space, offset_index); 907 if (!info->bitmap) { 908 const u64 offset = info->offset; 909 const u64 bytes = info->bytes; 910 911 unlink_free_space(ctl, info, true); 912 spin_unlock(&ctl->tree_lock); 913 kmem_cache_free(btrfs_free_space_cachep, info); 914 ret = btrfs_add_free_space(block_group, offset, bytes); 915 spin_lock(&ctl->tree_lock); 916 } else { 917 u64 offset = info->offset; 918 u64 bytes = ctl->unit; 919 920 ret = search_bitmap(ctl, info, &offset, &bytes, false); 921 if (ret == 0) { 922 bitmap_clear_bits(ctl, info, offset, bytes, true); 923 spin_unlock(&ctl->tree_lock); 924 ret = btrfs_add_free_space(block_group, offset, 925 bytes); 926 spin_lock(&ctl->tree_lock); 927 } else { 928 free_bitmap(ctl, info); 929 ret = 0; 930 } 931 } 932 cond_resched_lock(&ctl->tree_lock); 933 } 934 return ret; 935 } 936 937 static struct lock_class_key btrfs_free_space_inode_key; 938 939 int load_free_space_cache(struct btrfs_block_group *block_group) 940 { 941 struct btrfs_fs_info *fs_info = block_group->fs_info; 942 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 943 struct btrfs_free_space_ctl tmp_ctl = {}; 944 struct inode *inode; 945 struct btrfs_path *path; 946 int ret = 0; 947 bool matched; 948 u64 used = block_group->used; 949 950 /* 951 * Because we could potentially discard our loaded free space, we want 952 * to load everything into a temporary structure first, and then if it's 953 * valid copy it all into the actual free space ctl. 954 */ 955 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 956 957 /* 958 * If this block group has been marked to be cleared for one reason or 959 * another then we can't trust the on disk cache, so just return. 960 */ 961 spin_lock(&block_group->lock); 962 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 963 spin_unlock(&block_group->lock); 964 return 0; 965 } 966 spin_unlock(&block_group->lock); 967 968 path = btrfs_alloc_path(); 969 if (!path) 970 return 0; 971 path->search_commit_root = 1; 972 path->skip_locking = 1; 973 974 /* 975 * We must pass a path with search_commit_root set to btrfs_iget in 976 * order to avoid a deadlock when allocating extents for the tree root. 977 * 978 * When we are COWing an extent buffer from the tree root, when looking 979 * for a free extent, at extent-tree.c:find_free_extent(), we can find 980 * block group without its free space cache loaded. When we find one 981 * we must load its space cache which requires reading its free space 982 * cache's inode item from the root tree. If this inode item is located 983 * in the same leaf that we started COWing before, then we end up in 984 * deadlock on the extent buffer (trying to read lock it when we 985 * previously write locked it). 986 * 987 * It's safe to read the inode item using the commit root because 988 * block groups, once loaded, stay in memory forever (until they are 989 * removed) as well as their space caches once loaded. New block groups 990 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 991 * we will never try to read their inode item while the fs is mounted. 992 */ 993 inode = lookup_free_space_inode(block_group, path); 994 if (IS_ERR(inode)) { 995 btrfs_free_path(path); 996 return 0; 997 } 998 999 /* We may have converted the inode and made the cache invalid. */ 1000 spin_lock(&block_group->lock); 1001 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 1002 spin_unlock(&block_group->lock); 1003 btrfs_free_path(path); 1004 goto out; 1005 } 1006 spin_unlock(&block_group->lock); 1007 1008 /* 1009 * Reinitialize the class of struct inode's mapping->invalidate_lock for 1010 * free space inodes to prevent false positives related to locks for normal 1011 * inodes. 1012 */ 1013 lockdep_set_class(&(&inode->i_data)->invalidate_lock, 1014 &btrfs_free_space_inode_key); 1015 1016 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 1017 path, block_group->start); 1018 btrfs_free_path(path); 1019 if (ret <= 0) 1020 goto out; 1021 1022 matched = (tmp_ctl.free_space == (block_group->length - used - 1023 block_group->bytes_super)); 1024 1025 if (matched) { 1026 spin_lock(&tmp_ctl.tree_lock); 1027 ret = copy_free_space_cache(block_group, &tmp_ctl); 1028 spin_unlock(&tmp_ctl.tree_lock); 1029 /* 1030 * ret == 1 means we successfully loaded the free space cache, 1031 * so we need to re-set it here. 1032 */ 1033 if (ret == 0) 1034 ret = 1; 1035 } else { 1036 /* 1037 * We need to call the _locked variant so we don't try to update 1038 * the discard counters. 1039 */ 1040 spin_lock(&tmp_ctl.tree_lock); 1041 __btrfs_remove_free_space_cache(&tmp_ctl); 1042 spin_unlock(&tmp_ctl.tree_lock); 1043 btrfs_warn(fs_info, 1044 "block group %llu has wrong amount of free space", 1045 block_group->start); 1046 ret = -1; 1047 } 1048 out: 1049 if (ret < 0) { 1050 /* This cache is bogus, make sure it gets cleared */ 1051 spin_lock(&block_group->lock); 1052 block_group->disk_cache_state = BTRFS_DC_CLEAR; 1053 spin_unlock(&block_group->lock); 1054 ret = 0; 1055 1056 btrfs_warn(fs_info, 1057 "failed to load free space cache for block group %llu, rebuilding it now", 1058 block_group->start); 1059 } 1060 1061 spin_lock(&ctl->tree_lock); 1062 btrfs_discard_update_discardable(block_group); 1063 spin_unlock(&ctl->tree_lock); 1064 iput(inode); 1065 return ret; 1066 } 1067 1068 static noinline_for_stack 1069 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1070 struct btrfs_free_space_ctl *ctl, 1071 struct btrfs_block_group *block_group, 1072 int *entries, int *bitmaps, 1073 struct list_head *bitmap_list) 1074 { 1075 int ret; 1076 struct btrfs_free_cluster *cluster = NULL; 1077 struct btrfs_free_cluster *cluster_locked = NULL; 1078 struct rb_node *node = rb_first(&ctl->free_space_offset); 1079 struct btrfs_trim_range *trim_entry; 1080 1081 /* Get the cluster for this block_group if it exists */ 1082 if (block_group && !list_empty(&block_group->cluster_list)) { 1083 cluster = list_entry(block_group->cluster_list.next, 1084 struct btrfs_free_cluster, 1085 block_group_list); 1086 } 1087 1088 if (!node && cluster) { 1089 cluster_locked = cluster; 1090 spin_lock(&cluster_locked->lock); 1091 node = rb_first(&cluster->root); 1092 cluster = NULL; 1093 } 1094 1095 /* Write out the extent entries */ 1096 while (node) { 1097 struct btrfs_free_space *e; 1098 1099 e = rb_entry(node, struct btrfs_free_space, offset_index); 1100 *entries += 1; 1101 1102 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1103 e->bitmap); 1104 if (ret) 1105 goto fail; 1106 1107 if (e->bitmap) { 1108 list_add_tail(&e->list, bitmap_list); 1109 *bitmaps += 1; 1110 } 1111 node = rb_next(node); 1112 if (!node && cluster) { 1113 node = rb_first(&cluster->root); 1114 cluster_locked = cluster; 1115 spin_lock(&cluster_locked->lock); 1116 cluster = NULL; 1117 } 1118 } 1119 if (cluster_locked) { 1120 spin_unlock(&cluster_locked->lock); 1121 cluster_locked = NULL; 1122 } 1123 1124 /* 1125 * Make sure we don't miss any range that was removed from our rbtree 1126 * because trimming is running. Otherwise after a umount+mount (or crash 1127 * after committing the transaction) we would leak free space and get 1128 * an inconsistent free space cache report from fsck. 1129 */ 1130 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1131 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1132 trim_entry->bytes, NULL); 1133 if (ret) 1134 goto fail; 1135 *entries += 1; 1136 } 1137 1138 return 0; 1139 fail: 1140 if (cluster_locked) 1141 spin_unlock(&cluster_locked->lock); 1142 return -ENOSPC; 1143 } 1144 1145 static noinline_for_stack int 1146 update_cache_item(struct btrfs_trans_handle *trans, 1147 struct btrfs_root *root, 1148 struct inode *inode, 1149 struct btrfs_path *path, u64 offset, 1150 int entries, int bitmaps) 1151 { 1152 struct btrfs_key key; 1153 struct btrfs_free_space_header *header; 1154 struct extent_buffer *leaf; 1155 int ret; 1156 1157 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1158 key.offset = offset; 1159 key.type = 0; 1160 1161 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1162 if (ret < 0) { 1163 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1164 EXTENT_DELALLOC, NULL); 1165 goto fail; 1166 } 1167 leaf = path->nodes[0]; 1168 if (ret > 0) { 1169 struct btrfs_key found_key; 1170 ASSERT(path->slots[0]); 1171 path->slots[0]--; 1172 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1173 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1174 found_key.offset != offset) { 1175 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1176 inode->i_size - 1, EXTENT_DELALLOC, 1177 NULL); 1178 btrfs_release_path(path); 1179 goto fail; 1180 } 1181 } 1182 1183 BTRFS_I(inode)->generation = trans->transid; 1184 header = btrfs_item_ptr(leaf, path->slots[0], 1185 struct btrfs_free_space_header); 1186 btrfs_set_free_space_entries(leaf, header, entries); 1187 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1188 btrfs_set_free_space_generation(leaf, header, trans->transid); 1189 btrfs_mark_buffer_dirty(trans, leaf); 1190 btrfs_release_path(path); 1191 1192 return 0; 1193 1194 fail: 1195 return -1; 1196 } 1197 1198 static noinline_for_stack int write_pinned_extent_entries( 1199 struct btrfs_trans_handle *trans, 1200 struct btrfs_block_group *block_group, 1201 struct btrfs_io_ctl *io_ctl, 1202 int *entries) 1203 { 1204 u64 start, extent_start, extent_end, len; 1205 struct extent_io_tree *unpin = NULL; 1206 int ret; 1207 1208 if (!block_group) 1209 return 0; 1210 1211 /* 1212 * We want to add any pinned extents to our free space cache 1213 * so we don't leak the space 1214 * 1215 * We shouldn't have switched the pinned extents yet so this is the 1216 * right one 1217 */ 1218 unpin = &trans->transaction->pinned_extents; 1219 1220 start = block_group->start; 1221 1222 while (start < block_group->start + block_group->length) { 1223 if (!find_first_extent_bit(unpin, start, 1224 &extent_start, &extent_end, 1225 EXTENT_DIRTY, NULL)) 1226 return 0; 1227 1228 /* This pinned extent is out of our range */ 1229 if (extent_start >= block_group->start + block_group->length) 1230 return 0; 1231 1232 extent_start = max(extent_start, start); 1233 extent_end = min(block_group->start + block_group->length, 1234 extent_end + 1); 1235 len = extent_end - extent_start; 1236 1237 *entries += 1; 1238 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1239 if (ret) 1240 return -ENOSPC; 1241 1242 start = extent_end; 1243 } 1244 1245 return 0; 1246 } 1247 1248 static noinline_for_stack int 1249 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1250 { 1251 struct btrfs_free_space *entry, *next; 1252 int ret; 1253 1254 /* Write out the bitmaps */ 1255 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1256 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1257 if (ret) 1258 return -ENOSPC; 1259 list_del_init(&entry->list); 1260 } 1261 1262 return 0; 1263 } 1264 1265 static int flush_dirty_cache(struct inode *inode) 1266 { 1267 int ret; 1268 1269 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1270 if (ret) 1271 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1272 EXTENT_DELALLOC, NULL); 1273 1274 return ret; 1275 } 1276 1277 static void noinline_for_stack 1278 cleanup_bitmap_list(struct list_head *bitmap_list) 1279 { 1280 struct btrfs_free_space *entry, *next; 1281 1282 list_for_each_entry_safe(entry, next, bitmap_list, list) 1283 list_del_init(&entry->list); 1284 } 1285 1286 static void noinline_for_stack 1287 cleanup_write_cache_enospc(struct inode *inode, 1288 struct btrfs_io_ctl *io_ctl, 1289 struct extent_state **cached_state) 1290 { 1291 io_ctl_drop_pages(io_ctl); 1292 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1293 cached_state); 1294 } 1295 1296 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1297 struct btrfs_trans_handle *trans, 1298 struct btrfs_block_group *block_group, 1299 struct btrfs_io_ctl *io_ctl, 1300 struct btrfs_path *path, u64 offset) 1301 { 1302 int ret; 1303 struct inode *inode = io_ctl->inode; 1304 1305 if (!inode) 1306 return 0; 1307 1308 /* Flush the dirty pages in the cache file. */ 1309 ret = flush_dirty_cache(inode); 1310 if (ret) 1311 goto out; 1312 1313 /* Update the cache item to tell everyone this cache file is valid. */ 1314 ret = update_cache_item(trans, root, inode, path, offset, 1315 io_ctl->entries, io_ctl->bitmaps); 1316 out: 1317 if (ret) { 1318 invalidate_inode_pages2(inode->i_mapping); 1319 BTRFS_I(inode)->generation = 0; 1320 if (block_group) 1321 btrfs_debug(root->fs_info, 1322 "failed to write free space cache for block group %llu error %d", 1323 block_group->start, ret); 1324 } 1325 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1326 1327 if (block_group) { 1328 /* the dirty list is protected by the dirty_bgs_lock */ 1329 spin_lock(&trans->transaction->dirty_bgs_lock); 1330 1331 /* the disk_cache_state is protected by the block group lock */ 1332 spin_lock(&block_group->lock); 1333 1334 /* 1335 * only mark this as written if we didn't get put back on 1336 * the dirty list while waiting for IO. Otherwise our 1337 * cache state won't be right, and we won't get written again 1338 */ 1339 if (!ret && list_empty(&block_group->dirty_list)) 1340 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1341 else if (ret) 1342 block_group->disk_cache_state = BTRFS_DC_ERROR; 1343 1344 spin_unlock(&block_group->lock); 1345 spin_unlock(&trans->transaction->dirty_bgs_lock); 1346 io_ctl->inode = NULL; 1347 iput(inode); 1348 } 1349 1350 return ret; 1351 1352 } 1353 1354 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1355 struct btrfs_block_group *block_group, 1356 struct btrfs_path *path) 1357 { 1358 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1359 block_group, &block_group->io_ctl, 1360 path, block_group->start); 1361 } 1362 1363 /* 1364 * Write out cached info to an inode. 1365 * 1366 * @root: root the inode belongs to 1367 * @inode: freespace inode we are writing out 1368 * @ctl: free space cache we are going to write out 1369 * @block_group: block_group for this cache if it belongs to a block_group 1370 * @io_ctl: holds context for the io 1371 * @trans: the trans handle 1372 * 1373 * This function writes out a free space cache struct to disk for quick recovery 1374 * on mount. This will return 0 if it was successful in writing the cache out, 1375 * or an errno if it was not. 1376 */ 1377 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 1378 struct btrfs_free_space_ctl *ctl, 1379 struct btrfs_block_group *block_group, 1380 struct btrfs_io_ctl *io_ctl, 1381 struct btrfs_trans_handle *trans) 1382 { 1383 struct extent_state *cached_state = NULL; 1384 LIST_HEAD(bitmap_list); 1385 int entries = 0; 1386 int bitmaps = 0; 1387 int ret; 1388 int must_iput = 0; 1389 1390 if (!i_size_read(inode)) 1391 return -EIO; 1392 1393 WARN_ON(io_ctl->pages); 1394 ret = io_ctl_init(io_ctl, inode, 1); 1395 if (ret) 1396 return ret; 1397 1398 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1399 down_write(&block_group->data_rwsem); 1400 spin_lock(&block_group->lock); 1401 if (block_group->delalloc_bytes) { 1402 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1403 spin_unlock(&block_group->lock); 1404 up_write(&block_group->data_rwsem); 1405 BTRFS_I(inode)->generation = 0; 1406 ret = 0; 1407 must_iput = 1; 1408 goto out; 1409 } 1410 spin_unlock(&block_group->lock); 1411 } 1412 1413 /* Lock all pages first so we can lock the extent safely. */ 1414 ret = io_ctl_prepare_pages(io_ctl, false); 1415 if (ret) 1416 goto out_unlock; 1417 1418 lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1419 &cached_state); 1420 1421 io_ctl_set_generation(io_ctl, trans->transid); 1422 1423 mutex_lock(&ctl->cache_writeout_mutex); 1424 /* Write out the extent entries in the free space cache */ 1425 spin_lock(&ctl->tree_lock); 1426 ret = write_cache_extent_entries(io_ctl, ctl, 1427 block_group, &entries, &bitmaps, 1428 &bitmap_list); 1429 if (ret) 1430 goto out_nospc_locked; 1431 1432 /* 1433 * Some spaces that are freed in the current transaction are pinned, 1434 * they will be added into free space cache after the transaction is 1435 * committed, we shouldn't lose them. 1436 * 1437 * If this changes while we are working we'll get added back to 1438 * the dirty list and redo it. No locking needed 1439 */ 1440 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1441 if (ret) 1442 goto out_nospc_locked; 1443 1444 /* 1445 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1446 * locked while doing it because a concurrent trim can be manipulating 1447 * or freeing the bitmap. 1448 */ 1449 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1450 spin_unlock(&ctl->tree_lock); 1451 mutex_unlock(&ctl->cache_writeout_mutex); 1452 if (ret) 1453 goto out_nospc; 1454 1455 /* Zero out the rest of the pages just to make sure */ 1456 io_ctl_zero_remaining_pages(io_ctl); 1457 1458 /* Everything is written out, now we dirty the pages in the file. */ 1459 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 1460 io_ctl->num_pages, 0, i_size_read(inode), 1461 &cached_state, false); 1462 if (ret) 1463 goto out_nospc; 1464 1465 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1466 up_write(&block_group->data_rwsem); 1467 /* 1468 * Release the pages and unlock the extent, we will flush 1469 * them out later 1470 */ 1471 io_ctl_drop_pages(io_ctl); 1472 io_ctl_free(io_ctl); 1473 1474 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1475 &cached_state); 1476 1477 /* 1478 * at this point the pages are under IO and we're happy, 1479 * The caller is responsible for waiting on them and updating 1480 * the cache and the inode 1481 */ 1482 io_ctl->entries = entries; 1483 io_ctl->bitmaps = bitmaps; 1484 1485 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); 1486 if (ret) 1487 goto out; 1488 1489 return 0; 1490 1491 out_nospc_locked: 1492 cleanup_bitmap_list(&bitmap_list); 1493 spin_unlock(&ctl->tree_lock); 1494 mutex_unlock(&ctl->cache_writeout_mutex); 1495 1496 out_nospc: 1497 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1498 1499 out_unlock: 1500 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1501 up_write(&block_group->data_rwsem); 1502 1503 out: 1504 io_ctl->inode = NULL; 1505 io_ctl_free(io_ctl); 1506 if (ret) { 1507 invalidate_inode_pages2(inode->i_mapping); 1508 BTRFS_I(inode)->generation = 0; 1509 } 1510 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1511 if (must_iput) 1512 iput(inode); 1513 return ret; 1514 } 1515 1516 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1517 struct btrfs_block_group *block_group, 1518 struct btrfs_path *path) 1519 { 1520 struct btrfs_fs_info *fs_info = trans->fs_info; 1521 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1522 struct inode *inode; 1523 int ret = 0; 1524 1525 spin_lock(&block_group->lock); 1526 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1527 spin_unlock(&block_group->lock); 1528 return 0; 1529 } 1530 spin_unlock(&block_group->lock); 1531 1532 inode = lookup_free_space_inode(block_group, path); 1533 if (IS_ERR(inode)) 1534 return 0; 1535 1536 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, 1537 block_group, &block_group->io_ctl, trans); 1538 if (ret) { 1539 btrfs_debug(fs_info, 1540 "failed to write free space cache for block group %llu error %d", 1541 block_group->start, ret); 1542 spin_lock(&block_group->lock); 1543 block_group->disk_cache_state = BTRFS_DC_ERROR; 1544 spin_unlock(&block_group->lock); 1545 1546 block_group->io_ctl.inode = NULL; 1547 iput(inode); 1548 } 1549 1550 /* 1551 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1552 * to wait for IO and put the inode 1553 */ 1554 1555 return ret; 1556 } 1557 1558 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1559 u64 offset) 1560 { 1561 ASSERT(offset >= bitmap_start); 1562 offset -= bitmap_start; 1563 return (unsigned long)(div_u64(offset, unit)); 1564 } 1565 1566 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1567 { 1568 return (unsigned long)(div_u64(bytes, unit)); 1569 } 1570 1571 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1572 u64 offset) 1573 { 1574 u64 bitmap_start; 1575 u64 bytes_per_bitmap; 1576 1577 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1578 bitmap_start = offset - ctl->start; 1579 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1580 bitmap_start *= bytes_per_bitmap; 1581 bitmap_start += ctl->start; 1582 1583 return bitmap_start; 1584 } 1585 1586 static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, 1587 struct btrfs_free_cluster *cluster, 1588 struct btrfs_free_space *new_entry) 1589 { 1590 struct rb_root *root; 1591 struct rb_node **p; 1592 struct rb_node *parent = NULL; 1593 1594 lockdep_assert_held(&ctl->tree_lock); 1595 1596 if (cluster) { 1597 lockdep_assert_held(&cluster->lock); 1598 root = &cluster->root; 1599 } else { 1600 root = &ctl->free_space_offset; 1601 } 1602 1603 p = &root->rb_node; 1604 1605 while (*p) { 1606 struct btrfs_free_space *info; 1607 1608 parent = *p; 1609 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1610 1611 if (new_entry->offset < info->offset) { 1612 p = &(*p)->rb_left; 1613 } else if (new_entry->offset > info->offset) { 1614 p = &(*p)->rb_right; 1615 } else { 1616 /* 1617 * we could have a bitmap entry and an extent entry 1618 * share the same offset. If this is the case, we want 1619 * the extent entry to always be found first if we do a 1620 * linear search through the tree, since we want to have 1621 * the quickest allocation time, and allocating from an 1622 * extent is faster than allocating from a bitmap. So 1623 * if we're inserting a bitmap and we find an entry at 1624 * this offset, we want to go right, or after this entry 1625 * logically. If we are inserting an extent and we've 1626 * found a bitmap, we want to go left, or before 1627 * logically. 1628 */ 1629 if (new_entry->bitmap) { 1630 if (info->bitmap) { 1631 WARN_ON_ONCE(1); 1632 return -EEXIST; 1633 } 1634 p = &(*p)->rb_right; 1635 } else { 1636 if (!info->bitmap) { 1637 WARN_ON_ONCE(1); 1638 return -EEXIST; 1639 } 1640 p = &(*p)->rb_left; 1641 } 1642 } 1643 } 1644 1645 rb_link_node(&new_entry->offset_index, parent, p); 1646 rb_insert_color(&new_entry->offset_index, root); 1647 1648 return 0; 1649 } 1650 1651 /* 1652 * This is a little subtle. We *only* have ->max_extent_size set if we actually 1653 * searched through the bitmap and figured out the largest ->max_extent_size, 1654 * otherwise it's 0. In the case that it's 0 we don't want to tell the 1655 * allocator the wrong thing, we want to use the actual real max_extent_size 1656 * we've found already if it's larger, or we want to use ->bytes. 1657 * 1658 * This matters because find_free_space() will skip entries who's ->bytes is 1659 * less than the required bytes. So if we didn't search down this bitmap, we 1660 * may pick some previous entry that has a smaller ->max_extent_size than we 1661 * have. For example, assume we have two entries, one that has 1662 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set 1663 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will 1664 * call into find_free_space(), and return with max_extent_size == 4K, because 1665 * that first bitmap entry had ->max_extent_size set, but the second one did 1666 * not. If instead we returned 8K we'd come in searching for 8K, and find the 1667 * 8K contiguous range. 1668 * 1669 * Consider the other case, we have 2 8K chunks in that second entry and still 1670 * don't have ->max_extent_size set. We'll return 16K, and the next time the 1671 * allocator comes in it'll fully search our second bitmap, and this time it'll 1672 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the 1673 * right allocation the next loop through. 1674 */ 1675 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) 1676 { 1677 if (entry->bitmap && entry->max_extent_size) 1678 return entry->max_extent_size; 1679 return entry->bytes; 1680 } 1681 1682 /* 1683 * We want the largest entry to be leftmost, so this is inverted from what you'd 1684 * normally expect. 1685 */ 1686 static bool entry_less(struct rb_node *node, const struct rb_node *parent) 1687 { 1688 const struct btrfs_free_space *entry, *exist; 1689 1690 entry = rb_entry(node, struct btrfs_free_space, bytes_index); 1691 exist = rb_entry(parent, struct btrfs_free_space, bytes_index); 1692 return get_max_extent_size(exist) < get_max_extent_size(entry); 1693 } 1694 1695 /* 1696 * searches the tree for the given offset. 1697 * 1698 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1699 * want a section that has at least bytes size and comes at or after the given 1700 * offset. 1701 */ 1702 static struct btrfs_free_space * 1703 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1704 u64 offset, int bitmap_only, int fuzzy) 1705 { 1706 struct rb_node *n = ctl->free_space_offset.rb_node; 1707 struct btrfs_free_space *entry = NULL, *prev = NULL; 1708 1709 lockdep_assert_held(&ctl->tree_lock); 1710 1711 /* find entry that is closest to the 'offset' */ 1712 while (n) { 1713 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1714 prev = entry; 1715 1716 if (offset < entry->offset) 1717 n = n->rb_left; 1718 else if (offset > entry->offset) 1719 n = n->rb_right; 1720 else 1721 break; 1722 1723 entry = NULL; 1724 } 1725 1726 if (bitmap_only) { 1727 if (!entry) 1728 return NULL; 1729 if (entry->bitmap) 1730 return entry; 1731 1732 /* 1733 * bitmap entry and extent entry may share same offset, 1734 * in that case, bitmap entry comes after extent entry. 1735 */ 1736 n = rb_next(n); 1737 if (!n) 1738 return NULL; 1739 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1740 if (entry->offset != offset) 1741 return NULL; 1742 1743 WARN_ON(!entry->bitmap); 1744 return entry; 1745 } else if (entry) { 1746 if (entry->bitmap) { 1747 /* 1748 * if previous extent entry covers the offset, 1749 * we should return it instead of the bitmap entry 1750 */ 1751 n = rb_prev(&entry->offset_index); 1752 if (n) { 1753 prev = rb_entry(n, struct btrfs_free_space, 1754 offset_index); 1755 if (!prev->bitmap && 1756 prev->offset + prev->bytes > offset) 1757 entry = prev; 1758 } 1759 } 1760 return entry; 1761 } 1762 1763 if (!prev) 1764 return NULL; 1765 1766 /* find last entry before the 'offset' */ 1767 entry = prev; 1768 if (entry->offset > offset) { 1769 n = rb_prev(&entry->offset_index); 1770 if (n) { 1771 entry = rb_entry(n, struct btrfs_free_space, 1772 offset_index); 1773 ASSERT(entry->offset <= offset); 1774 } else { 1775 if (fuzzy) 1776 return entry; 1777 else 1778 return NULL; 1779 } 1780 } 1781 1782 if (entry->bitmap) { 1783 n = rb_prev(&entry->offset_index); 1784 if (n) { 1785 prev = rb_entry(n, struct btrfs_free_space, 1786 offset_index); 1787 if (!prev->bitmap && 1788 prev->offset + prev->bytes > offset) 1789 return prev; 1790 } 1791 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1792 return entry; 1793 } else if (entry->offset + entry->bytes > offset) 1794 return entry; 1795 1796 if (!fuzzy) 1797 return NULL; 1798 1799 while (1) { 1800 n = rb_next(&entry->offset_index); 1801 if (!n) 1802 return NULL; 1803 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1804 if (entry->bitmap) { 1805 if (entry->offset + BITS_PER_BITMAP * 1806 ctl->unit > offset) 1807 break; 1808 } else { 1809 if (entry->offset + entry->bytes > offset) 1810 break; 1811 } 1812 } 1813 return entry; 1814 } 1815 1816 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1817 struct btrfs_free_space *info, 1818 bool update_stat) 1819 { 1820 lockdep_assert_held(&ctl->tree_lock); 1821 1822 rb_erase(&info->offset_index, &ctl->free_space_offset); 1823 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1824 ctl->free_extents--; 1825 1826 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1827 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1828 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1829 } 1830 1831 if (update_stat) 1832 ctl->free_space -= info->bytes; 1833 } 1834 1835 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1836 struct btrfs_free_space *info) 1837 { 1838 int ret = 0; 1839 1840 lockdep_assert_held(&ctl->tree_lock); 1841 1842 ASSERT(info->bytes || info->bitmap); 1843 ret = tree_insert_offset(ctl, NULL, info); 1844 if (ret) 1845 return ret; 1846 1847 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1848 1849 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1850 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1851 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1852 } 1853 1854 ctl->free_space += info->bytes; 1855 ctl->free_extents++; 1856 return ret; 1857 } 1858 1859 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, 1860 struct btrfs_free_space *info) 1861 { 1862 ASSERT(info->bitmap); 1863 1864 /* 1865 * If our entry is empty it's because we're on a cluster and we don't 1866 * want to re-link it into our ctl bytes index. 1867 */ 1868 if (RB_EMPTY_NODE(&info->bytes_index)) 1869 return; 1870 1871 lockdep_assert_held(&ctl->tree_lock); 1872 1873 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1874 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1875 } 1876 1877 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1878 struct btrfs_free_space *info, 1879 u64 offset, u64 bytes, bool update_stat) 1880 { 1881 unsigned long start, count, end; 1882 int extent_delta = -1; 1883 1884 start = offset_to_bit(info->offset, ctl->unit, offset); 1885 count = bytes_to_bits(bytes, ctl->unit); 1886 end = start + count; 1887 ASSERT(end <= BITS_PER_BITMAP); 1888 1889 bitmap_clear(info->bitmap, start, count); 1890 1891 info->bytes -= bytes; 1892 if (info->max_extent_size > ctl->unit) 1893 info->max_extent_size = 0; 1894 1895 relink_bitmap_entry(ctl, info); 1896 1897 if (start && test_bit(start - 1, info->bitmap)) 1898 extent_delta++; 1899 1900 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1901 extent_delta++; 1902 1903 info->bitmap_extents += extent_delta; 1904 if (!btrfs_free_space_trimmed(info)) { 1905 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1906 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1907 } 1908 1909 if (update_stat) 1910 ctl->free_space -= bytes; 1911 } 1912 1913 static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1914 struct btrfs_free_space *info, u64 offset, 1915 u64 bytes) 1916 { 1917 unsigned long start, count, end; 1918 int extent_delta = 1; 1919 1920 start = offset_to_bit(info->offset, ctl->unit, offset); 1921 count = bytes_to_bits(bytes, ctl->unit); 1922 end = start + count; 1923 ASSERT(end <= BITS_PER_BITMAP); 1924 1925 bitmap_set(info->bitmap, start, count); 1926 1927 /* 1928 * We set some bytes, we have no idea what the max extent size is 1929 * anymore. 1930 */ 1931 info->max_extent_size = 0; 1932 info->bytes += bytes; 1933 ctl->free_space += bytes; 1934 1935 relink_bitmap_entry(ctl, info); 1936 1937 if (start && test_bit(start - 1, info->bitmap)) 1938 extent_delta--; 1939 1940 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1941 extent_delta--; 1942 1943 info->bitmap_extents += extent_delta; 1944 if (!btrfs_free_space_trimmed(info)) { 1945 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1946 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1947 } 1948 } 1949 1950 /* 1951 * If we can not find suitable extent, we will use bytes to record 1952 * the size of the max extent. 1953 */ 1954 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1955 struct btrfs_free_space *bitmap_info, u64 *offset, 1956 u64 *bytes, bool for_alloc) 1957 { 1958 unsigned long found_bits = 0; 1959 unsigned long max_bits = 0; 1960 unsigned long bits, i; 1961 unsigned long next_zero; 1962 unsigned long extent_bits; 1963 1964 /* 1965 * Skip searching the bitmap if we don't have a contiguous section that 1966 * is large enough for this allocation. 1967 */ 1968 if (for_alloc && 1969 bitmap_info->max_extent_size && 1970 bitmap_info->max_extent_size < *bytes) { 1971 *bytes = bitmap_info->max_extent_size; 1972 return -1; 1973 } 1974 1975 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1976 max_t(u64, *offset, bitmap_info->offset)); 1977 bits = bytes_to_bits(*bytes, ctl->unit); 1978 1979 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1980 if (for_alloc && bits == 1) { 1981 found_bits = 1; 1982 break; 1983 } 1984 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1985 BITS_PER_BITMAP, i); 1986 extent_bits = next_zero - i; 1987 if (extent_bits >= bits) { 1988 found_bits = extent_bits; 1989 break; 1990 } else if (extent_bits > max_bits) { 1991 max_bits = extent_bits; 1992 } 1993 i = next_zero; 1994 } 1995 1996 if (found_bits) { 1997 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1998 *bytes = (u64)(found_bits) * ctl->unit; 1999 return 0; 2000 } 2001 2002 *bytes = (u64)(max_bits) * ctl->unit; 2003 bitmap_info->max_extent_size = *bytes; 2004 relink_bitmap_entry(ctl, bitmap_info); 2005 return -1; 2006 } 2007 2008 /* Cache the size of the max extent in bytes */ 2009 static struct btrfs_free_space * 2010 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 2011 unsigned long align, u64 *max_extent_size, bool use_bytes_index) 2012 { 2013 struct btrfs_free_space *entry; 2014 struct rb_node *node; 2015 u64 tmp; 2016 u64 align_off; 2017 int ret; 2018 2019 if (!ctl->free_space_offset.rb_node) 2020 goto out; 2021 again: 2022 if (use_bytes_index) { 2023 node = rb_first_cached(&ctl->free_space_bytes); 2024 } else { 2025 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2026 0, 1); 2027 if (!entry) 2028 goto out; 2029 node = &entry->offset_index; 2030 } 2031 2032 for (; node; node = rb_next(node)) { 2033 if (use_bytes_index) 2034 entry = rb_entry(node, struct btrfs_free_space, 2035 bytes_index); 2036 else 2037 entry = rb_entry(node, struct btrfs_free_space, 2038 offset_index); 2039 2040 /* 2041 * If we are using the bytes index then all subsequent entries 2042 * in this tree are going to be < bytes, so simply set the max 2043 * extent size and exit the loop. 2044 * 2045 * If we're using the offset index then we need to keep going 2046 * through the rest of the tree. 2047 */ 2048 if (entry->bytes < *bytes) { 2049 *max_extent_size = max(get_max_extent_size(entry), 2050 *max_extent_size); 2051 if (use_bytes_index) 2052 break; 2053 continue; 2054 } 2055 2056 /* make sure the space returned is big enough 2057 * to match our requested alignment 2058 */ 2059 if (*bytes >= align) { 2060 tmp = entry->offset - ctl->start + align - 1; 2061 tmp = div64_u64(tmp, align); 2062 tmp = tmp * align + ctl->start; 2063 align_off = tmp - entry->offset; 2064 } else { 2065 align_off = 0; 2066 tmp = entry->offset; 2067 } 2068 2069 /* 2070 * We don't break here if we're using the bytes index because we 2071 * may have another entry that has the correct alignment that is 2072 * the right size, so we don't want to miss that possibility. 2073 * At worst this adds another loop through the logic, but if we 2074 * broke here we could prematurely ENOSPC. 2075 */ 2076 if (entry->bytes < *bytes + align_off) { 2077 *max_extent_size = max(get_max_extent_size(entry), 2078 *max_extent_size); 2079 continue; 2080 } 2081 2082 if (entry->bitmap) { 2083 struct rb_node *old_next = rb_next(node); 2084 u64 size = *bytes; 2085 2086 ret = search_bitmap(ctl, entry, &tmp, &size, true); 2087 if (!ret) { 2088 *offset = tmp; 2089 *bytes = size; 2090 return entry; 2091 } else { 2092 *max_extent_size = 2093 max(get_max_extent_size(entry), 2094 *max_extent_size); 2095 } 2096 2097 /* 2098 * The bitmap may have gotten re-arranged in the space 2099 * index here because the max_extent_size may have been 2100 * updated. Start from the beginning again if this 2101 * happened. 2102 */ 2103 if (use_bytes_index && old_next != rb_next(node)) 2104 goto again; 2105 continue; 2106 } 2107 2108 *offset = tmp; 2109 *bytes = entry->bytes - align_off; 2110 return entry; 2111 } 2112 out: 2113 return NULL; 2114 } 2115 2116 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 2117 struct btrfs_free_space *info, u64 offset) 2118 { 2119 info->offset = offset_to_bitmap(ctl, offset); 2120 info->bytes = 0; 2121 info->bitmap_extents = 0; 2122 INIT_LIST_HEAD(&info->list); 2123 link_free_space(ctl, info); 2124 ctl->total_bitmaps++; 2125 recalculate_thresholds(ctl); 2126 } 2127 2128 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 2129 struct btrfs_free_space *bitmap_info) 2130 { 2131 /* 2132 * Normally when this is called, the bitmap is completely empty. However, 2133 * if we are blowing up the free space cache for one reason or another 2134 * via __btrfs_remove_free_space_cache(), then it may not be freed and 2135 * we may leave stats on the table. 2136 */ 2137 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 2138 ctl->discardable_extents[BTRFS_STAT_CURR] -= 2139 bitmap_info->bitmap_extents; 2140 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 2141 2142 } 2143 unlink_free_space(ctl, bitmap_info, true); 2144 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 2145 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 2146 ctl->total_bitmaps--; 2147 recalculate_thresholds(ctl); 2148 } 2149 2150 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 2151 struct btrfs_free_space *bitmap_info, 2152 u64 *offset, u64 *bytes) 2153 { 2154 u64 end; 2155 u64 search_start, search_bytes; 2156 int ret; 2157 2158 again: 2159 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 2160 2161 /* 2162 * We need to search for bits in this bitmap. We could only cover some 2163 * of the extent in this bitmap thanks to how we add space, so we need 2164 * to search for as much as it as we can and clear that amount, and then 2165 * go searching for the next bit. 2166 */ 2167 search_start = *offset; 2168 search_bytes = ctl->unit; 2169 search_bytes = min(search_bytes, end - search_start + 1); 2170 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 2171 false); 2172 if (ret < 0 || search_start != *offset) 2173 return -EINVAL; 2174 2175 /* We may have found more bits than what we need */ 2176 search_bytes = min(search_bytes, *bytes); 2177 2178 /* Cannot clear past the end of the bitmap */ 2179 search_bytes = min(search_bytes, end - search_start + 1); 2180 2181 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); 2182 *offset += search_bytes; 2183 *bytes -= search_bytes; 2184 2185 if (*bytes) { 2186 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2187 if (!bitmap_info->bytes) 2188 free_bitmap(ctl, bitmap_info); 2189 2190 /* 2191 * no entry after this bitmap, but we still have bytes to 2192 * remove, so something has gone wrong. 2193 */ 2194 if (!next) 2195 return -EINVAL; 2196 2197 bitmap_info = rb_entry(next, struct btrfs_free_space, 2198 offset_index); 2199 2200 /* 2201 * if the next entry isn't a bitmap we need to return to let the 2202 * extent stuff do its work. 2203 */ 2204 if (!bitmap_info->bitmap) 2205 return -EAGAIN; 2206 2207 /* 2208 * Ok the next item is a bitmap, but it may not actually hold 2209 * the information for the rest of this free space stuff, so 2210 * look for it, and if we don't find it return so we can try 2211 * everything over again. 2212 */ 2213 search_start = *offset; 2214 search_bytes = ctl->unit; 2215 ret = search_bitmap(ctl, bitmap_info, &search_start, 2216 &search_bytes, false); 2217 if (ret < 0 || search_start != *offset) 2218 return -EAGAIN; 2219 2220 goto again; 2221 } else if (!bitmap_info->bytes) 2222 free_bitmap(ctl, bitmap_info); 2223 2224 return 0; 2225 } 2226 2227 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2228 struct btrfs_free_space *info, u64 offset, 2229 u64 bytes, enum btrfs_trim_state trim_state) 2230 { 2231 u64 bytes_to_set = 0; 2232 u64 end; 2233 2234 /* 2235 * This is a tradeoff to make bitmap trim state minimal. We mark the 2236 * whole bitmap untrimmed if at any point we add untrimmed regions. 2237 */ 2238 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2239 if (btrfs_free_space_trimmed(info)) { 2240 ctl->discardable_extents[BTRFS_STAT_CURR] += 2241 info->bitmap_extents; 2242 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2243 } 2244 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2245 } 2246 2247 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2248 2249 bytes_to_set = min(end - offset, bytes); 2250 2251 btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set); 2252 2253 return bytes_to_set; 2254 2255 } 2256 2257 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2258 struct btrfs_free_space *info) 2259 { 2260 struct btrfs_block_group *block_group = ctl->block_group; 2261 struct btrfs_fs_info *fs_info = block_group->fs_info; 2262 bool forced = false; 2263 2264 #ifdef CONFIG_BTRFS_DEBUG 2265 if (btrfs_should_fragment_free_space(block_group)) 2266 forced = true; 2267 #endif 2268 2269 /* This is a way to reclaim large regions from the bitmaps. */ 2270 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2271 return false; 2272 2273 /* 2274 * If we are below the extents threshold then we can add this as an 2275 * extent, and don't have to deal with the bitmap 2276 */ 2277 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2278 /* 2279 * If this block group has some small extents we don't want to 2280 * use up all of our free slots in the cache with them, we want 2281 * to reserve them to larger extents, however if we have plenty 2282 * of cache left then go ahead an dadd them, no sense in adding 2283 * the overhead of a bitmap if we don't have to. 2284 */ 2285 if (info->bytes <= fs_info->sectorsize * 8) { 2286 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2287 return false; 2288 } else { 2289 return false; 2290 } 2291 } 2292 2293 /* 2294 * The original block groups from mkfs can be really small, like 8 2295 * megabytes, so don't bother with a bitmap for those entries. However 2296 * some block groups can be smaller than what a bitmap would cover but 2297 * are still large enough that they could overflow the 32k memory limit, 2298 * so allow those block groups to still be allowed to have a bitmap 2299 * entry. 2300 */ 2301 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2302 return false; 2303 2304 return true; 2305 } 2306 2307 static const struct btrfs_free_space_op free_space_op = { 2308 .use_bitmap = use_bitmap, 2309 }; 2310 2311 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2312 struct btrfs_free_space *info) 2313 { 2314 struct btrfs_free_space *bitmap_info; 2315 struct btrfs_block_group *block_group = NULL; 2316 int added = 0; 2317 u64 bytes, offset, bytes_added; 2318 enum btrfs_trim_state trim_state; 2319 int ret; 2320 2321 bytes = info->bytes; 2322 offset = info->offset; 2323 trim_state = info->trim_state; 2324 2325 if (!ctl->op->use_bitmap(ctl, info)) 2326 return 0; 2327 2328 if (ctl->op == &free_space_op) 2329 block_group = ctl->block_group; 2330 again: 2331 /* 2332 * Since we link bitmaps right into the cluster we need to see if we 2333 * have a cluster here, and if so and it has our bitmap we need to add 2334 * the free space to that bitmap. 2335 */ 2336 if (block_group && !list_empty(&block_group->cluster_list)) { 2337 struct btrfs_free_cluster *cluster; 2338 struct rb_node *node; 2339 struct btrfs_free_space *entry; 2340 2341 cluster = list_entry(block_group->cluster_list.next, 2342 struct btrfs_free_cluster, 2343 block_group_list); 2344 spin_lock(&cluster->lock); 2345 node = rb_first(&cluster->root); 2346 if (!node) { 2347 spin_unlock(&cluster->lock); 2348 goto no_cluster_bitmap; 2349 } 2350 2351 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2352 if (!entry->bitmap) { 2353 spin_unlock(&cluster->lock); 2354 goto no_cluster_bitmap; 2355 } 2356 2357 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2358 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2359 bytes, trim_state); 2360 bytes -= bytes_added; 2361 offset += bytes_added; 2362 } 2363 spin_unlock(&cluster->lock); 2364 if (!bytes) { 2365 ret = 1; 2366 goto out; 2367 } 2368 } 2369 2370 no_cluster_bitmap: 2371 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2372 1, 0); 2373 if (!bitmap_info) { 2374 ASSERT(added == 0); 2375 goto new_bitmap; 2376 } 2377 2378 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2379 trim_state); 2380 bytes -= bytes_added; 2381 offset += bytes_added; 2382 added = 0; 2383 2384 if (!bytes) { 2385 ret = 1; 2386 goto out; 2387 } else 2388 goto again; 2389 2390 new_bitmap: 2391 if (info && info->bitmap) { 2392 add_new_bitmap(ctl, info, offset); 2393 added = 1; 2394 info = NULL; 2395 goto again; 2396 } else { 2397 spin_unlock(&ctl->tree_lock); 2398 2399 /* no pre-allocated info, allocate a new one */ 2400 if (!info) { 2401 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2402 GFP_NOFS); 2403 if (!info) { 2404 spin_lock(&ctl->tree_lock); 2405 ret = -ENOMEM; 2406 goto out; 2407 } 2408 } 2409 2410 /* allocate the bitmap */ 2411 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2412 GFP_NOFS); 2413 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2414 spin_lock(&ctl->tree_lock); 2415 if (!info->bitmap) { 2416 ret = -ENOMEM; 2417 goto out; 2418 } 2419 goto again; 2420 } 2421 2422 out: 2423 if (info) { 2424 if (info->bitmap) 2425 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2426 info->bitmap); 2427 kmem_cache_free(btrfs_free_space_cachep, info); 2428 } 2429 2430 return ret; 2431 } 2432 2433 /* 2434 * Free space merging rules: 2435 * 1) Merge trimmed areas together 2436 * 2) Let untrimmed areas coalesce with trimmed areas 2437 * 3) Always pull neighboring regions from bitmaps 2438 * 2439 * The above rules are for when we merge free space based on btrfs_trim_state. 2440 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2441 * same reason: to promote larger extent regions which makes life easier for 2442 * find_free_extent(). Rule 2 enables coalescing based on the common path 2443 * being returning free space from btrfs_finish_extent_commit(). So when free 2444 * space is trimmed, it will prevent aggregating trimmed new region and 2445 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2446 * and provide find_free_extent() with the largest extents possible hoping for 2447 * the reuse path. 2448 */ 2449 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2450 struct btrfs_free_space *info, bool update_stat) 2451 { 2452 struct btrfs_free_space *left_info = NULL; 2453 struct btrfs_free_space *right_info; 2454 bool merged = false; 2455 u64 offset = info->offset; 2456 u64 bytes = info->bytes; 2457 const bool is_trimmed = btrfs_free_space_trimmed(info); 2458 struct rb_node *right_prev = NULL; 2459 2460 /* 2461 * first we want to see if there is free space adjacent to the range we 2462 * are adding, if there is remove that struct and add a new one to 2463 * cover the entire range 2464 */ 2465 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2466 if (right_info) 2467 right_prev = rb_prev(&right_info->offset_index); 2468 2469 if (right_prev) 2470 left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); 2471 else if (!right_info) 2472 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2473 2474 /* See try_merge_free_space() comment. */ 2475 if (right_info && !right_info->bitmap && 2476 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2477 unlink_free_space(ctl, right_info, update_stat); 2478 info->bytes += right_info->bytes; 2479 kmem_cache_free(btrfs_free_space_cachep, right_info); 2480 merged = true; 2481 } 2482 2483 /* See try_merge_free_space() comment. */ 2484 if (left_info && !left_info->bitmap && 2485 left_info->offset + left_info->bytes == offset && 2486 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2487 unlink_free_space(ctl, left_info, update_stat); 2488 info->offset = left_info->offset; 2489 info->bytes += left_info->bytes; 2490 kmem_cache_free(btrfs_free_space_cachep, left_info); 2491 merged = true; 2492 } 2493 2494 return merged; 2495 } 2496 2497 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2498 struct btrfs_free_space *info, 2499 bool update_stat) 2500 { 2501 struct btrfs_free_space *bitmap; 2502 unsigned long i; 2503 unsigned long j; 2504 const u64 end = info->offset + info->bytes; 2505 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2506 u64 bytes; 2507 2508 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2509 if (!bitmap) 2510 return false; 2511 2512 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2513 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2514 if (j == i) 2515 return false; 2516 bytes = (j - i) * ctl->unit; 2517 info->bytes += bytes; 2518 2519 /* See try_merge_free_space() comment. */ 2520 if (!btrfs_free_space_trimmed(bitmap)) 2521 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2522 2523 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); 2524 2525 if (!bitmap->bytes) 2526 free_bitmap(ctl, bitmap); 2527 2528 return true; 2529 } 2530 2531 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2532 struct btrfs_free_space *info, 2533 bool update_stat) 2534 { 2535 struct btrfs_free_space *bitmap; 2536 u64 bitmap_offset; 2537 unsigned long i; 2538 unsigned long j; 2539 unsigned long prev_j; 2540 u64 bytes; 2541 2542 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2543 /* If we're on a boundary, try the previous logical bitmap. */ 2544 if (bitmap_offset == info->offset) { 2545 if (info->offset == 0) 2546 return false; 2547 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2548 } 2549 2550 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2551 if (!bitmap) 2552 return false; 2553 2554 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2555 j = 0; 2556 prev_j = (unsigned long)-1; 2557 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2558 if (j > i) 2559 break; 2560 prev_j = j; 2561 } 2562 if (prev_j == i) 2563 return false; 2564 2565 if (prev_j == (unsigned long)-1) 2566 bytes = (i + 1) * ctl->unit; 2567 else 2568 bytes = (i - prev_j) * ctl->unit; 2569 2570 info->offset -= bytes; 2571 info->bytes += bytes; 2572 2573 /* See try_merge_free_space() comment. */ 2574 if (!btrfs_free_space_trimmed(bitmap)) 2575 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2576 2577 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); 2578 2579 if (!bitmap->bytes) 2580 free_bitmap(ctl, bitmap); 2581 2582 return true; 2583 } 2584 2585 /* 2586 * We prefer always to allocate from extent entries, both for clustered and 2587 * non-clustered allocation requests. So when attempting to add a new extent 2588 * entry, try to see if there's adjacent free space in bitmap entries, and if 2589 * there is, migrate that space from the bitmaps to the extent. 2590 * Like this we get better chances of satisfying space allocation requests 2591 * because we attempt to satisfy them based on a single cache entry, and never 2592 * on 2 or more entries - even if the entries represent a contiguous free space 2593 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2594 * ends). 2595 */ 2596 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2597 struct btrfs_free_space *info, 2598 bool update_stat) 2599 { 2600 /* 2601 * Only work with disconnected entries, as we can change their offset, 2602 * and must be extent entries. 2603 */ 2604 ASSERT(!info->bitmap); 2605 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2606 2607 if (ctl->total_bitmaps > 0) { 2608 bool stole_end; 2609 bool stole_front = false; 2610 2611 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2612 if (ctl->total_bitmaps > 0) 2613 stole_front = steal_from_bitmap_to_front(ctl, info, 2614 update_stat); 2615 2616 if (stole_end || stole_front) 2617 try_merge_free_space(ctl, info, update_stat); 2618 } 2619 } 2620 2621 int __btrfs_add_free_space(struct btrfs_block_group *block_group, 2622 u64 offset, u64 bytes, 2623 enum btrfs_trim_state trim_state) 2624 { 2625 struct btrfs_fs_info *fs_info = block_group->fs_info; 2626 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2627 struct btrfs_free_space *info; 2628 int ret = 0; 2629 u64 filter_bytes = bytes; 2630 2631 ASSERT(!btrfs_is_zoned(fs_info)); 2632 2633 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2634 if (!info) 2635 return -ENOMEM; 2636 2637 info->offset = offset; 2638 info->bytes = bytes; 2639 info->trim_state = trim_state; 2640 RB_CLEAR_NODE(&info->offset_index); 2641 RB_CLEAR_NODE(&info->bytes_index); 2642 2643 spin_lock(&ctl->tree_lock); 2644 2645 if (try_merge_free_space(ctl, info, true)) 2646 goto link; 2647 2648 /* 2649 * There was no extent directly to the left or right of this new 2650 * extent then we know we're going to have to allocate a new extent, so 2651 * before we do that see if we need to drop this into a bitmap 2652 */ 2653 ret = insert_into_bitmap(ctl, info); 2654 if (ret < 0) { 2655 goto out; 2656 } else if (ret) { 2657 ret = 0; 2658 goto out; 2659 } 2660 link: 2661 /* 2662 * Only steal free space from adjacent bitmaps if we're sure we're not 2663 * going to add the new free space to existing bitmap entries - because 2664 * that would mean unnecessary work that would be reverted. Therefore 2665 * attempt to steal space from bitmaps if we're adding an extent entry. 2666 */ 2667 steal_from_bitmap(ctl, info, true); 2668 2669 filter_bytes = max(filter_bytes, info->bytes); 2670 2671 ret = link_free_space(ctl, info); 2672 if (ret) 2673 kmem_cache_free(btrfs_free_space_cachep, info); 2674 out: 2675 btrfs_discard_update_discardable(block_group); 2676 spin_unlock(&ctl->tree_lock); 2677 2678 if (ret) { 2679 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2680 ASSERT(ret != -EEXIST); 2681 } 2682 2683 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2684 btrfs_discard_check_filter(block_group, filter_bytes); 2685 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2686 } 2687 2688 return ret; 2689 } 2690 2691 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2692 u64 bytenr, u64 size, bool used) 2693 { 2694 struct btrfs_space_info *sinfo = block_group->space_info; 2695 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2696 u64 offset = bytenr - block_group->start; 2697 u64 to_free, to_unusable; 2698 int bg_reclaim_threshold = 0; 2699 bool initial; 2700 u64 reclaimable_unusable; 2701 2702 spin_lock(&block_group->lock); 2703 2704 initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); 2705 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2706 if (!initial) 2707 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2708 2709 if (!used) 2710 to_free = size; 2711 else if (initial) 2712 to_free = block_group->zone_capacity; 2713 else if (offset >= block_group->alloc_offset) 2714 to_free = size; 2715 else if (offset + size <= block_group->alloc_offset) 2716 to_free = 0; 2717 else 2718 to_free = offset + size - block_group->alloc_offset; 2719 to_unusable = size - to_free; 2720 2721 spin_lock(&ctl->tree_lock); 2722 ctl->free_space += to_free; 2723 spin_unlock(&ctl->tree_lock); 2724 /* 2725 * If the block group is read-only, we should account freed space into 2726 * bytes_readonly. 2727 */ 2728 if (!block_group->ro) { 2729 block_group->zone_unusable += to_unusable; 2730 WARN_ON(block_group->zone_unusable > block_group->length); 2731 } 2732 if (!used) { 2733 block_group->alloc_offset -= size; 2734 } 2735 2736 reclaimable_unusable = block_group->zone_unusable - 2737 (block_group->length - block_group->zone_capacity); 2738 /* All the region is now unusable. Mark it as unused and reclaim */ 2739 if (block_group->zone_unusable == block_group->length) { 2740 btrfs_mark_bg_unused(block_group); 2741 } else if (bg_reclaim_threshold && 2742 reclaimable_unusable >= 2743 mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { 2744 btrfs_mark_bg_to_reclaim(block_group); 2745 } 2746 2747 spin_unlock(&block_group->lock); 2748 2749 return 0; 2750 } 2751 2752 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2753 u64 bytenr, u64 size) 2754 { 2755 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2756 2757 if (btrfs_is_zoned(block_group->fs_info)) 2758 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2759 true); 2760 2761 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2762 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2763 2764 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2765 } 2766 2767 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, 2768 u64 bytenr, u64 size) 2769 { 2770 if (btrfs_is_zoned(block_group->fs_info)) 2771 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2772 false); 2773 2774 return btrfs_add_free_space(block_group, bytenr, size); 2775 } 2776 2777 /* 2778 * This is a subtle distinction because when adding free space back in general, 2779 * we want it to be added as untrimmed for async. But in the case where we add 2780 * it on loading of a block group, we want to consider it trimmed. 2781 */ 2782 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2783 u64 bytenr, u64 size) 2784 { 2785 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2786 2787 if (btrfs_is_zoned(block_group->fs_info)) 2788 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2789 true); 2790 2791 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2792 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2793 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2794 2795 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2796 } 2797 2798 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2799 u64 offset, u64 bytes) 2800 { 2801 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2802 struct btrfs_free_space *info; 2803 int ret; 2804 bool re_search = false; 2805 2806 if (btrfs_is_zoned(block_group->fs_info)) { 2807 /* 2808 * This can happen with conventional zones when replaying log. 2809 * Since the allocation info of tree-log nodes are not recorded 2810 * to the extent-tree, calculate_alloc_pointer() failed to 2811 * advance the allocation pointer after last allocated tree log 2812 * node blocks. 2813 * 2814 * This function is called from 2815 * btrfs_pin_extent_for_log_replay() when replaying the log. 2816 * Advance the pointer not to overwrite the tree-log nodes. 2817 */ 2818 if (block_group->start + block_group->alloc_offset < 2819 offset + bytes) { 2820 block_group->alloc_offset = 2821 offset + bytes - block_group->start; 2822 } 2823 return 0; 2824 } 2825 2826 spin_lock(&ctl->tree_lock); 2827 2828 again: 2829 ret = 0; 2830 if (!bytes) 2831 goto out_lock; 2832 2833 info = tree_search_offset(ctl, offset, 0, 0); 2834 if (!info) { 2835 /* 2836 * oops didn't find an extent that matched the space we wanted 2837 * to remove, look for a bitmap instead 2838 */ 2839 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2840 1, 0); 2841 if (!info) { 2842 /* 2843 * If we found a partial bit of our free space in a 2844 * bitmap but then couldn't find the other part this may 2845 * be a problem, so WARN about it. 2846 */ 2847 WARN_ON(re_search); 2848 goto out_lock; 2849 } 2850 } 2851 2852 re_search = false; 2853 if (!info->bitmap) { 2854 unlink_free_space(ctl, info, true); 2855 if (offset == info->offset) { 2856 u64 to_free = min(bytes, info->bytes); 2857 2858 info->bytes -= to_free; 2859 info->offset += to_free; 2860 if (info->bytes) { 2861 ret = link_free_space(ctl, info); 2862 WARN_ON(ret); 2863 } else { 2864 kmem_cache_free(btrfs_free_space_cachep, info); 2865 } 2866 2867 offset += to_free; 2868 bytes -= to_free; 2869 goto again; 2870 } else { 2871 u64 old_end = info->bytes + info->offset; 2872 2873 info->bytes = offset - info->offset; 2874 ret = link_free_space(ctl, info); 2875 WARN_ON(ret); 2876 if (ret) 2877 goto out_lock; 2878 2879 /* Not enough bytes in this entry to satisfy us */ 2880 if (old_end < offset + bytes) { 2881 bytes -= old_end - offset; 2882 offset = old_end; 2883 goto again; 2884 } else if (old_end == offset + bytes) { 2885 /* all done */ 2886 goto out_lock; 2887 } 2888 spin_unlock(&ctl->tree_lock); 2889 2890 ret = __btrfs_add_free_space(block_group, 2891 offset + bytes, 2892 old_end - (offset + bytes), 2893 info->trim_state); 2894 WARN_ON(ret); 2895 goto out; 2896 } 2897 } 2898 2899 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2900 if (ret == -EAGAIN) { 2901 re_search = true; 2902 goto again; 2903 } 2904 out_lock: 2905 btrfs_discard_update_discardable(block_group); 2906 spin_unlock(&ctl->tree_lock); 2907 out: 2908 return ret; 2909 } 2910 2911 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2912 u64 bytes) 2913 { 2914 struct btrfs_fs_info *fs_info = block_group->fs_info; 2915 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2916 struct btrfs_free_space *info; 2917 struct rb_node *n; 2918 int count = 0; 2919 2920 /* 2921 * Zoned btrfs does not use free space tree and cluster. Just print 2922 * out the free space after the allocation offset. 2923 */ 2924 if (btrfs_is_zoned(fs_info)) { 2925 btrfs_info(fs_info, "free space %llu active %d", 2926 block_group->zone_capacity - block_group->alloc_offset, 2927 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2928 &block_group->runtime_flags)); 2929 return; 2930 } 2931 2932 spin_lock(&ctl->tree_lock); 2933 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2934 info = rb_entry(n, struct btrfs_free_space, offset_index); 2935 if (info->bytes >= bytes && !block_group->ro) 2936 count++; 2937 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2938 info->offset, info->bytes, 2939 (info->bitmap) ? "yes" : "no"); 2940 } 2941 spin_unlock(&ctl->tree_lock); 2942 btrfs_info(fs_info, "block group has cluster?: %s", 2943 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2944 btrfs_info(fs_info, 2945 "%d free space entries at or bigger than %llu bytes", 2946 count, bytes); 2947 } 2948 2949 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2950 struct btrfs_free_space_ctl *ctl) 2951 { 2952 struct btrfs_fs_info *fs_info = block_group->fs_info; 2953 2954 spin_lock_init(&ctl->tree_lock); 2955 ctl->unit = fs_info->sectorsize; 2956 ctl->start = block_group->start; 2957 ctl->block_group = block_group; 2958 ctl->op = &free_space_op; 2959 ctl->free_space_bytes = RB_ROOT_CACHED; 2960 INIT_LIST_HEAD(&ctl->trimming_ranges); 2961 mutex_init(&ctl->cache_writeout_mutex); 2962 2963 /* 2964 * we only want to have 32k of ram per block group for keeping 2965 * track of free space, and if we pass 1/2 of that we want to 2966 * start converting things over to using bitmaps 2967 */ 2968 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2969 } 2970 2971 /* 2972 * for a given cluster, put all of its extents back into the free 2973 * space cache. If the block group passed doesn't match the block group 2974 * pointed to by the cluster, someone else raced in and freed the 2975 * cluster already. In that case, we just return without changing anything 2976 */ 2977 static void __btrfs_return_cluster_to_free_space( 2978 struct btrfs_block_group *block_group, 2979 struct btrfs_free_cluster *cluster) 2980 { 2981 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2982 struct rb_node *node; 2983 2984 lockdep_assert_held(&ctl->tree_lock); 2985 2986 spin_lock(&cluster->lock); 2987 if (cluster->block_group != block_group) { 2988 spin_unlock(&cluster->lock); 2989 return; 2990 } 2991 2992 cluster->block_group = NULL; 2993 cluster->window_start = 0; 2994 list_del_init(&cluster->block_group_list); 2995 2996 node = rb_first(&cluster->root); 2997 while (node) { 2998 struct btrfs_free_space *entry; 2999 3000 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3001 node = rb_next(&entry->offset_index); 3002 rb_erase(&entry->offset_index, &cluster->root); 3003 RB_CLEAR_NODE(&entry->offset_index); 3004 3005 if (!entry->bitmap) { 3006 /* Merging treats extents as if they were new */ 3007 if (!btrfs_free_space_trimmed(entry)) { 3008 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3009 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 3010 entry->bytes; 3011 } 3012 3013 try_merge_free_space(ctl, entry, false); 3014 steal_from_bitmap(ctl, entry, false); 3015 3016 /* As we insert directly, update these statistics */ 3017 if (!btrfs_free_space_trimmed(entry)) { 3018 ctl->discardable_extents[BTRFS_STAT_CURR]++; 3019 ctl->discardable_bytes[BTRFS_STAT_CURR] += 3020 entry->bytes; 3021 } 3022 } 3023 tree_insert_offset(ctl, NULL, entry); 3024 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, 3025 entry_less); 3026 } 3027 cluster->root = RB_ROOT; 3028 spin_unlock(&cluster->lock); 3029 btrfs_put_block_group(block_group); 3030 } 3031 3032 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 3033 { 3034 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3035 struct btrfs_free_cluster *cluster; 3036 struct list_head *head; 3037 3038 spin_lock(&ctl->tree_lock); 3039 while ((head = block_group->cluster_list.next) != 3040 &block_group->cluster_list) { 3041 cluster = list_entry(head, struct btrfs_free_cluster, 3042 block_group_list); 3043 3044 WARN_ON(cluster->block_group != block_group); 3045 __btrfs_return_cluster_to_free_space(block_group, cluster); 3046 3047 cond_resched_lock(&ctl->tree_lock); 3048 } 3049 __btrfs_remove_free_space_cache(ctl); 3050 btrfs_discard_update_discardable(block_group); 3051 spin_unlock(&ctl->tree_lock); 3052 3053 } 3054 3055 /* 3056 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 3057 */ 3058 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 3059 { 3060 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3061 struct btrfs_free_space *info; 3062 struct rb_node *node; 3063 bool ret = true; 3064 3065 spin_lock(&ctl->tree_lock); 3066 node = rb_first(&ctl->free_space_offset); 3067 3068 while (node) { 3069 info = rb_entry(node, struct btrfs_free_space, offset_index); 3070 3071 if (!btrfs_free_space_trimmed(info)) { 3072 ret = false; 3073 break; 3074 } 3075 3076 node = rb_next(node); 3077 } 3078 3079 spin_unlock(&ctl->tree_lock); 3080 return ret; 3081 } 3082 3083 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 3084 u64 offset, u64 bytes, u64 empty_size, 3085 u64 *max_extent_size) 3086 { 3087 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3088 struct btrfs_discard_ctl *discard_ctl = 3089 &block_group->fs_info->discard_ctl; 3090 struct btrfs_free_space *entry = NULL; 3091 u64 bytes_search = bytes + empty_size; 3092 u64 ret = 0; 3093 u64 align_gap = 0; 3094 u64 align_gap_len = 0; 3095 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3096 bool use_bytes_index = (offset == block_group->start); 3097 3098 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3099 3100 spin_lock(&ctl->tree_lock); 3101 entry = find_free_space(ctl, &offset, &bytes_search, 3102 block_group->full_stripe_len, max_extent_size, 3103 use_bytes_index); 3104 if (!entry) 3105 goto out; 3106 3107 ret = offset; 3108 if (entry->bitmap) { 3109 bitmap_clear_bits(ctl, entry, offset, bytes, true); 3110 3111 if (!btrfs_free_space_trimmed(entry)) 3112 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3113 3114 if (!entry->bytes) 3115 free_bitmap(ctl, entry); 3116 } else { 3117 unlink_free_space(ctl, entry, true); 3118 align_gap_len = offset - entry->offset; 3119 align_gap = entry->offset; 3120 align_gap_trim_state = entry->trim_state; 3121 3122 if (!btrfs_free_space_trimmed(entry)) 3123 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3124 3125 entry->offset = offset + bytes; 3126 WARN_ON(entry->bytes < bytes + align_gap_len); 3127 3128 entry->bytes -= bytes + align_gap_len; 3129 if (!entry->bytes) 3130 kmem_cache_free(btrfs_free_space_cachep, entry); 3131 else 3132 link_free_space(ctl, entry); 3133 } 3134 out: 3135 btrfs_discard_update_discardable(block_group); 3136 spin_unlock(&ctl->tree_lock); 3137 3138 if (align_gap_len) 3139 __btrfs_add_free_space(block_group, align_gap, align_gap_len, 3140 align_gap_trim_state); 3141 return ret; 3142 } 3143 3144 /* 3145 * given a cluster, put all of its extents back into the free space 3146 * cache. If a block group is passed, this function will only free 3147 * a cluster that belongs to the passed block group. 3148 * 3149 * Otherwise, it'll get a reference on the block group pointed to by the 3150 * cluster and remove the cluster from it. 3151 */ 3152 void btrfs_return_cluster_to_free_space( 3153 struct btrfs_block_group *block_group, 3154 struct btrfs_free_cluster *cluster) 3155 { 3156 struct btrfs_free_space_ctl *ctl; 3157 3158 /* first, get a safe pointer to the block group */ 3159 spin_lock(&cluster->lock); 3160 if (!block_group) { 3161 block_group = cluster->block_group; 3162 if (!block_group) { 3163 spin_unlock(&cluster->lock); 3164 return; 3165 } 3166 } else if (cluster->block_group != block_group) { 3167 /* someone else has already freed it don't redo their work */ 3168 spin_unlock(&cluster->lock); 3169 return; 3170 } 3171 btrfs_get_block_group(block_group); 3172 spin_unlock(&cluster->lock); 3173 3174 ctl = block_group->free_space_ctl; 3175 3176 /* now return any extents the cluster had on it */ 3177 spin_lock(&ctl->tree_lock); 3178 __btrfs_return_cluster_to_free_space(block_group, cluster); 3179 spin_unlock(&ctl->tree_lock); 3180 3181 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 3182 3183 /* finally drop our ref */ 3184 btrfs_put_block_group(block_group); 3185 } 3186 3187 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 3188 struct btrfs_free_cluster *cluster, 3189 struct btrfs_free_space *entry, 3190 u64 bytes, u64 min_start, 3191 u64 *max_extent_size) 3192 { 3193 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3194 int err; 3195 u64 search_start = cluster->window_start; 3196 u64 search_bytes = bytes; 3197 u64 ret = 0; 3198 3199 search_start = min_start; 3200 search_bytes = bytes; 3201 3202 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 3203 if (err) { 3204 *max_extent_size = max(get_max_extent_size(entry), 3205 *max_extent_size); 3206 return 0; 3207 } 3208 3209 ret = search_start; 3210 bitmap_clear_bits(ctl, entry, ret, bytes, false); 3211 3212 return ret; 3213 } 3214 3215 /* 3216 * given a cluster, try to allocate 'bytes' from it, returns 0 3217 * if it couldn't find anything suitably large, or a logical disk offset 3218 * if things worked out 3219 */ 3220 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 3221 struct btrfs_free_cluster *cluster, u64 bytes, 3222 u64 min_start, u64 *max_extent_size) 3223 { 3224 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3225 struct btrfs_discard_ctl *discard_ctl = 3226 &block_group->fs_info->discard_ctl; 3227 struct btrfs_free_space *entry = NULL; 3228 struct rb_node *node; 3229 u64 ret = 0; 3230 3231 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3232 3233 spin_lock(&cluster->lock); 3234 if (bytes > cluster->max_size) 3235 goto out; 3236 3237 if (cluster->block_group != block_group) 3238 goto out; 3239 3240 node = rb_first(&cluster->root); 3241 if (!node) 3242 goto out; 3243 3244 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3245 while (1) { 3246 if (entry->bytes < bytes) 3247 *max_extent_size = max(get_max_extent_size(entry), 3248 *max_extent_size); 3249 3250 if (entry->bytes < bytes || 3251 (!entry->bitmap && entry->offset < min_start)) { 3252 node = rb_next(&entry->offset_index); 3253 if (!node) 3254 break; 3255 entry = rb_entry(node, struct btrfs_free_space, 3256 offset_index); 3257 continue; 3258 } 3259 3260 if (entry->bitmap) { 3261 ret = btrfs_alloc_from_bitmap(block_group, 3262 cluster, entry, bytes, 3263 cluster->window_start, 3264 max_extent_size); 3265 if (ret == 0) { 3266 node = rb_next(&entry->offset_index); 3267 if (!node) 3268 break; 3269 entry = rb_entry(node, struct btrfs_free_space, 3270 offset_index); 3271 continue; 3272 } 3273 cluster->window_start += bytes; 3274 } else { 3275 ret = entry->offset; 3276 3277 entry->offset += bytes; 3278 entry->bytes -= bytes; 3279 } 3280 3281 break; 3282 } 3283 out: 3284 spin_unlock(&cluster->lock); 3285 3286 if (!ret) 3287 return 0; 3288 3289 spin_lock(&ctl->tree_lock); 3290 3291 if (!btrfs_free_space_trimmed(entry)) 3292 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3293 3294 ctl->free_space -= bytes; 3295 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3296 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3297 3298 spin_lock(&cluster->lock); 3299 if (entry->bytes == 0) { 3300 rb_erase(&entry->offset_index, &cluster->root); 3301 ctl->free_extents--; 3302 if (entry->bitmap) { 3303 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3304 entry->bitmap); 3305 ctl->total_bitmaps--; 3306 recalculate_thresholds(ctl); 3307 } else if (!btrfs_free_space_trimmed(entry)) { 3308 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3309 } 3310 kmem_cache_free(btrfs_free_space_cachep, entry); 3311 } 3312 3313 spin_unlock(&cluster->lock); 3314 spin_unlock(&ctl->tree_lock); 3315 3316 return ret; 3317 } 3318 3319 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3320 struct btrfs_free_space *entry, 3321 struct btrfs_free_cluster *cluster, 3322 u64 offset, u64 bytes, 3323 u64 cont1_bytes, u64 min_bytes) 3324 { 3325 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3326 unsigned long next_zero; 3327 unsigned long i; 3328 unsigned long want_bits; 3329 unsigned long min_bits; 3330 unsigned long found_bits; 3331 unsigned long max_bits = 0; 3332 unsigned long start = 0; 3333 unsigned long total_found = 0; 3334 int ret; 3335 3336 lockdep_assert_held(&ctl->tree_lock); 3337 3338 i = offset_to_bit(entry->offset, ctl->unit, 3339 max_t(u64, offset, entry->offset)); 3340 want_bits = bytes_to_bits(bytes, ctl->unit); 3341 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3342 3343 /* 3344 * Don't bother looking for a cluster in this bitmap if it's heavily 3345 * fragmented. 3346 */ 3347 if (entry->max_extent_size && 3348 entry->max_extent_size < cont1_bytes) 3349 return -ENOSPC; 3350 again: 3351 found_bits = 0; 3352 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3353 next_zero = find_next_zero_bit(entry->bitmap, 3354 BITS_PER_BITMAP, i); 3355 if (next_zero - i >= min_bits) { 3356 found_bits = next_zero - i; 3357 if (found_bits > max_bits) 3358 max_bits = found_bits; 3359 break; 3360 } 3361 if (next_zero - i > max_bits) 3362 max_bits = next_zero - i; 3363 i = next_zero; 3364 } 3365 3366 if (!found_bits) { 3367 entry->max_extent_size = (u64)max_bits * ctl->unit; 3368 return -ENOSPC; 3369 } 3370 3371 if (!total_found) { 3372 start = i; 3373 cluster->max_size = 0; 3374 } 3375 3376 total_found += found_bits; 3377 3378 if (cluster->max_size < found_bits * ctl->unit) 3379 cluster->max_size = found_bits * ctl->unit; 3380 3381 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3382 i = next_zero + 1; 3383 goto again; 3384 } 3385 3386 cluster->window_start = start * ctl->unit + entry->offset; 3387 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3388 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3389 3390 /* 3391 * We need to know if we're currently on the normal space index when we 3392 * manipulate the bitmap so that we know we need to remove and re-insert 3393 * it into the space_index tree. Clear the bytes_index node here so the 3394 * bitmap manipulation helpers know not to mess with the space_index 3395 * until this bitmap entry is added back into the normal cache. 3396 */ 3397 RB_CLEAR_NODE(&entry->bytes_index); 3398 3399 ret = tree_insert_offset(ctl, cluster, entry); 3400 ASSERT(!ret); /* -EEXIST; Logic error */ 3401 3402 trace_btrfs_setup_cluster(block_group, cluster, 3403 total_found * ctl->unit, 1); 3404 return 0; 3405 } 3406 3407 /* 3408 * This searches the block group for just extents to fill the cluster with. 3409 * Try to find a cluster with at least bytes total bytes, at least one 3410 * extent of cont1_bytes, and other clusters of at least min_bytes. 3411 */ 3412 static noinline int 3413 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3414 struct btrfs_free_cluster *cluster, 3415 struct list_head *bitmaps, u64 offset, u64 bytes, 3416 u64 cont1_bytes, u64 min_bytes) 3417 { 3418 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3419 struct btrfs_free_space *first = NULL; 3420 struct btrfs_free_space *entry = NULL; 3421 struct btrfs_free_space *last; 3422 struct rb_node *node; 3423 u64 window_free; 3424 u64 max_extent; 3425 u64 total_size = 0; 3426 3427 lockdep_assert_held(&ctl->tree_lock); 3428 3429 entry = tree_search_offset(ctl, offset, 0, 1); 3430 if (!entry) 3431 return -ENOSPC; 3432 3433 /* 3434 * We don't want bitmaps, so just move along until we find a normal 3435 * extent entry. 3436 */ 3437 while (entry->bitmap || entry->bytes < min_bytes) { 3438 if (entry->bitmap && list_empty(&entry->list)) 3439 list_add_tail(&entry->list, bitmaps); 3440 node = rb_next(&entry->offset_index); 3441 if (!node) 3442 return -ENOSPC; 3443 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3444 } 3445 3446 window_free = entry->bytes; 3447 max_extent = entry->bytes; 3448 first = entry; 3449 last = entry; 3450 3451 for (node = rb_next(&entry->offset_index); node; 3452 node = rb_next(&entry->offset_index)) { 3453 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3454 3455 if (entry->bitmap) { 3456 if (list_empty(&entry->list)) 3457 list_add_tail(&entry->list, bitmaps); 3458 continue; 3459 } 3460 3461 if (entry->bytes < min_bytes) 3462 continue; 3463 3464 last = entry; 3465 window_free += entry->bytes; 3466 if (entry->bytes > max_extent) 3467 max_extent = entry->bytes; 3468 } 3469 3470 if (window_free < bytes || max_extent < cont1_bytes) 3471 return -ENOSPC; 3472 3473 cluster->window_start = first->offset; 3474 3475 node = &first->offset_index; 3476 3477 /* 3478 * now we've found our entries, pull them out of the free space 3479 * cache and put them into the cluster rbtree 3480 */ 3481 do { 3482 int ret; 3483 3484 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3485 node = rb_next(&entry->offset_index); 3486 if (entry->bitmap || entry->bytes < min_bytes) 3487 continue; 3488 3489 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3490 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3491 ret = tree_insert_offset(ctl, cluster, entry); 3492 total_size += entry->bytes; 3493 ASSERT(!ret); /* -EEXIST; Logic error */ 3494 } while (node && entry != last); 3495 3496 cluster->max_size = max_extent; 3497 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3498 return 0; 3499 } 3500 3501 /* 3502 * This specifically looks for bitmaps that may work in the cluster, we assume 3503 * that we have already failed to find extents that will work. 3504 */ 3505 static noinline int 3506 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3507 struct btrfs_free_cluster *cluster, 3508 struct list_head *bitmaps, u64 offset, u64 bytes, 3509 u64 cont1_bytes, u64 min_bytes) 3510 { 3511 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3512 struct btrfs_free_space *entry = NULL; 3513 int ret = -ENOSPC; 3514 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3515 3516 if (ctl->total_bitmaps == 0) 3517 return -ENOSPC; 3518 3519 /* 3520 * The bitmap that covers offset won't be in the list unless offset 3521 * is just its start offset. 3522 */ 3523 if (!list_empty(bitmaps)) 3524 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3525 3526 if (!entry || entry->offset != bitmap_offset) { 3527 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3528 if (entry && list_empty(&entry->list)) 3529 list_add(&entry->list, bitmaps); 3530 } 3531 3532 list_for_each_entry(entry, bitmaps, list) { 3533 if (entry->bytes < bytes) 3534 continue; 3535 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3536 bytes, cont1_bytes, min_bytes); 3537 if (!ret) 3538 return 0; 3539 } 3540 3541 /* 3542 * The bitmaps list has all the bitmaps that record free space 3543 * starting after offset, so no more search is required. 3544 */ 3545 return -ENOSPC; 3546 } 3547 3548 /* 3549 * here we try to find a cluster of blocks in a block group. The goal 3550 * is to find at least bytes+empty_size. 3551 * We might not find them all in one contiguous area. 3552 * 3553 * returns zero and sets up cluster if things worked out, otherwise 3554 * it returns -enospc 3555 */ 3556 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3557 struct btrfs_free_cluster *cluster, 3558 u64 offset, u64 bytes, u64 empty_size) 3559 { 3560 struct btrfs_fs_info *fs_info = block_group->fs_info; 3561 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3562 struct btrfs_free_space *entry, *tmp; 3563 LIST_HEAD(bitmaps); 3564 u64 min_bytes; 3565 u64 cont1_bytes; 3566 int ret; 3567 3568 /* 3569 * Choose the minimum extent size we'll require for this 3570 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3571 * For metadata, allow allocates with smaller extents. For 3572 * data, keep it dense. 3573 */ 3574 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3575 cont1_bytes = bytes + empty_size; 3576 min_bytes = cont1_bytes; 3577 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3578 cont1_bytes = bytes; 3579 min_bytes = fs_info->sectorsize; 3580 } else { 3581 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3582 min_bytes = fs_info->sectorsize; 3583 } 3584 3585 spin_lock(&ctl->tree_lock); 3586 3587 /* 3588 * If we know we don't have enough space to make a cluster don't even 3589 * bother doing all the work to try and find one. 3590 */ 3591 if (ctl->free_space < bytes) { 3592 spin_unlock(&ctl->tree_lock); 3593 return -ENOSPC; 3594 } 3595 3596 spin_lock(&cluster->lock); 3597 3598 /* someone already found a cluster, hooray */ 3599 if (cluster->block_group) { 3600 ret = 0; 3601 goto out; 3602 } 3603 3604 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3605 min_bytes); 3606 3607 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3608 bytes + empty_size, 3609 cont1_bytes, min_bytes); 3610 if (ret) 3611 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3612 offset, bytes + empty_size, 3613 cont1_bytes, min_bytes); 3614 3615 /* Clear our temporary list */ 3616 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3617 list_del_init(&entry->list); 3618 3619 if (!ret) { 3620 btrfs_get_block_group(block_group); 3621 list_add_tail(&cluster->block_group_list, 3622 &block_group->cluster_list); 3623 cluster->block_group = block_group; 3624 } else { 3625 trace_btrfs_failed_cluster_setup(block_group); 3626 } 3627 out: 3628 spin_unlock(&cluster->lock); 3629 spin_unlock(&ctl->tree_lock); 3630 3631 return ret; 3632 } 3633 3634 /* 3635 * simple code to zero out a cluster 3636 */ 3637 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3638 { 3639 spin_lock_init(&cluster->lock); 3640 spin_lock_init(&cluster->refill_lock); 3641 cluster->root = RB_ROOT; 3642 cluster->max_size = 0; 3643 cluster->fragmented = false; 3644 INIT_LIST_HEAD(&cluster->block_group_list); 3645 cluster->block_group = NULL; 3646 } 3647 3648 static int do_trimming(struct btrfs_block_group *block_group, 3649 u64 *total_trimmed, u64 start, u64 bytes, 3650 u64 reserved_start, u64 reserved_bytes, 3651 enum btrfs_trim_state reserved_trim_state, 3652 struct btrfs_trim_range *trim_entry) 3653 { 3654 struct btrfs_space_info *space_info = block_group->space_info; 3655 struct btrfs_fs_info *fs_info = block_group->fs_info; 3656 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3657 int ret; 3658 int update = 0; 3659 const u64 end = start + bytes; 3660 const u64 reserved_end = reserved_start + reserved_bytes; 3661 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3662 u64 trimmed = 0; 3663 3664 spin_lock(&space_info->lock); 3665 spin_lock(&block_group->lock); 3666 if (!block_group->ro) { 3667 block_group->reserved += reserved_bytes; 3668 space_info->bytes_reserved += reserved_bytes; 3669 update = 1; 3670 } 3671 spin_unlock(&block_group->lock); 3672 spin_unlock(&space_info->lock); 3673 3674 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3675 if (!ret) { 3676 *total_trimmed += trimmed; 3677 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3678 } 3679 3680 mutex_lock(&ctl->cache_writeout_mutex); 3681 if (reserved_start < start) 3682 __btrfs_add_free_space(block_group, reserved_start, 3683 start - reserved_start, 3684 reserved_trim_state); 3685 if (end < reserved_end) 3686 __btrfs_add_free_space(block_group, end, reserved_end - end, 3687 reserved_trim_state); 3688 __btrfs_add_free_space(block_group, start, bytes, trim_state); 3689 list_del(&trim_entry->list); 3690 mutex_unlock(&ctl->cache_writeout_mutex); 3691 3692 if (update) { 3693 spin_lock(&space_info->lock); 3694 spin_lock(&block_group->lock); 3695 if (block_group->ro) 3696 space_info->bytes_readonly += reserved_bytes; 3697 block_group->reserved -= reserved_bytes; 3698 space_info->bytes_reserved -= reserved_bytes; 3699 spin_unlock(&block_group->lock); 3700 spin_unlock(&space_info->lock); 3701 } 3702 3703 return ret; 3704 } 3705 3706 /* 3707 * If @async is set, then we will trim 1 region and return. 3708 */ 3709 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3710 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3711 bool async) 3712 { 3713 struct btrfs_discard_ctl *discard_ctl = 3714 &block_group->fs_info->discard_ctl; 3715 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3716 struct btrfs_free_space *entry; 3717 struct rb_node *node; 3718 int ret = 0; 3719 u64 extent_start; 3720 u64 extent_bytes; 3721 enum btrfs_trim_state extent_trim_state; 3722 u64 bytes; 3723 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3724 3725 while (start < end) { 3726 struct btrfs_trim_range trim_entry; 3727 3728 mutex_lock(&ctl->cache_writeout_mutex); 3729 spin_lock(&ctl->tree_lock); 3730 3731 if (ctl->free_space < minlen) 3732 goto out_unlock; 3733 3734 entry = tree_search_offset(ctl, start, 0, 1); 3735 if (!entry) 3736 goto out_unlock; 3737 3738 /* Skip bitmaps and if async, already trimmed entries */ 3739 while (entry->bitmap || 3740 (async && btrfs_free_space_trimmed(entry))) { 3741 node = rb_next(&entry->offset_index); 3742 if (!node) 3743 goto out_unlock; 3744 entry = rb_entry(node, struct btrfs_free_space, 3745 offset_index); 3746 } 3747 3748 if (entry->offset >= end) 3749 goto out_unlock; 3750 3751 extent_start = entry->offset; 3752 extent_bytes = entry->bytes; 3753 extent_trim_state = entry->trim_state; 3754 if (async) { 3755 start = entry->offset; 3756 bytes = entry->bytes; 3757 if (bytes < minlen) { 3758 spin_unlock(&ctl->tree_lock); 3759 mutex_unlock(&ctl->cache_writeout_mutex); 3760 goto next; 3761 } 3762 unlink_free_space(ctl, entry, true); 3763 /* 3764 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3765 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3766 * X when we come back around. So trim it now. 3767 */ 3768 if (max_discard_size && 3769 bytes >= (max_discard_size + 3770 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3771 bytes = max_discard_size; 3772 extent_bytes = max_discard_size; 3773 entry->offset += max_discard_size; 3774 entry->bytes -= max_discard_size; 3775 link_free_space(ctl, entry); 3776 } else { 3777 kmem_cache_free(btrfs_free_space_cachep, entry); 3778 } 3779 } else { 3780 start = max(start, extent_start); 3781 bytes = min(extent_start + extent_bytes, end) - start; 3782 if (bytes < minlen) { 3783 spin_unlock(&ctl->tree_lock); 3784 mutex_unlock(&ctl->cache_writeout_mutex); 3785 goto next; 3786 } 3787 3788 unlink_free_space(ctl, entry, true); 3789 kmem_cache_free(btrfs_free_space_cachep, entry); 3790 } 3791 3792 spin_unlock(&ctl->tree_lock); 3793 trim_entry.start = extent_start; 3794 trim_entry.bytes = extent_bytes; 3795 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3796 mutex_unlock(&ctl->cache_writeout_mutex); 3797 3798 ret = do_trimming(block_group, total_trimmed, start, bytes, 3799 extent_start, extent_bytes, extent_trim_state, 3800 &trim_entry); 3801 if (ret) { 3802 block_group->discard_cursor = start + bytes; 3803 break; 3804 } 3805 next: 3806 start += bytes; 3807 block_group->discard_cursor = start; 3808 if (async && *total_trimmed) 3809 break; 3810 3811 if (fatal_signal_pending(current)) { 3812 ret = -ERESTARTSYS; 3813 break; 3814 } 3815 3816 cond_resched(); 3817 } 3818 3819 return ret; 3820 3821 out_unlock: 3822 block_group->discard_cursor = btrfs_block_group_end(block_group); 3823 spin_unlock(&ctl->tree_lock); 3824 mutex_unlock(&ctl->cache_writeout_mutex); 3825 3826 return ret; 3827 } 3828 3829 /* 3830 * If we break out of trimming a bitmap prematurely, we should reset the 3831 * trimming bit. In a rather contrieved case, it's possible to race here so 3832 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3833 * 3834 * start = start of bitmap 3835 * end = near end of bitmap 3836 * 3837 * Thread 1: Thread 2: 3838 * trim_bitmaps(start) 3839 * trim_bitmaps(end) 3840 * end_trimming_bitmap() 3841 * reset_trimming_bitmap() 3842 */ 3843 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3844 { 3845 struct btrfs_free_space *entry; 3846 3847 spin_lock(&ctl->tree_lock); 3848 entry = tree_search_offset(ctl, offset, 1, 0); 3849 if (entry) { 3850 if (btrfs_free_space_trimmed(entry)) { 3851 ctl->discardable_extents[BTRFS_STAT_CURR] += 3852 entry->bitmap_extents; 3853 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3854 } 3855 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3856 } 3857 3858 spin_unlock(&ctl->tree_lock); 3859 } 3860 3861 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3862 struct btrfs_free_space *entry) 3863 { 3864 if (btrfs_free_space_trimming_bitmap(entry)) { 3865 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3866 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3867 entry->bitmap_extents; 3868 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3869 } 3870 } 3871 3872 /* 3873 * If @async is set, then we will trim 1 region and return. 3874 */ 3875 static int trim_bitmaps(struct btrfs_block_group *block_group, 3876 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3877 u64 maxlen, bool async) 3878 { 3879 struct btrfs_discard_ctl *discard_ctl = 3880 &block_group->fs_info->discard_ctl; 3881 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3882 struct btrfs_free_space *entry; 3883 int ret = 0; 3884 int ret2; 3885 u64 bytes; 3886 u64 offset = offset_to_bitmap(ctl, start); 3887 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3888 3889 while (offset < end) { 3890 bool next_bitmap = false; 3891 struct btrfs_trim_range trim_entry; 3892 3893 mutex_lock(&ctl->cache_writeout_mutex); 3894 spin_lock(&ctl->tree_lock); 3895 3896 if (ctl->free_space < minlen) { 3897 block_group->discard_cursor = 3898 btrfs_block_group_end(block_group); 3899 spin_unlock(&ctl->tree_lock); 3900 mutex_unlock(&ctl->cache_writeout_mutex); 3901 break; 3902 } 3903 3904 entry = tree_search_offset(ctl, offset, 1, 0); 3905 /* 3906 * Bitmaps are marked trimmed lossily now to prevent constant 3907 * discarding of the same bitmap (the reason why we are bound 3908 * by the filters). So, retrim the block group bitmaps when we 3909 * are preparing to punt to the unused_bgs list. This uses 3910 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3911 * which is the only discard index which sets minlen to 0. 3912 */ 3913 if (!entry || (async && minlen && start == offset && 3914 btrfs_free_space_trimmed(entry))) { 3915 spin_unlock(&ctl->tree_lock); 3916 mutex_unlock(&ctl->cache_writeout_mutex); 3917 next_bitmap = true; 3918 goto next; 3919 } 3920 3921 /* 3922 * Async discard bitmap trimming begins at by setting the start 3923 * to be key.objectid and the offset_to_bitmap() aligns to the 3924 * start of the bitmap. This lets us know we are fully 3925 * scanning the bitmap rather than only some portion of it. 3926 */ 3927 if (start == offset) 3928 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3929 3930 bytes = minlen; 3931 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3932 if (ret2 || start >= end) { 3933 /* 3934 * We lossily consider a bitmap trimmed if we only skip 3935 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3936 */ 3937 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3938 end_trimming_bitmap(ctl, entry); 3939 else 3940 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3941 spin_unlock(&ctl->tree_lock); 3942 mutex_unlock(&ctl->cache_writeout_mutex); 3943 next_bitmap = true; 3944 goto next; 3945 } 3946 3947 /* 3948 * We already trimmed a region, but are using the locking above 3949 * to reset the trim_state. 3950 */ 3951 if (async && *total_trimmed) { 3952 spin_unlock(&ctl->tree_lock); 3953 mutex_unlock(&ctl->cache_writeout_mutex); 3954 goto out; 3955 } 3956 3957 bytes = min(bytes, end - start); 3958 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 3959 spin_unlock(&ctl->tree_lock); 3960 mutex_unlock(&ctl->cache_writeout_mutex); 3961 goto next; 3962 } 3963 3964 /* 3965 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3966 * If X < @minlen, we won't trim X when we come back around. 3967 * So trim it now. We differ here from trimming extents as we 3968 * don't keep individual state per bit. 3969 */ 3970 if (async && 3971 max_discard_size && 3972 bytes > (max_discard_size + minlen)) 3973 bytes = max_discard_size; 3974 3975 bitmap_clear_bits(ctl, entry, start, bytes, true); 3976 if (entry->bytes == 0) 3977 free_bitmap(ctl, entry); 3978 3979 spin_unlock(&ctl->tree_lock); 3980 trim_entry.start = start; 3981 trim_entry.bytes = bytes; 3982 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3983 mutex_unlock(&ctl->cache_writeout_mutex); 3984 3985 ret = do_trimming(block_group, total_trimmed, start, bytes, 3986 start, bytes, 0, &trim_entry); 3987 if (ret) { 3988 reset_trimming_bitmap(ctl, offset); 3989 block_group->discard_cursor = 3990 btrfs_block_group_end(block_group); 3991 break; 3992 } 3993 next: 3994 if (next_bitmap) { 3995 offset += BITS_PER_BITMAP * ctl->unit; 3996 start = offset; 3997 } else { 3998 start += bytes; 3999 } 4000 block_group->discard_cursor = start; 4001 4002 if (fatal_signal_pending(current)) { 4003 if (start != offset) 4004 reset_trimming_bitmap(ctl, offset); 4005 ret = -ERESTARTSYS; 4006 break; 4007 } 4008 4009 cond_resched(); 4010 } 4011 4012 if (offset >= end) 4013 block_group->discard_cursor = end; 4014 4015 out: 4016 return ret; 4017 } 4018 4019 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 4020 u64 *trimmed, u64 start, u64 end, u64 minlen) 4021 { 4022 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4023 int ret; 4024 u64 rem = 0; 4025 4026 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 4027 4028 *trimmed = 0; 4029 4030 spin_lock(&block_group->lock); 4031 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4032 spin_unlock(&block_group->lock); 4033 return 0; 4034 } 4035 btrfs_freeze_block_group(block_group); 4036 spin_unlock(&block_group->lock); 4037 4038 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 4039 if (ret) 4040 goto out; 4041 4042 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 4043 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 4044 /* If we ended in the middle of a bitmap, reset the trimming flag */ 4045 if (rem) 4046 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 4047 out: 4048 btrfs_unfreeze_block_group(block_group); 4049 return ret; 4050 } 4051 4052 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 4053 u64 *trimmed, u64 start, u64 end, u64 minlen, 4054 bool async) 4055 { 4056 int ret; 4057 4058 *trimmed = 0; 4059 4060 spin_lock(&block_group->lock); 4061 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4062 spin_unlock(&block_group->lock); 4063 return 0; 4064 } 4065 btrfs_freeze_block_group(block_group); 4066 spin_unlock(&block_group->lock); 4067 4068 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 4069 btrfs_unfreeze_block_group(block_group); 4070 4071 return ret; 4072 } 4073 4074 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 4075 u64 *trimmed, u64 start, u64 end, u64 minlen, 4076 u64 maxlen, bool async) 4077 { 4078 int ret; 4079 4080 *trimmed = 0; 4081 4082 spin_lock(&block_group->lock); 4083 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4084 spin_unlock(&block_group->lock); 4085 return 0; 4086 } 4087 btrfs_freeze_block_group(block_group); 4088 spin_unlock(&block_group->lock); 4089 4090 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 4091 async); 4092 4093 btrfs_unfreeze_block_group(block_group); 4094 4095 return ret; 4096 } 4097 4098 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 4099 { 4100 return btrfs_super_cache_generation(fs_info->super_copy); 4101 } 4102 4103 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 4104 struct btrfs_trans_handle *trans) 4105 { 4106 struct btrfs_block_group *block_group; 4107 struct rb_node *node; 4108 int ret = 0; 4109 4110 btrfs_info(fs_info, "cleaning free space cache v1"); 4111 4112 node = rb_first_cached(&fs_info->block_group_cache_tree); 4113 while (node) { 4114 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4115 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4116 if (ret) 4117 goto out; 4118 node = rb_next(node); 4119 } 4120 out: 4121 return ret; 4122 } 4123 4124 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 4125 { 4126 struct btrfs_trans_handle *trans; 4127 int ret; 4128 4129 /* 4130 * update_super_roots will appropriately set or unset 4131 * super_copy->cache_generation based on SPACE_CACHE and 4132 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 4133 * transaction commit whether we are enabling space cache v1 and don't 4134 * have any other work to do, or are disabling it and removing free 4135 * space inodes. 4136 */ 4137 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4138 if (IS_ERR(trans)) 4139 return PTR_ERR(trans); 4140 4141 if (!active) { 4142 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4143 ret = cleanup_free_space_cache_v1(fs_info, trans); 4144 if (ret) { 4145 btrfs_abort_transaction(trans, ret); 4146 btrfs_end_transaction(trans); 4147 goto out; 4148 } 4149 } 4150 4151 ret = btrfs_commit_transaction(trans); 4152 out: 4153 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4154 4155 return ret; 4156 } 4157 4158 int __init btrfs_free_space_init(void) 4159 { 4160 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 4161 sizeof(struct btrfs_free_space), 0, 4162 SLAB_MEM_SPREAD, NULL); 4163 if (!btrfs_free_space_cachep) 4164 return -ENOMEM; 4165 4166 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 4167 PAGE_SIZE, PAGE_SIZE, 4168 SLAB_MEM_SPREAD, NULL); 4169 if (!btrfs_free_space_bitmap_cachep) { 4170 kmem_cache_destroy(btrfs_free_space_cachep); 4171 return -ENOMEM; 4172 } 4173 4174 return 0; 4175 } 4176 4177 void __cold btrfs_free_space_exit(void) 4178 { 4179 kmem_cache_destroy(btrfs_free_space_cachep); 4180 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 4181 } 4182 4183 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4184 /* 4185 * Use this if you need to make a bitmap or extent entry specifically, it 4186 * doesn't do any of the merging that add_free_space does, this acts a lot like 4187 * how the free space cache loading stuff works, so you can get really weird 4188 * configurations. 4189 */ 4190 int test_add_free_space_entry(struct btrfs_block_group *cache, 4191 u64 offset, u64 bytes, bool bitmap) 4192 { 4193 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4194 struct btrfs_free_space *info = NULL, *bitmap_info; 4195 void *map = NULL; 4196 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 4197 u64 bytes_added; 4198 int ret; 4199 4200 again: 4201 if (!info) { 4202 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 4203 if (!info) 4204 return -ENOMEM; 4205 } 4206 4207 if (!bitmap) { 4208 spin_lock(&ctl->tree_lock); 4209 info->offset = offset; 4210 info->bytes = bytes; 4211 info->max_extent_size = 0; 4212 ret = link_free_space(ctl, info); 4213 spin_unlock(&ctl->tree_lock); 4214 if (ret) 4215 kmem_cache_free(btrfs_free_space_cachep, info); 4216 return ret; 4217 } 4218 4219 if (!map) { 4220 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 4221 if (!map) { 4222 kmem_cache_free(btrfs_free_space_cachep, info); 4223 return -ENOMEM; 4224 } 4225 } 4226 4227 spin_lock(&ctl->tree_lock); 4228 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4229 1, 0); 4230 if (!bitmap_info) { 4231 info->bitmap = map; 4232 map = NULL; 4233 add_new_bitmap(ctl, info, offset); 4234 bitmap_info = info; 4235 info = NULL; 4236 } 4237 4238 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 4239 trim_state); 4240 4241 bytes -= bytes_added; 4242 offset += bytes_added; 4243 spin_unlock(&ctl->tree_lock); 4244 4245 if (bytes) 4246 goto again; 4247 4248 if (info) 4249 kmem_cache_free(btrfs_free_space_cachep, info); 4250 if (map) 4251 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 4252 return 0; 4253 } 4254 4255 /* 4256 * Checks to see if the given range is in the free space cache. This is really 4257 * just used to check the absence of space, so if there is free space in the 4258 * range at all we will return 1. 4259 */ 4260 int test_check_exists(struct btrfs_block_group *cache, 4261 u64 offset, u64 bytes) 4262 { 4263 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4264 struct btrfs_free_space *info; 4265 int ret = 0; 4266 4267 spin_lock(&ctl->tree_lock); 4268 info = tree_search_offset(ctl, offset, 0, 0); 4269 if (!info) { 4270 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4271 1, 0); 4272 if (!info) 4273 goto out; 4274 } 4275 4276 have_info: 4277 if (info->bitmap) { 4278 u64 bit_off, bit_bytes; 4279 struct rb_node *n; 4280 struct btrfs_free_space *tmp; 4281 4282 bit_off = offset; 4283 bit_bytes = ctl->unit; 4284 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 4285 if (!ret) { 4286 if (bit_off == offset) { 4287 ret = 1; 4288 goto out; 4289 } else if (bit_off > offset && 4290 offset + bytes > bit_off) { 4291 ret = 1; 4292 goto out; 4293 } 4294 } 4295 4296 n = rb_prev(&info->offset_index); 4297 while (n) { 4298 tmp = rb_entry(n, struct btrfs_free_space, 4299 offset_index); 4300 if (tmp->offset + tmp->bytes < offset) 4301 break; 4302 if (offset + bytes < tmp->offset) { 4303 n = rb_prev(&tmp->offset_index); 4304 continue; 4305 } 4306 info = tmp; 4307 goto have_info; 4308 } 4309 4310 n = rb_next(&info->offset_index); 4311 while (n) { 4312 tmp = rb_entry(n, struct btrfs_free_space, 4313 offset_index); 4314 if (offset + bytes < tmp->offset) 4315 break; 4316 if (tmp->offset + tmp->bytes < offset) { 4317 n = rb_next(&tmp->offset_index); 4318 continue; 4319 } 4320 info = tmp; 4321 goto have_info; 4322 } 4323 4324 ret = 0; 4325 goto out; 4326 } 4327 4328 if (info->offset == offset) { 4329 ret = 1; 4330 goto out; 4331 } 4332 4333 if (offset > info->offset && offset < info->offset + info->bytes) 4334 ret = 1; 4335 out: 4336 spin_unlock(&ctl->tree_lock); 4337 return ret; 4338 } 4339 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4340