1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include "ctree.h" 15 #include "fs.h" 16 #include "messages.h" 17 #include "misc.h" 18 #include "free-space-cache.h" 19 #include "transaction.h" 20 #include "disk-io.h" 21 #include "extent_io.h" 22 #include "volumes.h" 23 #include "space-info.h" 24 #include "delalloc-space.h" 25 #include "block-group.h" 26 #include "discard.h" 27 #include "subpage.h" 28 #include "inode-item.h" 29 #include "accessors.h" 30 #include "file-item.h" 31 #include "file.h" 32 #include "super.h" 33 34 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 35 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 36 #define FORCE_EXTENT_THRESHOLD SZ_1M 37 38 static struct kmem_cache *btrfs_free_space_cachep; 39 static struct kmem_cache *btrfs_free_space_bitmap_cachep; 40 41 struct btrfs_trim_range { 42 u64 start; 43 u64 bytes; 44 struct list_head list; 45 }; 46 47 static int link_free_space(struct btrfs_free_space_ctl *ctl, 48 struct btrfs_free_space *info); 49 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 50 struct btrfs_free_space *info, bool update_stat); 51 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 52 struct btrfs_free_space *bitmap_info, u64 *offset, 53 u64 *bytes, bool for_alloc); 54 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 55 struct btrfs_free_space *bitmap_info); 56 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 57 struct btrfs_free_space *info, u64 offset, 58 u64 bytes, bool update_stats); 59 60 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 61 { 62 struct btrfs_free_space *info; 63 struct rb_node *node; 64 65 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 66 info = rb_entry(node, struct btrfs_free_space, offset_index); 67 if (!info->bitmap) { 68 unlink_free_space(ctl, info, true); 69 kmem_cache_free(btrfs_free_space_cachep, info); 70 } else { 71 free_bitmap(ctl, info); 72 } 73 74 cond_resched_lock(&ctl->tree_lock); 75 } 76 } 77 78 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 79 struct btrfs_path *path, 80 u64 offset) 81 { 82 struct btrfs_fs_info *fs_info = root->fs_info; 83 struct btrfs_key key; 84 struct btrfs_key location; 85 struct btrfs_disk_key disk_key; 86 struct btrfs_free_space_header *header; 87 struct extent_buffer *leaf; 88 struct inode *inode = NULL; 89 unsigned nofs_flag; 90 int ret; 91 92 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 93 key.offset = offset; 94 key.type = 0; 95 96 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 97 if (ret < 0) 98 return ERR_PTR(ret); 99 if (ret > 0) { 100 btrfs_release_path(path); 101 return ERR_PTR(-ENOENT); 102 } 103 104 leaf = path->nodes[0]; 105 header = btrfs_item_ptr(leaf, path->slots[0], 106 struct btrfs_free_space_header); 107 btrfs_free_space_key(leaf, header, &disk_key); 108 btrfs_disk_key_to_cpu(&location, &disk_key); 109 btrfs_release_path(path); 110 111 /* 112 * We are often under a trans handle at this point, so we need to make 113 * sure NOFS is set to keep us from deadlocking. 114 */ 115 nofs_flag = memalloc_nofs_save(); 116 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); 117 btrfs_release_path(path); 118 memalloc_nofs_restore(nofs_flag); 119 if (IS_ERR(inode)) 120 return inode; 121 122 mapping_set_gfp_mask(inode->i_mapping, 123 mapping_gfp_constraint(inode->i_mapping, 124 ~(__GFP_FS | __GFP_HIGHMEM))); 125 126 return inode; 127 } 128 129 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 130 struct btrfs_path *path) 131 { 132 struct btrfs_fs_info *fs_info = block_group->fs_info; 133 struct inode *inode = NULL; 134 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 135 136 spin_lock(&block_group->lock); 137 if (block_group->inode) 138 inode = igrab(block_group->inode); 139 spin_unlock(&block_group->lock); 140 if (inode) 141 return inode; 142 143 inode = __lookup_free_space_inode(fs_info->tree_root, path, 144 block_group->start); 145 if (IS_ERR(inode)) 146 return inode; 147 148 spin_lock(&block_group->lock); 149 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 150 btrfs_info(fs_info, "Old style space inode found, converting."); 151 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 152 BTRFS_INODE_NODATACOW; 153 block_group->disk_cache_state = BTRFS_DC_CLEAR; 154 } 155 156 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) 157 block_group->inode = igrab(inode); 158 spin_unlock(&block_group->lock); 159 160 return inode; 161 } 162 163 static int __create_free_space_inode(struct btrfs_root *root, 164 struct btrfs_trans_handle *trans, 165 struct btrfs_path *path, 166 u64 ino, u64 offset) 167 { 168 struct btrfs_key key; 169 struct btrfs_disk_key disk_key; 170 struct btrfs_free_space_header *header; 171 struct btrfs_inode_item *inode_item; 172 struct extent_buffer *leaf; 173 /* We inline CRCs for the free disk space cache */ 174 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 175 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 176 int ret; 177 178 ret = btrfs_insert_empty_inode(trans, root, path, ino); 179 if (ret) 180 return ret; 181 182 leaf = path->nodes[0]; 183 inode_item = btrfs_item_ptr(leaf, path->slots[0], 184 struct btrfs_inode_item); 185 btrfs_item_key(leaf, &disk_key, path->slots[0]); 186 memzero_extent_buffer(leaf, (unsigned long)inode_item, 187 sizeof(*inode_item)); 188 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 189 btrfs_set_inode_size(leaf, inode_item, 0); 190 btrfs_set_inode_nbytes(leaf, inode_item, 0); 191 btrfs_set_inode_uid(leaf, inode_item, 0); 192 btrfs_set_inode_gid(leaf, inode_item, 0); 193 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 194 btrfs_set_inode_flags(leaf, inode_item, flags); 195 btrfs_set_inode_nlink(leaf, inode_item, 1); 196 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 197 btrfs_set_inode_block_group(leaf, inode_item, offset); 198 btrfs_mark_buffer_dirty(trans, leaf); 199 btrfs_release_path(path); 200 201 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 202 key.offset = offset; 203 key.type = 0; 204 ret = btrfs_insert_empty_item(trans, root, path, &key, 205 sizeof(struct btrfs_free_space_header)); 206 if (ret < 0) { 207 btrfs_release_path(path); 208 return ret; 209 } 210 211 leaf = path->nodes[0]; 212 header = btrfs_item_ptr(leaf, path->slots[0], 213 struct btrfs_free_space_header); 214 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 215 btrfs_set_free_space_key(leaf, header, &disk_key); 216 btrfs_mark_buffer_dirty(trans, leaf); 217 btrfs_release_path(path); 218 219 return 0; 220 } 221 222 int create_free_space_inode(struct btrfs_trans_handle *trans, 223 struct btrfs_block_group *block_group, 224 struct btrfs_path *path) 225 { 226 int ret; 227 u64 ino; 228 229 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); 230 if (ret < 0) 231 return ret; 232 233 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 234 ino, block_group->start); 235 } 236 237 /* 238 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 239 * handles lookup, otherwise it takes ownership and iputs the inode. 240 * Don't reuse an inode pointer after passing it into this function. 241 */ 242 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 243 struct inode *inode, 244 struct btrfs_block_group *block_group) 245 { 246 struct btrfs_path *path; 247 struct btrfs_key key; 248 int ret = 0; 249 250 path = btrfs_alloc_path(); 251 if (!path) 252 return -ENOMEM; 253 254 if (!inode) 255 inode = lookup_free_space_inode(block_group, path); 256 if (IS_ERR(inode)) { 257 if (PTR_ERR(inode) != -ENOENT) 258 ret = PTR_ERR(inode); 259 goto out; 260 } 261 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 262 if (ret) { 263 btrfs_add_delayed_iput(BTRFS_I(inode)); 264 goto out; 265 } 266 clear_nlink(inode); 267 /* One for the block groups ref */ 268 spin_lock(&block_group->lock); 269 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { 270 block_group->inode = NULL; 271 spin_unlock(&block_group->lock); 272 iput(inode); 273 } else { 274 spin_unlock(&block_group->lock); 275 } 276 /* One for the lookup ref */ 277 btrfs_add_delayed_iput(BTRFS_I(inode)); 278 279 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 280 key.type = 0; 281 key.offset = block_group->start; 282 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 283 -1, 1); 284 if (ret) { 285 if (ret > 0) 286 ret = 0; 287 goto out; 288 } 289 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); 290 out: 291 btrfs_free_path(path); 292 return ret; 293 } 294 295 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 296 struct btrfs_block_group *block_group, 297 struct inode *vfs_inode) 298 { 299 struct btrfs_truncate_control control = { 300 .inode = BTRFS_I(vfs_inode), 301 .new_size = 0, 302 .ino = btrfs_ino(BTRFS_I(vfs_inode)), 303 .min_type = BTRFS_EXTENT_DATA_KEY, 304 .clear_extent_range = true, 305 }; 306 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 307 struct btrfs_root *root = inode->root; 308 struct extent_state *cached_state = NULL; 309 int ret = 0; 310 bool locked = false; 311 312 if (block_group) { 313 struct btrfs_path *path = btrfs_alloc_path(); 314 315 if (!path) { 316 ret = -ENOMEM; 317 goto fail; 318 } 319 locked = true; 320 mutex_lock(&trans->transaction->cache_write_mutex); 321 if (!list_empty(&block_group->io_list)) { 322 list_del_init(&block_group->io_list); 323 324 btrfs_wait_cache_io(trans, block_group, path); 325 btrfs_put_block_group(block_group); 326 } 327 328 /* 329 * now that we've truncated the cache away, its no longer 330 * setup or written 331 */ 332 spin_lock(&block_group->lock); 333 block_group->disk_cache_state = BTRFS_DC_CLEAR; 334 spin_unlock(&block_group->lock); 335 btrfs_free_path(path); 336 } 337 338 btrfs_i_size_write(inode, 0); 339 truncate_pagecache(vfs_inode, 0); 340 341 lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 342 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 343 344 /* 345 * We skip the throttling logic for free space cache inodes, so we don't 346 * need to check for -EAGAIN. 347 */ 348 ret = btrfs_truncate_inode_items(trans, root, &control); 349 350 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 351 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 352 353 unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 354 if (ret) 355 goto fail; 356 357 ret = btrfs_update_inode(trans, root, inode); 358 359 fail: 360 if (locked) 361 mutex_unlock(&trans->transaction->cache_write_mutex); 362 if (ret) 363 btrfs_abort_transaction(trans, ret); 364 365 return ret; 366 } 367 368 static void readahead_cache(struct inode *inode) 369 { 370 struct file_ra_state ra; 371 unsigned long last_index; 372 373 file_ra_state_init(&ra, inode->i_mapping); 374 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 375 376 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); 377 } 378 379 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 380 int write) 381 { 382 int num_pages; 383 384 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 385 386 /* Make sure we can fit our crcs and generation into the first page */ 387 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 388 return -ENOSPC; 389 390 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 391 392 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 393 if (!io_ctl->pages) 394 return -ENOMEM; 395 396 io_ctl->num_pages = num_pages; 397 io_ctl->fs_info = btrfs_sb(inode->i_sb); 398 io_ctl->inode = inode; 399 400 return 0; 401 } 402 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 403 404 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 405 { 406 kfree(io_ctl->pages); 407 io_ctl->pages = NULL; 408 } 409 410 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 411 { 412 if (io_ctl->cur) { 413 io_ctl->cur = NULL; 414 io_ctl->orig = NULL; 415 } 416 } 417 418 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 419 { 420 ASSERT(io_ctl->index < io_ctl->num_pages); 421 io_ctl->page = io_ctl->pages[io_ctl->index++]; 422 io_ctl->cur = page_address(io_ctl->page); 423 io_ctl->orig = io_ctl->cur; 424 io_ctl->size = PAGE_SIZE; 425 if (clear) 426 clear_page(io_ctl->cur); 427 } 428 429 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 430 { 431 int i; 432 433 io_ctl_unmap_page(io_ctl); 434 435 for (i = 0; i < io_ctl->num_pages; i++) { 436 if (io_ctl->pages[i]) { 437 btrfs_page_clear_checked(io_ctl->fs_info, 438 io_ctl->pages[i], 439 page_offset(io_ctl->pages[i]), 440 PAGE_SIZE); 441 unlock_page(io_ctl->pages[i]); 442 put_page(io_ctl->pages[i]); 443 } 444 } 445 } 446 447 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 448 { 449 struct page *page; 450 struct inode *inode = io_ctl->inode; 451 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 452 int i; 453 454 for (i = 0; i < io_ctl->num_pages; i++) { 455 int ret; 456 457 page = find_or_create_page(inode->i_mapping, i, mask); 458 if (!page) { 459 io_ctl_drop_pages(io_ctl); 460 return -ENOMEM; 461 } 462 463 ret = set_page_extent_mapped(page); 464 if (ret < 0) { 465 unlock_page(page); 466 put_page(page); 467 io_ctl_drop_pages(io_ctl); 468 return ret; 469 } 470 471 io_ctl->pages[i] = page; 472 if (uptodate && !PageUptodate(page)) { 473 btrfs_read_folio(NULL, page_folio(page)); 474 lock_page(page); 475 if (page->mapping != inode->i_mapping) { 476 btrfs_err(BTRFS_I(inode)->root->fs_info, 477 "free space cache page truncated"); 478 io_ctl_drop_pages(io_ctl); 479 return -EIO; 480 } 481 if (!PageUptodate(page)) { 482 btrfs_err(BTRFS_I(inode)->root->fs_info, 483 "error reading free space cache"); 484 io_ctl_drop_pages(io_ctl); 485 return -EIO; 486 } 487 } 488 } 489 490 for (i = 0; i < io_ctl->num_pages; i++) 491 clear_page_dirty_for_io(io_ctl->pages[i]); 492 493 return 0; 494 } 495 496 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 497 { 498 io_ctl_map_page(io_ctl, 1); 499 500 /* 501 * Skip the csum areas. If we don't check crcs then we just have a 502 * 64bit chunk at the front of the first page. 503 */ 504 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 505 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 506 507 put_unaligned_le64(generation, io_ctl->cur); 508 io_ctl->cur += sizeof(u64); 509 } 510 511 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 512 { 513 u64 cache_gen; 514 515 /* 516 * Skip the crc area. If we don't check crcs then we just have a 64bit 517 * chunk at the front of the first page. 518 */ 519 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 520 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 521 522 cache_gen = get_unaligned_le64(io_ctl->cur); 523 if (cache_gen != generation) { 524 btrfs_err_rl(io_ctl->fs_info, 525 "space cache generation (%llu) does not match inode (%llu)", 526 cache_gen, generation); 527 io_ctl_unmap_page(io_ctl); 528 return -EIO; 529 } 530 io_ctl->cur += sizeof(u64); 531 return 0; 532 } 533 534 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 535 { 536 u32 *tmp; 537 u32 crc = ~(u32)0; 538 unsigned offset = 0; 539 540 if (index == 0) 541 offset = sizeof(u32) * io_ctl->num_pages; 542 543 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 544 btrfs_crc32c_final(crc, (u8 *)&crc); 545 io_ctl_unmap_page(io_ctl); 546 tmp = page_address(io_ctl->pages[0]); 547 tmp += index; 548 *tmp = crc; 549 } 550 551 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 552 { 553 u32 *tmp, val; 554 u32 crc = ~(u32)0; 555 unsigned offset = 0; 556 557 if (index == 0) 558 offset = sizeof(u32) * io_ctl->num_pages; 559 560 tmp = page_address(io_ctl->pages[0]); 561 tmp += index; 562 val = *tmp; 563 564 io_ctl_map_page(io_ctl, 0); 565 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 566 btrfs_crc32c_final(crc, (u8 *)&crc); 567 if (val != crc) { 568 btrfs_err_rl(io_ctl->fs_info, 569 "csum mismatch on free space cache"); 570 io_ctl_unmap_page(io_ctl); 571 return -EIO; 572 } 573 574 return 0; 575 } 576 577 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 578 void *bitmap) 579 { 580 struct btrfs_free_space_entry *entry; 581 582 if (!io_ctl->cur) 583 return -ENOSPC; 584 585 entry = io_ctl->cur; 586 put_unaligned_le64(offset, &entry->offset); 587 put_unaligned_le64(bytes, &entry->bytes); 588 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 589 BTRFS_FREE_SPACE_EXTENT; 590 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 591 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 592 593 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 594 return 0; 595 596 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 597 598 /* No more pages to map */ 599 if (io_ctl->index >= io_ctl->num_pages) 600 return 0; 601 602 /* map the next page */ 603 io_ctl_map_page(io_ctl, 1); 604 return 0; 605 } 606 607 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 608 { 609 if (!io_ctl->cur) 610 return -ENOSPC; 611 612 /* 613 * If we aren't at the start of the current page, unmap this one and 614 * map the next one if there is any left. 615 */ 616 if (io_ctl->cur != io_ctl->orig) { 617 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 618 if (io_ctl->index >= io_ctl->num_pages) 619 return -ENOSPC; 620 io_ctl_map_page(io_ctl, 0); 621 } 622 623 copy_page(io_ctl->cur, bitmap); 624 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 625 if (io_ctl->index < io_ctl->num_pages) 626 io_ctl_map_page(io_ctl, 0); 627 return 0; 628 } 629 630 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 631 { 632 /* 633 * If we're not on the boundary we know we've modified the page and we 634 * need to crc the page. 635 */ 636 if (io_ctl->cur != io_ctl->orig) 637 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 638 else 639 io_ctl_unmap_page(io_ctl); 640 641 while (io_ctl->index < io_ctl->num_pages) { 642 io_ctl_map_page(io_ctl, 1); 643 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 644 } 645 } 646 647 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 648 struct btrfs_free_space *entry, u8 *type) 649 { 650 struct btrfs_free_space_entry *e; 651 int ret; 652 653 if (!io_ctl->cur) { 654 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 655 if (ret) 656 return ret; 657 } 658 659 e = io_ctl->cur; 660 entry->offset = get_unaligned_le64(&e->offset); 661 entry->bytes = get_unaligned_le64(&e->bytes); 662 *type = e->type; 663 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 664 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 665 666 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 667 return 0; 668 669 io_ctl_unmap_page(io_ctl); 670 671 return 0; 672 } 673 674 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 675 struct btrfs_free_space *entry) 676 { 677 int ret; 678 679 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 680 if (ret) 681 return ret; 682 683 copy_page(entry->bitmap, io_ctl->cur); 684 io_ctl_unmap_page(io_ctl); 685 686 return 0; 687 } 688 689 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 690 { 691 struct btrfs_block_group *block_group = ctl->block_group; 692 u64 max_bytes; 693 u64 bitmap_bytes; 694 u64 extent_bytes; 695 u64 size = block_group->length; 696 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 697 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 698 699 max_bitmaps = max_t(u64, max_bitmaps, 1); 700 701 if (ctl->total_bitmaps > max_bitmaps) 702 btrfs_err(block_group->fs_info, 703 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", 704 block_group->start, block_group->length, 705 ctl->total_bitmaps, ctl->unit, max_bitmaps, 706 bytes_per_bg); 707 ASSERT(ctl->total_bitmaps <= max_bitmaps); 708 709 /* 710 * We are trying to keep the total amount of memory used per 1GiB of 711 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 712 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 713 * bitmaps, we may end up using more memory than this. 714 */ 715 if (size < SZ_1G) 716 max_bytes = MAX_CACHE_BYTES_PER_GIG; 717 else 718 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 719 720 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 721 722 /* 723 * we want the extent entry threshold to always be at most 1/2 the max 724 * bytes we can have, or whatever is less than that. 725 */ 726 extent_bytes = max_bytes - bitmap_bytes; 727 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 728 729 ctl->extents_thresh = 730 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 731 } 732 733 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 734 struct btrfs_free_space_ctl *ctl, 735 struct btrfs_path *path, u64 offset) 736 { 737 struct btrfs_fs_info *fs_info = root->fs_info; 738 struct btrfs_free_space_header *header; 739 struct extent_buffer *leaf; 740 struct btrfs_io_ctl io_ctl; 741 struct btrfs_key key; 742 struct btrfs_free_space *e, *n; 743 LIST_HEAD(bitmaps); 744 u64 num_entries; 745 u64 num_bitmaps; 746 u64 generation; 747 u8 type; 748 int ret = 0; 749 750 /* Nothing in the space cache, goodbye */ 751 if (!i_size_read(inode)) 752 return 0; 753 754 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 755 key.offset = offset; 756 key.type = 0; 757 758 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 759 if (ret < 0) 760 return 0; 761 else if (ret > 0) { 762 btrfs_release_path(path); 763 return 0; 764 } 765 766 ret = -1; 767 768 leaf = path->nodes[0]; 769 header = btrfs_item_ptr(leaf, path->slots[0], 770 struct btrfs_free_space_header); 771 num_entries = btrfs_free_space_entries(leaf, header); 772 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 773 generation = btrfs_free_space_generation(leaf, header); 774 btrfs_release_path(path); 775 776 if (!BTRFS_I(inode)->generation) { 777 btrfs_info(fs_info, 778 "the free space cache file (%llu) is invalid, skip it", 779 offset); 780 return 0; 781 } 782 783 if (BTRFS_I(inode)->generation != generation) { 784 btrfs_err(fs_info, 785 "free space inode generation (%llu) did not match free space cache generation (%llu)", 786 BTRFS_I(inode)->generation, generation); 787 return 0; 788 } 789 790 if (!num_entries) 791 return 0; 792 793 ret = io_ctl_init(&io_ctl, inode, 0); 794 if (ret) 795 return ret; 796 797 readahead_cache(inode); 798 799 ret = io_ctl_prepare_pages(&io_ctl, true); 800 if (ret) 801 goto out; 802 803 ret = io_ctl_check_crc(&io_ctl, 0); 804 if (ret) 805 goto free_cache; 806 807 ret = io_ctl_check_generation(&io_ctl, generation); 808 if (ret) 809 goto free_cache; 810 811 while (num_entries) { 812 e = kmem_cache_zalloc(btrfs_free_space_cachep, 813 GFP_NOFS); 814 if (!e) { 815 ret = -ENOMEM; 816 goto free_cache; 817 } 818 819 ret = io_ctl_read_entry(&io_ctl, e, &type); 820 if (ret) { 821 kmem_cache_free(btrfs_free_space_cachep, e); 822 goto free_cache; 823 } 824 825 if (!e->bytes) { 826 ret = -1; 827 kmem_cache_free(btrfs_free_space_cachep, e); 828 goto free_cache; 829 } 830 831 if (type == BTRFS_FREE_SPACE_EXTENT) { 832 spin_lock(&ctl->tree_lock); 833 ret = link_free_space(ctl, e); 834 spin_unlock(&ctl->tree_lock); 835 if (ret) { 836 btrfs_err(fs_info, 837 "Duplicate entries in free space cache, dumping"); 838 kmem_cache_free(btrfs_free_space_cachep, e); 839 goto free_cache; 840 } 841 } else { 842 ASSERT(num_bitmaps); 843 num_bitmaps--; 844 e->bitmap = kmem_cache_zalloc( 845 btrfs_free_space_bitmap_cachep, GFP_NOFS); 846 if (!e->bitmap) { 847 ret = -ENOMEM; 848 kmem_cache_free( 849 btrfs_free_space_cachep, e); 850 goto free_cache; 851 } 852 spin_lock(&ctl->tree_lock); 853 ret = link_free_space(ctl, e); 854 if (ret) { 855 spin_unlock(&ctl->tree_lock); 856 btrfs_err(fs_info, 857 "Duplicate entries in free space cache, dumping"); 858 kmem_cache_free(btrfs_free_space_cachep, e); 859 goto free_cache; 860 } 861 ctl->total_bitmaps++; 862 recalculate_thresholds(ctl); 863 spin_unlock(&ctl->tree_lock); 864 list_add_tail(&e->list, &bitmaps); 865 } 866 867 num_entries--; 868 } 869 870 io_ctl_unmap_page(&io_ctl); 871 872 /* 873 * We add the bitmaps at the end of the entries in order that 874 * the bitmap entries are added to the cache. 875 */ 876 list_for_each_entry_safe(e, n, &bitmaps, list) { 877 list_del_init(&e->list); 878 ret = io_ctl_read_bitmap(&io_ctl, e); 879 if (ret) 880 goto free_cache; 881 } 882 883 io_ctl_drop_pages(&io_ctl); 884 ret = 1; 885 out: 886 io_ctl_free(&io_ctl); 887 return ret; 888 free_cache: 889 io_ctl_drop_pages(&io_ctl); 890 891 spin_lock(&ctl->tree_lock); 892 __btrfs_remove_free_space_cache(ctl); 893 spin_unlock(&ctl->tree_lock); 894 goto out; 895 } 896 897 static int copy_free_space_cache(struct btrfs_block_group *block_group, 898 struct btrfs_free_space_ctl *ctl) 899 { 900 struct btrfs_free_space *info; 901 struct rb_node *n; 902 int ret = 0; 903 904 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 905 info = rb_entry(n, struct btrfs_free_space, offset_index); 906 if (!info->bitmap) { 907 const u64 offset = info->offset; 908 const u64 bytes = info->bytes; 909 910 unlink_free_space(ctl, info, true); 911 spin_unlock(&ctl->tree_lock); 912 kmem_cache_free(btrfs_free_space_cachep, info); 913 ret = btrfs_add_free_space(block_group, offset, bytes); 914 spin_lock(&ctl->tree_lock); 915 } else { 916 u64 offset = info->offset; 917 u64 bytes = ctl->unit; 918 919 ret = search_bitmap(ctl, info, &offset, &bytes, false); 920 if (ret == 0) { 921 bitmap_clear_bits(ctl, info, offset, bytes, true); 922 spin_unlock(&ctl->tree_lock); 923 ret = btrfs_add_free_space(block_group, offset, 924 bytes); 925 spin_lock(&ctl->tree_lock); 926 } else { 927 free_bitmap(ctl, info); 928 ret = 0; 929 } 930 } 931 cond_resched_lock(&ctl->tree_lock); 932 } 933 return ret; 934 } 935 936 static struct lock_class_key btrfs_free_space_inode_key; 937 938 int load_free_space_cache(struct btrfs_block_group *block_group) 939 { 940 struct btrfs_fs_info *fs_info = block_group->fs_info; 941 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 942 struct btrfs_free_space_ctl tmp_ctl = {}; 943 struct inode *inode; 944 struct btrfs_path *path; 945 int ret = 0; 946 bool matched; 947 u64 used = block_group->used; 948 949 /* 950 * Because we could potentially discard our loaded free space, we want 951 * to load everything into a temporary structure first, and then if it's 952 * valid copy it all into the actual free space ctl. 953 */ 954 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 955 956 /* 957 * If this block group has been marked to be cleared for one reason or 958 * another then we can't trust the on disk cache, so just return. 959 */ 960 spin_lock(&block_group->lock); 961 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 962 spin_unlock(&block_group->lock); 963 return 0; 964 } 965 spin_unlock(&block_group->lock); 966 967 path = btrfs_alloc_path(); 968 if (!path) 969 return 0; 970 path->search_commit_root = 1; 971 path->skip_locking = 1; 972 973 /* 974 * We must pass a path with search_commit_root set to btrfs_iget in 975 * order to avoid a deadlock when allocating extents for the tree root. 976 * 977 * When we are COWing an extent buffer from the tree root, when looking 978 * for a free extent, at extent-tree.c:find_free_extent(), we can find 979 * block group without its free space cache loaded. When we find one 980 * we must load its space cache which requires reading its free space 981 * cache's inode item from the root tree. If this inode item is located 982 * in the same leaf that we started COWing before, then we end up in 983 * deadlock on the extent buffer (trying to read lock it when we 984 * previously write locked it). 985 * 986 * It's safe to read the inode item using the commit root because 987 * block groups, once loaded, stay in memory forever (until they are 988 * removed) as well as their space caches once loaded. New block groups 989 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 990 * we will never try to read their inode item while the fs is mounted. 991 */ 992 inode = lookup_free_space_inode(block_group, path); 993 if (IS_ERR(inode)) { 994 btrfs_free_path(path); 995 return 0; 996 } 997 998 /* We may have converted the inode and made the cache invalid. */ 999 spin_lock(&block_group->lock); 1000 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 1001 spin_unlock(&block_group->lock); 1002 btrfs_free_path(path); 1003 goto out; 1004 } 1005 spin_unlock(&block_group->lock); 1006 1007 /* 1008 * Reinitialize the class of struct inode's mapping->invalidate_lock for 1009 * free space inodes to prevent false positives related to locks for normal 1010 * inodes. 1011 */ 1012 lockdep_set_class(&(&inode->i_data)->invalidate_lock, 1013 &btrfs_free_space_inode_key); 1014 1015 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 1016 path, block_group->start); 1017 btrfs_free_path(path); 1018 if (ret <= 0) 1019 goto out; 1020 1021 matched = (tmp_ctl.free_space == (block_group->length - used - 1022 block_group->bytes_super)); 1023 1024 if (matched) { 1025 spin_lock(&tmp_ctl.tree_lock); 1026 ret = copy_free_space_cache(block_group, &tmp_ctl); 1027 spin_unlock(&tmp_ctl.tree_lock); 1028 /* 1029 * ret == 1 means we successfully loaded the free space cache, 1030 * so we need to re-set it here. 1031 */ 1032 if (ret == 0) 1033 ret = 1; 1034 } else { 1035 /* 1036 * We need to call the _locked variant so we don't try to update 1037 * the discard counters. 1038 */ 1039 spin_lock(&tmp_ctl.tree_lock); 1040 __btrfs_remove_free_space_cache(&tmp_ctl); 1041 spin_unlock(&tmp_ctl.tree_lock); 1042 btrfs_warn(fs_info, 1043 "block group %llu has wrong amount of free space", 1044 block_group->start); 1045 ret = -1; 1046 } 1047 out: 1048 if (ret < 0) { 1049 /* This cache is bogus, make sure it gets cleared */ 1050 spin_lock(&block_group->lock); 1051 block_group->disk_cache_state = BTRFS_DC_CLEAR; 1052 spin_unlock(&block_group->lock); 1053 ret = 0; 1054 1055 btrfs_warn(fs_info, 1056 "failed to load free space cache for block group %llu, rebuilding it now", 1057 block_group->start); 1058 } 1059 1060 spin_lock(&ctl->tree_lock); 1061 btrfs_discard_update_discardable(block_group); 1062 spin_unlock(&ctl->tree_lock); 1063 iput(inode); 1064 return ret; 1065 } 1066 1067 static noinline_for_stack 1068 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1069 struct btrfs_free_space_ctl *ctl, 1070 struct btrfs_block_group *block_group, 1071 int *entries, int *bitmaps, 1072 struct list_head *bitmap_list) 1073 { 1074 int ret; 1075 struct btrfs_free_cluster *cluster = NULL; 1076 struct btrfs_free_cluster *cluster_locked = NULL; 1077 struct rb_node *node = rb_first(&ctl->free_space_offset); 1078 struct btrfs_trim_range *trim_entry; 1079 1080 /* Get the cluster for this block_group if it exists */ 1081 if (block_group && !list_empty(&block_group->cluster_list)) { 1082 cluster = list_entry(block_group->cluster_list.next, 1083 struct btrfs_free_cluster, 1084 block_group_list); 1085 } 1086 1087 if (!node && cluster) { 1088 cluster_locked = cluster; 1089 spin_lock(&cluster_locked->lock); 1090 node = rb_first(&cluster->root); 1091 cluster = NULL; 1092 } 1093 1094 /* Write out the extent entries */ 1095 while (node) { 1096 struct btrfs_free_space *e; 1097 1098 e = rb_entry(node, struct btrfs_free_space, offset_index); 1099 *entries += 1; 1100 1101 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1102 e->bitmap); 1103 if (ret) 1104 goto fail; 1105 1106 if (e->bitmap) { 1107 list_add_tail(&e->list, bitmap_list); 1108 *bitmaps += 1; 1109 } 1110 node = rb_next(node); 1111 if (!node && cluster) { 1112 node = rb_first(&cluster->root); 1113 cluster_locked = cluster; 1114 spin_lock(&cluster_locked->lock); 1115 cluster = NULL; 1116 } 1117 } 1118 if (cluster_locked) { 1119 spin_unlock(&cluster_locked->lock); 1120 cluster_locked = NULL; 1121 } 1122 1123 /* 1124 * Make sure we don't miss any range that was removed from our rbtree 1125 * because trimming is running. Otherwise after a umount+mount (or crash 1126 * after committing the transaction) we would leak free space and get 1127 * an inconsistent free space cache report from fsck. 1128 */ 1129 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1130 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1131 trim_entry->bytes, NULL); 1132 if (ret) 1133 goto fail; 1134 *entries += 1; 1135 } 1136 1137 return 0; 1138 fail: 1139 if (cluster_locked) 1140 spin_unlock(&cluster_locked->lock); 1141 return -ENOSPC; 1142 } 1143 1144 static noinline_for_stack int 1145 update_cache_item(struct btrfs_trans_handle *trans, 1146 struct btrfs_root *root, 1147 struct inode *inode, 1148 struct btrfs_path *path, u64 offset, 1149 int entries, int bitmaps) 1150 { 1151 struct btrfs_key key; 1152 struct btrfs_free_space_header *header; 1153 struct extent_buffer *leaf; 1154 int ret; 1155 1156 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1157 key.offset = offset; 1158 key.type = 0; 1159 1160 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1161 if (ret < 0) { 1162 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1163 EXTENT_DELALLOC, NULL); 1164 goto fail; 1165 } 1166 leaf = path->nodes[0]; 1167 if (ret > 0) { 1168 struct btrfs_key found_key; 1169 ASSERT(path->slots[0]); 1170 path->slots[0]--; 1171 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1172 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1173 found_key.offset != offset) { 1174 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1175 inode->i_size - 1, EXTENT_DELALLOC, 1176 NULL); 1177 btrfs_release_path(path); 1178 goto fail; 1179 } 1180 } 1181 1182 BTRFS_I(inode)->generation = trans->transid; 1183 header = btrfs_item_ptr(leaf, path->slots[0], 1184 struct btrfs_free_space_header); 1185 btrfs_set_free_space_entries(leaf, header, entries); 1186 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1187 btrfs_set_free_space_generation(leaf, header, trans->transid); 1188 btrfs_mark_buffer_dirty(trans, leaf); 1189 btrfs_release_path(path); 1190 1191 return 0; 1192 1193 fail: 1194 return -1; 1195 } 1196 1197 static noinline_for_stack int write_pinned_extent_entries( 1198 struct btrfs_trans_handle *trans, 1199 struct btrfs_block_group *block_group, 1200 struct btrfs_io_ctl *io_ctl, 1201 int *entries) 1202 { 1203 u64 start, extent_start, extent_end, len; 1204 struct extent_io_tree *unpin = NULL; 1205 int ret; 1206 1207 if (!block_group) 1208 return 0; 1209 1210 /* 1211 * We want to add any pinned extents to our free space cache 1212 * so we don't leak the space 1213 * 1214 * We shouldn't have switched the pinned extents yet so this is the 1215 * right one 1216 */ 1217 unpin = &trans->transaction->pinned_extents; 1218 1219 start = block_group->start; 1220 1221 while (start < block_group->start + block_group->length) { 1222 if (!find_first_extent_bit(unpin, start, 1223 &extent_start, &extent_end, 1224 EXTENT_DIRTY, NULL)) 1225 return 0; 1226 1227 /* This pinned extent is out of our range */ 1228 if (extent_start >= block_group->start + block_group->length) 1229 return 0; 1230 1231 extent_start = max(extent_start, start); 1232 extent_end = min(block_group->start + block_group->length, 1233 extent_end + 1); 1234 len = extent_end - extent_start; 1235 1236 *entries += 1; 1237 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1238 if (ret) 1239 return -ENOSPC; 1240 1241 start = extent_end; 1242 } 1243 1244 return 0; 1245 } 1246 1247 static noinline_for_stack int 1248 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1249 { 1250 struct btrfs_free_space *entry, *next; 1251 int ret; 1252 1253 /* Write out the bitmaps */ 1254 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1255 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1256 if (ret) 1257 return -ENOSPC; 1258 list_del_init(&entry->list); 1259 } 1260 1261 return 0; 1262 } 1263 1264 static int flush_dirty_cache(struct inode *inode) 1265 { 1266 int ret; 1267 1268 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1269 if (ret) 1270 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1271 EXTENT_DELALLOC, NULL); 1272 1273 return ret; 1274 } 1275 1276 static void noinline_for_stack 1277 cleanup_bitmap_list(struct list_head *bitmap_list) 1278 { 1279 struct btrfs_free_space *entry, *next; 1280 1281 list_for_each_entry_safe(entry, next, bitmap_list, list) 1282 list_del_init(&entry->list); 1283 } 1284 1285 static void noinline_for_stack 1286 cleanup_write_cache_enospc(struct inode *inode, 1287 struct btrfs_io_ctl *io_ctl, 1288 struct extent_state **cached_state) 1289 { 1290 io_ctl_drop_pages(io_ctl); 1291 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1292 cached_state); 1293 } 1294 1295 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1296 struct btrfs_trans_handle *trans, 1297 struct btrfs_block_group *block_group, 1298 struct btrfs_io_ctl *io_ctl, 1299 struct btrfs_path *path, u64 offset) 1300 { 1301 int ret; 1302 struct inode *inode = io_ctl->inode; 1303 1304 if (!inode) 1305 return 0; 1306 1307 /* Flush the dirty pages in the cache file. */ 1308 ret = flush_dirty_cache(inode); 1309 if (ret) 1310 goto out; 1311 1312 /* Update the cache item to tell everyone this cache file is valid. */ 1313 ret = update_cache_item(trans, root, inode, path, offset, 1314 io_ctl->entries, io_ctl->bitmaps); 1315 out: 1316 if (ret) { 1317 invalidate_inode_pages2(inode->i_mapping); 1318 BTRFS_I(inode)->generation = 0; 1319 if (block_group) 1320 btrfs_debug(root->fs_info, 1321 "failed to write free space cache for block group %llu error %d", 1322 block_group->start, ret); 1323 } 1324 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1325 1326 if (block_group) { 1327 /* the dirty list is protected by the dirty_bgs_lock */ 1328 spin_lock(&trans->transaction->dirty_bgs_lock); 1329 1330 /* the disk_cache_state is protected by the block group lock */ 1331 spin_lock(&block_group->lock); 1332 1333 /* 1334 * only mark this as written if we didn't get put back on 1335 * the dirty list while waiting for IO. Otherwise our 1336 * cache state won't be right, and we won't get written again 1337 */ 1338 if (!ret && list_empty(&block_group->dirty_list)) 1339 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1340 else if (ret) 1341 block_group->disk_cache_state = BTRFS_DC_ERROR; 1342 1343 spin_unlock(&block_group->lock); 1344 spin_unlock(&trans->transaction->dirty_bgs_lock); 1345 io_ctl->inode = NULL; 1346 iput(inode); 1347 } 1348 1349 return ret; 1350 1351 } 1352 1353 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1354 struct btrfs_block_group *block_group, 1355 struct btrfs_path *path) 1356 { 1357 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1358 block_group, &block_group->io_ctl, 1359 path, block_group->start); 1360 } 1361 1362 /* 1363 * Write out cached info to an inode. 1364 * 1365 * @root: root the inode belongs to 1366 * @inode: freespace inode we are writing out 1367 * @ctl: free space cache we are going to write out 1368 * @block_group: block_group for this cache if it belongs to a block_group 1369 * @io_ctl: holds context for the io 1370 * @trans: the trans handle 1371 * 1372 * This function writes out a free space cache struct to disk for quick recovery 1373 * on mount. This will return 0 if it was successful in writing the cache out, 1374 * or an errno if it was not. 1375 */ 1376 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 1377 struct btrfs_free_space_ctl *ctl, 1378 struct btrfs_block_group *block_group, 1379 struct btrfs_io_ctl *io_ctl, 1380 struct btrfs_trans_handle *trans) 1381 { 1382 struct extent_state *cached_state = NULL; 1383 LIST_HEAD(bitmap_list); 1384 int entries = 0; 1385 int bitmaps = 0; 1386 int ret; 1387 int must_iput = 0; 1388 1389 if (!i_size_read(inode)) 1390 return -EIO; 1391 1392 WARN_ON(io_ctl->pages); 1393 ret = io_ctl_init(io_ctl, inode, 1); 1394 if (ret) 1395 return ret; 1396 1397 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1398 down_write(&block_group->data_rwsem); 1399 spin_lock(&block_group->lock); 1400 if (block_group->delalloc_bytes) { 1401 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1402 spin_unlock(&block_group->lock); 1403 up_write(&block_group->data_rwsem); 1404 BTRFS_I(inode)->generation = 0; 1405 ret = 0; 1406 must_iput = 1; 1407 goto out; 1408 } 1409 spin_unlock(&block_group->lock); 1410 } 1411 1412 /* Lock all pages first so we can lock the extent safely. */ 1413 ret = io_ctl_prepare_pages(io_ctl, false); 1414 if (ret) 1415 goto out_unlock; 1416 1417 lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1418 &cached_state); 1419 1420 io_ctl_set_generation(io_ctl, trans->transid); 1421 1422 mutex_lock(&ctl->cache_writeout_mutex); 1423 /* Write out the extent entries in the free space cache */ 1424 spin_lock(&ctl->tree_lock); 1425 ret = write_cache_extent_entries(io_ctl, ctl, 1426 block_group, &entries, &bitmaps, 1427 &bitmap_list); 1428 if (ret) 1429 goto out_nospc_locked; 1430 1431 /* 1432 * Some spaces that are freed in the current transaction are pinned, 1433 * they will be added into free space cache after the transaction is 1434 * committed, we shouldn't lose them. 1435 * 1436 * If this changes while we are working we'll get added back to 1437 * the dirty list and redo it. No locking needed 1438 */ 1439 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1440 if (ret) 1441 goto out_nospc_locked; 1442 1443 /* 1444 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1445 * locked while doing it because a concurrent trim can be manipulating 1446 * or freeing the bitmap. 1447 */ 1448 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1449 spin_unlock(&ctl->tree_lock); 1450 mutex_unlock(&ctl->cache_writeout_mutex); 1451 if (ret) 1452 goto out_nospc; 1453 1454 /* Zero out the rest of the pages just to make sure */ 1455 io_ctl_zero_remaining_pages(io_ctl); 1456 1457 /* Everything is written out, now we dirty the pages in the file. */ 1458 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 1459 io_ctl->num_pages, 0, i_size_read(inode), 1460 &cached_state, false); 1461 if (ret) 1462 goto out_nospc; 1463 1464 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1465 up_write(&block_group->data_rwsem); 1466 /* 1467 * Release the pages and unlock the extent, we will flush 1468 * them out later 1469 */ 1470 io_ctl_drop_pages(io_ctl); 1471 io_ctl_free(io_ctl); 1472 1473 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1474 &cached_state); 1475 1476 /* 1477 * at this point the pages are under IO and we're happy, 1478 * The caller is responsible for waiting on them and updating 1479 * the cache and the inode 1480 */ 1481 io_ctl->entries = entries; 1482 io_ctl->bitmaps = bitmaps; 1483 1484 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); 1485 if (ret) 1486 goto out; 1487 1488 return 0; 1489 1490 out_nospc_locked: 1491 cleanup_bitmap_list(&bitmap_list); 1492 spin_unlock(&ctl->tree_lock); 1493 mutex_unlock(&ctl->cache_writeout_mutex); 1494 1495 out_nospc: 1496 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1497 1498 out_unlock: 1499 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1500 up_write(&block_group->data_rwsem); 1501 1502 out: 1503 io_ctl->inode = NULL; 1504 io_ctl_free(io_ctl); 1505 if (ret) { 1506 invalidate_inode_pages2(inode->i_mapping); 1507 BTRFS_I(inode)->generation = 0; 1508 } 1509 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1510 if (must_iput) 1511 iput(inode); 1512 return ret; 1513 } 1514 1515 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1516 struct btrfs_block_group *block_group, 1517 struct btrfs_path *path) 1518 { 1519 struct btrfs_fs_info *fs_info = trans->fs_info; 1520 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1521 struct inode *inode; 1522 int ret = 0; 1523 1524 spin_lock(&block_group->lock); 1525 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1526 spin_unlock(&block_group->lock); 1527 return 0; 1528 } 1529 spin_unlock(&block_group->lock); 1530 1531 inode = lookup_free_space_inode(block_group, path); 1532 if (IS_ERR(inode)) 1533 return 0; 1534 1535 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, 1536 block_group, &block_group->io_ctl, trans); 1537 if (ret) { 1538 btrfs_debug(fs_info, 1539 "failed to write free space cache for block group %llu error %d", 1540 block_group->start, ret); 1541 spin_lock(&block_group->lock); 1542 block_group->disk_cache_state = BTRFS_DC_ERROR; 1543 spin_unlock(&block_group->lock); 1544 1545 block_group->io_ctl.inode = NULL; 1546 iput(inode); 1547 } 1548 1549 /* 1550 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1551 * to wait for IO and put the inode 1552 */ 1553 1554 return ret; 1555 } 1556 1557 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1558 u64 offset) 1559 { 1560 ASSERT(offset >= bitmap_start); 1561 offset -= bitmap_start; 1562 return (unsigned long)(div_u64(offset, unit)); 1563 } 1564 1565 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1566 { 1567 return (unsigned long)(div_u64(bytes, unit)); 1568 } 1569 1570 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1571 u64 offset) 1572 { 1573 u64 bitmap_start; 1574 u64 bytes_per_bitmap; 1575 1576 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1577 bitmap_start = offset - ctl->start; 1578 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1579 bitmap_start *= bytes_per_bitmap; 1580 bitmap_start += ctl->start; 1581 1582 return bitmap_start; 1583 } 1584 1585 static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, 1586 struct btrfs_free_cluster *cluster, 1587 struct btrfs_free_space *new_entry) 1588 { 1589 struct rb_root *root; 1590 struct rb_node **p; 1591 struct rb_node *parent = NULL; 1592 1593 lockdep_assert_held(&ctl->tree_lock); 1594 1595 if (cluster) { 1596 lockdep_assert_held(&cluster->lock); 1597 root = &cluster->root; 1598 } else { 1599 root = &ctl->free_space_offset; 1600 } 1601 1602 p = &root->rb_node; 1603 1604 while (*p) { 1605 struct btrfs_free_space *info; 1606 1607 parent = *p; 1608 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1609 1610 if (new_entry->offset < info->offset) { 1611 p = &(*p)->rb_left; 1612 } else if (new_entry->offset > info->offset) { 1613 p = &(*p)->rb_right; 1614 } else { 1615 /* 1616 * we could have a bitmap entry and an extent entry 1617 * share the same offset. If this is the case, we want 1618 * the extent entry to always be found first if we do a 1619 * linear search through the tree, since we want to have 1620 * the quickest allocation time, and allocating from an 1621 * extent is faster than allocating from a bitmap. So 1622 * if we're inserting a bitmap and we find an entry at 1623 * this offset, we want to go right, or after this entry 1624 * logically. If we are inserting an extent and we've 1625 * found a bitmap, we want to go left, or before 1626 * logically. 1627 */ 1628 if (new_entry->bitmap) { 1629 if (info->bitmap) { 1630 WARN_ON_ONCE(1); 1631 return -EEXIST; 1632 } 1633 p = &(*p)->rb_right; 1634 } else { 1635 if (!info->bitmap) { 1636 WARN_ON_ONCE(1); 1637 return -EEXIST; 1638 } 1639 p = &(*p)->rb_left; 1640 } 1641 } 1642 } 1643 1644 rb_link_node(&new_entry->offset_index, parent, p); 1645 rb_insert_color(&new_entry->offset_index, root); 1646 1647 return 0; 1648 } 1649 1650 /* 1651 * This is a little subtle. We *only* have ->max_extent_size set if we actually 1652 * searched through the bitmap and figured out the largest ->max_extent_size, 1653 * otherwise it's 0. In the case that it's 0 we don't want to tell the 1654 * allocator the wrong thing, we want to use the actual real max_extent_size 1655 * we've found already if it's larger, or we want to use ->bytes. 1656 * 1657 * This matters because find_free_space() will skip entries who's ->bytes is 1658 * less than the required bytes. So if we didn't search down this bitmap, we 1659 * may pick some previous entry that has a smaller ->max_extent_size than we 1660 * have. For example, assume we have two entries, one that has 1661 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set 1662 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will 1663 * call into find_free_space(), and return with max_extent_size == 4K, because 1664 * that first bitmap entry had ->max_extent_size set, but the second one did 1665 * not. If instead we returned 8K we'd come in searching for 8K, and find the 1666 * 8K contiguous range. 1667 * 1668 * Consider the other case, we have 2 8K chunks in that second entry and still 1669 * don't have ->max_extent_size set. We'll return 16K, and the next time the 1670 * allocator comes in it'll fully search our second bitmap, and this time it'll 1671 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the 1672 * right allocation the next loop through. 1673 */ 1674 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) 1675 { 1676 if (entry->bitmap && entry->max_extent_size) 1677 return entry->max_extent_size; 1678 return entry->bytes; 1679 } 1680 1681 /* 1682 * We want the largest entry to be leftmost, so this is inverted from what you'd 1683 * normally expect. 1684 */ 1685 static bool entry_less(struct rb_node *node, const struct rb_node *parent) 1686 { 1687 const struct btrfs_free_space *entry, *exist; 1688 1689 entry = rb_entry(node, struct btrfs_free_space, bytes_index); 1690 exist = rb_entry(parent, struct btrfs_free_space, bytes_index); 1691 return get_max_extent_size(exist) < get_max_extent_size(entry); 1692 } 1693 1694 /* 1695 * searches the tree for the given offset. 1696 * 1697 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1698 * want a section that has at least bytes size and comes at or after the given 1699 * offset. 1700 */ 1701 static struct btrfs_free_space * 1702 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1703 u64 offset, int bitmap_only, int fuzzy) 1704 { 1705 struct rb_node *n = ctl->free_space_offset.rb_node; 1706 struct btrfs_free_space *entry = NULL, *prev = NULL; 1707 1708 lockdep_assert_held(&ctl->tree_lock); 1709 1710 /* find entry that is closest to the 'offset' */ 1711 while (n) { 1712 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1713 prev = entry; 1714 1715 if (offset < entry->offset) 1716 n = n->rb_left; 1717 else if (offset > entry->offset) 1718 n = n->rb_right; 1719 else 1720 break; 1721 1722 entry = NULL; 1723 } 1724 1725 if (bitmap_only) { 1726 if (!entry) 1727 return NULL; 1728 if (entry->bitmap) 1729 return entry; 1730 1731 /* 1732 * bitmap entry and extent entry may share same offset, 1733 * in that case, bitmap entry comes after extent entry. 1734 */ 1735 n = rb_next(n); 1736 if (!n) 1737 return NULL; 1738 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1739 if (entry->offset != offset) 1740 return NULL; 1741 1742 WARN_ON(!entry->bitmap); 1743 return entry; 1744 } else if (entry) { 1745 if (entry->bitmap) { 1746 /* 1747 * if previous extent entry covers the offset, 1748 * we should return it instead of the bitmap entry 1749 */ 1750 n = rb_prev(&entry->offset_index); 1751 if (n) { 1752 prev = rb_entry(n, struct btrfs_free_space, 1753 offset_index); 1754 if (!prev->bitmap && 1755 prev->offset + prev->bytes > offset) 1756 entry = prev; 1757 } 1758 } 1759 return entry; 1760 } 1761 1762 if (!prev) 1763 return NULL; 1764 1765 /* find last entry before the 'offset' */ 1766 entry = prev; 1767 if (entry->offset > offset) { 1768 n = rb_prev(&entry->offset_index); 1769 if (n) { 1770 entry = rb_entry(n, struct btrfs_free_space, 1771 offset_index); 1772 ASSERT(entry->offset <= offset); 1773 } else { 1774 if (fuzzy) 1775 return entry; 1776 else 1777 return NULL; 1778 } 1779 } 1780 1781 if (entry->bitmap) { 1782 n = rb_prev(&entry->offset_index); 1783 if (n) { 1784 prev = rb_entry(n, struct btrfs_free_space, 1785 offset_index); 1786 if (!prev->bitmap && 1787 prev->offset + prev->bytes > offset) 1788 return prev; 1789 } 1790 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1791 return entry; 1792 } else if (entry->offset + entry->bytes > offset) 1793 return entry; 1794 1795 if (!fuzzy) 1796 return NULL; 1797 1798 while (1) { 1799 n = rb_next(&entry->offset_index); 1800 if (!n) 1801 return NULL; 1802 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1803 if (entry->bitmap) { 1804 if (entry->offset + BITS_PER_BITMAP * 1805 ctl->unit > offset) 1806 break; 1807 } else { 1808 if (entry->offset + entry->bytes > offset) 1809 break; 1810 } 1811 } 1812 return entry; 1813 } 1814 1815 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1816 struct btrfs_free_space *info, 1817 bool update_stat) 1818 { 1819 lockdep_assert_held(&ctl->tree_lock); 1820 1821 rb_erase(&info->offset_index, &ctl->free_space_offset); 1822 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1823 ctl->free_extents--; 1824 1825 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1826 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1827 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1828 } 1829 1830 if (update_stat) 1831 ctl->free_space -= info->bytes; 1832 } 1833 1834 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1835 struct btrfs_free_space *info) 1836 { 1837 int ret = 0; 1838 1839 lockdep_assert_held(&ctl->tree_lock); 1840 1841 ASSERT(info->bytes || info->bitmap); 1842 ret = tree_insert_offset(ctl, NULL, info); 1843 if (ret) 1844 return ret; 1845 1846 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1847 1848 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1849 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1850 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1851 } 1852 1853 ctl->free_space += info->bytes; 1854 ctl->free_extents++; 1855 return ret; 1856 } 1857 1858 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, 1859 struct btrfs_free_space *info) 1860 { 1861 ASSERT(info->bitmap); 1862 1863 /* 1864 * If our entry is empty it's because we're on a cluster and we don't 1865 * want to re-link it into our ctl bytes index. 1866 */ 1867 if (RB_EMPTY_NODE(&info->bytes_index)) 1868 return; 1869 1870 lockdep_assert_held(&ctl->tree_lock); 1871 1872 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1873 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1874 } 1875 1876 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1877 struct btrfs_free_space *info, 1878 u64 offset, u64 bytes, bool update_stat) 1879 { 1880 unsigned long start, count, end; 1881 int extent_delta = -1; 1882 1883 start = offset_to_bit(info->offset, ctl->unit, offset); 1884 count = bytes_to_bits(bytes, ctl->unit); 1885 end = start + count; 1886 ASSERT(end <= BITS_PER_BITMAP); 1887 1888 bitmap_clear(info->bitmap, start, count); 1889 1890 info->bytes -= bytes; 1891 if (info->max_extent_size > ctl->unit) 1892 info->max_extent_size = 0; 1893 1894 relink_bitmap_entry(ctl, info); 1895 1896 if (start && test_bit(start - 1, info->bitmap)) 1897 extent_delta++; 1898 1899 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1900 extent_delta++; 1901 1902 info->bitmap_extents += extent_delta; 1903 if (!btrfs_free_space_trimmed(info)) { 1904 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1905 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1906 } 1907 1908 if (update_stat) 1909 ctl->free_space -= bytes; 1910 } 1911 1912 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1913 struct btrfs_free_space *info, u64 offset, 1914 u64 bytes) 1915 { 1916 unsigned long start, count, end; 1917 int extent_delta = 1; 1918 1919 start = offset_to_bit(info->offset, ctl->unit, offset); 1920 count = bytes_to_bits(bytes, ctl->unit); 1921 end = start + count; 1922 ASSERT(end <= BITS_PER_BITMAP); 1923 1924 bitmap_set(info->bitmap, start, count); 1925 1926 /* 1927 * We set some bytes, we have no idea what the max extent size is 1928 * anymore. 1929 */ 1930 info->max_extent_size = 0; 1931 info->bytes += bytes; 1932 ctl->free_space += bytes; 1933 1934 relink_bitmap_entry(ctl, info); 1935 1936 if (start && test_bit(start - 1, info->bitmap)) 1937 extent_delta--; 1938 1939 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1940 extent_delta--; 1941 1942 info->bitmap_extents += extent_delta; 1943 if (!btrfs_free_space_trimmed(info)) { 1944 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1945 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1946 } 1947 } 1948 1949 /* 1950 * If we can not find suitable extent, we will use bytes to record 1951 * the size of the max extent. 1952 */ 1953 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1954 struct btrfs_free_space *bitmap_info, u64 *offset, 1955 u64 *bytes, bool for_alloc) 1956 { 1957 unsigned long found_bits = 0; 1958 unsigned long max_bits = 0; 1959 unsigned long bits, i; 1960 unsigned long next_zero; 1961 unsigned long extent_bits; 1962 1963 /* 1964 * Skip searching the bitmap if we don't have a contiguous section that 1965 * is large enough for this allocation. 1966 */ 1967 if (for_alloc && 1968 bitmap_info->max_extent_size && 1969 bitmap_info->max_extent_size < *bytes) { 1970 *bytes = bitmap_info->max_extent_size; 1971 return -1; 1972 } 1973 1974 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1975 max_t(u64, *offset, bitmap_info->offset)); 1976 bits = bytes_to_bits(*bytes, ctl->unit); 1977 1978 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1979 if (for_alloc && bits == 1) { 1980 found_bits = 1; 1981 break; 1982 } 1983 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1984 BITS_PER_BITMAP, i); 1985 extent_bits = next_zero - i; 1986 if (extent_bits >= bits) { 1987 found_bits = extent_bits; 1988 break; 1989 } else if (extent_bits > max_bits) { 1990 max_bits = extent_bits; 1991 } 1992 i = next_zero; 1993 } 1994 1995 if (found_bits) { 1996 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1997 *bytes = (u64)(found_bits) * ctl->unit; 1998 return 0; 1999 } 2000 2001 *bytes = (u64)(max_bits) * ctl->unit; 2002 bitmap_info->max_extent_size = *bytes; 2003 relink_bitmap_entry(ctl, bitmap_info); 2004 return -1; 2005 } 2006 2007 /* Cache the size of the max extent in bytes */ 2008 static struct btrfs_free_space * 2009 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 2010 unsigned long align, u64 *max_extent_size, bool use_bytes_index) 2011 { 2012 struct btrfs_free_space *entry; 2013 struct rb_node *node; 2014 u64 tmp; 2015 u64 align_off; 2016 int ret; 2017 2018 if (!ctl->free_space_offset.rb_node) 2019 goto out; 2020 again: 2021 if (use_bytes_index) { 2022 node = rb_first_cached(&ctl->free_space_bytes); 2023 } else { 2024 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2025 0, 1); 2026 if (!entry) 2027 goto out; 2028 node = &entry->offset_index; 2029 } 2030 2031 for (; node; node = rb_next(node)) { 2032 if (use_bytes_index) 2033 entry = rb_entry(node, struct btrfs_free_space, 2034 bytes_index); 2035 else 2036 entry = rb_entry(node, struct btrfs_free_space, 2037 offset_index); 2038 2039 /* 2040 * If we are using the bytes index then all subsequent entries 2041 * in this tree are going to be < bytes, so simply set the max 2042 * extent size and exit the loop. 2043 * 2044 * If we're using the offset index then we need to keep going 2045 * through the rest of the tree. 2046 */ 2047 if (entry->bytes < *bytes) { 2048 *max_extent_size = max(get_max_extent_size(entry), 2049 *max_extent_size); 2050 if (use_bytes_index) 2051 break; 2052 continue; 2053 } 2054 2055 /* make sure the space returned is big enough 2056 * to match our requested alignment 2057 */ 2058 if (*bytes >= align) { 2059 tmp = entry->offset - ctl->start + align - 1; 2060 tmp = div64_u64(tmp, align); 2061 tmp = tmp * align + ctl->start; 2062 align_off = tmp - entry->offset; 2063 } else { 2064 align_off = 0; 2065 tmp = entry->offset; 2066 } 2067 2068 /* 2069 * We don't break here if we're using the bytes index because we 2070 * may have another entry that has the correct alignment that is 2071 * the right size, so we don't want to miss that possibility. 2072 * At worst this adds another loop through the logic, but if we 2073 * broke here we could prematurely ENOSPC. 2074 */ 2075 if (entry->bytes < *bytes + align_off) { 2076 *max_extent_size = max(get_max_extent_size(entry), 2077 *max_extent_size); 2078 continue; 2079 } 2080 2081 if (entry->bitmap) { 2082 struct rb_node *old_next = rb_next(node); 2083 u64 size = *bytes; 2084 2085 ret = search_bitmap(ctl, entry, &tmp, &size, true); 2086 if (!ret) { 2087 *offset = tmp; 2088 *bytes = size; 2089 return entry; 2090 } else { 2091 *max_extent_size = 2092 max(get_max_extent_size(entry), 2093 *max_extent_size); 2094 } 2095 2096 /* 2097 * The bitmap may have gotten re-arranged in the space 2098 * index here because the max_extent_size may have been 2099 * updated. Start from the beginning again if this 2100 * happened. 2101 */ 2102 if (use_bytes_index && old_next != rb_next(node)) 2103 goto again; 2104 continue; 2105 } 2106 2107 *offset = tmp; 2108 *bytes = entry->bytes - align_off; 2109 return entry; 2110 } 2111 out: 2112 return NULL; 2113 } 2114 2115 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 2116 struct btrfs_free_space *info, u64 offset) 2117 { 2118 info->offset = offset_to_bitmap(ctl, offset); 2119 info->bytes = 0; 2120 info->bitmap_extents = 0; 2121 INIT_LIST_HEAD(&info->list); 2122 link_free_space(ctl, info); 2123 ctl->total_bitmaps++; 2124 recalculate_thresholds(ctl); 2125 } 2126 2127 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 2128 struct btrfs_free_space *bitmap_info) 2129 { 2130 /* 2131 * Normally when this is called, the bitmap is completely empty. However, 2132 * if we are blowing up the free space cache for one reason or another 2133 * via __btrfs_remove_free_space_cache(), then it may not be freed and 2134 * we may leave stats on the table. 2135 */ 2136 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 2137 ctl->discardable_extents[BTRFS_STAT_CURR] -= 2138 bitmap_info->bitmap_extents; 2139 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 2140 2141 } 2142 unlink_free_space(ctl, bitmap_info, true); 2143 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 2144 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 2145 ctl->total_bitmaps--; 2146 recalculate_thresholds(ctl); 2147 } 2148 2149 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 2150 struct btrfs_free_space *bitmap_info, 2151 u64 *offset, u64 *bytes) 2152 { 2153 u64 end; 2154 u64 search_start, search_bytes; 2155 int ret; 2156 2157 again: 2158 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 2159 2160 /* 2161 * We need to search for bits in this bitmap. We could only cover some 2162 * of the extent in this bitmap thanks to how we add space, so we need 2163 * to search for as much as it as we can and clear that amount, and then 2164 * go searching for the next bit. 2165 */ 2166 search_start = *offset; 2167 search_bytes = ctl->unit; 2168 search_bytes = min(search_bytes, end - search_start + 1); 2169 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 2170 false); 2171 if (ret < 0 || search_start != *offset) 2172 return -EINVAL; 2173 2174 /* We may have found more bits than what we need */ 2175 search_bytes = min(search_bytes, *bytes); 2176 2177 /* Cannot clear past the end of the bitmap */ 2178 search_bytes = min(search_bytes, end - search_start + 1); 2179 2180 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); 2181 *offset += search_bytes; 2182 *bytes -= search_bytes; 2183 2184 if (*bytes) { 2185 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2186 if (!bitmap_info->bytes) 2187 free_bitmap(ctl, bitmap_info); 2188 2189 /* 2190 * no entry after this bitmap, but we still have bytes to 2191 * remove, so something has gone wrong. 2192 */ 2193 if (!next) 2194 return -EINVAL; 2195 2196 bitmap_info = rb_entry(next, struct btrfs_free_space, 2197 offset_index); 2198 2199 /* 2200 * if the next entry isn't a bitmap we need to return to let the 2201 * extent stuff do its work. 2202 */ 2203 if (!bitmap_info->bitmap) 2204 return -EAGAIN; 2205 2206 /* 2207 * Ok the next item is a bitmap, but it may not actually hold 2208 * the information for the rest of this free space stuff, so 2209 * look for it, and if we don't find it return so we can try 2210 * everything over again. 2211 */ 2212 search_start = *offset; 2213 search_bytes = ctl->unit; 2214 ret = search_bitmap(ctl, bitmap_info, &search_start, 2215 &search_bytes, false); 2216 if (ret < 0 || search_start != *offset) 2217 return -EAGAIN; 2218 2219 goto again; 2220 } else if (!bitmap_info->bytes) 2221 free_bitmap(ctl, bitmap_info); 2222 2223 return 0; 2224 } 2225 2226 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2227 struct btrfs_free_space *info, u64 offset, 2228 u64 bytes, enum btrfs_trim_state trim_state) 2229 { 2230 u64 bytes_to_set = 0; 2231 u64 end; 2232 2233 /* 2234 * This is a tradeoff to make bitmap trim state minimal. We mark the 2235 * whole bitmap untrimmed if at any point we add untrimmed regions. 2236 */ 2237 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2238 if (btrfs_free_space_trimmed(info)) { 2239 ctl->discardable_extents[BTRFS_STAT_CURR] += 2240 info->bitmap_extents; 2241 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2242 } 2243 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2244 } 2245 2246 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2247 2248 bytes_to_set = min(end - offset, bytes); 2249 2250 bitmap_set_bits(ctl, info, offset, bytes_to_set); 2251 2252 return bytes_to_set; 2253 2254 } 2255 2256 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2257 struct btrfs_free_space *info) 2258 { 2259 struct btrfs_block_group *block_group = ctl->block_group; 2260 struct btrfs_fs_info *fs_info = block_group->fs_info; 2261 bool forced = false; 2262 2263 #ifdef CONFIG_BTRFS_DEBUG 2264 if (btrfs_should_fragment_free_space(block_group)) 2265 forced = true; 2266 #endif 2267 2268 /* This is a way to reclaim large regions from the bitmaps. */ 2269 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2270 return false; 2271 2272 /* 2273 * If we are below the extents threshold then we can add this as an 2274 * extent, and don't have to deal with the bitmap 2275 */ 2276 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2277 /* 2278 * If this block group has some small extents we don't want to 2279 * use up all of our free slots in the cache with them, we want 2280 * to reserve them to larger extents, however if we have plenty 2281 * of cache left then go ahead an dadd them, no sense in adding 2282 * the overhead of a bitmap if we don't have to. 2283 */ 2284 if (info->bytes <= fs_info->sectorsize * 8) { 2285 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2286 return false; 2287 } else { 2288 return false; 2289 } 2290 } 2291 2292 /* 2293 * The original block groups from mkfs can be really small, like 8 2294 * megabytes, so don't bother with a bitmap for those entries. However 2295 * some block groups can be smaller than what a bitmap would cover but 2296 * are still large enough that they could overflow the 32k memory limit, 2297 * so allow those block groups to still be allowed to have a bitmap 2298 * entry. 2299 */ 2300 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2301 return false; 2302 2303 return true; 2304 } 2305 2306 static const struct btrfs_free_space_op free_space_op = { 2307 .use_bitmap = use_bitmap, 2308 }; 2309 2310 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2311 struct btrfs_free_space *info) 2312 { 2313 struct btrfs_free_space *bitmap_info; 2314 struct btrfs_block_group *block_group = NULL; 2315 int added = 0; 2316 u64 bytes, offset, bytes_added; 2317 enum btrfs_trim_state trim_state; 2318 int ret; 2319 2320 bytes = info->bytes; 2321 offset = info->offset; 2322 trim_state = info->trim_state; 2323 2324 if (!ctl->op->use_bitmap(ctl, info)) 2325 return 0; 2326 2327 if (ctl->op == &free_space_op) 2328 block_group = ctl->block_group; 2329 again: 2330 /* 2331 * Since we link bitmaps right into the cluster we need to see if we 2332 * have a cluster here, and if so and it has our bitmap we need to add 2333 * the free space to that bitmap. 2334 */ 2335 if (block_group && !list_empty(&block_group->cluster_list)) { 2336 struct btrfs_free_cluster *cluster; 2337 struct rb_node *node; 2338 struct btrfs_free_space *entry; 2339 2340 cluster = list_entry(block_group->cluster_list.next, 2341 struct btrfs_free_cluster, 2342 block_group_list); 2343 spin_lock(&cluster->lock); 2344 node = rb_first(&cluster->root); 2345 if (!node) { 2346 spin_unlock(&cluster->lock); 2347 goto no_cluster_bitmap; 2348 } 2349 2350 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2351 if (!entry->bitmap) { 2352 spin_unlock(&cluster->lock); 2353 goto no_cluster_bitmap; 2354 } 2355 2356 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2357 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2358 bytes, trim_state); 2359 bytes -= bytes_added; 2360 offset += bytes_added; 2361 } 2362 spin_unlock(&cluster->lock); 2363 if (!bytes) { 2364 ret = 1; 2365 goto out; 2366 } 2367 } 2368 2369 no_cluster_bitmap: 2370 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2371 1, 0); 2372 if (!bitmap_info) { 2373 ASSERT(added == 0); 2374 goto new_bitmap; 2375 } 2376 2377 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2378 trim_state); 2379 bytes -= bytes_added; 2380 offset += bytes_added; 2381 added = 0; 2382 2383 if (!bytes) { 2384 ret = 1; 2385 goto out; 2386 } else 2387 goto again; 2388 2389 new_bitmap: 2390 if (info && info->bitmap) { 2391 add_new_bitmap(ctl, info, offset); 2392 added = 1; 2393 info = NULL; 2394 goto again; 2395 } else { 2396 spin_unlock(&ctl->tree_lock); 2397 2398 /* no pre-allocated info, allocate a new one */ 2399 if (!info) { 2400 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2401 GFP_NOFS); 2402 if (!info) { 2403 spin_lock(&ctl->tree_lock); 2404 ret = -ENOMEM; 2405 goto out; 2406 } 2407 } 2408 2409 /* allocate the bitmap */ 2410 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2411 GFP_NOFS); 2412 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2413 spin_lock(&ctl->tree_lock); 2414 if (!info->bitmap) { 2415 ret = -ENOMEM; 2416 goto out; 2417 } 2418 goto again; 2419 } 2420 2421 out: 2422 if (info) { 2423 if (info->bitmap) 2424 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2425 info->bitmap); 2426 kmem_cache_free(btrfs_free_space_cachep, info); 2427 } 2428 2429 return ret; 2430 } 2431 2432 /* 2433 * Free space merging rules: 2434 * 1) Merge trimmed areas together 2435 * 2) Let untrimmed areas coalesce with trimmed areas 2436 * 3) Always pull neighboring regions from bitmaps 2437 * 2438 * The above rules are for when we merge free space based on btrfs_trim_state. 2439 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2440 * same reason: to promote larger extent regions which makes life easier for 2441 * find_free_extent(). Rule 2 enables coalescing based on the common path 2442 * being returning free space from btrfs_finish_extent_commit(). So when free 2443 * space is trimmed, it will prevent aggregating trimmed new region and 2444 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2445 * and provide find_free_extent() with the largest extents possible hoping for 2446 * the reuse path. 2447 */ 2448 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2449 struct btrfs_free_space *info, bool update_stat) 2450 { 2451 struct btrfs_free_space *left_info = NULL; 2452 struct btrfs_free_space *right_info; 2453 bool merged = false; 2454 u64 offset = info->offset; 2455 u64 bytes = info->bytes; 2456 const bool is_trimmed = btrfs_free_space_trimmed(info); 2457 struct rb_node *right_prev = NULL; 2458 2459 /* 2460 * first we want to see if there is free space adjacent to the range we 2461 * are adding, if there is remove that struct and add a new one to 2462 * cover the entire range 2463 */ 2464 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2465 if (right_info) 2466 right_prev = rb_prev(&right_info->offset_index); 2467 2468 if (right_prev) 2469 left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); 2470 else if (!right_info) 2471 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2472 2473 /* See try_merge_free_space() comment. */ 2474 if (right_info && !right_info->bitmap && 2475 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2476 unlink_free_space(ctl, right_info, update_stat); 2477 info->bytes += right_info->bytes; 2478 kmem_cache_free(btrfs_free_space_cachep, right_info); 2479 merged = true; 2480 } 2481 2482 /* See try_merge_free_space() comment. */ 2483 if (left_info && !left_info->bitmap && 2484 left_info->offset + left_info->bytes == offset && 2485 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2486 unlink_free_space(ctl, left_info, update_stat); 2487 info->offset = left_info->offset; 2488 info->bytes += left_info->bytes; 2489 kmem_cache_free(btrfs_free_space_cachep, left_info); 2490 merged = true; 2491 } 2492 2493 return merged; 2494 } 2495 2496 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2497 struct btrfs_free_space *info, 2498 bool update_stat) 2499 { 2500 struct btrfs_free_space *bitmap; 2501 unsigned long i; 2502 unsigned long j; 2503 const u64 end = info->offset + info->bytes; 2504 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2505 u64 bytes; 2506 2507 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2508 if (!bitmap) 2509 return false; 2510 2511 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2512 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2513 if (j == i) 2514 return false; 2515 bytes = (j - i) * ctl->unit; 2516 info->bytes += bytes; 2517 2518 /* See try_merge_free_space() comment. */ 2519 if (!btrfs_free_space_trimmed(bitmap)) 2520 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2521 2522 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); 2523 2524 if (!bitmap->bytes) 2525 free_bitmap(ctl, bitmap); 2526 2527 return true; 2528 } 2529 2530 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2531 struct btrfs_free_space *info, 2532 bool update_stat) 2533 { 2534 struct btrfs_free_space *bitmap; 2535 u64 bitmap_offset; 2536 unsigned long i; 2537 unsigned long j; 2538 unsigned long prev_j; 2539 u64 bytes; 2540 2541 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2542 /* If we're on a boundary, try the previous logical bitmap. */ 2543 if (bitmap_offset == info->offset) { 2544 if (info->offset == 0) 2545 return false; 2546 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2547 } 2548 2549 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2550 if (!bitmap) 2551 return false; 2552 2553 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2554 j = 0; 2555 prev_j = (unsigned long)-1; 2556 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2557 if (j > i) 2558 break; 2559 prev_j = j; 2560 } 2561 if (prev_j == i) 2562 return false; 2563 2564 if (prev_j == (unsigned long)-1) 2565 bytes = (i + 1) * ctl->unit; 2566 else 2567 bytes = (i - prev_j) * ctl->unit; 2568 2569 info->offset -= bytes; 2570 info->bytes += bytes; 2571 2572 /* See try_merge_free_space() comment. */ 2573 if (!btrfs_free_space_trimmed(bitmap)) 2574 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2575 2576 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); 2577 2578 if (!bitmap->bytes) 2579 free_bitmap(ctl, bitmap); 2580 2581 return true; 2582 } 2583 2584 /* 2585 * We prefer always to allocate from extent entries, both for clustered and 2586 * non-clustered allocation requests. So when attempting to add a new extent 2587 * entry, try to see if there's adjacent free space in bitmap entries, and if 2588 * there is, migrate that space from the bitmaps to the extent. 2589 * Like this we get better chances of satisfying space allocation requests 2590 * because we attempt to satisfy them based on a single cache entry, and never 2591 * on 2 or more entries - even if the entries represent a contiguous free space 2592 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2593 * ends). 2594 */ 2595 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2596 struct btrfs_free_space *info, 2597 bool update_stat) 2598 { 2599 /* 2600 * Only work with disconnected entries, as we can change their offset, 2601 * and must be extent entries. 2602 */ 2603 ASSERT(!info->bitmap); 2604 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2605 2606 if (ctl->total_bitmaps > 0) { 2607 bool stole_end; 2608 bool stole_front = false; 2609 2610 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2611 if (ctl->total_bitmaps > 0) 2612 stole_front = steal_from_bitmap_to_front(ctl, info, 2613 update_stat); 2614 2615 if (stole_end || stole_front) 2616 try_merge_free_space(ctl, info, update_stat); 2617 } 2618 } 2619 2620 int __btrfs_add_free_space(struct btrfs_block_group *block_group, 2621 u64 offset, u64 bytes, 2622 enum btrfs_trim_state trim_state) 2623 { 2624 struct btrfs_fs_info *fs_info = block_group->fs_info; 2625 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2626 struct btrfs_free_space *info; 2627 int ret = 0; 2628 u64 filter_bytes = bytes; 2629 2630 ASSERT(!btrfs_is_zoned(fs_info)); 2631 2632 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2633 if (!info) 2634 return -ENOMEM; 2635 2636 info->offset = offset; 2637 info->bytes = bytes; 2638 info->trim_state = trim_state; 2639 RB_CLEAR_NODE(&info->offset_index); 2640 RB_CLEAR_NODE(&info->bytes_index); 2641 2642 spin_lock(&ctl->tree_lock); 2643 2644 if (try_merge_free_space(ctl, info, true)) 2645 goto link; 2646 2647 /* 2648 * There was no extent directly to the left or right of this new 2649 * extent then we know we're going to have to allocate a new extent, so 2650 * before we do that see if we need to drop this into a bitmap 2651 */ 2652 ret = insert_into_bitmap(ctl, info); 2653 if (ret < 0) { 2654 goto out; 2655 } else if (ret) { 2656 ret = 0; 2657 goto out; 2658 } 2659 link: 2660 /* 2661 * Only steal free space from adjacent bitmaps if we're sure we're not 2662 * going to add the new free space to existing bitmap entries - because 2663 * that would mean unnecessary work that would be reverted. Therefore 2664 * attempt to steal space from bitmaps if we're adding an extent entry. 2665 */ 2666 steal_from_bitmap(ctl, info, true); 2667 2668 filter_bytes = max(filter_bytes, info->bytes); 2669 2670 ret = link_free_space(ctl, info); 2671 if (ret) 2672 kmem_cache_free(btrfs_free_space_cachep, info); 2673 out: 2674 btrfs_discard_update_discardable(block_group); 2675 spin_unlock(&ctl->tree_lock); 2676 2677 if (ret) { 2678 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2679 ASSERT(ret != -EEXIST); 2680 } 2681 2682 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2683 btrfs_discard_check_filter(block_group, filter_bytes); 2684 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2685 } 2686 2687 return ret; 2688 } 2689 2690 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2691 u64 bytenr, u64 size, bool used) 2692 { 2693 struct btrfs_space_info *sinfo = block_group->space_info; 2694 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2695 u64 offset = bytenr - block_group->start; 2696 u64 to_free, to_unusable; 2697 int bg_reclaim_threshold = 0; 2698 bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); 2699 u64 reclaimable_unusable; 2700 2701 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2702 2703 if (!initial) 2704 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2705 2706 spin_lock(&ctl->tree_lock); 2707 if (!used) 2708 to_free = size; 2709 else if (initial) 2710 to_free = block_group->zone_capacity; 2711 else if (offset >= block_group->alloc_offset) 2712 to_free = size; 2713 else if (offset + size <= block_group->alloc_offset) 2714 to_free = 0; 2715 else 2716 to_free = offset + size - block_group->alloc_offset; 2717 to_unusable = size - to_free; 2718 2719 ctl->free_space += to_free; 2720 /* 2721 * If the block group is read-only, we should account freed space into 2722 * bytes_readonly. 2723 */ 2724 if (!block_group->ro) 2725 block_group->zone_unusable += to_unusable; 2726 spin_unlock(&ctl->tree_lock); 2727 if (!used) { 2728 spin_lock(&block_group->lock); 2729 block_group->alloc_offset -= size; 2730 spin_unlock(&block_group->lock); 2731 } 2732 2733 reclaimable_unusable = block_group->zone_unusable - 2734 (block_group->length - block_group->zone_capacity); 2735 /* All the region is now unusable. Mark it as unused and reclaim */ 2736 if (block_group->zone_unusable == block_group->length) { 2737 btrfs_mark_bg_unused(block_group); 2738 } else if (bg_reclaim_threshold && 2739 reclaimable_unusable >= 2740 mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { 2741 btrfs_mark_bg_to_reclaim(block_group); 2742 } 2743 2744 return 0; 2745 } 2746 2747 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2748 u64 bytenr, u64 size) 2749 { 2750 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2751 2752 if (btrfs_is_zoned(block_group->fs_info)) 2753 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2754 true); 2755 2756 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2757 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2758 2759 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2760 } 2761 2762 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, 2763 u64 bytenr, u64 size) 2764 { 2765 if (btrfs_is_zoned(block_group->fs_info)) 2766 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2767 false); 2768 2769 return btrfs_add_free_space(block_group, bytenr, size); 2770 } 2771 2772 /* 2773 * This is a subtle distinction because when adding free space back in general, 2774 * we want it to be added as untrimmed for async. But in the case where we add 2775 * it on loading of a block group, we want to consider it trimmed. 2776 */ 2777 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2778 u64 bytenr, u64 size) 2779 { 2780 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2781 2782 if (btrfs_is_zoned(block_group->fs_info)) 2783 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2784 true); 2785 2786 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2787 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2788 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2789 2790 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2791 } 2792 2793 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2794 u64 offset, u64 bytes) 2795 { 2796 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2797 struct btrfs_free_space *info; 2798 int ret; 2799 bool re_search = false; 2800 2801 if (btrfs_is_zoned(block_group->fs_info)) { 2802 /* 2803 * This can happen with conventional zones when replaying log. 2804 * Since the allocation info of tree-log nodes are not recorded 2805 * to the extent-tree, calculate_alloc_pointer() failed to 2806 * advance the allocation pointer after last allocated tree log 2807 * node blocks. 2808 * 2809 * This function is called from 2810 * btrfs_pin_extent_for_log_replay() when replaying the log. 2811 * Advance the pointer not to overwrite the tree-log nodes. 2812 */ 2813 if (block_group->start + block_group->alloc_offset < 2814 offset + bytes) { 2815 block_group->alloc_offset = 2816 offset + bytes - block_group->start; 2817 } 2818 return 0; 2819 } 2820 2821 spin_lock(&ctl->tree_lock); 2822 2823 again: 2824 ret = 0; 2825 if (!bytes) 2826 goto out_lock; 2827 2828 info = tree_search_offset(ctl, offset, 0, 0); 2829 if (!info) { 2830 /* 2831 * oops didn't find an extent that matched the space we wanted 2832 * to remove, look for a bitmap instead 2833 */ 2834 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2835 1, 0); 2836 if (!info) { 2837 /* 2838 * If we found a partial bit of our free space in a 2839 * bitmap but then couldn't find the other part this may 2840 * be a problem, so WARN about it. 2841 */ 2842 WARN_ON(re_search); 2843 goto out_lock; 2844 } 2845 } 2846 2847 re_search = false; 2848 if (!info->bitmap) { 2849 unlink_free_space(ctl, info, true); 2850 if (offset == info->offset) { 2851 u64 to_free = min(bytes, info->bytes); 2852 2853 info->bytes -= to_free; 2854 info->offset += to_free; 2855 if (info->bytes) { 2856 ret = link_free_space(ctl, info); 2857 WARN_ON(ret); 2858 } else { 2859 kmem_cache_free(btrfs_free_space_cachep, info); 2860 } 2861 2862 offset += to_free; 2863 bytes -= to_free; 2864 goto again; 2865 } else { 2866 u64 old_end = info->bytes + info->offset; 2867 2868 info->bytes = offset - info->offset; 2869 ret = link_free_space(ctl, info); 2870 WARN_ON(ret); 2871 if (ret) 2872 goto out_lock; 2873 2874 /* Not enough bytes in this entry to satisfy us */ 2875 if (old_end < offset + bytes) { 2876 bytes -= old_end - offset; 2877 offset = old_end; 2878 goto again; 2879 } else if (old_end == offset + bytes) { 2880 /* all done */ 2881 goto out_lock; 2882 } 2883 spin_unlock(&ctl->tree_lock); 2884 2885 ret = __btrfs_add_free_space(block_group, 2886 offset + bytes, 2887 old_end - (offset + bytes), 2888 info->trim_state); 2889 WARN_ON(ret); 2890 goto out; 2891 } 2892 } 2893 2894 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2895 if (ret == -EAGAIN) { 2896 re_search = true; 2897 goto again; 2898 } 2899 out_lock: 2900 btrfs_discard_update_discardable(block_group); 2901 spin_unlock(&ctl->tree_lock); 2902 out: 2903 return ret; 2904 } 2905 2906 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2907 u64 bytes) 2908 { 2909 struct btrfs_fs_info *fs_info = block_group->fs_info; 2910 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2911 struct btrfs_free_space *info; 2912 struct rb_node *n; 2913 int count = 0; 2914 2915 /* 2916 * Zoned btrfs does not use free space tree and cluster. Just print 2917 * out the free space after the allocation offset. 2918 */ 2919 if (btrfs_is_zoned(fs_info)) { 2920 btrfs_info(fs_info, "free space %llu active %d", 2921 block_group->zone_capacity - block_group->alloc_offset, 2922 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2923 &block_group->runtime_flags)); 2924 return; 2925 } 2926 2927 spin_lock(&ctl->tree_lock); 2928 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2929 info = rb_entry(n, struct btrfs_free_space, offset_index); 2930 if (info->bytes >= bytes && !block_group->ro) 2931 count++; 2932 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2933 info->offset, info->bytes, 2934 (info->bitmap) ? "yes" : "no"); 2935 } 2936 spin_unlock(&ctl->tree_lock); 2937 btrfs_info(fs_info, "block group has cluster?: %s", 2938 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2939 btrfs_info(fs_info, 2940 "%d free space entries at or bigger than %llu bytes", 2941 count, bytes); 2942 } 2943 2944 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2945 struct btrfs_free_space_ctl *ctl) 2946 { 2947 struct btrfs_fs_info *fs_info = block_group->fs_info; 2948 2949 spin_lock_init(&ctl->tree_lock); 2950 ctl->unit = fs_info->sectorsize; 2951 ctl->start = block_group->start; 2952 ctl->block_group = block_group; 2953 ctl->op = &free_space_op; 2954 ctl->free_space_bytes = RB_ROOT_CACHED; 2955 INIT_LIST_HEAD(&ctl->trimming_ranges); 2956 mutex_init(&ctl->cache_writeout_mutex); 2957 2958 /* 2959 * we only want to have 32k of ram per block group for keeping 2960 * track of free space, and if we pass 1/2 of that we want to 2961 * start converting things over to using bitmaps 2962 */ 2963 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2964 } 2965 2966 /* 2967 * for a given cluster, put all of its extents back into the free 2968 * space cache. If the block group passed doesn't match the block group 2969 * pointed to by the cluster, someone else raced in and freed the 2970 * cluster already. In that case, we just return without changing anything 2971 */ 2972 static void __btrfs_return_cluster_to_free_space( 2973 struct btrfs_block_group *block_group, 2974 struct btrfs_free_cluster *cluster) 2975 { 2976 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2977 struct rb_node *node; 2978 2979 lockdep_assert_held(&ctl->tree_lock); 2980 2981 spin_lock(&cluster->lock); 2982 if (cluster->block_group != block_group) { 2983 spin_unlock(&cluster->lock); 2984 return; 2985 } 2986 2987 cluster->block_group = NULL; 2988 cluster->window_start = 0; 2989 list_del_init(&cluster->block_group_list); 2990 2991 node = rb_first(&cluster->root); 2992 while (node) { 2993 struct btrfs_free_space *entry; 2994 2995 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2996 node = rb_next(&entry->offset_index); 2997 rb_erase(&entry->offset_index, &cluster->root); 2998 RB_CLEAR_NODE(&entry->offset_index); 2999 3000 if (!entry->bitmap) { 3001 /* Merging treats extents as if they were new */ 3002 if (!btrfs_free_space_trimmed(entry)) { 3003 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3004 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 3005 entry->bytes; 3006 } 3007 3008 try_merge_free_space(ctl, entry, false); 3009 steal_from_bitmap(ctl, entry, false); 3010 3011 /* As we insert directly, update these statistics */ 3012 if (!btrfs_free_space_trimmed(entry)) { 3013 ctl->discardable_extents[BTRFS_STAT_CURR]++; 3014 ctl->discardable_bytes[BTRFS_STAT_CURR] += 3015 entry->bytes; 3016 } 3017 } 3018 tree_insert_offset(ctl, NULL, entry); 3019 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, 3020 entry_less); 3021 } 3022 cluster->root = RB_ROOT; 3023 spin_unlock(&cluster->lock); 3024 btrfs_put_block_group(block_group); 3025 } 3026 3027 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 3028 { 3029 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3030 struct btrfs_free_cluster *cluster; 3031 struct list_head *head; 3032 3033 spin_lock(&ctl->tree_lock); 3034 while ((head = block_group->cluster_list.next) != 3035 &block_group->cluster_list) { 3036 cluster = list_entry(head, struct btrfs_free_cluster, 3037 block_group_list); 3038 3039 WARN_ON(cluster->block_group != block_group); 3040 __btrfs_return_cluster_to_free_space(block_group, cluster); 3041 3042 cond_resched_lock(&ctl->tree_lock); 3043 } 3044 __btrfs_remove_free_space_cache(ctl); 3045 btrfs_discard_update_discardable(block_group); 3046 spin_unlock(&ctl->tree_lock); 3047 3048 } 3049 3050 /* 3051 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 3052 */ 3053 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 3054 { 3055 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3056 struct btrfs_free_space *info; 3057 struct rb_node *node; 3058 bool ret = true; 3059 3060 spin_lock(&ctl->tree_lock); 3061 node = rb_first(&ctl->free_space_offset); 3062 3063 while (node) { 3064 info = rb_entry(node, struct btrfs_free_space, offset_index); 3065 3066 if (!btrfs_free_space_trimmed(info)) { 3067 ret = false; 3068 break; 3069 } 3070 3071 node = rb_next(node); 3072 } 3073 3074 spin_unlock(&ctl->tree_lock); 3075 return ret; 3076 } 3077 3078 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 3079 u64 offset, u64 bytes, u64 empty_size, 3080 u64 *max_extent_size) 3081 { 3082 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3083 struct btrfs_discard_ctl *discard_ctl = 3084 &block_group->fs_info->discard_ctl; 3085 struct btrfs_free_space *entry = NULL; 3086 u64 bytes_search = bytes + empty_size; 3087 u64 ret = 0; 3088 u64 align_gap = 0; 3089 u64 align_gap_len = 0; 3090 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3091 bool use_bytes_index = (offset == block_group->start); 3092 3093 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3094 3095 spin_lock(&ctl->tree_lock); 3096 entry = find_free_space(ctl, &offset, &bytes_search, 3097 block_group->full_stripe_len, max_extent_size, 3098 use_bytes_index); 3099 if (!entry) 3100 goto out; 3101 3102 ret = offset; 3103 if (entry->bitmap) { 3104 bitmap_clear_bits(ctl, entry, offset, bytes, true); 3105 3106 if (!btrfs_free_space_trimmed(entry)) 3107 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3108 3109 if (!entry->bytes) 3110 free_bitmap(ctl, entry); 3111 } else { 3112 unlink_free_space(ctl, entry, true); 3113 align_gap_len = offset - entry->offset; 3114 align_gap = entry->offset; 3115 align_gap_trim_state = entry->trim_state; 3116 3117 if (!btrfs_free_space_trimmed(entry)) 3118 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3119 3120 entry->offset = offset + bytes; 3121 WARN_ON(entry->bytes < bytes + align_gap_len); 3122 3123 entry->bytes -= bytes + align_gap_len; 3124 if (!entry->bytes) 3125 kmem_cache_free(btrfs_free_space_cachep, entry); 3126 else 3127 link_free_space(ctl, entry); 3128 } 3129 out: 3130 btrfs_discard_update_discardable(block_group); 3131 spin_unlock(&ctl->tree_lock); 3132 3133 if (align_gap_len) 3134 __btrfs_add_free_space(block_group, align_gap, align_gap_len, 3135 align_gap_trim_state); 3136 return ret; 3137 } 3138 3139 /* 3140 * given a cluster, put all of its extents back into the free space 3141 * cache. If a block group is passed, this function will only free 3142 * a cluster that belongs to the passed block group. 3143 * 3144 * Otherwise, it'll get a reference on the block group pointed to by the 3145 * cluster and remove the cluster from it. 3146 */ 3147 void btrfs_return_cluster_to_free_space( 3148 struct btrfs_block_group *block_group, 3149 struct btrfs_free_cluster *cluster) 3150 { 3151 struct btrfs_free_space_ctl *ctl; 3152 3153 /* first, get a safe pointer to the block group */ 3154 spin_lock(&cluster->lock); 3155 if (!block_group) { 3156 block_group = cluster->block_group; 3157 if (!block_group) { 3158 spin_unlock(&cluster->lock); 3159 return; 3160 } 3161 } else if (cluster->block_group != block_group) { 3162 /* someone else has already freed it don't redo their work */ 3163 spin_unlock(&cluster->lock); 3164 return; 3165 } 3166 btrfs_get_block_group(block_group); 3167 spin_unlock(&cluster->lock); 3168 3169 ctl = block_group->free_space_ctl; 3170 3171 /* now return any extents the cluster had on it */ 3172 spin_lock(&ctl->tree_lock); 3173 __btrfs_return_cluster_to_free_space(block_group, cluster); 3174 spin_unlock(&ctl->tree_lock); 3175 3176 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 3177 3178 /* finally drop our ref */ 3179 btrfs_put_block_group(block_group); 3180 } 3181 3182 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 3183 struct btrfs_free_cluster *cluster, 3184 struct btrfs_free_space *entry, 3185 u64 bytes, u64 min_start, 3186 u64 *max_extent_size) 3187 { 3188 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3189 int err; 3190 u64 search_start = cluster->window_start; 3191 u64 search_bytes = bytes; 3192 u64 ret = 0; 3193 3194 search_start = min_start; 3195 search_bytes = bytes; 3196 3197 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 3198 if (err) { 3199 *max_extent_size = max(get_max_extent_size(entry), 3200 *max_extent_size); 3201 return 0; 3202 } 3203 3204 ret = search_start; 3205 bitmap_clear_bits(ctl, entry, ret, bytes, false); 3206 3207 return ret; 3208 } 3209 3210 /* 3211 * given a cluster, try to allocate 'bytes' from it, returns 0 3212 * if it couldn't find anything suitably large, or a logical disk offset 3213 * if things worked out 3214 */ 3215 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 3216 struct btrfs_free_cluster *cluster, u64 bytes, 3217 u64 min_start, u64 *max_extent_size) 3218 { 3219 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3220 struct btrfs_discard_ctl *discard_ctl = 3221 &block_group->fs_info->discard_ctl; 3222 struct btrfs_free_space *entry = NULL; 3223 struct rb_node *node; 3224 u64 ret = 0; 3225 3226 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3227 3228 spin_lock(&cluster->lock); 3229 if (bytes > cluster->max_size) 3230 goto out; 3231 3232 if (cluster->block_group != block_group) 3233 goto out; 3234 3235 node = rb_first(&cluster->root); 3236 if (!node) 3237 goto out; 3238 3239 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3240 while (1) { 3241 if (entry->bytes < bytes) 3242 *max_extent_size = max(get_max_extent_size(entry), 3243 *max_extent_size); 3244 3245 if (entry->bytes < bytes || 3246 (!entry->bitmap && entry->offset < min_start)) { 3247 node = rb_next(&entry->offset_index); 3248 if (!node) 3249 break; 3250 entry = rb_entry(node, struct btrfs_free_space, 3251 offset_index); 3252 continue; 3253 } 3254 3255 if (entry->bitmap) { 3256 ret = btrfs_alloc_from_bitmap(block_group, 3257 cluster, entry, bytes, 3258 cluster->window_start, 3259 max_extent_size); 3260 if (ret == 0) { 3261 node = rb_next(&entry->offset_index); 3262 if (!node) 3263 break; 3264 entry = rb_entry(node, struct btrfs_free_space, 3265 offset_index); 3266 continue; 3267 } 3268 cluster->window_start += bytes; 3269 } else { 3270 ret = entry->offset; 3271 3272 entry->offset += bytes; 3273 entry->bytes -= bytes; 3274 } 3275 3276 break; 3277 } 3278 out: 3279 spin_unlock(&cluster->lock); 3280 3281 if (!ret) 3282 return 0; 3283 3284 spin_lock(&ctl->tree_lock); 3285 3286 if (!btrfs_free_space_trimmed(entry)) 3287 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3288 3289 ctl->free_space -= bytes; 3290 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3291 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3292 3293 spin_lock(&cluster->lock); 3294 if (entry->bytes == 0) { 3295 rb_erase(&entry->offset_index, &cluster->root); 3296 ctl->free_extents--; 3297 if (entry->bitmap) { 3298 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3299 entry->bitmap); 3300 ctl->total_bitmaps--; 3301 recalculate_thresholds(ctl); 3302 } else if (!btrfs_free_space_trimmed(entry)) { 3303 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3304 } 3305 kmem_cache_free(btrfs_free_space_cachep, entry); 3306 } 3307 3308 spin_unlock(&cluster->lock); 3309 spin_unlock(&ctl->tree_lock); 3310 3311 return ret; 3312 } 3313 3314 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3315 struct btrfs_free_space *entry, 3316 struct btrfs_free_cluster *cluster, 3317 u64 offset, u64 bytes, 3318 u64 cont1_bytes, u64 min_bytes) 3319 { 3320 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3321 unsigned long next_zero; 3322 unsigned long i; 3323 unsigned long want_bits; 3324 unsigned long min_bits; 3325 unsigned long found_bits; 3326 unsigned long max_bits = 0; 3327 unsigned long start = 0; 3328 unsigned long total_found = 0; 3329 int ret; 3330 3331 lockdep_assert_held(&ctl->tree_lock); 3332 3333 i = offset_to_bit(entry->offset, ctl->unit, 3334 max_t(u64, offset, entry->offset)); 3335 want_bits = bytes_to_bits(bytes, ctl->unit); 3336 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3337 3338 /* 3339 * Don't bother looking for a cluster in this bitmap if it's heavily 3340 * fragmented. 3341 */ 3342 if (entry->max_extent_size && 3343 entry->max_extent_size < cont1_bytes) 3344 return -ENOSPC; 3345 again: 3346 found_bits = 0; 3347 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3348 next_zero = find_next_zero_bit(entry->bitmap, 3349 BITS_PER_BITMAP, i); 3350 if (next_zero - i >= min_bits) { 3351 found_bits = next_zero - i; 3352 if (found_bits > max_bits) 3353 max_bits = found_bits; 3354 break; 3355 } 3356 if (next_zero - i > max_bits) 3357 max_bits = next_zero - i; 3358 i = next_zero; 3359 } 3360 3361 if (!found_bits) { 3362 entry->max_extent_size = (u64)max_bits * ctl->unit; 3363 return -ENOSPC; 3364 } 3365 3366 if (!total_found) { 3367 start = i; 3368 cluster->max_size = 0; 3369 } 3370 3371 total_found += found_bits; 3372 3373 if (cluster->max_size < found_bits * ctl->unit) 3374 cluster->max_size = found_bits * ctl->unit; 3375 3376 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3377 i = next_zero + 1; 3378 goto again; 3379 } 3380 3381 cluster->window_start = start * ctl->unit + entry->offset; 3382 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3383 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3384 3385 /* 3386 * We need to know if we're currently on the normal space index when we 3387 * manipulate the bitmap so that we know we need to remove and re-insert 3388 * it into the space_index tree. Clear the bytes_index node here so the 3389 * bitmap manipulation helpers know not to mess with the space_index 3390 * until this bitmap entry is added back into the normal cache. 3391 */ 3392 RB_CLEAR_NODE(&entry->bytes_index); 3393 3394 ret = tree_insert_offset(ctl, cluster, entry); 3395 ASSERT(!ret); /* -EEXIST; Logic error */ 3396 3397 trace_btrfs_setup_cluster(block_group, cluster, 3398 total_found * ctl->unit, 1); 3399 return 0; 3400 } 3401 3402 /* 3403 * This searches the block group for just extents to fill the cluster with. 3404 * Try to find a cluster with at least bytes total bytes, at least one 3405 * extent of cont1_bytes, and other clusters of at least min_bytes. 3406 */ 3407 static noinline int 3408 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3409 struct btrfs_free_cluster *cluster, 3410 struct list_head *bitmaps, u64 offset, u64 bytes, 3411 u64 cont1_bytes, u64 min_bytes) 3412 { 3413 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3414 struct btrfs_free_space *first = NULL; 3415 struct btrfs_free_space *entry = NULL; 3416 struct btrfs_free_space *last; 3417 struct rb_node *node; 3418 u64 window_free; 3419 u64 max_extent; 3420 u64 total_size = 0; 3421 3422 lockdep_assert_held(&ctl->tree_lock); 3423 3424 entry = tree_search_offset(ctl, offset, 0, 1); 3425 if (!entry) 3426 return -ENOSPC; 3427 3428 /* 3429 * We don't want bitmaps, so just move along until we find a normal 3430 * extent entry. 3431 */ 3432 while (entry->bitmap || entry->bytes < min_bytes) { 3433 if (entry->bitmap && list_empty(&entry->list)) 3434 list_add_tail(&entry->list, bitmaps); 3435 node = rb_next(&entry->offset_index); 3436 if (!node) 3437 return -ENOSPC; 3438 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3439 } 3440 3441 window_free = entry->bytes; 3442 max_extent = entry->bytes; 3443 first = entry; 3444 last = entry; 3445 3446 for (node = rb_next(&entry->offset_index); node; 3447 node = rb_next(&entry->offset_index)) { 3448 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3449 3450 if (entry->bitmap) { 3451 if (list_empty(&entry->list)) 3452 list_add_tail(&entry->list, bitmaps); 3453 continue; 3454 } 3455 3456 if (entry->bytes < min_bytes) 3457 continue; 3458 3459 last = entry; 3460 window_free += entry->bytes; 3461 if (entry->bytes > max_extent) 3462 max_extent = entry->bytes; 3463 } 3464 3465 if (window_free < bytes || max_extent < cont1_bytes) 3466 return -ENOSPC; 3467 3468 cluster->window_start = first->offset; 3469 3470 node = &first->offset_index; 3471 3472 /* 3473 * now we've found our entries, pull them out of the free space 3474 * cache and put them into the cluster rbtree 3475 */ 3476 do { 3477 int ret; 3478 3479 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3480 node = rb_next(&entry->offset_index); 3481 if (entry->bitmap || entry->bytes < min_bytes) 3482 continue; 3483 3484 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3485 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3486 ret = tree_insert_offset(ctl, cluster, entry); 3487 total_size += entry->bytes; 3488 ASSERT(!ret); /* -EEXIST; Logic error */ 3489 } while (node && entry != last); 3490 3491 cluster->max_size = max_extent; 3492 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3493 return 0; 3494 } 3495 3496 /* 3497 * This specifically looks for bitmaps that may work in the cluster, we assume 3498 * that we have already failed to find extents that will work. 3499 */ 3500 static noinline int 3501 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3502 struct btrfs_free_cluster *cluster, 3503 struct list_head *bitmaps, u64 offset, u64 bytes, 3504 u64 cont1_bytes, u64 min_bytes) 3505 { 3506 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3507 struct btrfs_free_space *entry = NULL; 3508 int ret = -ENOSPC; 3509 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3510 3511 if (ctl->total_bitmaps == 0) 3512 return -ENOSPC; 3513 3514 /* 3515 * The bitmap that covers offset won't be in the list unless offset 3516 * is just its start offset. 3517 */ 3518 if (!list_empty(bitmaps)) 3519 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3520 3521 if (!entry || entry->offset != bitmap_offset) { 3522 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3523 if (entry && list_empty(&entry->list)) 3524 list_add(&entry->list, bitmaps); 3525 } 3526 3527 list_for_each_entry(entry, bitmaps, list) { 3528 if (entry->bytes < bytes) 3529 continue; 3530 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3531 bytes, cont1_bytes, min_bytes); 3532 if (!ret) 3533 return 0; 3534 } 3535 3536 /* 3537 * The bitmaps list has all the bitmaps that record free space 3538 * starting after offset, so no more search is required. 3539 */ 3540 return -ENOSPC; 3541 } 3542 3543 /* 3544 * here we try to find a cluster of blocks in a block group. The goal 3545 * is to find at least bytes+empty_size. 3546 * We might not find them all in one contiguous area. 3547 * 3548 * returns zero and sets up cluster if things worked out, otherwise 3549 * it returns -enospc 3550 */ 3551 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3552 struct btrfs_free_cluster *cluster, 3553 u64 offset, u64 bytes, u64 empty_size) 3554 { 3555 struct btrfs_fs_info *fs_info = block_group->fs_info; 3556 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3557 struct btrfs_free_space *entry, *tmp; 3558 LIST_HEAD(bitmaps); 3559 u64 min_bytes; 3560 u64 cont1_bytes; 3561 int ret; 3562 3563 /* 3564 * Choose the minimum extent size we'll require for this 3565 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3566 * For metadata, allow allocates with smaller extents. For 3567 * data, keep it dense. 3568 */ 3569 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3570 cont1_bytes = bytes + empty_size; 3571 min_bytes = cont1_bytes; 3572 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3573 cont1_bytes = bytes; 3574 min_bytes = fs_info->sectorsize; 3575 } else { 3576 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3577 min_bytes = fs_info->sectorsize; 3578 } 3579 3580 spin_lock(&ctl->tree_lock); 3581 3582 /* 3583 * If we know we don't have enough space to make a cluster don't even 3584 * bother doing all the work to try and find one. 3585 */ 3586 if (ctl->free_space < bytes) { 3587 spin_unlock(&ctl->tree_lock); 3588 return -ENOSPC; 3589 } 3590 3591 spin_lock(&cluster->lock); 3592 3593 /* someone already found a cluster, hooray */ 3594 if (cluster->block_group) { 3595 ret = 0; 3596 goto out; 3597 } 3598 3599 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3600 min_bytes); 3601 3602 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3603 bytes + empty_size, 3604 cont1_bytes, min_bytes); 3605 if (ret) 3606 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3607 offset, bytes + empty_size, 3608 cont1_bytes, min_bytes); 3609 3610 /* Clear our temporary list */ 3611 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3612 list_del_init(&entry->list); 3613 3614 if (!ret) { 3615 btrfs_get_block_group(block_group); 3616 list_add_tail(&cluster->block_group_list, 3617 &block_group->cluster_list); 3618 cluster->block_group = block_group; 3619 } else { 3620 trace_btrfs_failed_cluster_setup(block_group); 3621 } 3622 out: 3623 spin_unlock(&cluster->lock); 3624 spin_unlock(&ctl->tree_lock); 3625 3626 return ret; 3627 } 3628 3629 /* 3630 * simple code to zero out a cluster 3631 */ 3632 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3633 { 3634 spin_lock_init(&cluster->lock); 3635 spin_lock_init(&cluster->refill_lock); 3636 cluster->root = RB_ROOT; 3637 cluster->max_size = 0; 3638 cluster->fragmented = false; 3639 INIT_LIST_HEAD(&cluster->block_group_list); 3640 cluster->block_group = NULL; 3641 } 3642 3643 static int do_trimming(struct btrfs_block_group *block_group, 3644 u64 *total_trimmed, u64 start, u64 bytes, 3645 u64 reserved_start, u64 reserved_bytes, 3646 enum btrfs_trim_state reserved_trim_state, 3647 struct btrfs_trim_range *trim_entry) 3648 { 3649 struct btrfs_space_info *space_info = block_group->space_info; 3650 struct btrfs_fs_info *fs_info = block_group->fs_info; 3651 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3652 int ret; 3653 int update = 0; 3654 const u64 end = start + bytes; 3655 const u64 reserved_end = reserved_start + reserved_bytes; 3656 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3657 u64 trimmed = 0; 3658 3659 spin_lock(&space_info->lock); 3660 spin_lock(&block_group->lock); 3661 if (!block_group->ro) { 3662 block_group->reserved += reserved_bytes; 3663 space_info->bytes_reserved += reserved_bytes; 3664 update = 1; 3665 } 3666 spin_unlock(&block_group->lock); 3667 spin_unlock(&space_info->lock); 3668 3669 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3670 if (!ret) { 3671 *total_trimmed += trimmed; 3672 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3673 } 3674 3675 mutex_lock(&ctl->cache_writeout_mutex); 3676 if (reserved_start < start) 3677 __btrfs_add_free_space(block_group, reserved_start, 3678 start - reserved_start, 3679 reserved_trim_state); 3680 if (end < reserved_end) 3681 __btrfs_add_free_space(block_group, end, reserved_end - end, 3682 reserved_trim_state); 3683 __btrfs_add_free_space(block_group, start, bytes, trim_state); 3684 list_del(&trim_entry->list); 3685 mutex_unlock(&ctl->cache_writeout_mutex); 3686 3687 if (update) { 3688 spin_lock(&space_info->lock); 3689 spin_lock(&block_group->lock); 3690 if (block_group->ro) 3691 space_info->bytes_readonly += reserved_bytes; 3692 block_group->reserved -= reserved_bytes; 3693 space_info->bytes_reserved -= reserved_bytes; 3694 spin_unlock(&block_group->lock); 3695 spin_unlock(&space_info->lock); 3696 } 3697 3698 return ret; 3699 } 3700 3701 /* 3702 * If @async is set, then we will trim 1 region and return. 3703 */ 3704 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3705 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3706 bool async) 3707 { 3708 struct btrfs_discard_ctl *discard_ctl = 3709 &block_group->fs_info->discard_ctl; 3710 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3711 struct btrfs_free_space *entry; 3712 struct rb_node *node; 3713 int ret = 0; 3714 u64 extent_start; 3715 u64 extent_bytes; 3716 enum btrfs_trim_state extent_trim_state; 3717 u64 bytes; 3718 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3719 3720 while (start < end) { 3721 struct btrfs_trim_range trim_entry; 3722 3723 mutex_lock(&ctl->cache_writeout_mutex); 3724 spin_lock(&ctl->tree_lock); 3725 3726 if (ctl->free_space < minlen) 3727 goto out_unlock; 3728 3729 entry = tree_search_offset(ctl, start, 0, 1); 3730 if (!entry) 3731 goto out_unlock; 3732 3733 /* Skip bitmaps and if async, already trimmed entries */ 3734 while (entry->bitmap || 3735 (async && btrfs_free_space_trimmed(entry))) { 3736 node = rb_next(&entry->offset_index); 3737 if (!node) 3738 goto out_unlock; 3739 entry = rb_entry(node, struct btrfs_free_space, 3740 offset_index); 3741 } 3742 3743 if (entry->offset >= end) 3744 goto out_unlock; 3745 3746 extent_start = entry->offset; 3747 extent_bytes = entry->bytes; 3748 extent_trim_state = entry->trim_state; 3749 if (async) { 3750 start = entry->offset; 3751 bytes = entry->bytes; 3752 if (bytes < minlen) { 3753 spin_unlock(&ctl->tree_lock); 3754 mutex_unlock(&ctl->cache_writeout_mutex); 3755 goto next; 3756 } 3757 unlink_free_space(ctl, entry, true); 3758 /* 3759 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3760 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3761 * X when we come back around. So trim it now. 3762 */ 3763 if (max_discard_size && 3764 bytes >= (max_discard_size + 3765 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3766 bytes = max_discard_size; 3767 extent_bytes = max_discard_size; 3768 entry->offset += max_discard_size; 3769 entry->bytes -= max_discard_size; 3770 link_free_space(ctl, entry); 3771 } else { 3772 kmem_cache_free(btrfs_free_space_cachep, entry); 3773 } 3774 } else { 3775 start = max(start, extent_start); 3776 bytes = min(extent_start + extent_bytes, end) - start; 3777 if (bytes < minlen) { 3778 spin_unlock(&ctl->tree_lock); 3779 mutex_unlock(&ctl->cache_writeout_mutex); 3780 goto next; 3781 } 3782 3783 unlink_free_space(ctl, entry, true); 3784 kmem_cache_free(btrfs_free_space_cachep, entry); 3785 } 3786 3787 spin_unlock(&ctl->tree_lock); 3788 trim_entry.start = extent_start; 3789 trim_entry.bytes = extent_bytes; 3790 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3791 mutex_unlock(&ctl->cache_writeout_mutex); 3792 3793 ret = do_trimming(block_group, total_trimmed, start, bytes, 3794 extent_start, extent_bytes, extent_trim_state, 3795 &trim_entry); 3796 if (ret) { 3797 block_group->discard_cursor = start + bytes; 3798 break; 3799 } 3800 next: 3801 start += bytes; 3802 block_group->discard_cursor = start; 3803 if (async && *total_trimmed) 3804 break; 3805 3806 if (fatal_signal_pending(current)) { 3807 ret = -ERESTARTSYS; 3808 break; 3809 } 3810 3811 cond_resched(); 3812 } 3813 3814 return ret; 3815 3816 out_unlock: 3817 block_group->discard_cursor = btrfs_block_group_end(block_group); 3818 spin_unlock(&ctl->tree_lock); 3819 mutex_unlock(&ctl->cache_writeout_mutex); 3820 3821 return ret; 3822 } 3823 3824 /* 3825 * If we break out of trimming a bitmap prematurely, we should reset the 3826 * trimming bit. In a rather contrieved case, it's possible to race here so 3827 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3828 * 3829 * start = start of bitmap 3830 * end = near end of bitmap 3831 * 3832 * Thread 1: Thread 2: 3833 * trim_bitmaps(start) 3834 * trim_bitmaps(end) 3835 * end_trimming_bitmap() 3836 * reset_trimming_bitmap() 3837 */ 3838 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3839 { 3840 struct btrfs_free_space *entry; 3841 3842 spin_lock(&ctl->tree_lock); 3843 entry = tree_search_offset(ctl, offset, 1, 0); 3844 if (entry) { 3845 if (btrfs_free_space_trimmed(entry)) { 3846 ctl->discardable_extents[BTRFS_STAT_CURR] += 3847 entry->bitmap_extents; 3848 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3849 } 3850 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3851 } 3852 3853 spin_unlock(&ctl->tree_lock); 3854 } 3855 3856 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3857 struct btrfs_free_space *entry) 3858 { 3859 if (btrfs_free_space_trimming_bitmap(entry)) { 3860 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3861 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3862 entry->bitmap_extents; 3863 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3864 } 3865 } 3866 3867 /* 3868 * If @async is set, then we will trim 1 region and return. 3869 */ 3870 static int trim_bitmaps(struct btrfs_block_group *block_group, 3871 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3872 u64 maxlen, bool async) 3873 { 3874 struct btrfs_discard_ctl *discard_ctl = 3875 &block_group->fs_info->discard_ctl; 3876 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3877 struct btrfs_free_space *entry; 3878 int ret = 0; 3879 int ret2; 3880 u64 bytes; 3881 u64 offset = offset_to_bitmap(ctl, start); 3882 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3883 3884 while (offset < end) { 3885 bool next_bitmap = false; 3886 struct btrfs_trim_range trim_entry; 3887 3888 mutex_lock(&ctl->cache_writeout_mutex); 3889 spin_lock(&ctl->tree_lock); 3890 3891 if (ctl->free_space < minlen) { 3892 block_group->discard_cursor = 3893 btrfs_block_group_end(block_group); 3894 spin_unlock(&ctl->tree_lock); 3895 mutex_unlock(&ctl->cache_writeout_mutex); 3896 break; 3897 } 3898 3899 entry = tree_search_offset(ctl, offset, 1, 0); 3900 /* 3901 * Bitmaps are marked trimmed lossily now to prevent constant 3902 * discarding of the same bitmap (the reason why we are bound 3903 * by the filters). So, retrim the block group bitmaps when we 3904 * are preparing to punt to the unused_bgs list. This uses 3905 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3906 * which is the only discard index which sets minlen to 0. 3907 */ 3908 if (!entry || (async && minlen && start == offset && 3909 btrfs_free_space_trimmed(entry))) { 3910 spin_unlock(&ctl->tree_lock); 3911 mutex_unlock(&ctl->cache_writeout_mutex); 3912 next_bitmap = true; 3913 goto next; 3914 } 3915 3916 /* 3917 * Async discard bitmap trimming begins at by setting the start 3918 * to be key.objectid and the offset_to_bitmap() aligns to the 3919 * start of the bitmap. This lets us know we are fully 3920 * scanning the bitmap rather than only some portion of it. 3921 */ 3922 if (start == offset) 3923 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3924 3925 bytes = minlen; 3926 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3927 if (ret2 || start >= end) { 3928 /* 3929 * We lossily consider a bitmap trimmed if we only skip 3930 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3931 */ 3932 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3933 end_trimming_bitmap(ctl, entry); 3934 else 3935 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3936 spin_unlock(&ctl->tree_lock); 3937 mutex_unlock(&ctl->cache_writeout_mutex); 3938 next_bitmap = true; 3939 goto next; 3940 } 3941 3942 /* 3943 * We already trimmed a region, but are using the locking above 3944 * to reset the trim_state. 3945 */ 3946 if (async && *total_trimmed) { 3947 spin_unlock(&ctl->tree_lock); 3948 mutex_unlock(&ctl->cache_writeout_mutex); 3949 goto out; 3950 } 3951 3952 bytes = min(bytes, end - start); 3953 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 3954 spin_unlock(&ctl->tree_lock); 3955 mutex_unlock(&ctl->cache_writeout_mutex); 3956 goto next; 3957 } 3958 3959 /* 3960 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3961 * If X < @minlen, we won't trim X when we come back around. 3962 * So trim it now. We differ here from trimming extents as we 3963 * don't keep individual state per bit. 3964 */ 3965 if (async && 3966 max_discard_size && 3967 bytes > (max_discard_size + minlen)) 3968 bytes = max_discard_size; 3969 3970 bitmap_clear_bits(ctl, entry, start, bytes, true); 3971 if (entry->bytes == 0) 3972 free_bitmap(ctl, entry); 3973 3974 spin_unlock(&ctl->tree_lock); 3975 trim_entry.start = start; 3976 trim_entry.bytes = bytes; 3977 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3978 mutex_unlock(&ctl->cache_writeout_mutex); 3979 3980 ret = do_trimming(block_group, total_trimmed, start, bytes, 3981 start, bytes, 0, &trim_entry); 3982 if (ret) { 3983 reset_trimming_bitmap(ctl, offset); 3984 block_group->discard_cursor = 3985 btrfs_block_group_end(block_group); 3986 break; 3987 } 3988 next: 3989 if (next_bitmap) { 3990 offset += BITS_PER_BITMAP * ctl->unit; 3991 start = offset; 3992 } else { 3993 start += bytes; 3994 } 3995 block_group->discard_cursor = start; 3996 3997 if (fatal_signal_pending(current)) { 3998 if (start != offset) 3999 reset_trimming_bitmap(ctl, offset); 4000 ret = -ERESTARTSYS; 4001 break; 4002 } 4003 4004 cond_resched(); 4005 } 4006 4007 if (offset >= end) 4008 block_group->discard_cursor = end; 4009 4010 out: 4011 return ret; 4012 } 4013 4014 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 4015 u64 *trimmed, u64 start, u64 end, u64 minlen) 4016 { 4017 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4018 int ret; 4019 u64 rem = 0; 4020 4021 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 4022 4023 *trimmed = 0; 4024 4025 spin_lock(&block_group->lock); 4026 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4027 spin_unlock(&block_group->lock); 4028 return 0; 4029 } 4030 btrfs_freeze_block_group(block_group); 4031 spin_unlock(&block_group->lock); 4032 4033 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 4034 if (ret) 4035 goto out; 4036 4037 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 4038 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 4039 /* If we ended in the middle of a bitmap, reset the trimming flag */ 4040 if (rem) 4041 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 4042 out: 4043 btrfs_unfreeze_block_group(block_group); 4044 return ret; 4045 } 4046 4047 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 4048 u64 *trimmed, u64 start, u64 end, u64 minlen, 4049 bool async) 4050 { 4051 int ret; 4052 4053 *trimmed = 0; 4054 4055 spin_lock(&block_group->lock); 4056 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4057 spin_unlock(&block_group->lock); 4058 return 0; 4059 } 4060 btrfs_freeze_block_group(block_group); 4061 spin_unlock(&block_group->lock); 4062 4063 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 4064 btrfs_unfreeze_block_group(block_group); 4065 4066 return ret; 4067 } 4068 4069 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 4070 u64 *trimmed, u64 start, u64 end, u64 minlen, 4071 u64 maxlen, bool async) 4072 { 4073 int ret; 4074 4075 *trimmed = 0; 4076 4077 spin_lock(&block_group->lock); 4078 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4079 spin_unlock(&block_group->lock); 4080 return 0; 4081 } 4082 btrfs_freeze_block_group(block_group); 4083 spin_unlock(&block_group->lock); 4084 4085 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 4086 async); 4087 4088 btrfs_unfreeze_block_group(block_group); 4089 4090 return ret; 4091 } 4092 4093 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 4094 { 4095 return btrfs_super_cache_generation(fs_info->super_copy); 4096 } 4097 4098 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 4099 struct btrfs_trans_handle *trans) 4100 { 4101 struct btrfs_block_group *block_group; 4102 struct rb_node *node; 4103 int ret = 0; 4104 4105 btrfs_info(fs_info, "cleaning free space cache v1"); 4106 4107 node = rb_first_cached(&fs_info->block_group_cache_tree); 4108 while (node) { 4109 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4110 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4111 if (ret) 4112 goto out; 4113 node = rb_next(node); 4114 } 4115 out: 4116 return ret; 4117 } 4118 4119 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 4120 { 4121 struct btrfs_trans_handle *trans; 4122 int ret; 4123 4124 /* 4125 * update_super_roots will appropriately set or unset 4126 * super_copy->cache_generation based on SPACE_CACHE and 4127 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 4128 * transaction commit whether we are enabling space cache v1 and don't 4129 * have any other work to do, or are disabling it and removing free 4130 * space inodes. 4131 */ 4132 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4133 if (IS_ERR(trans)) 4134 return PTR_ERR(trans); 4135 4136 if (!active) { 4137 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4138 ret = cleanup_free_space_cache_v1(fs_info, trans); 4139 if (ret) { 4140 btrfs_abort_transaction(trans, ret); 4141 btrfs_end_transaction(trans); 4142 goto out; 4143 } 4144 } 4145 4146 ret = btrfs_commit_transaction(trans); 4147 out: 4148 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4149 4150 return ret; 4151 } 4152 4153 int __init btrfs_free_space_init(void) 4154 { 4155 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 4156 sizeof(struct btrfs_free_space), 0, 4157 SLAB_MEM_SPREAD, NULL); 4158 if (!btrfs_free_space_cachep) 4159 return -ENOMEM; 4160 4161 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 4162 PAGE_SIZE, PAGE_SIZE, 4163 SLAB_MEM_SPREAD, NULL); 4164 if (!btrfs_free_space_bitmap_cachep) { 4165 kmem_cache_destroy(btrfs_free_space_cachep); 4166 return -ENOMEM; 4167 } 4168 4169 return 0; 4170 } 4171 4172 void __cold btrfs_free_space_exit(void) 4173 { 4174 kmem_cache_destroy(btrfs_free_space_cachep); 4175 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 4176 } 4177 4178 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4179 /* 4180 * Use this if you need to make a bitmap or extent entry specifically, it 4181 * doesn't do any of the merging that add_free_space does, this acts a lot like 4182 * how the free space cache loading stuff works, so you can get really weird 4183 * configurations. 4184 */ 4185 int test_add_free_space_entry(struct btrfs_block_group *cache, 4186 u64 offset, u64 bytes, bool bitmap) 4187 { 4188 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4189 struct btrfs_free_space *info = NULL, *bitmap_info; 4190 void *map = NULL; 4191 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 4192 u64 bytes_added; 4193 int ret; 4194 4195 again: 4196 if (!info) { 4197 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 4198 if (!info) 4199 return -ENOMEM; 4200 } 4201 4202 if (!bitmap) { 4203 spin_lock(&ctl->tree_lock); 4204 info->offset = offset; 4205 info->bytes = bytes; 4206 info->max_extent_size = 0; 4207 ret = link_free_space(ctl, info); 4208 spin_unlock(&ctl->tree_lock); 4209 if (ret) 4210 kmem_cache_free(btrfs_free_space_cachep, info); 4211 return ret; 4212 } 4213 4214 if (!map) { 4215 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 4216 if (!map) { 4217 kmem_cache_free(btrfs_free_space_cachep, info); 4218 return -ENOMEM; 4219 } 4220 } 4221 4222 spin_lock(&ctl->tree_lock); 4223 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4224 1, 0); 4225 if (!bitmap_info) { 4226 info->bitmap = map; 4227 map = NULL; 4228 add_new_bitmap(ctl, info, offset); 4229 bitmap_info = info; 4230 info = NULL; 4231 } 4232 4233 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 4234 trim_state); 4235 4236 bytes -= bytes_added; 4237 offset += bytes_added; 4238 spin_unlock(&ctl->tree_lock); 4239 4240 if (bytes) 4241 goto again; 4242 4243 if (info) 4244 kmem_cache_free(btrfs_free_space_cachep, info); 4245 if (map) 4246 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 4247 return 0; 4248 } 4249 4250 /* 4251 * Checks to see if the given range is in the free space cache. This is really 4252 * just used to check the absence of space, so if there is free space in the 4253 * range at all we will return 1. 4254 */ 4255 int test_check_exists(struct btrfs_block_group *cache, 4256 u64 offset, u64 bytes) 4257 { 4258 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4259 struct btrfs_free_space *info; 4260 int ret = 0; 4261 4262 spin_lock(&ctl->tree_lock); 4263 info = tree_search_offset(ctl, offset, 0, 0); 4264 if (!info) { 4265 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4266 1, 0); 4267 if (!info) 4268 goto out; 4269 } 4270 4271 have_info: 4272 if (info->bitmap) { 4273 u64 bit_off, bit_bytes; 4274 struct rb_node *n; 4275 struct btrfs_free_space *tmp; 4276 4277 bit_off = offset; 4278 bit_bytes = ctl->unit; 4279 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 4280 if (!ret) { 4281 if (bit_off == offset) { 4282 ret = 1; 4283 goto out; 4284 } else if (bit_off > offset && 4285 offset + bytes > bit_off) { 4286 ret = 1; 4287 goto out; 4288 } 4289 } 4290 4291 n = rb_prev(&info->offset_index); 4292 while (n) { 4293 tmp = rb_entry(n, struct btrfs_free_space, 4294 offset_index); 4295 if (tmp->offset + tmp->bytes < offset) 4296 break; 4297 if (offset + bytes < tmp->offset) { 4298 n = rb_prev(&tmp->offset_index); 4299 continue; 4300 } 4301 info = tmp; 4302 goto have_info; 4303 } 4304 4305 n = rb_next(&info->offset_index); 4306 while (n) { 4307 tmp = rb_entry(n, struct btrfs_free_space, 4308 offset_index); 4309 if (offset + bytes < tmp->offset) 4310 break; 4311 if (tmp->offset + tmp->bytes < offset) { 4312 n = rb_next(&tmp->offset_index); 4313 continue; 4314 } 4315 info = tmp; 4316 goto have_info; 4317 } 4318 4319 ret = 0; 4320 goto out; 4321 } 4322 4323 if (info->offset == offset) { 4324 ret = 1; 4325 goto out; 4326 } 4327 4328 if (offset > info->offset && offset < info->offset + info->bytes) 4329 ret = 1; 4330 out: 4331 spin_unlock(&ctl->tree_lock); 4332 return ret; 4333 } 4334 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4335