1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include "ctree.h" 15 #include "fs.h" 16 #include "messages.h" 17 #include "misc.h" 18 #include "free-space-cache.h" 19 #include "transaction.h" 20 #include "disk-io.h" 21 #include "extent_io.h" 22 #include "volumes.h" 23 #include "space-info.h" 24 #include "delalloc-space.h" 25 #include "block-group.h" 26 #include "discard.h" 27 #include "subpage.h" 28 #include "inode-item.h" 29 #include "accessors.h" 30 #include "file-item.h" 31 #include "file.h" 32 #include "super.h" 33 34 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 35 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 36 #define FORCE_EXTENT_THRESHOLD SZ_1M 37 38 static struct kmem_cache *btrfs_free_space_cachep; 39 static struct kmem_cache *btrfs_free_space_bitmap_cachep; 40 41 struct btrfs_trim_range { 42 u64 start; 43 u64 bytes; 44 struct list_head list; 45 }; 46 47 static int link_free_space(struct btrfs_free_space_ctl *ctl, 48 struct btrfs_free_space *info); 49 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 50 struct btrfs_free_space *info, bool update_stat); 51 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 52 struct btrfs_free_space *bitmap_info, u64 *offset, 53 u64 *bytes, bool for_alloc); 54 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 55 struct btrfs_free_space *bitmap_info); 56 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 57 struct btrfs_free_space *info, u64 offset, 58 u64 bytes, bool update_stats); 59 60 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 61 { 62 struct btrfs_free_space *info; 63 struct rb_node *node; 64 65 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 66 info = rb_entry(node, struct btrfs_free_space, offset_index); 67 if (!info->bitmap) { 68 unlink_free_space(ctl, info, true); 69 kmem_cache_free(btrfs_free_space_cachep, info); 70 } else { 71 free_bitmap(ctl, info); 72 } 73 74 cond_resched_lock(&ctl->tree_lock); 75 } 76 } 77 78 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 79 struct btrfs_path *path, 80 u64 offset) 81 { 82 struct btrfs_fs_info *fs_info = root->fs_info; 83 struct btrfs_key key; 84 struct btrfs_key location; 85 struct btrfs_disk_key disk_key; 86 struct btrfs_free_space_header *header; 87 struct extent_buffer *leaf; 88 struct inode *inode = NULL; 89 unsigned nofs_flag; 90 int ret; 91 92 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 93 key.offset = offset; 94 key.type = 0; 95 96 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 97 if (ret < 0) 98 return ERR_PTR(ret); 99 if (ret > 0) { 100 btrfs_release_path(path); 101 return ERR_PTR(-ENOENT); 102 } 103 104 leaf = path->nodes[0]; 105 header = btrfs_item_ptr(leaf, path->slots[0], 106 struct btrfs_free_space_header); 107 btrfs_free_space_key(leaf, header, &disk_key); 108 btrfs_disk_key_to_cpu(&location, &disk_key); 109 btrfs_release_path(path); 110 111 /* 112 * We are often under a trans handle at this point, so we need to make 113 * sure NOFS is set to keep us from deadlocking. 114 */ 115 nofs_flag = memalloc_nofs_save(); 116 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); 117 btrfs_release_path(path); 118 memalloc_nofs_restore(nofs_flag); 119 if (IS_ERR(inode)) 120 return inode; 121 122 mapping_set_gfp_mask(inode->i_mapping, 123 mapping_gfp_constraint(inode->i_mapping, 124 ~(__GFP_FS | __GFP_HIGHMEM))); 125 126 return inode; 127 } 128 129 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 130 struct btrfs_path *path) 131 { 132 struct btrfs_fs_info *fs_info = block_group->fs_info; 133 struct inode *inode = NULL; 134 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 135 136 spin_lock(&block_group->lock); 137 if (block_group->inode) 138 inode = igrab(block_group->inode); 139 spin_unlock(&block_group->lock); 140 if (inode) 141 return inode; 142 143 inode = __lookup_free_space_inode(fs_info->tree_root, path, 144 block_group->start); 145 if (IS_ERR(inode)) 146 return inode; 147 148 spin_lock(&block_group->lock); 149 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 150 btrfs_info(fs_info, "Old style space inode found, converting."); 151 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 152 BTRFS_INODE_NODATACOW; 153 block_group->disk_cache_state = BTRFS_DC_CLEAR; 154 } 155 156 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) 157 block_group->inode = igrab(inode); 158 spin_unlock(&block_group->lock); 159 160 return inode; 161 } 162 163 static int __create_free_space_inode(struct btrfs_root *root, 164 struct btrfs_trans_handle *trans, 165 struct btrfs_path *path, 166 u64 ino, u64 offset) 167 { 168 struct btrfs_key key; 169 struct btrfs_disk_key disk_key; 170 struct btrfs_free_space_header *header; 171 struct btrfs_inode_item *inode_item; 172 struct extent_buffer *leaf; 173 /* We inline CRCs for the free disk space cache */ 174 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 175 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 176 int ret; 177 178 ret = btrfs_insert_empty_inode(trans, root, path, ino); 179 if (ret) 180 return ret; 181 182 leaf = path->nodes[0]; 183 inode_item = btrfs_item_ptr(leaf, path->slots[0], 184 struct btrfs_inode_item); 185 btrfs_item_key(leaf, &disk_key, path->slots[0]); 186 memzero_extent_buffer(leaf, (unsigned long)inode_item, 187 sizeof(*inode_item)); 188 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 189 btrfs_set_inode_size(leaf, inode_item, 0); 190 btrfs_set_inode_nbytes(leaf, inode_item, 0); 191 btrfs_set_inode_uid(leaf, inode_item, 0); 192 btrfs_set_inode_gid(leaf, inode_item, 0); 193 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 194 btrfs_set_inode_flags(leaf, inode_item, flags); 195 btrfs_set_inode_nlink(leaf, inode_item, 1); 196 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 197 btrfs_set_inode_block_group(leaf, inode_item, offset); 198 btrfs_mark_buffer_dirty(leaf); 199 btrfs_release_path(path); 200 201 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 202 key.offset = offset; 203 key.type = 0; 204 ret = btrfs_insert_empty_item(trans, root, path, &key, 205 sizeof(struct btrfs_free_space_header)); 206 if (ret < 0) { 207 btrfs_release_path(path); 208 return ret; 209 } 210 211 leaf = path->nodes[0]; 212 header = btrfs_item_ptr(leaf, path->slots[0], 213 struct btrfs_free_space_header); 214 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 215 btrfs_set_free_space_key(leaf, header, &disk_key); 216 btrfs_mark_buffer_dirty(leaf); 217 btrfs_release_path(path); 218 219 return 0; 220 } 221 222 int create_free_space_inode(struct btrfs_trans_handle *trans, 223 struct btrfs_block_group *block_group, 224 struct btrfs_path *path) 225 { 226 int ret; 227 u64 ino; 228 229 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); 230 if (ret < 0) 231 return ret; 232 233 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 234 ino, block_group->start); 235 } 236 237 /* 238 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 239 * handles lookup, otherwise it takes ownership and iputs the inode. 240 * Don't reuse an inode pointer after passing it into this function. 241 */ 242 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 243 struct inode *inode, 244 struct btrfs_block_group *block_group) 245 { 246 struct btrfs_path *path; 247 struct btrfs_key key; 248 int ret = 0; 249 250 path = btrfs_alloc_path(); 251 if (!path) 252 return -ENOMEM; 253 254 if (!inode) 255 inode = lookup_free_space_inode(block_group, path); 256 if (IS_ERR(inode)) { 257 if (PTR_ERR(inode) != -ENOENT) 258 ret = PTR_ERR(inode); 259 goto out; 260 } 261 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 262 if (ret) { 263 btrfs_add_delayed_iput(BTRFS_I(inode)); 264 goto out; 265 } 266 clear_nlink(inode); 267 /* One for the block groups ref */ 268 spin_lock(&block_group->lock); 269 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { 270 block_group->inode = NULL; 271 spin_unlock(&block_group->lock); 272 iput(inode); 273 } else { 274 spin_unlock(&block_group->lock); 275 } 276 /* One for the lookup ref */ 277 btrfs_add_delayed_iput(BTRFS_I(inode)); 278 279 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 280 key.type = 0; 281 key.offset = block_group->start; 282 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 283 -1, 1); 284 if (ret) { 285 if (ret > 0) 286 ret = 0; 287 goto out; 288 } 289 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); 290 out: 291 btrfs_free_path(path); 292 return ret; 293 } 294 295 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, 296 struct btrfs_block_rsv *rsv) 297 { 298 u64 needed_bytes; 299 int ret; 300 301 /* 1 for slack space, 1 for updating the inode */ 302 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + 303 btrfs_calc_metadata_size(fs_info, 1); 304 305 spin_lock(&rsv->lock); 306 if (rsv->reserved < needed_bytes) 307 ret = -ENOSPC; 308 else 309 ret = 0; 310 spin_unlock(&rsv->lock); 311 return ret; 312 } 313 314 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 315 struct btrfs_block_group *block_group, 316 struct inode *vfs_inode) 317 { 318 struct btrfs_truncate_control control = { 319 .inode = BTRFS_I(vfs_inode), 320 .new_size = 0, 321 .ino = btrfs_ino(BTRFS_I(vfs_inode)), 322 .min_type = BTRFS_EXTENT_DATA_KEY, 323 .clear_extent_range = true, 324 }; 325 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 326 struct btrfs_root *root = inode->root; 327 struct extent_state *cached_state = NULL; 328 int ret = 0; 329 bool locked = false; 330 331 if (block_group) { 332 struct btrfs_path *path = btrfs_alloc_path(); 333 334 if (!path) { 335 ret = -ENOMEM; 336 goto fail; 337 } 338 locked = true; 339 mutex_lock(&trans->transaction->cache_write_mutex); 340 if (!list_empty(&block_group->io_list)) { 341 list_del_init(&block_group->io_list); 342 343 btrfs_wait_cache_io(trans, block_group, path); 344 btrfs_put_block_group(block_group); 345 } 346 347 /* 348 * now that we've truncated the cache away, its no longer 349 * setup or written 350 */ 351 spin_lock(&block_group->lock); 352 block_group->disk_cache_state = BTRFS_DC_CLEAR; 353 spin_unlock(&block_group->lock); 354 btrfs_free_path(path); 355 } 356 357 btrfs_i_size_write(inode, 0); 358 truncate_pagecache(vfs_inode, 0); 359 360 lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 361 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 362 363 /* 364 * We skip the throttling logic for free space cache inodes, so we don't 365 * need to check for -EAGAIN. 366 */ 367 ret = btrfs_truncate_inode_items(trans, root, &control); 368 369 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 370 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 371 372 unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 373 if (ret) 374 goto fail; 375 376 ret = btrfs_update_inode(trans, root, inode); 377 378 fail: 379 if (locked) 380 mutex_unlock(&trans->transaction->cache_write_mutex); 381 if (ret) 382 btrfs_abort_transaction(trans, ret); 383 384 return ret; 385 } 386 387 static void readahead_cache(struct inode *inode) 388 { 389 struct file_ra_state ra; 390 unsigned long last_index; 391 392 file_ra_state_init(&ra, inode->i_mapping); 393 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 394 395 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); 396 } 397 398 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 399 int write) 400 { 401 int num_pages; 402 403 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 404 405 /* Make sure we can fit our crcs and generation into the first page */ 406 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 407 return -ENOSPC; 408 409 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 410 411 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 412 if (!io_ctl->pages) 413 return -ENOMEM; 414 415 io_ctl->num_pages = num_pages; 416 io_ctl->fs_info = btrfs_sb(inode->i_sb); 417 io_ctl->inode = inode; 418 419 return 0; 420 } 421 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 422 423 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 424 { 425 kfree(io_ctl->pages); 426 io_ctl->pages = NULL; 427 } 428 429 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 430 { 431 if (io_ctl->cur) { 432 io_ctl->cur = NULL; 433 io_ctl->orig = NULL; 434 } 435 } 436 437 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 438 { 439 ASSERT(io_ctl->index < io_ctl->num_pages); 440 io_ctl->page = io_ctl->pages[io_ctl->index++]; 441 io_ctl->cur = page_address(io_ctl->page); 442 io_ctl->orig = io_ctl->cur; 443 io_ctl->size = PAGE_SIZE; 444 if (clear) 445 clear_page(io_ctl->cur); 446 } 447 448 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 449 { 450 int i; 451 452 io_ctl_unmap_page(io_ctl); 453 454 for (i = 0; i < io_ctl->num_pages; i++) { 455 if (io_ctl->pages[i]) { 456 btrfs_page_clear_checked(io_ctl->fs_info, 457 io_ctl->pages[i], 458 page_offset(io_ctl->pages[i]), 459 PAGE_SIZE); 460 unlock_page(io_ctl->pages[i]); 461 put_page(io_ctl->pages[i]); 462 } 463 } 464 } 465 466 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 467 { 468 struct page *page; 469 struct inode *inode = io_ctl->inode; 470 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 471 int i; 472 473 for (i = 0; i < io_ctl->num_pages; i++) { 474 int ret; 475 476 page = find_or_create_page(inode->i_mapping, i, mask); 477 if (!page) { 478 io_ctl_drop_pages(io_ctl); 479 return -ENOMEM; 480 } 481 482 ret = set_page_extent_mapped(page); 483 if (ret < 0) { 484 unlock_page(page); 485 put_page(page); 486 io_ctl_drop_pages(io_ctl); 487 return ret; 488 } 489 490 io_ctl->pages[i] = page; 491 if (uptodate && !PageUptodate(page)) { 492 btrfs_read_folio(NULL, page_folio(page)); 493 lock_page(page); 494 if (page->mapping != inode->i_mapping) { 495 btrfs_err(BTRFS_I(inode)->root->fs_info, 496 "free space cache page truncated"); 497 io_ctl_drop_pages(io_ctl); 498 return -EIO; 499 } 500 if (!PageUptodate(page)) { 501 btrfs_err(BTRFS_I(inode)->root->fs_info, 502 "error reading free space cache"); 503 io_ctl_drop_pages(io_ctl); 504 return -EIO; 505 } 506 } 507 } 508 509 for (i = 0; i < io_ctl->num_pages; i++) 510 clear_page_dirty_for_io(io_ctl->pages[i]); 511 512 return 0; 513 } 514 515 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 516 { 517 io_ctl_map_page(io_ctl, 1); 518 519 /* 520 * Skip the csum areas. If we don't check crcs then we just have a 521 * 64bit chunk at the front of the first page. 522 */ 523 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 524 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 525 526 put_unaligned_le64(generation, io_ctl->cur); 527 io_ctl->cur += sizeof(u64); 528 } 529 530 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 531 { 532 u64 cache_gen; 533 534 /* 535 * Skip the crc area. If we don't check crcs then we just have a 64bit 536 * chunk at the front of the first page. 537 */ 538 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 539 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 540 541 cache_gen = get_unaligned_le64(io_ctl->cur); 542 if (cache_gen != generation) { 543 btrfs_err_rl(io_ctl->fs_info, 544 "space cache generation (%llu) does not match inode (%llu)", 545 cache_gen, generation); 546 io_ctl_unmap_page(io_ctl); 547 return -EIO; 548 } 549 io_ctl->cur += sizeof(u64); 550 return 0; 551 } 552 553 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 554 { 555 u32 *tmp; 556 u32 crc = ~(u32)0; 557 unsigned offset = 0; 558 559 if (index == 0) 560 offset = sizeof(u32) * io_ctl->num_pages; 561 562 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 563 btrfs_crc32c_final(crc, (u8 *)&crc); 564 io_ctl_unmap_page(io_ctl); 565 tmp = page_address(io_ctl->pages[0]); 566 tmp += index; 567 *tmp = crc; 568 } 569 570 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 571 { 572 u32 *tmp, val; 573 u32 crc = ~(u32)0; 574 unsigned offset = 0; 575 576 if (index == 0) 577 offset = sizeof(u32) * io_ctl->num_pages; 578 579 tmp = page_address(io_ctl->pages[0]); 580 tmp += index; 581 val = *tmp; 582 583 io_ctl_map_page(io_ctl, 0); 584 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 585 btrfs_crc32c_final(crc, (u8 *)&crc); 586 if (val != crc) { 587 btrfs_err_rl(io_ctl->fs_info, 588 "csum mismatch on free space cache"); 589 io_ctl_unmap_page(io_ctl); 590 return -EIO; 591 } 592 593 return 0; 594 } 595 596 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 597 void *bitmap) 598 { 599 struct btrfs_free_space_entry *entry; 600 601 if (!io_ctl->cur) 602 return -ENOSPC; 603 604 entry = io_ctl->cur; 605 put_unaligned_le64(offset, &entry->offset); 606 put_unaligned_le64(bytes, &entry->bytes); 607 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 608 BTRFS_FREE_SPACE_EXTENT; 609 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 610 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 611 612 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 613 return 0; 614 615 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 616 617 /* No more pages to map */ 618 if (io_ctl->index >= io_ctl->num_pages) 619 return 0; 620 621 /* map the next page */ 622 io_ctl_map_page(io_ctl, 1); 623 return 0; 624 } 625 626 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 627 { 628 if (!io_ctl->cur) 629 return -ENOSPC; 630 631 /* 632 * If we aren't at the start of the current page, unmap this one and 633 * map the next one if there is any left. 634 */ 635 if (io_ctl->cur != io_ctl->orig) { 636 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 637 if (io_ctl->index >= io_ctl->num_pages) 638 return -ENOSPC; 639 io_ctl_map_page(io_ctl, 0); 640 } 641 642 copy_page(io_ctl->cur, bitmap); 643 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 644 if (io_ctl->index < io_ctl->num_pages) 645 io_ctl_map_page(io_ctl, 0); 646 return 0; 647 } 648 649 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 650 { 651 /* 652 * If we're not on the boundary we know we've modified the page and we 653 * need to crc the page. 654 */ 655 if (io_ctl->cur != io_ctl->orig) 656 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 657 else 658 io_ctl_unmap_page(io_ctl); 659 660 while (io_ctl->index < io_ctl->num_pages) { 661 io_ctl_map_page(io_ctl, 1); 662 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 663 } 664 } 665 666 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 667 struct btrfs_free_space *entry, u8 *type) 668 { 669 struct btrfs_free_space_entry *e; 670 int ret; 671 672 if (!io_ctl->cur) { 673 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 674 if (ret) 675 return ret; 676 } 677 678 e = io_ctl->cur; 679 entry->offset = get_unaligned_le64(&e->offset); 680 entry->bytes = get_unaligned_le64(&e->bytes); 681 *type = e->type; 682 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 683 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 684 685 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 686 return 0; 687 688 io_ctl_unmap_page(io_ctl); 689 690 return 0; 691 } 692 693 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 694 struct btrfs_free_space *entry) 695 { 696 int ret; 697 698 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 699 if (ret) 700 return ret; 701 702 copy_page(entry->bitmap, io_ctl->cur); 703 io_ctl_unmap_page(io_ctl); 704 705 return 0; 706 } 707 708 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 709 { 710 struct btrfs_block_group *block_group = ctl->block_group; 711 u64 max_bytes; 712 u64 bitmap_bytes; 713 u64 extent_bytes; 714 u64 size = block_group->length; 715 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 716 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 717 718 max_bitmaps = max_t(u64, max_bitmaps, 1); 719 720 if (ctl->total_bitmaps > max_bitmaps) 721 btrfs_err(block_group->fs_info, 722 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", 723 block_group->start, block_group->length, 724 ctl->total_bitmaps, ctl->unit, max_bitmaps, 725 bytes_per_bg); 726 ASSERT(ctl->total_bitmaps <= max_bitmaps); 727 728 /* 729 * We are trying to keep the total amount of memory used per 1GiB of 730 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 731 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 732 * bitmaps, we may end up using more memory than this. 733 */ 734 if (size < SZ_1G) 735 max_bytes = MAX_CACHE_BYTES_PER_GIG; 736 else 737 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 738 739 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 740 741 /* 742 * we want the extent entry threshold to always be at most 1/2 the max 743 * bytes we can have, or whatever is less than that. 744 */ 745 extent_bytes = max_bytes - bitmap_bytes; 746 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 747 748 ctl->extents_thresh = 749 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 750 } 751 752 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 753 struct btrfs_free_space_ctl *ctl, 754 struct btrfs_path *path, u64 offset) 755 { 756 struct btrfs_fs_info *fs_info = root->fs_info; 757 struct btrfs_free_space_header *header; 758 struct extent_buffer *leaf; 759 struct btrfs_io_ctl io_ctl; 760 struct btrfs_key key; 761 struct btrfs_free_space *e, *n; 762 LIST_HEAD(bitmaps); 763 u64 num_entries; 764 u64 num_bitmaps; 765 u64 generation; 766 u8 type; 767 int ret = 0; 768 769 /* Nothing in the space cache, goodbye */ 770 if (!i_size_read(inode)) 771 return 0; 772 773 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 774 key.offset = offset; 775 key.type = 0; 776 777 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 778 if (ret < 0) 779 return 0; 780 else if (ret > 0) { 781 btrfs_release_path(path); 782 return 0; 783 } 784 785 ret = -1; 786 787 leaf = path->nodes[0]; 788 header = btrfs_item_ptr(leaf, path->slots[0], 789 struct btrfs_free_space_header); 790 num_entries = btrfs_free_space_entries(leaf, header); 791 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 792 generation = btrfs_free_space_generation(leaf, header); 793 btrfs_release_path(path); 794 795 if (!BTRFS_I(inode)->generation) { 796 btrfs_info(fs_info, 797 "the free space cache file (%llu) is invalid, skip it", 798 offset); 799 return 0; 800 } 801 802 if (BTRFS_I(inode)->generation != generation) { 803 btrfs_err(fs_info, 804 "free space inode generation (%llu) did not match free space cache generation (%llu)", 805 BTRFS_I(inode)->generation, generation); 806 return 0; 807 } 808 809 if (!num_entries) 810 return 0; 811 812 ret = io_ctl_init(&io_ctl, inode, 0); 813 if (ret) 814 return ret; 815 816 readahead_cache(inode); 817 818 ret = io_ctl_prepare_pages(&io_ctl, true); 819 if (ret) 820 goto out; 821 822 ret = io_ctl_check_crc(&io_ctl, 0); 823 if (ret) 824 goto free_cache; 825 826 ret = io_ctl_check_generation(&io_ctl, generation); 827 if (ret) 828 goto free_cache; 829 830 while (num_entries) { 831 e = kmem_cache_zalloc(btrfs_free_space_cachep, 832 GFP_NOFS); 833 if (!e) { 834 ret = -ENOMEM; 835 goto free_cache; 836 } 837 838 ret = io_ctl_read_entry(&io_ctl, e, &type); 839 if (ret) { 840 kmem_cache_free(btrfs_free_space_cachep, e); 841 goto free_cache; 842 } 843 844 if (!e->bytes) { 845 ret = -1; 846 kmem_cache_free(btrfs_free_space_cachep, e); 847 goto free_cache; 848 } 849 850 if (type == BTRFS_FREE_SPACE_EXTENT) { 851 spin_lock(&ctl->tree_lock); 852 ret = link_free_space(ctl, e); 853 spin_unlock(&ctl->tree_lock); 854 if (ret) { 855 btrfs_err(fs_info, 856 "Duplicate entries in free space cache, dumping"); 857 kmem_cache_free(btrfs_free_space_cachep, e); 858 goto free_cache; 859 } 860 } else { 861 ASSERT(num_bitmaps); 862 num_bitmaps--; 863 e->bitmap = kmem_cache_zalloc( 864 btrfs_free_space_bitmap_cachep, GFP_NOFS); 865 if (!e->bitmap) { 866 ret = -ENOMEM; 867 kmem_cache_free( 868 btrfs_free_space_cachep, e); 869 goto free_cache; 870 } 871 spin_lock(&ctl->tree_lock); 872 ret = link_free_space(ctl, e); 873 if (ret) { 874 spin_unlock(&ctl->tree_lock); 875 btrfs_err(fs_info, 876 "Duplicate entries in free space cache, dumping"); 877 kmem_cache_free(btrfs_free_space_cachep, e); 878 goto free_cache; 879 } 880 ctl->total_bitmaps++; 881 recalculate_thresholds(ctl); 882 spin_unlock(&ctl->tree_lock); 883 list_add_tail(&e->list, &bitmaps); 884 } 885 886 num_entries--; 887 } 888 889 io_ctl_unmap_page(&io_ctl); 890 891 /* 892 * We add the bitmaps at the end of the entries in order that 893 * the bitmap entries are added to the cache. 894 */ 895 list_for_each_entry_safe(e, n, &bitmaps, list) { 896 list_del_init(&e->list); 897 ret = io_ctl_read_bitmap(&io_ctl, e); 898 if (ret) 899 goto free_cache; 900 } 901 902 io_ctl_drop_pages(&io_ctl); 903 ret = 1; 904 out: 905 io_ctl_free(&io_ctl); 906 return ret; 907 free_cache: 908 io_ctl_drop_pages(&io_ctl); 909 910 spin_lock(&ctl->tree_lock); 911 __btrfs_remove_free_space_cache(ctl); 912 spin_unlock(&ctl->tree_lock); 913 goto out; 914 } 915 916 static int copy_free_space_cache(struct btrfs_block_group *block_group, 917 struct btrfs_free_space_ctl *ctl) 918 { 919 struct btrfs_free_space *info; 920 struct rb_node *n; 921 int ret = 0; 922 923 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 924 info = rb_entry(n, struct btrfs_free_space, offset_index); 925 if (!info->bitmap) { 926 unlink_free_space(ctl, info, true); 927 ret = btrfs_add_free_space(block_group, info->offset, 928 info->bytes); 929 kmem_cache_free(btrfs_free_space_cachep, info); 930 } else { 931 u64 offset = info->offset; 932 u64 bytes = ctl->unit; 933 934 while (search_bitmap(ctl, info, &offset, &bytes, 935 false) == 0) { 936 ret = btrfs_add_free_space(block_group, offset, 937 bytes); 938 if (ret) 939 break; 940 bitmap_clear_bits(ctl, info, offset, bytes, true); 941 offset = info->offset; 942 bytes = ctl->unit; 943 } 944 free_bitmap(ctl, info); 945 } 946 cond_resched(); 947 } 948 return ret; 949 } 950 951 static struct lock_class_key btrfs_free_space_inode_key; 952 953 int load_free_space_cache(struct btrfs_block_group *block_group) 954 { 955 struct btrfs_fs_info *fs_info = block_group->fs_info; 956 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 957 struct btrfs_free_space_ctl tmp_ctl = {}; 958 struct inode *inode; 959 struct btrfs_path *path; 960 int ret = 0; 961 bool matched; 962 u64 used = block_group->used; 963 964 /* 965 * Because we could potentially discard our loaded free space, we want 966 * to load everything into a temporary structure first, and then if it's 967 * valid copy it all into the actual free space ctl. 968 */ 969 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 970 971 /* 972 * If this block group has been marked to be cleared for one reason or 973 * another then we can't trust the on disk cache, so just return. 974 */ 975 spin_lock(&block_group->lock); 976 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 977 spin_unlock(&block_group->lock); 978 return 0; 979 } 980 spin_unlock(&block_group->lock); 981 982 path = btrfs_alloc_path(); 983 if (!path) 984 return 0; 985 path->search_commit_root = 1; 986 path->skip_locking = 1; 987 988 /* 989 * We must pass a path with search_commit_root set to btrfs_iget in 990 * order to avoid a deadlock when allocating extents for the tree root. 991 * 992 * When we are COWing an extent buffer from the tree root, when looking 993 * for a free extent, at extent-tree.c:find_free_extent(), we can find 994 * block group without its free space cache loaded. When we find one 995 * we must load its space cache which requires reading its free space 996 * cache's inode item from the root tree. If this inode item is located 997 * in the same leaf that we started COWing before, then we end up in 998 * deadlock on the extent buffer (trying to read lock it when we 999 * previously write locked it). 1000 * 1001 * It's safe to read the inode item using the commit root because 1002 * block groups, once loaded, stay in memory forever (until they are 1003 * removed) as well as their space caches once loaded. New block groups 1004 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 1005 * we will never try to read their inode item while the fs is mounted. 1006 */ 1007 inode = lookup_free_space_inode(block_group, path); 1008 if (IS_ERR(inode)) { 1009 btrfs_free_path(path); 1010 return 0; 1011 } 1012 1013 /* We may have converted the inode and made the cache invalid. */ 1014 spin_lock(&block_group->lock); 1015 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 1016 spin_unlock(&block_group->lock); 1017 btrfs_free_path(path); 1018 goto out; 1019 } 1020 spin_unlock(&block_group->lock); 1021 1022 /* 1023 * Reinitialize the class of struct inode's mapping->invalidate_lock for 1024 * free space inodes to prevent false positives related to locks for normal 1025 * inodes. 1026 */ 1027 lockdep_set_class(&(&inode->i_data)->invalidate_lock, 1028 &btrfs_free_space_inode_key); 1029 1030 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 1031 path, block_group->start); 1032 btrfs_free_path(path); 1033 if (ret <= 0) 1034 goto out; 1035 1036 matched = (tmp_ctl.free_space == (block_group->length - used - 1037 block_group->bytes_super)); 1038 1039 if (matched) { 1040 ret = copy_free_space_cache(block_group, &tmp_ctl); 1041 /* 1042 * ret == 1 means we successfully loaded the free space cache, 1043 * so we need to re-set it here. 1044 */ 1045 if (ret == 0) 1046 ret = 1; 1047 } else { 1048 /* 1049 * We need to call the _locked variant so we don't try to update 1050 * the discard counters. 1051 */ 1052 spin_lock(&tmp_ctl.tree_lock); 1053 __btrfs_remove_free_space_cache(&tmp_ctl); 1054 spin_unlock(&tmp_ctl.tree_lock); 1055 btrfs_warn(fs_info, 1056 "block group %llu has wrong amount of free space", 1057 block_group->start); 1058 ret = -1; 1059 } 1060 out: 1061 if (ret < 0) { 1062 /* This cache is bogus, make sure it gets cleared */ 1063 spin_lock(&block_group->lock); 1064 block_group->disk_cache_state = BTRFS_DC_CLEAR; 1065 spin_unlock(&block_group->lock); 1066 ret = 0; 1067 1068 btrfs_warn(fs_info, 1069 "failed to load free space cache for block group %llu, rebuilding it now", 1070 block_group->start); 1071 } 1072 1073 spin_lock(&ctl->tree_lock); 1074 btrfs_discard_update_discardable(block_group); 1075 spin_unlock(&ctl->tree_lock); 1076 iput(inode); 1077 return ret; 1078 } 1079 1080 static noinline_for_stack 1081 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1082 struct btrfs_free_space_ctl *ctl, 1083 struct btrfs_block_group *block_group, 1084 int *entries, int *bitmaps, 1085 struct list_head *bitmap_list) 1086 { 1087 int ret; 1088 struct btrfs_free_cluster *cluster = NULL; 1089 struct btrfs_free_cluster *cluster_locked = NULL; 1090 struct rb_node *node = rb_first(&ctl->free_space_offset); 1091 struct btrfs_trim_range *trim_entry; 1092 1093 /* Get the cluster for this block_group if it exists */ 1094 if (block_group && !list_empty(&block_group->cluster_list)) { 1095 cluster = list_entry(block_group->cluster_list.next, 1096 struct btrfs_free_cluster, 1097 block_group_list); 1098 } 1099 1100 if (!node && cluster) { 1101 cluster_locked = cluster; 1102 spin_lock(&cluster_locked->lock); 1103 node = rb_first(&cluster->root); 1104 cluster = NULL; 1105 } 1106 1107 /* Write out the extent entries */ 1108 while (node) { 1109 struct btrfs_free_space *e; 1110 1111 e = rb_entry(node, struct btrfs_free_space, offset_index); 1112 *entries += 1; 1113 1114 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1115 e->bitmap); 1116 if (ret) 1117 goto fail; 1118 1119 if (e->bitmap) { 1120 list_add_tail(&e->list, bitmap_list); 1121 *bitmaps += 1; 1122 } 1123 node = rb_next(node); 1124 if (!node && cluster) { 1125 node = rb_first(&cluster->root); 1126 cluster_locked = cluster; 1127 spin_lock(&cluster_locked->lock); 1128 cluster = NULL; 1129 } 1130 } 1131 if (cluster_locked) { 1132 spin_unlock(&cluster_locked->lock); 1133 cluster_locked = NULL; 1134 } 1135 1136 /* 1137 * Make sure we don't miss any range that was removed from our rbtree 1138 * because trimming is running. Otherwise after a umount+mount (or crash 1139 * after committing the transaction) we would leak free space and get 1140 * an inconsistent free space cache report from fsck. 1141 */ 1142 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1143 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1144 trim_entry->bytes, NULL); 1145 if (ret) 1146 goto fail; 1147 *entries += 1; 1148 } 1149 1150 return 0; 1151 fail: 1152 if (cluster_locked) 1153 spin_unlock(&cluster_locked->lock); 1154 return -ENOSPC; 1155 } 1156 1157 static noinline_for_stack int 1158 update_cache_item(struct btrfs_trans_handle *trans, 1159 struct btrfs_root *root, 1160 struct inode *inode, 1161 struct btrfs_path *path, u64 offset, 1162 int entries, int bitmaps) 1163 { 1164 struct btrfs_key key; 1165 struct btrfs_free_space_header *header; 1166 struct extent_buffer *leaf; 1167 int ret; 1168 1169 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1170 key.offset = offset; 1171 key.type = 0; 1172 1173 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1174 if (ret < 0) { 1175 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1176 EXTENT_DELALLOC, NULL); 1177 goto fail; 1178 } 1179 leaf = path->nodes[0]; 1180 if (ret > 0) { 1181 struct btrfs_key found_key; 1182 ASSERT(path->slots[0]); 1183 path->slots[0]--; 1184 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1185 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1186 found_key.offset != offset) { 1187 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1188 inode->i_size - 1, EXTENT_DELALLOC, 1189 NULL); 1190 btrfs_release_path(path); 1191 goto fail; 1192 } 1193 } 1194 1195 BTRFS_I(inode)->generation = trans->transid; 1196 header = btrfs_item_ptr(leaf, path->slots[0], 1197 struct btrfs_free_space_header); 1198 btrfs_set_free_space_entries(leaf, header, entries); 1199 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1200 btrfs_set_free_space_generation(leaf, header, trans->transid); 1201 btrfs_mark_buffer_dirty(leaf); 1202 btrfs_release_path(path); 1203 1204 return 0; 1205 1206 fail: 1207 return -1; 1208 } 1209 1210 static noinline_for_stack int write_pinned_extent_entries( 1211 struct btrfs_trans_handle *trans, 1212 struct btrfs_block_group *block_group, 1213 struct btrfs_io_ctl *io_ctl, 1214 int *entries) 1215 { 1216 u64 start, extent_start, extent_end, len; 1217 struct extent_io_tree *unpin = NULL; 1218 int ret; 1219 1220 if (!block_group) 1221 return 0; 1222 1223 /* 1224 * We want to add any pinned extents to our free space cache 1225 * so we don't leak the space 1226 * 1227 * We shouldn't have switched the pinned extents yet so this is the 1228 * right one 1229 */ 1230 unpin = &trans->transaction->pinned_extents; 1231 1232 start = block_group->start; 1233 1234 while (start < block_group->start + block_group->length) { 1235 ret = find_first_extent_bit(unpin, start, 1236 &extent_start, &extent_end, 1237 EXTENT_DIRTY, NULL); 1238 if (ret) 1239 return 0; 1240 1241 /* This pinned extent is out of our range */ 1242 if (extent_start >= block_group->start + block_group->length) 1243 return 0; 1244 1245 extent_start = max(extent_start, start); 1246 extent_end = min(block_group->start + block_group->length, 1247 extent_end + 1); 1248 len = extent_end - extent_start; 1249 1250 *entries += 1; 1251 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1252 if (ret) 1253 return -ENOSPC; 1254 1255 start = extent_end; 1256 } 1257 1258 return 0; 1259 } 1260 1261 static noinline_for_stack int 1262 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1263 { 1264 struct btrfs_free_space *entry, *next; 1265 int ret; 1266 1267 /* Write out the bitmaps */ 1268 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1269 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1270 if (ret) 1271 return -ENOSPC; 1272 list_del_init(&entry->list); 1273 } 1274 1275 return 0; 1276 } 1277 1278 static int flush_dirty_cache(struct inode *inode) 1279 { 1280 int ret; 1281 1282 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1283 if (ret) 1284 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1285 EXTENT_DELALLOC, NULL); 1286 1287 return ret; 1288 } 1289 1290 static void noinline_for_stack 1291 cleanup_bitmap_list(struct list_head *bitmap_list) 1292 { 1293 struct btrfs_free_space *entry, *next; 1294 1295 list_for_each_entry_safe(entry, next, bitmap_list, list) 1296 list_del_init(&entry->list); 1297 } 1298 1299 static void noinline_for_stack 1300 cleanup_write_cache_enospc(struct inode *inode, 1301 struct btrfs_io_ctl *io_ctl, 1302 struct extent_state **cached_state) 1303 { 1304 io_ctl_drop_pages(io_ctl); 1305 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1306 cached_state); 1307 } 1308 1309 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1310 struct btrfs_trans_handle *trans, 1311 struct btrfs_block_group *block_group, 1312 struct btrfs_io_ctl *io_ctl, 1313 struct btrfs_path *path, u64 offset) 1314 { 1315 int ret; 1316 struct inode *inode = io_ctl->inode; 1317 1318 if (!inode) 1319 return 0; 1320 1321 /* Flush the dirty pages in the cache file. */ 1322 ret = flush_dirty_cache(inode); 1323 if (ret) 1324 goto out; 1325 1326 /* Update the cache item to tell everyone this cache file is valid. */ 1327 ret = update_cache_item(trans, root, inode, path, offset, 1328 io_ctl->entries, io_ctl->bitmaps); 1329 out: 1330 if (ret) { 1331 invalidate_inode_pages2(inode->i_mapping); 1332 BTRFS_I(inode)->generation = 0; 1333 if (block_group) 1334 btrfs_debug(root->fs_info, 1335 "failed to write free space cache for block group %llu error %d", 1336 block_group->start, ret); 1337 } 1338 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1339 1340 if (block_group) { 1341 /* the dirty list is protected by the dirty_bgs_lock */ 1342 spin_lock(&trans->transaction->dirty_bgs_lock); 1343 1344 /* the disk_cache_state is protected by the block group lock */ 1345 spin_lock(&block_group->lock); 1346 1347 /* 1348 * only mark this as written if we didn't get put back on 1349 * the dirty list while waiting for IO. Otherwise our 1350 * cache state won't be right, and we won't get written again 1351 */ 1352 if (!ret && list_empty(&block_group->dirty_list)) 1353 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1354 else if (ret) 1355 block_group->disk_cache_state = BTRFS_DC_ERROR; 1356 1357 spin_unlock(&block_group->lock); 1358 spin_unlock(&trans->transaction->dirty_bgs_lock); 1359 io_ctl->inode = NULL; 1360 iput(inode); 1361 } 1362 1363 return ret; 1364 1365 } 1366 1367 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1368 struct btrfs_block_group *block_group, 1369 struct btrfs_path *path) 1370 { 1371 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1372 block_group, &block_group->io_ctl, 1373 path, block_group->start); 1374 } 1375 1376 /* 1377 * Write out cached info to an inode. 1378 * 1379 * @root: root the inode belongs to 1380 * @inode: freespace inode we are writing out 1381 * @ctl: free space cache we are going to write out 1382 * @block_group: block_group for this cache if it belongs to a block_group 1383 * @io_ctl: holds context for the io 1384 * @trans: the trans handle 1385 * 1386 * This function writes out a free space cache struct to disk for quick recovery 1387 * on mount. This will return 0 if it was successful in writing the cache out, 1388 * or an errno if it was not. 1389 */ 1390 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 1391 struct btrfs_free_space_ctl *ctl, 1392 struct btrfs_block_group *block_group, 1393 struct btrfs_io_ctl *io_ctl, 1394 struct btrfs_trans_handle *trans) 1395 { 1396 struct extent_state *cached_state = NULL; 1397 LIST_HEAD(bitmap_list); 1398 int entries = 0; 1399 int bitmaps = 0; 1400 int ret; 1401 int must_iput = 0; 1402 1403 if (!i_size_read(inode)) 1404 return -EIO; 1405 1406 WARN_ON(io_ctl->pages); 1407 ret = io_ctl_init(io_ctl, inode, 1); 1408 if (ret) 1409 return ret; 1410 1411 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1412 down_write(&block_group->data_rwsem); 1413 spin_lock(&block_group->lock); 1414 if (block_group->delalloc_bytes) { 1415 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1416 spin_unlock(&block_group->lock); 1417 up_write(&block_group->data_rwsem); 1418 BTRFS_I(inode)->generation = 0; 1419 ret = 0; 1420 must_iput = 1; 1421 goto out; 1422 } 1423 spin_unlock(&block_group->lock); 1424 } 1425 1426 /* Lock all pages first so we can lock the extent safely. */ 1427 ret = io_ctl_prepare_pages(io_ctl, false); 1428 if (ret) 1429 goto out_unlock; 1430 1431 lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1432 &cached_state); 1433 1434 io_ctl_set_generation(io_ctl, trans->transid); 1435 1436 mutex_lock(&ctl->cache_writeout_mutex); 1437 /* Write out the extent entries in the free space cache */ 1438 spin_lock(&ctl->tree_lock); 1439 ret = write_cache_extent_entries(io_ctl, ctl, 1440 block_group, &entries, &bitmaps, 1441 &bitmap_list); 1442 if (ret) 1443 goto out_nospc_locked; 1444 1445 /* 1446 * Some spaces that are freed in the current transaction are pinned, 1447 * they will be added into free space cache after the transaction is 1448 * committed, we shouldn't lose them. 1449 * 1450 * If this changes while we are working we'll get added back to 1451 * the dirty list and redo it. No locking needed 1452 */ 1453 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1454 if (ret) 1455 goto out_nospc_locked; 1456 1457 /* 1458 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1459 * locked while doing it because a concurrent trim can be manipulating 1460 * or freeing the bitmap. 1461 */ 1462 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1463 spin_unlock(&ctl->tree_lock); 1464 mutex_unlock(&ctl->cache_writeout_mutex); 1465 if (ret) 1466 goto out_nospc; 1467 1468 /* Zero out the rest of the pages just to make sure */ 1469 io_ctl_zero_remaining_pages(io_ctl); 1470 1471 /* Everything is written out, now we dirty the pages in the file. */ 1472 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 1473 io_ctl->num_pages, 0, i_size_read(inode), 1474 &cached_state, false); 1475 if (ret) 1476 goto out_nospc; 1477 1478 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1479 up_write(&block_group->data_rwsem); 1480 /* 1481 * Release the pages and unlock the extent, we will flush 1482 * them out later 1483 */ 1484 io_ctl_drop_pages(io_ctl); 1485 io_ctl_free(io_ctl); 1486 1487 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1488 &cached_state); 1489 1490 /* 1491 * at this point the pages are under IO and we're happy, 1492 * The caller is responsible for waiting on them and updating 1493 * the cache and the inode 1494 */ 1495 io_ctl->entries = entries; 1496 io_ctl->bitmaps = bitmaps; 1497 1498 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); 1499 if (ret) 1500 goto out; 1501 1502 return 0; 1503 1504 out_nospc_locked: 1505 cleanup_bitmap_list(&bitmap_list); 1506 spin_unlock(&ctl->tree_lock); 1507 mutex_unlock(&ctl->cache_writeout_mutex); 1508 1509 out_nospc: 1510 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1511 1512 out_unlock: 1513 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1514 up_write(&block_group->data_rwsem); 1515 1516 out: 1517 io_ctl->inode = NULL; 1518 io_ctl_free(io_ctl); 1519 if (ret) { 1520 invalidate_inode_pages2(inode->i_mapping); 1521 BTRFS_I(inode)->generation = 0; 1522 } 1523 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1524 if (must_iput) 1525 iput(inode); 1526 return ret; 1527 } 1528 1529 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1530 struct btrfs_block_group *block_group, 1531 struct btrfs_path *path) 1532 { 1533 struct btrfs_fs_info *fs_info = trans->fs_info; 1534 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1535 struct inode *inode; 1536 int ret = 0; 1537 1538 spin_lock(&block_group->lock); 1539 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1540 spin_unlock(&block_group->lock); 1541 return 0; 1542 } 1543 spin_unlock(&block_group->lock); 1544 1545 inode = lookup_free_space_inode(block_group, path); 1546 if (IS_ERR(inode)) 1547 return 0; 1548 1549 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, 1550 block_group, &block_group->io_ctl, trans); 1551 if (ret) { 1552 btrfs_debug(fs_info, 1553 "failed to write free space cache for block group %llu error %d", 1554 block_group->start, ret); 1555 spin_lock(&block_group->lock); 1556 block_group->disk_cache_state = BTRFS_DC_ERROR; 1557 spin_unlock(&block_group->lock); 1558 1559 block_group->io_ctl.inode = NULL; 1560 iput(inode); 1561 } 1562 1563 /* 1564 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1565 * to wait for IO and put the inode 1566 */ 1567 1568 return ret; 1569 } 1570 1571 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1572 u64 offset) 1573 { 1574 ASSERT(offset >= bitmap_start); 1575 offset -= bitmap_start; 1576 return (unsigned long)(div_u64(offset, unit)); 1577 } 1578 1579 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1580 { 1581 return (unsigned long)(div_u64(bytes, unit)); 1582 } 1583 1584 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1585 u64 offset) 1586 { 1587 u64 bitmap_start; 1588 u64 bytes_per_bitmap; 1589 1590 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1591 bitmap_start = offset - ctl->start; 1592 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1593 bitmap_start *= bytes_per_bitmap; 1594 bitmap_start += ctl->start; 1595 1596 return bitmap_start; 1597 } 1598 1599 static int tree_insert_offset(struct rb_root *root, u64 offset, 1600 struct rb_node *node, int bitmap) 1601 { 1602 struct rb_node **p = &root->rb_node; 1603 struct rb_node *parent = NULL; 1604 struct btrfs_free_space *info; 1605 1606 while (*p) { 1607 parent = *p; 1608 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1609 1610 if (offset < info->offset) { 1611 p = &(*p)->rb_left; 1612 } else if (offset > info->offset) { 1613 p = &(*p)->rb_right; 1614 } else { 1615 /* 1616 * we could have a bitmap entry and an extent entry 1617 * share the same offset. If this is the case, we want 1618 * the extent entry to always be found first if we do a 1619 * linear search through the tree, since we want to have 1620 * the quickest allocation time, and allocating from an 1621 * extent is faster than allocating from a bitmap. So 1622 * if we're inserting a bitmap and we find an entry at 1623 * this offset, we want to go right, or after this entry 1624 * logically. If we are inserting an extent and we've 1625 * found a bitmap, we want to go left, or before 1626 * logically. 1627 */ 1628 if (bitmap) { 1629 if (info->bitmap) { 1630 WARN_ON_ONCE(1); 1631 return -EEXIST; 1632 } 1633 p = &(*p)->rb_right; 1634 } else { 1635 if (!info->bitmap) { 1636 WARN_ON_ONCE(1); 1637 return -EEXIST; 1638 } 1639 p = &(*p)->rb_left; 1640 } 1641 } 1642 } 1643 1644 rb_link_node(node, parent, p); 1645 rb_insert_color(node, root); 1646 1647 return 0; 1648 } 1649 1650 /* 1651 * This is a little subtle. We *only* have ->max_extent_size set if we actually 1652 * searched through the bitmap and figured out the largest ->max_extent_size, 1653 * otherwise it's 0. In the case that it's 0 we don't want to tell the 1654 * allocator the wrong thing, we want to use the actual real max_extent_size 1655 * we've found already if it's larger, or we want to use ->bytes. 1656 * 1657 * This matters because find_free_space() will skip entries who's ->bytes is 1658 * less than the required bytes. So if we didn't search down this bitmap, we 1659 * may pick some previous entry that has a smaller ->max_extent_size than we 1660 * have. For example, assume we have two entries, one that has 1661 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set 1662 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will 1663 * call into find_free_space(), and return with max_extent_size == 4K, because 1664 * that first bitmap entry had ->max_extent_size set, but the second one did 1665 * not. If instead we returned 8K we'd come in searching for 8K, and find the 1666 * 8K contiguous range. 1667 * 1668 * Consider the other case, we have 2 8K chunks in that second entry and still 1669 * don't have ->max_extent_size set. We'll return 16K, and the next time the 1670 * allocator comes in it'll fully search our second bitmap, and this time it'll 1671 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the 1672 * right allocation the next loop through. 1673 */ 1674 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) 1675 { 1676 if (entry->bitmap && entry->max_extent_size) 1677 return entry->max_extent_size; 1678 return entry->bytes; 1679 } 1680 1681 /* 1682 * We want the largest entry to be leftmost, so this is inverted from what you'd 1683 * normally expect. 1684 */ 1685 static bool entry_less(struct rb_node *node, const struct rb_node *parent) 1686 { 1687 const struct btrfs_free_space *entry, *exist; 1688 1689 entry = rb_entry(node, struct btrfs_free_space, bytes_index); 1690 exist = rb_entry(parent, struct btrfs_free_space, bytes_index); 1691 return get_max_extent_size(exist) < get_max_extent_size(entry); 1692 } 1693 1694 /* 1695 * searches the tree for the given offset. 1696 * 1697 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1698 * want a section that has at least bytes size and comes at or after the given 1699 * offset. 1700 */ 1701 static struct btrfs_free_space * 1702 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1703 u64 offset, int bitmap_only, int fuzzy) 1704 { 1705 struct rb_node *n = ctl->free_space_offset.rb_node; 1706 struct btrfs_free_space *entry = NULL, *prev = NULL; 1707 1708 /* find entry that is closest to the 'offset' */ 1709 while (n) { 1710 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1711 prev = entry; 1712 1713 if (offset < entry->offset) 1714 n = n->rb_left; 1715 else if (offset > entry->offset) 1716 n = n->rb_right; 1717 else 1718 break; 1719 1720 entry = NULL; 1721 } 1722 1723 if (bitmap_only) { 1724 if (!entry) 1725 return NULL; 1726 if (entry->bitmap) 1727 return entry; 1728 1729 /* 1730 * bitmap entry and extent entry may share same offset, 1731 * in that case, bitmap entry comes after extent entry. 1732 */ 1733 n = rb_next(n); 1734 if (!n) 1735 return NULL; 1736 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1737 if (entry->offset != offset) 1738 return NULL; 1739 1740 WARN_ON(!entry->bitmap); 1741 return entry; 1742 } else if (entry) { 1743 if (entry->bitmap) { 1744 /* 1745 * if previous extent entry covers the offset, 1746 * we should return it instead of the bitmap entry 1747 */ 1748 n = rb_prev(&entry->offset_index); 1749 if (n) { 1750 prev = rb_entry(n, struct btrfs_free_space, 1751 offset_index); 1752 if (!prev->bitmap && 1753 prev->offset + prev->bytes > offset) 1754 entry = prev; 1755 } 1756 } 1757 return entry; 1758 } 1759 1760 if (!prev) 1761 return NULL; 1762 1763 /* find last entry before the 'offset' */ 1764 entry = prev; 1765 if (entry->offset > offset) { 1766 n = rb_prev(&entry->offset_index); 1767 if (n) { 1768 entry = rb_entry(n, struct btrfs_free_space, 1769 offset_index); 1770 ASSERT(entry->offset <= offset); 1771 } else { 1772 if (fuzzy) 1773 return entry; 1774 else 1775 return NULL; 1776 } 1777 } 1778 1779 if (entry->bitmap) { 1780 n = rb_prev(&entry->offset_index); 1781 if (n) { 1782 prev = rb_entry(n, struct btrfs_free_space, 1783 offset_index); 1784 if (!prev->bitmap && 1785 prev->offset + prev->bytes > offset) 1786 return prev; 1787 } 1788 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1789 return entry; 1790 } else if (entry->offset + entry->bytes > offset) 1791 return entry; 1792 1793 if (!fuzzy) 1794 return NULL; 1795 1796 while (1) { 1797 n = rb_next(&entry->offset_index); 1798 if (!n) 1799 return NULL; 1800 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1801 if (entry->bitmap) { 1802 if (entry->offset + BITS_PER_BITMAP * 1803 ctl->unit > offset) 1804 break; 1805 } else { 1806 if (entry->offset + entry->bytes > offset) 1807 break; 1808 } 1809 } 1810 return entry; 1811 } 1812 1813 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1814 struct btrfs_free_space *info, 1815 bool update_stat) 1816 { 1817 rb_erase(&info->offset_index, &ctl->free_space_offset); 1818 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1819 ctl->free_extents--; 1820 1821 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1822 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1823 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1824 } 1825 1826 if (update_stat) 1827 ctl->free_space -= info->bytes; 1828 } 1829 1830 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1831 struct btrfs_free_space *info) 1832 { 1833 int ret = 0; 1834 1835 ASSERT(info->bytes || info->bitmap); 1836 ret = tree_insert_offset(&ctl->free_space_offset, info->offset, 1837 &info->offset_index, (info->bitmap != NULL)); 1838 if (ret) 1839 return ret; 1840 1841 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1842 1843 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1844 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1845 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1846 } 1847 1848 ctl->free_space += info->bytes; 1849 ctl->free_extents++; 1850 return ret; 1851 } 1852 1853 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, 1854 struct btrfs_free_space *info) 1855 { 1856 ASSERT(info->bitmap); 1857 1858 /* 1859 * If our entry is empty it's because we're on a cluster and we don't 1860 * want to re-link it into our ctl bytes index. 1861 */ 1862 if (RB_EMPTY_NODE(&info->bytes_index)) 1863 return; 1864 1865 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1866 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1867 } 1868 1869 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1870 struct btrfs_free_space *info, 1871 u64 offset, u64 bytes, bool update_stat) 1872 { 1873 unsigned long start, count, end; 1874 int extent_delta = -1; 1875 1876 start = offset_to_bit(info->offset, ctl->unit, offset); 1877 count = bytes_to_bits(bytes, ctl->unit); 1878 end = start + count; 1879 ASSERT(end <= BITS_PER_BITMAP); 1880 1881 bitmap_clear(info->bitmap, start, count); 1882 1883 info->bytes -= bytes; 1884 if (info->max_extent_size > ctl->unit) 1885 info->max_extent_size = 0; 1886 1887 relink_bitmap_entry(ctl, info); 1888 1889 if (start && test_bit(start - 1, info->bitmap)) 1890 extent_delta++; 1891 1892 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1893 extent_delta++; 1894 1895 info->bitmap_extents += extent_delta; 1896 if (!btrfs_free_space_trimmed(info)) { 1897 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1898 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1899 } 1900 1901 if (update_stat) 1902 ctl->free_space -= bytes; 1903 } 1904 1905 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1906 struct btrfs_free_space *info, u64 offset, 1907 u64 bytes) 1908 { 1909 unsigned long start, count, end; 1910 int extent_delta = 1; 1911 1912 start = offset_to_bit(info->offset, ctl->unit, offset); 1913 count = bytes_to_bits(bytes, ctl->unit); 1914 end = start + count; 1915 ASSERT(end <= BITS_PER_BITMAP); 1916 1917 bitmap_set(info->bitmap, start, count); 1918 1919 /* 1920 * We set some bytes, we have no idea what the max extent size is 1921 * anymore. 1922 */ 1923 info->max_extent_size = 0; 1924 info->bytes += bytes; 1925 ctl->free_space += bytes; 1926 1927 relink_bitmap_entry(ctl, info); 1928 1929 if (start && test_bit(start - 1, info->bitmap)) 1930 extent_delta--; 1931 1932 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1933 extent_delta--; 1934 1935 info->bitmap_extents += extent_delta; 1936 if (!btrfs_free_space_trimmed(info)) { 1937 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1938 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1939 } 1940 } 1941 1942 /* 1943 * If we can not find suitable extent, we will use bytes to record 1944 * the size of the max extent. 1945 */ 1946 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1947 struct btrfs_free_space *bitmap_info, u64 *offset, 1948 u64 *bytes, bool for_alloc) 1949 { 1950 unsigned long found_bits = 0; 1951 unsigned long max_bits = 0; 1952 unsigned long bits, i; 1953 unsigned long next_zero; 1954 unsigned long extent_bits; 1955 1956 /* 1957 * Skip searching the bitmap if we don't have a contiguous section that 1958 * is large enough for this allocation. 1959 */ 1960 if (for_alloc && 1961 bitmap_info->max_extent_size && 1962 bitmap_info->max_extent_size < *bytes) { 1963 *bytes = bitmap_info->max_extent_size; 1964 return -1; 1965 } 1966 1967 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1968 max_t(u64, *offset, bitmap_info->offset)); 1969 bits = bytes_to_bits(*bytes, ctl->unit); 1970 1971 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1972 if (for_alloc && bits == 1) { 1973 found_bits = 1; 1974 break; 1975 } 1976 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1977 BITS_PER_BITMAP, i); 1978 extent_bits = next_zero - i; 1979 if (extent_bits >= bits) { 1980 found_bits = extent_bits; 1981 break; 1982 } else if (extent_bits > max_bits) { 1983 max_bits = extent_bits; 1984 } 1985 i = next_zero; 1986 } 1987 1988 if (found_bits) { 1989 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1990 *bytes = (u64)(found_bits) * ctl->unit; 1991 return 0; 1992 } 1993 1994 *bytes = (u64)(max_bits) * ctl->unit; 1995 bitmap_info->max_extent_size = *bytes; 1996 relink_bitmap_entry(ctl, bitmap_info); 1997 return -1; 1998 } 1999 2000 /* Cache the size of the max extent in bytes */ 2001 static struct btrfs_free_space * 2002 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 2003 unsigned long align, u64 *max_extent_size, bool use_bytes_index) 2004 { 2005 struct btrfs_free_space *entry; 2006 struct rb_node *node; 2007 u64 tmp; 2008 u64 align_off; 2009 int ret; 2010 2011 if (!ctl->free_space_offset.rb_node) 2012 goto out; 2013 again: 2014 if (use_bytes_index) { 2015 node = rb_first_cached(&ctl->free_space_bytes); 2016 } else { 2017 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2018 0, 1); 2019 if (!entry) 2020 goto out; 2021 node = &entry->offset_index; 2022 } 2023 2024 for (; node; node = rb_next(node)) { 2025 if (use_bytes_index) 2026 entry = rb_entry(node, struct btrfs_free_space, 2027 bytes_index); 2028 else 2029 entry = rb_entry(node, struct btrfs_free_space, 2030 offset_index); 2031 2032 /* 2033 * If we are using the bytes index then all subsequent entries 2034 * in this tree are going to be < bytes, so simply set the max 2035 * extent size and exit the loop. 2036 * 2037 * If we're using the offset index then we need to keep going 2038 * through the rest of the tree. 2039 */ 2040 if (entry->bytes < *bytes) { 2041 *max_extent_size = max(get_max_extent_size(entry), 2042 *max_extent_size); 2043 if (use_bytes_index) 2044 break; 2045 continue; 2046 } 2047 2048 /* make sure the space returned is big enough 2049 * to match our requested alignment 2050 */ 2051 if (*bytes >= align) { 2052 tmp = entry->offset - ctl->start + align - 1; 2053 tmp = div64_u64(tmp, align); 2054 tmp = tmp * align + ctl->start; 2055 align_off = tmp - entry->offset; 2056 } else { 2057 align_off = 0; 2058 tmp = entry->offset; 2059 } 2060 2061 /* 2062 * We don't break here if we're using the bytes index because we 2063 * may have another entry that has the correct alignment that is 2064 * the right size, so we don't want to miss that possibility. 2065 * At worst this adds another loop through the logic, but if we 2066 * broke here we could prematurely ENOSPC. 2067 */ 2068 if (entry->bytes < *bytes + align_off) { 2069 *max_extent_size = max(get_max_extent_size(entry), 2070 *max_extent_size); 2071 continue; 2072 } 2073 2074 if (entry->bitmap) { 2075 struct rb_node *old_next = rb_next(node); 2076 u64 size = *bytes; 2077 2078 ret = search_bitmap(ctl, entry, &tmp, &size, true); 2079 if (!ret) { 2080 *offset = tmp; 2081 *bytes = size; 2082 return entry; 2083 } else { 2084 *max_extent_size = 2085 max(get_max_extent_size(entry), 2086 *max_extent_size); 2087 } 2088 2089 /* 2090 * The bitmap may have gotten re-arranged in the space 2091 * index here because the max_extent_size may have been 2092 * updated. Start from the beginning again if this 2093 * happened. 2094 */ 2095 if (use_bytes_index && old_next != rb_next(node)) 2096 goto again; 2097 continue; 2098 } 2099 2100 *offset = tmp; 2101 *bytes = entry->bytes - align_off; 2102 return entry; 2103 } 2104 out: 2105 return NULL; 2106 } 2107 2108 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 2109 struct btrfs_free_space *info, u64 offset) 2110 { 2111 info->offset = offset_to_bitmap(ctl, offset); 2112 info->bytes = 0; 2113 info->bitmap_extents = 0; 2114 INIT_LIST_HEAD(&info->list); 2115 link_free_space(ctl, info); 2116 ctl->total_bitmaps++; 2117 recalculate_thresholds(ctl); 2118 } 2119 2120 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 2121 struct btrfs_free_space *bitmap_info) 2122 { 2123 /* 2124 * Normally when this is called, the bitmap is completely empty. However, 2125 * if we are blowing up the free space cache for one reason or another 2126 * via __btrfs_remove_free_space_cache(), then it may not be freed and 2127 * we may leave stats on the table. 2128 */ 2129 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 2130 ctl->discardable_extents[BTRFS_STAT_CURR] -= 2131 bitmap_info->bitmap_extents; 2132 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 2133 2134 } 2135 unlink_free_space(ctl, bitmap_info, true); 2136 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 2137 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 2138 ctl->total_bitmaps--; 2139 recalculate_thresholds(ctl); 2140 } 2141 2142 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 2143 struct btrfs_free_space *bitmap_info, 2144 u64 *offset, u64 *bytes) 2145 { 2146 u64 end; 2147 u64 search_start, search_bytes; 2148 int ret; 2149 2150 again: 2151 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 2152 2153 /* 2154 * We need to search for bits in this bitmap. We could only cover some 2155 * of the extent in this bitmap thanks to how we add space, so we need 2156 * to search for as much as it as we can and clear that amount, and then 2157 * go searching for the next bit. 2158 */ 2159 search_start = *offset; 2160 search_bytes = ctl->unit; 2161 search_bytes = min(search_bytes, end - search_start + 1); 2162 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 2163 false); 2164 if (ret < 0 || search_start != *offset) 2165 return -EINVAL; 2166 2167 /* We may have found more bits than what we need */ 2168 search_bytes = min(search_bytes, *bytes); 2169 2170 /* Cannot clear past the end of the bitmap */ 2171 search_bytes = min(search_bytes, end - search_start + 1); 2172 2173 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); 2174 *offset += search_bytes; 2175 *bytes -= search_bytes; 2176 2177 if (*bytes) { 2178 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2179 if (!bitmap_info->bytes) 2180 free_bitmap(ctl, bitmap_info); 2181 2182 /* 2183 * no entry after this bitmap, but we still have bytes to 2184 * remove, so something has gone wrong. 2185 */ 2186 if (!next) 2187 return -EINVAL; 2188 2189 bitmap_info = rb_entry(next, struct btrfs_free_space, 2190 offset_index); 2191 2192 /* 2193 * if the next entry isn't a bitmap we need to return to let the 2194 * extent stuff do its work. 2195 */ 2196 if (!bitmap_info->bitmap) 2197 return -EAGAIN; 2198 2199 /* 2200 * Ok the next item is a bitmap, but it may not actually hold 2201 * the information for the rest of this free space stuff, so 2202 * look for it, and if we don't find it return so we can try 2203 * everything over again. 2204 */ 2205 search_start = *offset; 2206 search_bytes = ctl->unit; 2207 ret = search_bitmap(ctl, bitmap_info, &search_start, 2208 &search_bytes, false); 2209 if (ret < 0 || search_start != *offset) 2210 return -EAGAIN; 2211 2212 goto again; 2213 } else if (!bitmap_info->bytes) 2214 free_bitmap(ctl, bitmap_info); 2215 2216 return 0; 2217 } 2218 2219 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2220 struct btrfs_free_space *info, u64 offset, 2221 u64 bytes, enum btrfs_trim_state trim_state) 2222 { 2223 u64 bytes_to_set = 0; 2224 u64 end; 2225 2226 /* 2227 * This is a tradeoff to make bitmap trim state minimal. We mark the 2228 * whole bitmap untrimmed if at any point we add untrimmed regions. 2229 */ 2230 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2231 if (btrfs_free_space_trimmed(info)) { 2232 ctl->discardable_extents[BTRFS_STAT_CURR] += 2233 info->bitmap_extents; 2234 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2235 } 2236 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2237 } 2238 2239 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2240 2241 bytes_to_set = min(end - offset, bytes); 2242 2243 bitmap_set_bits(ctl, info, offset, bytes_to_set); 2244 2245 return bytes_to_set; 2246 2247 } 2248 2249 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2250 struct btrfs_free_space *info) 2251 { 2252 struct btrfs_block_group *block_group = ctl->block_group; 2253 struct btrfs_fs_info *fs_info = block_group->fs_info; 2254 bool forced = false; 2255 2256 #ifdef CONFIG_BTRFS_DEBUG 2257 if (btrfs_should_fragment_free_space(block_group)) 2258 forced = true; 2259 #endif 2260 2261 /* This is a way to reclaim large regions from the bitmaps. */ 2262 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2263 return false; 2264 2265 /* 2266 * If we are below the extents threshold then we can add this as an 2267 * extent, and don't have to deal with the bitmap 2268 */ 2269 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2270 /* 2271 * If this block group has some small extents we don't want to 2272 * use up all of our free slots in the cache with them, we want 2273 * to reserve them to larger extents, however if we have plenty 2274 * of cache left then go ahead an dadd them, no sense in adding 2275 * the overhead of a bitmap if we don't have to. 2276 */ 2277 if (info->bytes <= fs_info->sectorsize * 8) { 2278 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2279 return false; 2280 } else { 2281 return false; 2282 } 2283 } 2284 2285 /* 2286 * The original block groups from mkfs can be really small, like 8 2287 * megabytes, so don't bother with a bitmap for those entries. However 2288 * some block groups can be smaller than what a bitmap would cover but 2289 * are still large enough that they could overflow the 32k memory limit, 2290 * so allow those block groups to still be allowed to have a bitmap 2291 * entry. 2292 */ 2293 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2294 return false; 2295 2296 return true; 2297 } 2298 2299 static const struct btrfs_free_space_op free_space_op = { 2300 .use_bitmap = use_bitmap, 2301 }; 2302 2303 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2304 struct btrfs_free_space *info) 2305 { 2306 struct btrfs_free_space *bitmap_info; 2307 struct btrfs_block_group *block_group = NULL; 2308 int added = 0; 2309 u64 bytes, offset, bytes_added; 2310 enum btrfs_trim_state trim_state; 2311 int ret; 2312 2313 bytes = info->bytes; 2314 offset = info->offset; 2315 trim_state = info->trim_state; 2316 2317 if (!ctl->op->use_bitmap(ctl, info)) 2318 return 0; 2319 2320 if (ctl->op == &free_space_op) 2321 block_group = ctl->block_group; 2322 again: 2323 /* 2324 * Since we link bitmaps right into the cluster we need to see if we 2325 * have a cluster here, and if so and it has our bitmap we need to add 2326 * the free space to that bitmap. 2327 */ 2328 if (block_group && !list_empty(&block_group->cluster_list)) { 2329 struct btrfs_free_cluster *cluster; 2330 struct rb_node *node; 2331 struct btrfs_free_space *entry; 2332 2333 cluster = list_entry(block_group->cluster_list.next, 2334 struct btrfs_free_cluster, 2335 block_group_list); 2336 spin_lock(&cluster->lock); 2337 node = rb_first(&cluster->root); 2338 if (!node) { 2339 spin_unlock(&cluster->lock); 2340 goto no_cluster_bitmap; 2341 } 2342 2343 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2344 if (!entry->bitmap) { 2345 spin_unlock(&cluster->lock); 2346 goto no_cluster_bitmap; 2347 } 2348 2349 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2350 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2351 bytes, trim_state); 2352 bytes -= bytes_added; 2353 offset += bytes_added; 2354 } 2355 spin_unlock(&cluster->lock); 2356 if (!bytes) { 2357 ret = 1; 2358 goto out; 2359 } 2360 } 2361 2362 no_cluster_bitmap: 2363 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2364 1, 0); 2365 if (!bitmap_info) { 2366 ASSERT(added == 0); 2367 goto new_bitmap; 2368 } 2369 2370 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2371 trim_state); 2372 bytes -= bytes_added; 2373 offset += bytes_added; 2374 added = 0; 2375 2376 if (!bytes) { 2377 ret = 1; 2378 goto out; 2379 } else 2380 goto again; 2381 2382 new_bitmap: 2383 if (info && info->bitmap) { 2384 add_new_bitmap(ctl, info, offset); 2385 added = 1; 2386 info = NULL; 2387 goto again; 2388 } else { 2389 spin_unlock(&ctl->tree_lock); 2390 2391 /* no pre-allocated info, allocate a new one */ 2392 if (!info) { 2393 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2394 GFP_NOFS); 2395 if (!info) { 2396 spin_lock(&ctl->tree_lock); 2397 ret = -ENOMEM; 2398 goto out; 2399 } 2400 } 2401 2402 /* allocate the bitmap */ 2403 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2404 GFP_NOFS); 2405 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2406 spin_lock(&ctl->tree_lock); 2407 if (!info->bitmap) { 2408 ret = -ENOMEM; 2409 goto out; 2410 } 2411 goto again; 2412 } 2413 2414 out: 2415 if (info) { 2416 if (info->bitmap) 2417 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2418 info->bitmap); 2419 kmem_cache_free(btrfs_free_space_cachep, info); 2420 } 2421 2422 return ret; 2423 } 2424 2425 /* 2426 * Free space merging rules: 2427 * 1) Merge trimmed areas together 2428 * 2) Let untrimmed areas coalesce with trimmed areas 2429 * 3) Always pull neighboring regions from bitmaps 2430 * 2431 * The above rules are for when we merge free space based on btrfs_trim_state. 2432 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2433 * same reason: to promote larger extent regions which makes life easier for 2434 * find_free_extent(). Rule 2 enables coalescing based on the common path 2435 * being returning free space from btrfs_finish_extent_commit(). So when free 2436 * space is trimmed, it will prevent aggregating trimmed new region and 2437 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2438 * and provide find_free_extent() with the largest extents possible hoping for 2439 * the reuse path. 2440 */ 2441 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2442 struct btrfs_free_space *info, bool update_stat) 2443 { 2444 struct btrfs_free_space *left_info = NULL; 2445 struct btrfs_free_space *right_info; 2446 bool merged = false; 2447 u64 offset = info->offset; 2448 u64 bytes = info->bytes; 2449 const bool is_trimmed = btrfs_free_space_trimmed(info); 2450 2451 /* 2452 * first we want to see if there is free space adjacent to the range we 2453 * are adding, if there is remove that struct and add a new one to 2454 * cover the entire range 2455 */ 2456 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2457 if (right_info && rb_prev(&right_info->offset_index)) 2458 left_info = rb_entry(rb_prev(&right_info->offset_index), 2459 struct btrfs_free_space, offset_index); 2460 else if (!right_info) 2461 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2462 2463 /* See try_merge_free_space() comment. */ 2464 if (right_info && !right_info->bitmap && 2465 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2466 unlink_free_space(ctl, right_info, update_stat); 2467 info->bytes += right_info->bytes; 2468 kmem_cache_free(btrfs_free_space_cachep, right_info); 2469 merged = true; 2470 } 2471 2472 /* See try_merge_free_space() comment. */ 2473 if (left_info && !left_info->bitmap && 2474 left_info->offset + left_info->bytes == offset && 2475 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2476 unlink_free_space(ctl, left_info, update_stat); 2477 info->offset = left_info->offset; 2478 info->bytes += left_info->bytes; 2479 kmem_cache_free(btrfs_free_space_cachep, left_info); 2480 merged = true; 2481 } 2482 2483 return merged; 2484 } 2485 2486 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2487 struct btrfs_free_space *info, 2488 bool update_stat) 2489 { 2490 struct btrfs_free_space *bitmap; 2491 unsigned long i; 2492 unsigned long j; 2493 const u64 end = info->offset + info->bytes; 2494 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2495 u64 bytes; 2496 2497 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2498 if (!bitmap) 2499 return false; 2500 2501 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2502 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2503 if (j == i) 2504 return false; 2505 bytes = (j - i) * ctl->unit; 2506 info->bytes += bytes; 2507 2508 /* See try_merge_free_space() comment. */ 2509 if (!btrfs_free_space_trimmed(bitmap)) 2510 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2511 2512 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); 2513 2514 if (!bitmap->bytes) 2515 free_bitmap(ctl, bitmap); 2516 2517 return true; 2518 } 2519 2520 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2521 struct btrfs_free_space *info, 2522 bool update_stat) 2523 { 2524 struct btrfs_free_space *bitmap; 2525 u64 bitmap_offset; 2526 unsigned long i; 2527 unsigned long j; 2528 unsigned long prev_j; 2529 u64 bytes; 2530 2531 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2532 /* If we're on a boundary, try the previous logical bitmap. */ 2533 if (bitmap_offset == info->offset) { 2534 if (info->offset == 0) 2535 return false; 2536 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2537 } 2538 2539 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2540 if (!bitmap) 2541 return false; 2542 2543 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2544 j = 0; 2545 prev_j = (unsigned long)-1; 2546 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2547 if (j > i) 2548 break; 2549 prev_j = j; 2550 } 2551 if (prev_j == i) 2552 return false; 2553 2554 if (prev_j == (unsigned long)-1) 2555 bytes = (i + 1) * ctl->unit; 2556 else 2557 bytes = (i - prev_j) * ctl->unit; 2558 2559 info->offset -= bytes; 2560 info->bytes += bytes; 2561 2562 /* See try_merge_free_space() comment. */ 2563 if (!btrfs_free_space_trimmed(bitmap)) 2564 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2565 2566 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); 2567 2568 if (!bitmap->bytes) 2569 free_bitmap(ctl, bitmap); 2570 2571 return true; 2572 } 2573 2574 /* 2575 * We prefer always to allocate from extent entries, both for clustered and 2576 * non-clustered allocation requests. So when attempting to add a new extent 2577 * entry, try to see if there's adjacent free space in bitmap entries, and if 2578 * there is, migrate that space from the bitmaps to the extent. 2579 * Like this we get better chances of satisfying space allocation requests 2580 * because we attempt to satisfy them based on a single cache entry, and never 2581 * on 2 or more entries - even if the entries represent a contiguous free space 2582 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2583 * ends). 2584 */ 2585 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2586 struct btrfs_free_space *info, 2587 bool update_stat) 2588 { 2589 /* 2590 * Only work with disconnected entries, as we can change their offset, 2591 * and must be extent entries. 2592 */ 2593 ASSERT(!info->bitmap); 2594 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2595 2596 if (ctl->total_bitmaps > 0) { 2597 bool stole_end; 2598 bool stole_front = false; 2599 2600 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2601 if (ctl->total_bitmaps > 0) 2602 stole_front = steal_from_bitmap_to_front(ctl, info, 2603 update_stat); 2604 2605 if (stole_end || stole_front) 2606 try_merge_free_space(ctl, info, update_stat); 2607 } 2608 } 2609 2610 int __btrfs_add_free_space(struct btrfs_block_group *block_group, 2611 u64 offset, u64 bytes, 2612 enum btrfs_trim_state trim_state) 2613 { 2614 struct btrfs_fs_info *fs_info = block_group->fs_info; 2615 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2616 struct btrfs_free_space *info; 2617 int ret = 0; 2618 u64 filter_bytes = bytes; 2619 2620 ASSERT(!btrfs_is_zoned(fs_info)); 2621 2622 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2623 if (!info) 2624 return -ENOMEM; 2625 2626 info->offset = offset; 2627 info->bytes = bytes; 2628 info->trim_state = trim_state; 2629 RB_CLEAR_NODE(&info->offset_index); 2630 RB_CLEAR_NODE(&info->bytes_index); 2631 2632 spin_lock(&ctl->tree_lock); 2633 2634 if (try_merge_free_space(ctl, info, true)) 2635 goto link; 2636 2637 /* 2638 * There was no extent directly to the left or right of this new 2639 * extent then we know we're going to have to allocate a new extent, so 2640 * before we do that see if we need to drop this into a bitmap 2641 */ 2642 ret = insert_into_bitmap(ctl, info); 2643 if (ret < 0) { 2644 goto out; 2645 } else if (ret) { 2646 ret = 0; 2647 goto out; 2648 } 2649 link: 2650 /* 2651 * Only steal free space from adjacent bitmaps if we're sure we're not 2652 * going to add the new free space to existing bitmap entries - because 2653 * that would mean unnecessary work that would be reverted. Therefore 2654 * attempt to steal space from bitmaps if we're adding an extent entry. 2655 */ 2656 steal_from_bitmap(ctl, info, true); 2657 2658 filter_bytes = max(filter_bytes, info->bytes); 2659 2660 ret = link_free_space(ctl, info); 2661 if (ret) 2662 kmem_cache_free(btrfs_free_space_cachep, info); 2663 out: 2664 btrfs_discard_update_discardable(block_group); 2665 spin_unlock(&ctl->tree_lock); 2666 2667 if (ret) { 2668 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2669 ASSERT(ret != -EEXIST); 2670 } 2671 2672 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2673 btrfs_discard_check_filter(block_group, filter_bytes); 2674 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2675 } 2676 2677 return ret; 2678 } 2679 2680 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2681 u64 bytenr, u64 size, bool used) 2682 { 2683 struct btrfs_space_info *sinfo = block_group->space_info; 2684 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2685 u64 offset = bytenr - block_group->start; 2686 u64 to_free, to_unusable; 2687 int bg_reclaim_threshold = 0; 2688 bool initial = (size == block_group->length); 2689 u64 reclaimable_unusable; 2690 2691 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2692 2693 if (!initial) 2694 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2695 2696 spin_lock(&ctl->tree_lock); 2697 /* Count initial region as zone_unusable until it gets activated. */ 2698 if (!used) 2699 to_free = size; 2700 else if (initial && 2701 test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) && 2702 (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) 2703 to_free = 0; 2704 else if (initial) 2705 to_free = block_group->zone_capacity; 2706 else if (offset >= block_group->alloc_offset) 2707 to_free = size; 2708 else if (offset + size <= block_group->alloc_offset) 2709 to_free = 0; 2710 else 2711 to_free = offset + size - block_group->alloc_offset; 2712 to_unusable = size - to_free; 2713 2714 ctl->free_space += to_free; 2715 /* 2716 * If the block group is read-only, we should account freed space into 2717 * bytes_readonly. 2718 */ 2719 if (!block_group->ro) 2720 block_group->zone_unusable += to_unusable; 2721 spin_unlock(&ctl->tree_lock); 2722 if (!used) { 2723 spin_lock(&block_group->lock); 2724 block_group->alloc_offset -= size; 2725 spin_unlock(&block_group->lock); 2726 } 2727 2728 reclaimable_unusable = block_group->zone_unusable - 2729 (block_group->length - block_group->zone_capacity); 2730 /* All the region is now unusable. Mark it as unused and reclaim */ 2731 if (block_group->zone_unusable == block_group->length && 2732 block_group->alloc_offset) { 2733 btrfs_mark_bg_unused(block_group); 2734 } else if (bg_reclaim_threshold && 2735 reclaimable_unusable >= 2736 mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { 2737 btrfs_mark_bg_to_reclaim(block_group); 2738 } 2739 2740 return 0; 2741 } 2742 2743 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2744 u64 bytenr, u64 size) 2745 { 2746 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2747 2748 if (btrfs_is_zoned(block_group->fs_info)) 2749 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2750 true); 2751 2752 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2753 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2754 2755 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2756 } 2757 2758 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, 2759 u64 bytenr, u64 size) 2760 { 2761 if (btrfs_is_zoned(block_group->fs_info)) 2762 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2763 false); 2764 2765 return btrfs_add_free_space(block_group, bytenr, size); 2766 } 2767 2768 /* 2769 * This is a subtle distinction because when adding free space back in general, 2770 * we want it to be added as untrimmed for async. But in the case where we add 2771 * it on loading of a block group, we want to consider it trimmed. 2772 */ 2773 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2774 u64 bytenr, u64 size) 2775 { 2776 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2777 2778 if (btrfs_is_zoned(block_group->fs_info)) 2779 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2780 true); 2781 2782 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2783 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2784 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2785 2786 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2787 } 2788 2789 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2790 u64 offset, u64 bytes) 2791 { 2792 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2793 struct btrfs_free_space *info; 2794 int ret; 2795 bool re_search = false; 2796 2797 if (btrfs_is_zoned(block_group->fs_info)) { 2798 /* 2799 * This can happen with conventional zones when replaying log. 2800 * Since the allocation info of tree-log nodes are not recorded 2801 * to the extent-tree, calculate_alloc_pointer() failed to 2802 * advance the allocation pointer after last allocated tree log 2803 * node blocks. 2804 * 2805 * This function is called from 2806 * btrfs_pin_extent_for_log_replay() when replaying the log. 2807 * Advance the pointer not to overwrite the tree-log nodes. 2808 */ 2809 if (block_group->start + block_group->alloc_offset < 2810 offset + bytes) { 2811 block_group->alloc_offset = 2812 offset + bytes - block_group->start; 2813 } 2814 return 0; 2815 } 2816 2817 spin_lock(&ctl->tree_lock); 2818 2819 again: 2820 ret = 0; 2821 if (!bytes) 2822 goto out_lock; 2823 2824 info = tree_search_offset(ctl, offset, 0, 0); 2825 if (!info) { 2826 /* 2827 * oops didn't find an extent that matched the space we wanted 2828 * to remove, look for a bitmap instead 2829 */ 2830 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2831 1, 0); 2832 if (!info) { 2833 /* 2834 * If we found a partial bit of our free space in a 2835 * bitmap but then couldn't find the other part this may 2836 * be a problem, so WARN about it. 2837 */ 2838 WARN_ON(re_search); 2839 goto out_lock; 2840 } 2841 } 2842 2843 re_search = false; 2844 if (!info->bitmap) { 2845 unlink_free_space(ctl, info, true); 2846 if (offset == info->offset) { 2847 u64 to_free = min(bytes, info->bytes); 2848 2849 info->bytes -= to_free; 2850 info->offset += to_free; 2851 if (info->bytes) { 2852 ret = link_free_space(ctl, info); 2853 WARN_ON(ret); 2854 } else { 2855 kmem_cache_free(btrfs_free_space_cachep, info); 2856 } 2857 2858 offset += to_free; 2859 bytes -= to_free; 2860 goto again; 2861 } else { 2862 u64 old_end = info->bytes + info->offset; 2863 2864 info->bytes = offset - info->offset; 2865 ret = link_free_space(ctl, info); 2866 WARN_ON(ret); 2867 if (ret) 2868 goto out_lock; 2869 2870 /* Not enough bytes in this entry to satisfy us */ 2871 if (old_end < offset + bytes) { 2872 bytes -= old_end - offset; 2873 offset = old_end; 2874 goto again; 2875 } else if (old_end == offset + bytes) { 2876 /* all done */ 2877 goto out_lock; 2878 } 2879 spin_unlock(&ctl->tree_lock); 2880 2881 ret = __btrfs_add_free_space(block_group, 2882 offset + bytes, 2883 old_end - (offset + bytes), 2884 info->trim_state); 2885 WARN_ON(ret); 2886 goto out; 2887 } 2888 } 2889 2890 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2891 if (ret == -EAGAIN) { 2892 re_search = true; 2893 goto again; 2894 } 2895 out_lock: 2896 btrfs_discard_update_discardable(block_group); 2897 spin_unlock(&ctl->tree_lock); 2898 out: 2899 return ret; 2900 } 2901 2902 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2903 u64 bytes) 2904 { 2905 struct btrfs_fs_info *fs_info = block_group->fs_info; 2906 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2907 struct btrfs_free_space *info; 2908 struct rb_node *n; 2909 int count = 0; 2910 2911 /* 2912 * Zoned btrfs does not use free space tree and cluster. Just print 2913 * out the free space after the allocation offset. 2914 */ 2915 if (btrfs_is_zoned(fs_info)) { 2916 btrfs_info(fs_info, "free space %llu active %d", 2917 block_group->zone_capacity - block_group->alloc_offset, 2918 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2919 &block_group->runtime_flags)); 2920 return; 2921 } 2922 2923 spin_lock(&ctl->tree_lock); 2924 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2925 info = rb_entry(n, struct btrfs_free_space, offset_index); 2926 if (info->bytes >= bytes && !block_group->ro) 2927 count++; 2928 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2929 info->offset, info->bytes, 2930 (info->bitmap) ? "yes" : "no"); 2931 } 2932 spin_unlock(&ctl->tree_lock); 2933 btrfs_info(fs_info, "block group has cluster?: %s", 2934 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2935 btrfs_info(fs_info, 2936 "%d blocks of free space at or bigger than bytes is", count); 2937 } 2938 2939 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2940 struct btrfs_free_space_ctl *ctl) 2941 { 2942 struct btrfs_fs_info *fs_info = block_group->fs_info; 2943 2944 spin_lock_init(&ctl->tree_lock); 2945 ctl->unit = fs_info->sectorsize; 2946 ctl->start = block_group->start; 2947 ctl->block_group = block_group; 2948 ctl->op = &free_space_op; 2949 ctl->free_space_bytes = RB_ROOT_CACHED; 2950 INIT_LIST_HEAD(&ctl->trimming_ranges); 2951 mutex_init(&ctl->cache_writeout_mutex); 2952 2953 /* 2954 * we only want to have 32k of ram per block group for keeping 2955 * track of free space, and if we pass 1/2 of that we want to 2956 * start converting things over to using bitmaps 2957 */ 2958 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2959 } 2960 2961 /* 2962 * for a given cluster, put all of its extents back into the free 2963 * space cache. If the block group passed doesn't match the block group 2964 * pointed to by the cluster, someone else raced in and freed the 2965 * cluster already. In that case, we just return without changing anything 2966 */ 2967 static void __btrfs_return_cluster_to_free_space( 2968 struct btrfs_block_group *block_group, 2969 struct btrfs_free_cluster *cluster) 2970 { 2971 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2972 struct btrfs_free_space *entry; 2973 struct rb_node *node; 2974 2975 spin_lock(&cluster->lock); 2976 if (cluster->block_group != block_group) { 2977 spin_unlock(&cluster->lock); 2978 return; 2979 } 2980 2981 cluster->block_group = NULL; 2982 cluster->window_start = 0; 2983 list_del_init(&cluster->block_group_list); 2984 2985 node = rb_first(&cluster->root); 2986 while (node) { 2987 bool bitmap; 2988 2989 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2990 node = rb_next(&entry->offset_index); 2991 rb_erase(&entry->offset_index, &cluster->root); 2992 RB_CLEAR_NODE(&entry->offset_index); 2993 2994 bitmap = (entry->bitmap != NULL); 2995 if (!bitmap) { 2996 /* Merging treats extents as if they were new */ 2997 if (!btrfs_free_space_trimmed(entry)) { 2998 ctl->discardable_extents[BTRFS_STAT_CURR]--; 2999 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 3000 entry->bytes; 3001 } 3002 3003 try_merge_free_space(ctl, entry, false); 3004 steal_from_bitmap(ctl, entry, false); 3005 3006 /* As we insert directly, update these statistics */ 3007 if (!btrfs_free_space_trimmed(entry)) { 3008 ctl->discardable_extents[BTRFS_STAT_CURR]++; 3009 ctl->discardable_bytes[BTRFS_STAT_CURR] += 3010 entry->bytes; 3011 } 3012 } 3013 tree_insert_offset(&ctl->free_space_offset, 3014 entry->offset, &entry->offset_index, bitmap); 3015 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, 3016 entry_less); 3017 } 3018 cluster->root = RB_ROOT; 3019 spin_unlock(&cluster->lock); 3020 btrfs_put_block_group(block_group); 3021 } 3022 3023 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 3024 { 3025 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3026 struct btrfs_free_cluster *cluster; 3027 struct list_head *head; 3028 3029 spin_lock(&ctl->tree_lock); 3030 while ((head = block_group->cluster_list.next) != 3031 &block_group->cluster_list) { 3032 cluster = list_entry(head, struct btrfs_free_cluster, 3033 block_group_list); 3034 3035 WARN_ON(cluster->block_group != block_group); 3036 __btrfs_return_cluster_to_free_space(block_group, cluster); 3037 3038 cond_resched_lock(&ctl->tree_lock); 3039 } 3040 __btrfs_remove_free_space_cache(ctl); 3041 btrfs_discard_update_discardable(block_group); 3042 spin_unlock(&ctl->tree_lock); 3043 3044 } 3045 3046 /* 3047 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 3048 */ 3049 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 3050 { 3051 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3052 struct btrfs_free_space *info; 3053 struct rb_node *node; 3054 bool ret = true; 3055 3056 spin_lock(&ctl->tree_lock); 3057 node = rb_first(&ctl->free_space_offset); 3058 3059 while (node) { 3060 info = rb_entry(node, struct btrfs_free_space, offset_index); 3061 3062 if (!btrfs_free_space_trimmed(info)) { 3063 ret = false; 3064 break; 3065 } 3066 3067 node = rb_next(node); 3068 } 3069 3070 spin_unlock(&ctl->tree_lock); 3071 return ret; 3072 } 3073 3074 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 3075 u64 offset, u64 bytes, u64 empty_size, 3076 u64 *max_extent_size) 3077 { 3078 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3079 struct btrfs_discard_ctl *discard_ctl = 3080 &block_group->fs_info->discard_ctl; 3081 struct btrfs_free_space *entry = NULL; 3082 u64 bytes_search = bytes + empty_size; 3083 u64 ret = 0; 3084 u64 align_gap = 0; 3085 u64 align_gap_len = 0; 3086 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3087 bool use_bytes_index = (offset == block_group->start); 3088 3089 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3090 3091 spin_lock(&ctl->tree_lock); 3092 entry = find_free_space(ctl, &offset, &bytes_search, 3093 block_group->full_stripe_len, max_extent_size, 3094 use_bytes_index); 3095 if (!entry) 3096 goto out; 3097 3098 ret = offset; 3099 if (entry->bitmap) { 3100 bitmap_clear_bits(ctl, entry, offset, bytes, true); 3101 3102 if (!btrfs_free_space_trimmed(entry)) 3103 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3104 3105 if (!entry->bytes) 3106 free_bitmap(ctl, entry); 3107 } else { 3108 unlink_free_space(ctl, entry, true); 3109 align_gap_len = offset - entry->offset; 3110 align_gap = entry->offset; 3111 align_gap_trim_state = entry->trim_state; 3112 3113 if (!btrfs_free_space_trimmed(entry)) 3114 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3115 3116 entry->offset = offset + bytes; 3117 WARN_ON(entry->bytes < bytes + align_gap_len); 3118 3119 entry->bytes -= bytes + align_gap_len; 3120 if (!entry->bytes) 3121 kmem_cache_free(btrfs_free_space_cachep, entry); 3122 else 3123 link_free_space(ctl, entry); 3124 } 3125 out: 3126 btrfs_discard_update_discardable(block_group); 3127 spin_unlock(&ctl->tree_lock); 3128 3129 if (align_gap_len) 3130 __btrfs_add_free_space(block_group, align_gap, align_gap_len, 3131 align_gap_trim_state); 3132 return ret; 3133 } 3134 3135 /* 3136 * given a cluster, put all of its extents back into the free space 3137 * cache. If a block group is passed, this function will only free 3138 * a cluster that belongs to the passed block group. 3139 * 3140 * Otherwise, it'll get a reference on the block group pointed to by the 3141 * cluster and remove the cluster from it. 3142 */ 3143 void btrfs_return_cluster_to_free_space( 3144 struct btrfs_block_group *block_group, 3145 struct btrfs_free_cluster *cluster) 3146 { 3147 struct btrfs_free_space_ctl *ctl; 3148 3149 /* first, get a safe pointer to the block group */ 3150 spin_lock(&cluster->lock); 3151 if (!block_group) { 3152 block_group = cluster->block_group; 3153 if (!block_group) { 3154 spin_unlock(&cluster->lock); 3155 return; 3156 } 3157 } else if (cluster->block_group != block_group) { 3158 /* someone else has already freed it don't redo their work */ 3159 spin_unlock(&cluster->lock); 3160 return; 3161 } 3162 btrfs_get_block_group(block_group); 3163 spin_unlock(&cluster->lock); 3164 3165 ctl = block_group->free_space_ctl; 3166 3167 /* now return any extents the cluster had on it */ 3168 spin_lock(&ctl->tree_lock); 3169 __btrfs_return_cluster_to_free_space(block_group, cluster); 3170 spin_unlock(&ctl->tree_lock); 3171 3172 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 3173 3174 /* finally drop our ref */ 3175 btrfs_put_block_group(block_group); 3176 } 3177 3178 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 3179 struct btrfs_free_cluster *cluster, 3180 struct btrfs_free_space *entry, 3181 u64 bytes, u64 min_start, 3182 u64 *max_extent_size) 3183 { 3184 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3185 int err; 3186 u64 search_start = cluster->window_start; 3187 u64 search_bytes = bytes; 3188 u64 ret = 0; 3189 3190 search_start = min_start; 3191 search_bytes = bytes; 3192 3193 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 3194 if (err) { 3195 *max_extent_size = max(get_max_extent_size(entry), 3196 *max_extent_size); 3197 return 0; 3198 } 3199 3200 ret = search_start; 3201 bitmap_clear_bits(ctl, entry, ret, bytes, false); 3202 3203 return ret; 3204 } 3205 3206 /* 3207 * given a cluster, try to allocate 'bytes' from it, returns 0 3208 * if it couldn't find anything suitably large, or a logical disk offset 3209 * if things worked out 3210 */ 3211 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 3212 struct btrfs_free_cluster *cluster, u64 bytes, 3213 u64 min_start, u64 *max_extent_size) 3214 { 3215 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3216 struct btrfs_discard_ctl *discard_ctl = 3217 &block_group->fs_info->discard_ctl; 3218 struct btrfs_free_space *entry = NULL; 3219 struct rb_node *node; 3220 u64 ret = 0; 3221 3222 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3223 3224 spin_lock(&cluster->lock); 3225 if (bytes > cluster->max_size) 3226 goto out; 3227 3228 if (cluster->block_group != block_group) 3229 goto out; 3230 3231 node = rb_first(&cluster->root); 3232 if (!node) 3233 goto out; 3234 3235 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3236 while (1) { 3237 if (entry->bytes < bytes) 3238 *max_extent_size = max(get_max_extent_size(entry), 3239 *max_extent_size); 3240 3241 if (entry->bytes < bytes || 3242 (!entry->bitmap && entry->offset < min_start)) { 3243 node = rb_next(&entry->offset_index); 3244 if (!node) 3245 break; 3246 entry = rb_entry(node, struct btrfs_free_space, 3247 offset_index); 3248 continue; 3249 } 3250 3251 if (entry->bitmap) { 3252 ret = btrfs_alloc_from_bitmap(block_group, 3253 cluster, entry, bytes, 3254 cluster->window_start, 3255 max_extent_size); 3256 if (ret == 0) { 3257 node = rb_next(&entry->offset_index); 3258 if (!node) 3259 break; 3260 entry = rb_entry(node, struct btrfs_free_space, 3261 offset_index); 3262 continue; 3263 } 3264 cluster->window_start += bytes; 3265 } else { 3266 ret = entry->offset; 3267 3268 entry->offset += bytes; 3269 entry->bytes -= bytes; 3270 } 3271 3272 break; 3273 } 3274 out: 3275 spin_unlock(&cluster->lock); 3276 3277 if (!ret) 3278 return 0; 3279 3280 spin_lock(&ctl->tree_lock); 3281 3282 if (!btrfs_free_space_trimmed(entry)) 3283 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3284 3285 ctl->free_space -= bytes; 3286 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3287 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3288 3289 spin_lock(&cluster->lock); 3290 if (entry->bytes == 0) { 3291 rb_erase(&entry->offset_index, &cluster->root); 3292 ctl->free_extents--; 3293 if (entry->bitmap) { 3294 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3295 entry->bitmap); 3296 ctl->total_bitmaps--; 3297 recalculate_thresholds(ctl); 3298 } else if (!btrfs_free_space_trimmed(entry)) { 3299 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3300 } 3301 kmem_cache_free(btrfs_free_space_cachep, entry); 3302 } 3303 3304 spin_unlock(&cluster->lock); 3305 spin_unlock(&ctl->tree_lock); 3306 3307 return ret; 3308 } 3309 3310 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3311 struct btrfs_free_space *entry, 3312 struct btrfs_free_cluster *cluster, 3313 u64 offset, u64 bytes, 3314 u64 cont1_bytes, u64 min_bytes) 3315 { 3316 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3317 unsigned long next_zero; 3318 unsigned long i; 3319 unsigned long want_bits; 3320 unsigned long min_bits; 3321 unsigned long found_bits; 3322 unsigned long max_bits = 0; 3323 unsigned long start = 0; 3324 unsigned long total_found = 0; 3325 int ret; 3326 3327 i = offset_to_bit(entry->offset, ctl->unit, 3328 max_t(u64, offset, entry->offset)); 3329 want_bits = bytes_to_bits(bytes, ctl->unit); 3330 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3331 3332 /* 3333 * Don't bother looking for a cluster in this bitmap if it's heavily 3334 * fragmented. 3335 */ 3336 if (entry->max_extent_size && 3337 entry->max_extent_size < cont1_bytes) 3338 return -ENOSPC; 3339 again: 3340 found_bits = 0; 3341 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3342 next_zero = find_next_zero_bit(entry->bitmap, 3343 BITS_PER_BITMAP, i); 3344 if (next_zero - i >= min_bits) { 3345 found_bits = next_zero - i; 3346 if (found_bits > max_bits) 3347 max_bits = found_bits; 3348 break; 3349 } 3350 if (next_zero - i > max_bits) 3351 max_bits = next_zero - i; 3352 i = next_zero; 3353 } 3354 3355 if (!found_bits) { 3356 entry->max_extent_size = (u64)max_bits * ctl->unit; 3357 return -ENOSPC; 3358 } 3359 3360 if (!total_found) { 3361 start = i; 3362 cluster->max_size = 0; 3363 } 3364 3365 total_found += found_bits; 3366 3367 if (cluster->max_size < found_bits * ctl->unit) 3368 cluster->max_size = found_bits * ctl->unit; 3369 3370 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3371 i = next_zero + 1; 3372 goto again; 3373 } 3374 3375 cluster->window_start = start * ctl->unit + entry->offset; 3376 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3377 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3378 3379 /* 3380 * We need to know if we're currently on the normal space index when we 3381 * manipulate the bitmap so that we know we need to remove and re-insert 3382 * it into the space_index tree. Clear the bytes_index node here so the 3383 * bitmap manipulation helpers know not to mess with the space_index 3384 * until this bitmap entry is added back into the normal cache. 3385 */ 3386 RB_CLEAR_NODE(&entry->bytes_index); 3387 3388 ret = tree_insert_offset(&cluster->root, entry->offset, 3389 &entry->offset_index, 1); 3390 ASSERT(!ret); /* -EEXIST; Logic error */ 3391 3392 trace_btrfs_setup_cluster(block_group, cluster, 3393 total_found * ctl->unit, 1); 3394 return 0; 3395 } 3396 3397 /* 3398 * This searches the block group for just extents to fill the cluster with. 3399 * Try to find a cluster with at least bytes total bytes, at least one 3400 * extent of cont1_bytes, and other clusters of at least min_bytes. 3401 */ 3402 static noinline int 3403 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3404 struct btrfs_free_cluster *cluster, 3405 struct list_head *bitmaps, u64 offset, u64 bytes, 3406 u64 cont1_bytes, u64 min_bytes) 3407 { 3408 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3409 struct btrfs_free_space *first = NULL; 3410 struct btrfs_free_space *entry = NULL; 3411 struct btrfs_free_space *last; 3412 struct rb_node *node; 3413 u64 window_free; 3414 u64 max_extent; 3415 u64 total_size = 0; 3416 3417 entry = tree_search_offset(ctl, offset, 0, 1); 3418 if (!entry) 3419 return -ENOSPC; 3420 3421 /* 3422 * We don't want bitmaps, so just move along until we find a normal 3423 * extent entry. 3424 */ 3425 while (entry->bitmap || entry->bytes < min_bytes) { 3426 if (entry->bitmap && list_empty(&entry->list)) 3427 list_add_tail(&entry->list, bitmaps); 3428 node = rb_next(&entry->offset_index); 3429 if (!node) 3430 return -ENOSPC; 3431 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3432 } 3433 3434 window_free = entry->bytes; 3435 max_extent = entry->bytes; 3436 first = entry; 3437 last = entry; 3438 3439 for (node = rb_next(&entry->offset_index); node; 3440 node = rb_next(&entry->offset_index)) { 3441 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3442 3443 if (entry->bitmap) { 3444 if (list_empty(&entry->list)) 3445 list_add_tail(&entry->list, bitmaps); 3446 continue; 3447 } 3448 3449 if (entry->bytes < min_bytes) 3450 continue; 3451 3452 last = entry; 3453 window_free += entry->bytes; 3454 if (entry->bytes > max_extent) 3455 max_extent = entry->bytes; 3456 } 3457 3458 if (window_free < bytes || max_extent < cont1_bytes) 3459 return -ENOSPC; 3460 3461 cluster->window_start = first->offset; 3462 3463 node = &first->offset_index; 3464 3465 /* 3466 * now we've found our entries, pull them out of the free space 3467 * cache and put them into the cluster rbtree 3468 */ 3469 do { 3470 int ret; 3471 3472 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3473 node = rb_next(&entry->offset_index); 3474 if (entry->bitmap || entry->bytes < min_bytes) 3475 continue; 3476 3477 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3478 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3479 ret = tree_insert_offset(&cluster->root, entry->offset, 3480 &entry->offset_index, 0); 3481 total_size += entry->bytes; 3482 ASSERT(!ret); /* -EEXIST; Logic error */ 3483 } while (node && entry != last); 3484 3485 cluster->max_size = max_extent; 3486 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3487 return 0; 3488 } 3489 3490 /* 3491 * This specifically looks for bitmaps that may work in the cluster, we assume 3492 * that we have already failed to find extents that will work. 3493 */ 3494 static noinline int 3495 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3496 struct btrfs_free_cluster *cluster, 3497 struct list_head *bitmaps, u64 offset, u64 bytes, 3498 u64 cont1_bytes, u64 min_bytes) 3499 { 3500 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3501 struct btrfs_free_space *entry = NULL; 3502 int ret = -ENOSPC; 3503 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3504 3505 if (ctl->total_bitmaps == 0) 3506 return -ENOSPC; 3507 3508 /* 3509 * The bitmap that covers offset won't be in the list unless offset 3510 * is just its start offset. 3511 */ 3512 if (!list_empty(bitmaps)) 3513 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3514 3515 if (!entry || entry->offset != bitmap_offset) { 3516 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3517 if (entry && list_empty(&entry->list)) 3518 list_add(&entry->list, bitmaps); 3519 } 3520 3521 list_for_each_entry(entry, bitmaps, list) { 3522 if (entry->bytes < bytes) 3523 continue; 3524 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3525 bytes, cont1_bytes, min_bytes); 3526 if (!ret) 3527 return 0; 3528 } 3529 3530 /* 3531 * The bitmaps list has all the bitmaps that record free space 3532 * starting after offset, so no more search is required. 3533 */ 3534 return -ENOSPC; 3535 } 3536 3537 /* 3538 * here we try to find a cluster of blocks in a block group. The goal 3539 * is to find at least bytes+empty_size. 3540 * We might not find them all in one contiguous area. 3541 * 3542 * returns zero and sets up cluster if things worked out, otherwise 3543 * it returns -enospc 3544 */ 3545 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3546 struct btrfs_free_cluster *cluster, 3547 u64 offset, u64 bytes, u64 empty_size) 3548 { 3549 struct btrfs_fs_info *fs_info = block_group->fs_info; 3550 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3551 struct btrfs_free_space *entry, *tmp; 3552 LIST_HEAD(bitmaps); 3553 u64 min_bytes; 3554 u64 cont1_bytes; 3555 int ret; 3556 3557 /* 3558 * Choose the minimum extent size we'll require for this 3559 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3560 * For metadata, allow allocates with smaller extents. For 3561 * data, keep it dense. 3562 */ 3563 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3564 cont1_bytes = bytes + empty_size; 3565 min_bytes = cont1_bytes; 3566 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3567 cont1_bytes = bytes; 3568 min_bytes = fs_info->sectorsize; 3569 } else { 3570 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3571 min_bytes = fs_info->sectorsize; 3572 } 3573 3574 spin_lock(&ctl->tree_lock); 3575 3576 /* 3577 * If we know we don't have enough space to make a cluster don't even 3578 * bother doing all the work to try and find one. 3579 */ 3580 if (ctl->free_space < bytes) { 3581 spin_unlock(&ctl->tree_lock); 3582 return -ENOSPC; 3583 } 3584 3585 spin_lock(&cluster->lock); 3586 3587 /* someone already found a cluster, hooray */ 3588 if (cluster->block_group) { 3589 ret = 0; 3590 goto out; 3591 } 3592 3593 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3594 min_bytes); 3595 3596 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3597 bytes + empty_size, 3598 cont1_bytes, min_bytes); 3599 if (ret) 3600 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3601 offset, bytes + empty_size, 3602 cont1_bytes, min_bytes); 3603 3604 /* Clear our temporary list */ 3605 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3606 list_del_init(&entry->list); 3607 3608 if (!ret) { 3609 btrfs_get_block_group(block_group); 3610 list_add_tail(&cluster->block_group_list, 3611 &block_group->cluster_list); 3612 cluster->block_group = block_group; 3613 } else { 3614 trace_btrfs_failed_cluster_setup(block_group); 3615 } 3616 out: 3617 spin_unlock(&cluster->lock); 3618 spin_unlock(&ctl->tree_lock); 3619 3620 return ret; 3621 } 3622 3623 /* 3624 * simple code to zero out a cluster 3625 */ 3626 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3627 { 3628 spin_lock_init(&cluster->lock); 3629 spin_lock_init(&cluster->refill_lock); 3630 cluster->root = RB_ROOT; 3631 cluster->max_size = 0; 3632 cluster->fragmented = false; 3633 INIT_LIST_HEAD(&cluster->block_group_list); 3634 cluster->block_group = NULL; 3635 } 3636 3637 static int do_trimming(struct btrfs_block_group *block_group, 3638 u64 *total_trimmed, u64 start, u64 bytes, 3639 u64 reserved_start, u64 reserved_bytes, 3640 enum btrfs_trim_state reserved_trim_state, 3641 struct btrfs_trim_range *trim_entry) 3642 { 3643 struct btrfs_space_info *space_info = block_group->space_info; 3644 struct btrfs_fs_info *fs_info = block_group->fs_info; 3645 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3646 int ret; 3647 int update = 0; 3648 const u64 end = start + bytes; 3649 const u64 reserved_end = reserved_start + reserved_bytes; 3650 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3651 u64 trimmed = 0; 3652 3653 spin_lock(&space_info->lock); 3654 spin_lock(&block_group->lock); 3655 if (!block_group->ro) { 3656 block_group->reserved += reserved_bytes; 3657 space_info->bytes_reserved += reserved_bytes; 3658 update = 1; 3659 } 3660 spin_unlock(&block_group->lock); 3661 spin_unlock(&space_info->lock); 3662 3663 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3664 if (!ret) { 3665 *total_trimmed += trimmed; 3666 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3667 } 3668 3669 mutex_lock(&ctl->cache_writeout_mutex); 3670 if (reserved_start < start) 3671 __btrfs_add_free_space(block_group, reserved_start, 3672 start - reserved_start, 3673 reserved_trim_state); 3674 if (start + bytes < reserved_start + reserved_bytes) 3675 __btrfs_add_free_space(block_group, end, reserved_end - end, 3676 reserved_trim_state); 3677 __btrfs_add_free_space(block_group, start, bytes, trim_state); 3678 list_del(&trim_entry->list); 3679 mutex_unlock(&ctl->cache_writeout_mutex); 3680 3681 if (update) { 3682 spin_lock(&space_info->lock); 3683 spin_lock(&block_group->lock); 3684 if (block_group->ro) 3685 space_info->bytes_readonly += reserved_bytes; 3686 block_group->reserved -= reserved_bytes; 3687 space_info->bytes_reserved -= reserved_bytes; 3688 spin_unlock(&block_group->lock); 3689 spin_unlock(&space_info->lock); 3690 } 3691 3692 return ret; 3693 } 3694 3695 /* 3696 * If @async is set, then we will trim 1 region and return. 3697 */ 3698 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3699 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3700 bool async) 3701 { 3702 struct btrfs_discard_ctl *discard_ctl = 3703 &block_group->fs_info->discard_ctl; 3704 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3705 struct btrfs_free_space *entry; 3706 struct rb_node *node; 3707 int ret = 0; 3708 u64 extent_start; 3709 u64 extent_bytes; 3710 enum btrfs_trim_state extent_trim_state; 3711 u64 bytes; 3712 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3713 3714 while (start < end) { 3715 struct btrfs_trim_range trim_entry; 3716 3717 mutex_lock(&ctl->cache_writeout_mutex); 3718 spin_lock(&ctl->tree_lock); 3719 3720 if (ctl->free_space < minlen) 3721 goto out_unlock; 3722 3723 entry = tree_search_offset(ctl, start, 0, 1); 3724 if (!entry) 3725 goto out_unlock; 3726 3727 /* Skip bitmaps and if async, already trimmed entries */ 3728 while (entry->bitmap || 3729 (async && btrfs_free_space_trimmed(entry))) { 3730 node = rb_next(&entry->offset_index); 3731 if (!node) 3732 goto out_unlock; 3733 entry = rb_entry(node, struct btrfs_free_space, 3734 offset_index); 3735 } 3736 3737 if (entry->offset >= end) 3738 goto out_unlock; 3739 3740 extent_start = entry->offset; 3741 extent_bytes = entry->bytes; 3742 extent_trim_state = entry->trim_state; 3743 if (async) { 3744 start = entry->offset; 3745 bytes = entry->bytes; 3746 if (bytes < minlen) { 3747 spin_unlock(&ctl->tree_lock); 3748 mutex_unlock(&ctl->cache_writeout_mutex); 3749 goto next; 3750 } 3751 unlink_free_space(ctl, entry, true); 3752 /* 3753 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3754 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3755 * X when we come back around. So trim it now. 3756 */ 3757 if (max_discard_size && 3758 bytes >= (max_discard_size + 3759 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3760 bytes = max_discard_size; 3761 extent_bytes = max_discard_size; 3762 entry->offset += max_discard_size; 3763 entry->bytes -= max_discard_size; 3764 link_free_space(ctl, entry); 3765 } else { 3766 kmem_cache_free(btrfs_free_space_cachep, entry); 3767 } 3768 } else { 3769 start = max(start, extent_start); 3770 bytes = min(extent_start + extent_bytes, end) - start; 3771 if (bytes < minlen) { 3772 spin_unlock(&ctl->tree_lock); 3773 mutex_unlock(&ctl->cache_writeout_mutex); 3774 goto next; 3775 } 3776 3777 unlink_free_space(ctl, entry, true); 3778 kmem_cache_free(btrfs_free_space_cachep, entry); 3779 } 3780 3781 spin_unlock(&ctl->tree_lock); 3782 trim_entry.start = extent_start; 3783 trim_entry.bytes = extent_bytes; 3784 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3785 mutex_unlock(&ctl->cache_writeout_mutex); 3786 3787 ret = do_trimming(block_group, total_trimmed, start, bytes, 3788 extent_start, extent_bytes, extent_trim_state, 3789 &trim_entry); 3790 if (ret) { 3791 block_group->discard_cursor = start + bytes; 3792 break; 3793 } 3794 next: 3795 start += bytes; 3796 block_group->discard_cursor = start; 3797 if (async && *total_trimmed) 3798 break; 3799 3800 if (fatal_signal_pending(current)) { 3801 ret = -ERESTARTSYS; 3802 break; 3803 } 3804 3805 cond_resched(); 3806 } 3807 3808 return ret; 3809 3810 out_unlock: 3811 block_group->discard_cursor = btrfs_block_group_end(block_group); 3812 spin_unlock(&ctl->tree_lock); 3813 mutex_unlock(&ctl->cache_writeout_mutex); 3814 3815 return ret; 3816 } 3817 3818 /* 3819 * If we break out of trimming a bitmap prematurely, we should reset the 3820 * trimming bit. In a rather contrieved case, it's possible to race here so 3821 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3822 * 3823 * start = start of bitmap 3824 * end = near end of bitmap 3825 * 3826 * Thread 1: Thread 2: 3827 * trim_bitmaps(start) 3828 * trim_bitmaps(end) 3829 * end_trimming_bitmap() 3830 * reset_trimming_bitmap() 3831 */ 3832 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3833 { 3834 struct btrfs_free_space *entry; 3835 3836 spin_lock(&ctl->tree_lock); 3837 entry = tree_search_offset(ctl, offset, 1, 0); 3838 if (entry) { 3839 if (btrfs_free_space_trimmed(entry)) { 3840 ctl->discardable_extents[BTRFS_STAT_CURR] += 3841 entry->bitmap_extents; 3842 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3843 } 3844 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3845 } 3846 3847 spin_unlock(&ctl->tree_lock); 3848 } 3849 3850 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3851 struct btrfs_free_space *entry) 3852 { 3853 if (btrfs_free_space_trimming_bitmap(entry)) { 3854 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3855 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3856 entry->bitmap_extents; 3857 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3858 } 3859 } 3860 3861 /* 3862 * If @async is set, then we will trim 1 region and return. 3863 */ 3864 static int trim_bitmaps(struct btrfs_block_group *block_group, 3865 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3866 u64 maxlen, bool async) 3867 { 3868 struct btrfs_discard_ctl *discard_ctl = 3869 &block_group->fs_info->discard_ctl; 3870 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3871 struct btrfs_free_space *entry; 3872 int ret = 0; 3873 int ret2; 3874 u64 bytes; 3875 u64 offset = offset_to_bitmap(ctl, start); 3876 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3877 3878 while (offset < end) { 3879 bool next_bitmap = false; 3880 struct btrfs_trim_range trim_entry; 3881 3882 mutex_lock(&ctl->cache_writeout_mutex); 3883 spin_lock(&ctl->tree_lock); 3884 3885 if (ctl->free_space < minlen) { 3886 block_group->discard_cursor = 3887 btrfs_block_group_end(block_group); 3888 spin_unlock(&ctl->tree_lock); 3889 mutex_unlock(&ctl->cache_writeout_mutex); 3890 break; 3891 } 3892 3893 entry = tree_search_offset(ctl, offset, 1, 0); 3894 /* 3895 * Bitmaps are marked trimmed lossily now to prevent constant 3896 * discarding of the same bitmap (the reason why we are bound 3897 * by the filters). So, retrim the block group bitmaps when we 3898 * are preparing to punt to the unused_bgs list. This uses 3899 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3900 * which is the only discard index which sets minlen to 0. 3901 */ 3902 if (!entry || (async && minlen && start == offset && 3903 btrfs_free_space_trimmed(entry))) { 3904 spin_unlock(&ctl->tree_lock); 3905 mutex_unlock(&ctl->cache_writeout_mutex); 3906 next_bitmap = true; 3907 goto next; 3908 } 3909 3910 /* 3911 * Async discard bitmap trimming begins at by setting the start 3912 * to be key.objectid and the offset_to_bitmap() aligns to the 3913 * start of the bitmap. This lets us know we are fully 3914 * scanning the bitmap rather than only some portion of it. 3915 */ 3916 if (start == offset) 3917 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3918 3919 bytes = minlen; 3920 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3921 if (ret2 || start >= end) { 3922 /* 3923 * We lossily consider a bitmap trimmed if we only skip 3924 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3925 */ 3926 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3927 end_trimming_bitmap(ctl, entry); 3928 else 3929 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3930 spin_unlock(&ctl->tree_lock); 3931 mutex_unlock(&ctl->cache_writeout_mutex); 3932 next_bitmap = true; 3933 goto next; 3934 } 3935 3936 /* 3937 * We already trimmed a region, but are using the locking above 3938 * to reset the trim_state. 3939 */ 3940 if (async && *total_trimmed) { 3941 spin_unlock(&ctl->tree_lock); 3942 mutex_unlock(&ctl->cache_writeout_mutex); 3943 goto out; 3944 } 3945 3946 bytes = min(bytes, end - start); 3947 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 3948 spin_unlock(&ctl->tree_lock); 3949 mutex_unlock(&ctl->cache_writeout_mutex); 3950 goto next; 3951 } 3952 3953 /* 3954 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3955 * If X < @minlen, we won't trim X when we come back around. 3956 * So trim it now. We differ here from trimming extents as we 3957 * don't keep individual state per bit. 3958 */ 3959 if (async && 3960 max_discard_size && 3961 bytes > (max_discard_size + minlen)) 3962 bytes = max_discard_size; 3963 3964 bitmap_clear_bits(ctl, entry, start, bytes, true); 3965 if (entry->bytes == 0) 3966 free_bitmap(ctl, entry); 3967 3968 spin_unlock(&ctl->tree_lock); 3969 trim_entry.start = start; 3970 trim_entry.bytes = bytes; 3971 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3972 mutex_unlock(&ctl->cache_writeout_mutex); 3973 3974 ret = do_trimming(block_group, total_trimmed, start, bytes, 3975 start, bytes, 0, &trim_entry); 3976 if (ret) { 3977 reset_trimming_bitmap(ctl, offset); 3978 block_group->discard_cursor = 3979 btrfs_block_group_end(block_group); 3980 break; 3981 } 3982 next: 3983 if (next_bitmap) { 3984 offset += BITS_PER_BITMAP * ctl->unit; 3985 start = offset; 3986 } else { 3987 start += bytes; 3988 } 3989 block_group->discard_cursor = start; 3990 3991 if (fatal_signal_pending(current)) { 3992 if (start != offset) 3993 reset_trimming_bitmap(ctl, offset); 3994 ret = -ERESTARTSYS; 3995 break; 3996 } 3997 3998 cond_resched(); 3999 } 4000 4001 if (offset >= end) 4002 block_group->discard_cursor = end; 4003 4004 out: 4005 return ret; 4006 } 4007 4008 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 4009 u64 *trimmed, u64 start, u64 end, u64 minlen) 4010 { 4011 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4012 int ret; 4013 u64 rem = 0; 4014 4015 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 4016 4017 *trimmed = 0; 4018 4019 spin_lock(&block_group->lock); 4020 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4021 spin_unlock(&block_group->lock); 4022 return 0; 4023 } 4024 btrfs_freeze_block_group(block_group); 4025 spin_unlock(&block_group->lock); 4026 4027 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 4028 if (ret) 4029 goto out; 4030 4031 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 4032 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 4033 /* If we ended in the middle of a bitmap, reset the trimming flag */ 4034 if (rem) 4035 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 4036 out: 4037 btrfs_unfreeze_block_group(block_group); 4038 return ret; 4039 } 4040 4041 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 4042 u64 *trimmed, u64 start, u64 end, u64 minlen, 4043 bool async) 4044 { 4045 int ret; 4046 4047 *trimmed = 0; 4048 4049 spin_lock(&block_group->lock); 4050 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4051 spin_unlock(&block_group->lock); 4052 return 0; 4053 } 4054 btrfs_freeze_block_group(block_group); 4055 spin_unlock(&block_group->lock); 4056 4057 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 4058 btrfs_unfreeze_block_group(block_group); 4059 4060 return ret; 4061 } 4062 4063 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 4064 u64 *trimmed, u64 start, u64 end, u64 minlen, 4065 u64 maxlen, bool async) 4066 { 4067 int ret; 4068 4069 *trimmed = 0; 4070 4071 spin_lock(&block_group->lock); 4072 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4073 spin_unlock(&block_group->lock); 4074 return 0; 4075 } 4076 btrfs_freeze_block_group(block_group); 4077 spin_unlock(&block_group->lock); 4078 4079 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 4080 async); 4081 4082 btrfs_unfreeze_block_group(block_group); 4083 4084 return ret; 4085 } 4086 4087 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 4088 { 4089 return btrfs_super_cache_generation(fs_info->super_copy); 4090 } 4091 4092 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 4093 struct btrfs_trans_handle *trans) 4094 { 4095 struct btrfs_block_group *block_group; 4096 struct rb_node *node; 4097 int ret = 0; 4098 4099 btrfs_info(fs_info, "cleaning free space cache v1"); 4100 4101 node = rb_first_cached(&fs_info->block_group_cache_tree); 4102 while (node) { 4103 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4104 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4105 if (ret) 4106 goto out; 4107 node = rb_next(node); 4108 } 4109 out: 4110 return ret; 4111 } 4112 4113 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 4114 { 4115 struct btrfs_trans_handle *trans; 4116 int ret; 4117 4118 /* 4119 * update_super_roots will appropriately set or unset 4120 * super_copy->cache_generation based on SPACE_CACHE and 4121 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 4122 * transaction commit whether we are enabling space cache v1 and don't 4123 * have any other work to do, or are disabling it and removing free 4124 * space inodes. 4125 */ 4126 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4127 if (IS_ERR(trans)) 4128 return PTR_ERR(trans); 4129 4130 if (!active) { 4131 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4132 ret = cleanup_free_space_cache_v1(fs_info, trans); 4133 if (ret) { 4134 btrfs_abort_transaction(trans, ret); 4135 btrfs_end_transaction(trans); 4136 goto out; 4137 } 4138 } 4139 4140 ret = btrfs_commit_transaction(trans); 4141 out: 4142 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4143 4144 return ret; 4145 } 4146 4147 int __init btrfs_free_space_init(void) 4148 { 4149 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 4150 sizeof(struct btrfs_free_space), 0, 4151 SLAB_MEM_SPREAD, NULL); 4152 if (!btrfs_free_space_cachep) 4153 return -ENOMEM; 4154 4155 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 4156 PAGE_SIZE, PAGE_SIZE, 4157 SLAB_MEM_SPREAD, NULL); 4158 if (!btrfs_free_space_bitmap_cachep) { 4159 kmem_cache_destroy(btrfs_free_space_cachep); 4160 return -ENOMEM; 4161 } 4162 4163 return 0; 4164 } 4165 4166 void __cold btrfs_free_space_exit(void) 4167 { 4168 kmem_cache_destroy(btrfs_free_space_cachep); 4169 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 4170 } 4171 4172 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4173 /* 4174 * Use this if you need to make a bitmap or extent entry specifically, it 4175 * doesn't do any of the merging that add_free_space does, this acts a lot like 4176 * how the free space cache loading stuff works, so you can get really weird 4177 * configurations. 4178 */ 4179 int test_add_free_space_entry(struct btrfs_block_group *cache, 4180 u64 offset, u64 bytes, bool bitmap) 4181 { 4182 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4183 struct btrfs_free_space *info = NULL, *bitmap_info; 4184 void *map = NULL; 4185 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 4186 u64 bytes_added; 4187 int ret; 4188 4189 again: 4190 if (!info) { 4191 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 4192 if (!info) 4193 return -ENOMEM; 4194 } 4195 4196 if (!bitmap) { 4197 spin_lock(&ctl->tree_lock); 4198 info->offset = offset; 4199 info->bytes = bytes; 4200 info->max_extent_size = 0; 4201 ret = link_free_space(ctl, info); 4202 spin_unlock(&ctl->tree_lock); 4203 if (ret) 4204 kmem_cache_free(btrfs_free_space_cachep, info); 4205 return ret; 4206 } 4207 4208 if (!map) { 4209 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 4210 if (!map) { 4211 kmem_cache_free(btrfs_free_space_cachep, info); 4212 return -ENOMEM; 4213 } 4214 } 4215 4216 spin_lock(&ctl->tree_lock); 4217 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4218 1, 0); 4219 if (!bitmap_info) { 4220 info->bitmap = map; 4221 map = NULL; 4222 add_new_bitmap(ctl, info, offset); 4223 bitmap_info = info; 4224 info = NULL; 4225 } 4226 4227 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 4228 trim_state); 4229 4230 bytes -= bytes_added; 4231 offset += bytes_added; 4232 spin_unlock(&ctl->tree_lock); 4233 4234 if (bytes) 4235 goto again; 4236 4237 if (info) 4238 kmem_cache_free(btrfs_free_space_cachep, info); 4239 if (map) 4240 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 4241 return 0; 4242 } 4243 4244 /* 4245 * Checks to see if the given range is in the free space cache. This is really 4246 * just used to check the absence of space, so if there is free space in the 4247 * range at all we will return 1. 4248 */ 4249 int test_check_exists(struct btrfs_block_group *cache, 4250 u64 offset, u64 bytes) 4251 { 4252 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4253 struct btrfs_free_space *info; 4254 int ret = 0; 4255 4256 spin_lock(&ctl->tree_lock); 4257 info = tree_search_offset(ctl, offset, 0, 0); 4258 if (!info) { 4259 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4260 1, 0); 4261 if (!info) 4262 goto out; 4263 } 4264 4265 have_info: 4266 if (info->bitmap) { 4267 u64 bit_off, bit_bytes; 4268 struct rb_node *n; 4269 struct btrfs_free_space *tmp; 4270 4271 bit_off = offset; 4272 bit_bytes = ctl->unit; 4273 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 4274 if (!ret) { 4275 if (bit_off == offset) { 4276 ret = 1; 4277 goto out; 4278 } else if (bit_off > offset && 4279 offset + bytes > bit_off) { 4280 ret = 1; 4281 goto out; 4282 } 4283 } 4284 4285 n = rb_prev(&info->offset_index); 4286 while (n) { 4287 tmp = rb_entry(n, struct btrfs_free_space, 4288 offset_index); 4289 if (tmp->offset + tmp->bytes < offset) 4290 break; 4291 if (offset + bytes < tmp->offset) { 4292 n = rb_prev(&tmp->offset_index); 4293 continue; 4294 } 4295 info = tmp; 4296 goto have_info; 4297 } 4298 4299 n = rb_next(&info->offset_index); 4300 while (n) { 4301 tmp = rb_entry(n, struct btrfs_free_space, 4302 offset_index); 4303 if (offset + bytes < tmp->offset) 4304 break; 4305 if (tmp->offset + tmp->bytes < offset) { 4306 n = rb_next(&tmp->offset_index); 4307 continue; 4308 } 4309 info = tmp; 4310 goto have_info; 4311 } 4312 4313 ret = 0; 4314 goto out; 4315 } 4316 4317 if (info->offset == offset) { 4318 ret = 1; 4319 goto out; 4320 } 4321 4322 if (offset > info->offset && offset < info->offset + info->bytes) 4323 ret = 1; 4324 out: 4325 spin_unlock(&ctl->tree_lock); 4326 return ret; 4327 } 4328 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4329