1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "print-tree.h" 10 #include "transaction.h" 11 #include "locking.h" 12 #include "accessors.h" 13 #include "messages.h" 14 #include "delalloc-space.h" 15 #include "subpage.h" 16 #include "defrag.h" 17 #include "file-item.h" 18 19 static struct kmem_cache *btrfs_inode_defrag_cachep; 20 21 /* 22 * When auto defrag is enabled we queue up these defrag structs to remember 23 * which inodes need defragging passes. 24 */ 25 struct inode_defrag { 26 struct rb_node rb_node; 27 /* Inode number */ 28 u64 ino; 29 /* 30 * Transid where the defrag was added, we search for extents newer than 31 * this. 32 */ 33 u64 transid; 34 35 /* Root objectid */ 36 u64 root; 37 38 /* 39 * The extent size threshold for autodefrag. 40 * 41 * This value is different for compressed/non-compressed extents, thus 42 * needs to be passed from higher layer. 43 * (aka, inode_should_defrag()) 44 */ 45 u32 extent_thresh; 46 }; 47 48 static int __compare_inode_defrag(struct inode_defrag *defrag1, 49 struct inode_defrag *defrag2) 50 { 51 if (defrag1->root > defrag2->root) 52 return 1; 53 else if (defrag1->root < defrag2->root) 54 return -1; 55 else if (defrag1->ino > defrag2->ino) 56 return 1; 57 else if (defrag1->ino < defrag2->ino) 58 return -1; 59 else 60 return 0; 61 } 62 63 /* 64 * Pop a record for an inode into the defrag tree. The lock must be held 65 * already. 66 * 67 * If you're inserting a record for an older transid than an existing record, 68 * the transid already in the tree is lowered. 69 * 70 * If an existing record is found the defrag item you pass in is freed. 71 */ 72 static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, 73 struct inode_defrag *defrag) 74 { 75 struct btrfs_fs_info *fs_info = inode->root->fs_info; 76 struct inode_defrag *entry; 77 struct rb_node **p; 78 struct rb_node *parent = NULL; 79 int ret; 80 81 p = &fs_info->defrag_inodes.rb_node; 82 while (*p) { 83 parent = *p; 84 entry = rb_entry(parent, struct inode_defrag, rb_node); 85 86 ret = __compare_inode_defrag(defrag, entry); 87 if (ret < 0) 88 p = &parent->rb_left; 89 else if (ret > 0) 90 p = &parent->rb_right; 91 else { 92 /* 93 * If we're reinserting an entry for an old defrag run, 94 * make sure to lower the transid of our existing 95 * record. 96 */ 97 if (defrag->transid < entry->transid) 98 entry->transid = defrag->transid; 99 entry->extent_thresh = min(defrag->extent_thresh, 100 entry->extent_thresh); 101 return -EEXIST; 102 } 103 } 104 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 105 rb_link_node(&defrag->rb_node, parent, p); 106 rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); 107 return 0; 108 } 109 110 static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) 111 { 112 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 113 return 0; 114 115 if (btrfs_fs_closing(fs_info)) 116 return 0; 117 118 return 1; 119 } 120 121 /* 122 * Insert a defrag record for this inode if auto defrag is enabled. 123 */ 124 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 125 struct btrfs_inode *inode, u32 extent_thresh) 126 { 127 struct btrfs_root *root = inode->root; 128 struct btrfs_fs_info *fs_info = root->fs_info; 129 struct inode_defrag *defrag; 130 u64 transid; 131 int ret; 132 133 if (!__need_auto_defrag(fs_info)) 134 return 0; 135 136 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 137 return 0; 138 139 if (trans) 140 transid = trans->transid; 141 else 142 transid = inode->root->last_trans; 143 144 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 145 if (!defrag) 146 return -ENOMEM; 147 148 defrag->ino = btrfs_ino(inode); 149 defrag->transid = transid; 150 defrag->root = root->root_key.objectid; 151 defrag->extent_thresh = extent_thresh; 152 153 spin_lock(&fs_info->defrag_inodes_lock); 154 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 155 /* 156 * If we set IN_DEFRAG flag and evict the inode from memory, 157 * and then re-read this inode, this new inode doesn't have 158 * IN_DEFRAG flag. At the case, we may find the existed defrag. 159 */ 160 ret = __btrfs_add_inode_defrag(inode, defrag); 161 if (ret) 162 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 163 } else { 164 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 165 } 166 spin_unlock(&fs_info->defrag_inodes_lock); 167 return 0; 168 } 169 170 /* 171 * Pick the defragable inode that we want, if it doesn't exist, we will get the 172 * next one. 173 */ 174 static struct inode_defrag *btrfs_pick_defrag_inode( 175 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 176 { 177 struct inode_defrag *entry = NULL; 178 struct inode_defrag tmp; 179 struct rb_node *p; 180 struct rb_node *parent = NULL; 181 int ret; 182 183 tmp.ino = ino; 184 tmp.root = root; 185 186 spin_lock(&fs_info->defrag_inodes_lock); 187 p = fs_info->defrag_inodes.rb_node; 188 while (p) { 189 parent = p; 190 entry = rb_entry(parent, struct inode_defrag, rb_node); 191 192 ret = __compare_inode_defrag(&tmp, entry); 193 if (ret < 0) 194 p = parent->rb_left; 195 else if (ret > 0) 196 p = parent->rb_right; 197 else 198 goto out; 199 } 200 201 if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 202 parent = rb_next(parent); 203 if (parent) 204 entry = rb_entry(parent, struct inode_defrag, rb_node); 205 else 206 entry = NULL; 207 } 208 out: 209 if (entry) 210 rb_erase(parent, &fs_info->defrag_inodes); 211 spin_unlock(&fs_info->defrag_inodes_lock); 212 return entry; 213 } 214 215 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 216 { 217 struct inode_defrag *defrag; 218 struct rb_node *node; 219 220 spin_lock(&fs_info->defrag_inodes_lock); 221 node = rb_first(&fs_info->defrag_inodes); 222 while (node) { 223 rb_erase(node, &fs_info->defrag_inodes); 224 defrag = rb_entry(node, struct inode_defrag, rb_node); 225 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 226 227 cond_resched_lock(&fs_info->defrag_inodes_lock); 228 229 node = rb_first(&fs_info->defrag_inodes); 230 } 231 spin_unlock(&fs_info->defrag_inodes_lock); 232 } 233 234 #define BTRFS_DEFRAG_BATCH 1024 235 236 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 237 struct inode_defrag *defrag) 238 { 239 struct btrfs_root *inode_root; 240 struct inode *inode; 241 struct btrfs_ioctl_defrag_range_args range; 242 int ret = 0; 243 u64 cur = 0; 244 245 again: 246 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 247 goto cleanup; 248 if (!__need_auto_defrag(fs_info)) 249 goto cleanup; 250 251 /* Get the inode */ 252 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 253 if (IS_ERR(inode_root)) { 254 ret = PTR_ERR(inode_root); 255 goto cleanup; 256 } 257 258 inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); 259 btrfs_put_root(inode_root); 260 if (IS_ERR(inode)) { 261 ret = PTR_ERR(inode); 262 goto cleanup; 263 } 264 265 if (cur >= i_size_read(inode)) { 266 iput(inode); 267 goto cleanup; 268 } 269 270 /* Do a chunk of defrag */ 271 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 272 memset(&range, 0, sizeof(range)); 273 range.len = (u64)-1; 274 range.start = cur; 275 range.extent_thresh = defrag->extent_thresh; 276 277 sb_start_write(fs_info->sb); 278 ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 279 BTRFS_DEFRAG_BATCH); 280 sb_end_write(fs_info->sb); 281 iput(inode); 282 283 if (ret < 0) 284 goto cleanup; 285 286 cur = max(cur + fs_info->sectorsize, range.start); 287 goto again; 288 289 cleanup: 290 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 291 return ret; 292 } 293 294 /* 295 * Run through the list of inodes in the FS that need defragging. 296 */ 297 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 298 { 299 struct inode_defrag *defrag; 300 u64 first_ino = 0; 301 u64 root_objectid = 0; 302 303 atomic_inc(&fs_info->defrag_running); 304 while (1) { 305 /* Pause the auto defragger. */ 306 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 307 break; 308 309 if (!__need_auto_defrag(fs_info)) 310 break; 311 312 /* find an inode to defrag */ 313 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 314 if (!defrag) { 315 if (root_objectid || first_ino) { 316 root_objectid = 0; 317 first_ino = 0; 318 continue; 319 } else { 320 break; 321 } 322 } 323 324 first_ino = defrag->ino + 1; 325 root_objectid = defrag->root; 326 327 __btrfs_run_defrag_inode(fs_info, defrag); 328 } 329 atomic_dec(&fs_info->defrag_running); 330 331 /* 332 * During unmount, we use the transaction_wait queue to wait for the 333 * defragger to stop. 334 */ 335 wake_up(&fs_info->transaction_wait); 336 return 0; 337 } 338 339 /* 340 * Defrag all the leaves in a given btree. 341 * Read all the leaves and try to get key order to 342 * better reflect disk order 343 */ 344 345 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 346 struct btrfs_root *root) 347 { 348 struct btrfs_path *path = NULL; 349 struct btrfs_key key; 350 int ret = 0; 351 int wret; 352 int level; 353 int next_key_ret = 0; 354 u64 last_ret = 0; 355 356 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 357 goto out; 358 359 path = btrfs_alloc_path(); 360 if (!path) 361 return -ENOMEM; 362 363 level = btrfs_header_level(root->node); 364 365 if (level == 0) 366 goto out; 367 368 if (root->defrag_progress.objectid == 0) { 369 struct extent_buffer *root_node; 370 u32 nritems; 371 372 root_node = btrfs_lock_root_node(root); 373 nritems = btrfs_header_nritems(root_node); 374 root->defrag_max.objectid = 0; 375 /* from above we know this is not a leaf */ 376 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 377 nritems - 1); 378 btrfs_tree_unlock(root_node); 379 free_extent_buffer(root_node); 380 memset(&key, 0, sizeof(key)); 381 } else { 382 memcpy(&key, &root->defrag_progress, sizeof(key)); 383 } 384 385 path->keep_locks = 1; 386 387 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 388 if (ret < 0) 389 goto out; 390 if (ret > 0) { 391 ret = 0; 392 goto out; 393 } 394 btrfs_release_path(path); 395 /* 396 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 397 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 398 * a deadlock (attempting to write lock an already write locked leaf). 399 */ 400 path->lowest_level = 1; 401 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 402 403 if (wret < 0) { 404 ret = wret; 405 goto out; 406 } 407 if (!path->nodes[1]) { 408 ret = 0; 409 goto out; 410 } 411 /* 412 * The node at level 1 must always be locked when our path has 413 * keep_locks set and lowest_level is 1, regardless of the value of 414 * path->slots[1]. 415 */ 416 BUG_ON(path->locks[1] == 0); 417 ret = btrfs_realloc_node(trans, root, 418 path->nodes[1], 0, 419 &last_ret, 420 &root->defrag_progress); 421 if (ret) { 422 WARN_ON(ret == -EAGAIN); 423 goto out; 424 } 425 /* 426 * Now that we reallocated the node we can find the next key. Note that 427 * btrfs_find_next_key() can release our path and do another search 428 * without COWing, this is because even with path->keep_locks = 1, 429 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 430 * node when path->slots[node_level - 1] does not point to the last 431 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 432 * we search for the next key after reallocating our node. 433 */ 434 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 435 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 436 BTRFS_OLDEST_GENERATION); 437 if (next_key_ret == 0) { 438 memcpy(&root->defrag_progress, &key, sizeof(key)); 439 ret = -EAGAIN; 440 } 441 out: 442 btrfs_free_path(path); 443 if (ret == -EAGAIN) { 444 if (root->defrag_max.objectid > root->defrag_progress.objectid) 445 goto done; 446 if (root->defrag_max.type > root->defrag_progress.type) 447 goto done; 448 if (root->defrag_max.offset > root->defrag_progress.offset) 449 goto done; 450 ret = 0; 451 } 452 done: 453 if (ret != -EAGAIN) 454 memset(&root->defrag_progress, 0, 455 sizeof(root->defrag_progress)); 456 457 return ret; 458 } 459 460 /* 461 * Defrag specific helper to get an extent map. 462 * 463 * Differences between this and btrfs_get_extent() are: 464 * 465 * - No extent_map will be added to inode->extent_tree 466 * To reduce memory usage in the long run. 467 * 468 * - Extra optimization to skip file extents older than @newer_than 469 * By using btrfs_search_forward() we can skip entire file ranges that 470 * have extents created in past transactions, because btrfs_search_forward() 471 * will not visit leaves and nodes with a generation smaller than given 472 * minimal generation threshold (@newer_than). 473 * 474 * Return valid em if we find a file extent matching the requirement. 475 * Return NULL if we can not find a file extent matching the requirement. 476 * 477 * Return ERR_PTR() for error. 478 */ 479 static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, 480 u64 start, u64 newer_than) 481 { 482 struct btrfs_root *root = inode->root; 483 struct btrfs_file_extent_item *fi; 484 struct btrfs_path path = { 0 }; 485 struct extent_map *em; 486 struct btrfs_key key; 487 u64 ino = btrfs_ino(inode); 488 int ret; 489 490 em = alloc_extent_map(); 491 if (!em) { 492 ret = -ENOMEM; 493 goto err; 494 } 495 496 key.objectid = ino; 497 key.type = BTRFS_EXTENT_DATA_KEY; 498 key.offset = start; 499 500 if (newer_than) { 501 ret = btrfs_search_forward(root, &key, &path, newer_than); 502 if (ret < 0) 503 goto err; 504 /* Can't find anything newer */ 505 if (ret > 0) 506 goto not_found; 507 } else { 508 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 509 if (ret < 0) 510 goto err; 511 } 512 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { 513 /* 514 * If btrfs_search_slot() makes path to point beyond nritems, 515 * we should not have an empty leaf, as this inode must at 516 * least have its INODE_ITEM. 517 */ 518 ASSERT(btrfs_header_nritems(path.nodes[0])); 519 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; 520 } 521 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 522 /* Perfect match, no need to go one slot back */ 523 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && 524 key.offset == start) 525 goto iterate; 526 527 /* We didn't find a perfect match, needs to go one slot back */ 528 if (path.slots[0] > 0) { 529 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 530 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 531 path.slots[0]--; 532 } 533 534 iterate: 535 /* Iterate through the path to find a file extent covering @start */ 536 while (true) { 537 u64 extent_end; 538 539 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 540 goto next; 541 542 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 543 544 /* 545 * We may go one slot back to INODE_REF/XATTR item, then 546 * need to go forward until we reach an EXTENT_DATA. 547 * But we should still has the correct ino as key.objectid. 548 */ 549 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) 550 goto next; 551 552 /* It's beyond our target range, definitely not extent found */ 553 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) 554 goto not_found; 555 556 /* 557 * | |<- File extent ->| 558 * \- start 559 * 560 * This means there is a hole between start and key.offset. 561 */ 562 if (key.offset > start) { 563 em->start = start; 564 em->orig_start = start; 565 em->block_start = EXTENT_MAP_HOLE; 566 em->len = key.offset - start; 567 break; 568 } 569 570 fi = btrfs_item_ptr(path.nodes[0], path.slots[0], 571 struct btrfs_file_extent_item); 572 extent_end = btrfs_file_extent_end(&path); 573 574 /* 575 * |<- file extent ->| | 576 * \- start 577 * 578 * We haven't reached start, search next slot. 579 */ 580 if (extent_end <= start) 581 goto next; 582 583 /* Now this extent covers @start, convert it to em */ 584 btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); 585 break; 586 next: 587 ret = btrfs_next_item(root, &path); 588 if (ret < 0) 589 goto err; 590 if (ret > 0) 591 goto not_found; 592 } 593 btrfs_release_path(&path); 594 return em; 595 596 not_found: 597 btrfs_release_path(&path); 598 free_extent_map(em); 599 return NULL; 600 601 err: 602 btrfs_release_path(&path); 603 free_extent_map(em); 604 return ERR_PTR(ret); 605 } 606 607 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, 608 u64 newer_than, bool locked) 609 { 610 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 611 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 612 struct extent_map *em; 613 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; 614 615 /* 616 * Hopefully we have this extent in the tree already, try without the 617 * full extent lock. 618 */ 619 read_lock(&em_tree->lock); 620 em = lookup_extent_mapping(em_tree, start, sectorsize); 621 read_unlock(&em_tree->lock); 622 623 /* 624 * We can get a merged extent, in that case, we need to re-search 625 * tree to get the original em for defrag. 626 * 627 * If @newer_than is 0 or em::generation < newer_than, we can trust 628 * this em, as either we don't care about the generation, or the 629 * merged extent map will be rejected anyway. 630 */ 631 if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && 632 newer_than && em->generation >= newer_than) { 633 free_extent_map(em); 634 em = NULL; 635 } 636 637 if (!em) { 638 struct extent_state *cached = NULL; 639 u64 end = start + sectorsize - 1; 640 641 /* Get the big lock and read metadata off disk. */ 642 if (!locked) 643 lock_extent(io_tree, start, end, &cached); 644 em = defrag_get_extent(BTRFS_I(inode), start, newer_than); 645 if (!locked) 646 unlock_extent(io_tree, start, end, &cached); 647 648 if (IS_ERR(em)) 649 return NULL; 650 } 651 652 return em; 653 } 654 655 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, 656 const struct extent_map *em) 657 { 658 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 659 return BTRFS_MAX_COMPRESSED; 660 return fs_info->max_extent_size; 661 } 662 663 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, 664 u32 extent_thresh, u64 newer_than, bool locked) 665 { 666 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 667 struct extent_map *next; 668 bool ret = false; 669 670 /* This is the last extent */ 671 if (em->start + em->len >= i_size_read(inode)) 672 return false; 673 674 /* 675 * Here we need to pass @newer_then when checking the next extent, or 676 * we will hit a case we mark current extent for defrag, but the next 677 * one will not be a target. 678 * This will just cause extra IO without really reducing the fragments. 679 */ 680 next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); 681 /* No more em or hole */ 682 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 683 goto out; 684 if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) 685 goto out; 686 /* 687 * If the next extent is at its max capacity, defragging current extent 688 * makes no sense, as the total number of extents won't change. 689 */ 690 if (next->len >= get_extent_max_capacity(fs_info, em)) 691 goto out; 692 /* Skip older extent */ 693 if (next->generation < newer_than) 694 goto out; 695 /* Also check extent size */ 696 if (next->len >= extent_thresh) 697 goto out; 698 699 ret = true; 700 out: 701 free_extent_map(next); 702 return ret; 703 } 704 705 /* 706 * Prepare one page to be defragged. 707 * 708 * This will ensure: 709 * 710 * - Returned page is locked and has been set up properly. 711 * - No ordered extent exists in the page. 712 * - The page is uptodate. 713 * 714 * NOTE: Caller should also wait for page writeback after the cluster is 715 * prepared, here we don't do writeback wait for each page. 716 */ 717 static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) 718 { 719 struct address_space *mapping = inode->vfs_inode.i_mapping; 720 gfp_t mask = btrfs_alloc_write_mask(mapping); 721 u64 page_start = (u64)index << PAGE_SHIFT; 722 u64 page_end = page_start + PAGE_SIZE - 1; 723 struct extent_state *cached_state = NULL; 724 struct page *page; 725 int ret; 726 727 again: 728 page = find_or_create_page(mapping, index, mask); 729 if (!page) 730 return ERR_PTR(-ENOMEM); 731 732 /* 733 * Since we can defragment files opened read-only, we can encounter 734 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We 735 * can't do I/O using huge pages yet, so return an error for now. 736 * Filesystem transparent huge pages are typically only used for 737 * executables that explicitly enable them, so this isn't very 738 * restrictive. 739 */ 740 if (PageCompound(page)) { 741 unlock_page(page); 742 put_page(page); 743 return ERR_PTR(-ETXTBSY); 744 } 745 746 ret = set_page_extent_mapped(page); 747 if (ret < 0) { 748 unlock_page(page); 749 put_page(page); 750 return ERR_PTR(ret); 751 } 752 753 /* Wait for any existing ordered extent in the range */ 754 while (1) { 755 struct btrfs_ordered_extent *ordered; 756 757 lock_extent(&inode->io_tree, page_start, page_end, &cached_state); 758 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 759 unlock_extent(&inode->io_tree, page_start, page_end, 760 &cached_state); 761 if (!ordered) 762 break; 763 764 unlock_page(page); 765 btrfs_start_ordered_extent(ordered, 1); 766 btrfs_put_ordered_extent(ordered); 767 lock_page(page); 768 /* 769 * We unlocked the page above, so we need check if it was 770 * released or not. 771 */ 772 if (page->mapping != mapping || !PagePrivate(page)) { 773 unlock_page(page); 774 put_page(page); 775 goto again; 776 } 777 } 778 779 /* 780 * Now the page range has no ordered extent any more. Read the page to 781 * make it uptodate. 782 */ 783 if (!PageUptodate(page)) { 784 btrfs_read_folio(NULL, page_folio(page)); 785 lock_page(page); 786 if (page->mapping != mapping || !PagePrivate(page)) { 787 unlock_page(page); 788 put_page(page); 789 goto again; 790 } 791 if (!PageUptodate(page)) { 792 unlock_page(page); 793 put_page(page); 794 return ERR_PTR(-EIO); 795 } 796 } 797 return page; 798 } 799 800 struct defrag_target_range { 801 struct list_head list; 802 u64 start; 803 u64 len; 804 }; 805 806 /* 807 * Collect all valid target extents. 808 * 809 * @start: file offset to lookup 810 * @len: length to lookup 811 * @extent_thresh: file extent size threshold, any extent size >= this value 812 * will be ignored 813 * @newer_than: only defrag extents newer than this value 814 * @do_compress: whether the defrag is doing compression 815 * if true, @extent_thresh will be ignored and all regular 816 * file extents meeting @newer_than will be targets. 817 * @locked: if the range has already held extent lock 818 * @target_list: list of targets file extents 819 */ 820 static int defrag_collect_targets(struct btrfs_inode *inode, 821 u64 start, u64 len, u32 extent_thresh, 822 u64 newer_than, bool do_compress, 823 bool locked, struct list_head *target_list, 824 u64 *last_scanned_ret) 825 { 826 struct btrfs_fs_info *fs_info = inode->root->fs_info; 827 bool last_is_target = false; 828 u64 cur = start; 829 int ret = 0; 830 831 while (cur < start + len) { 832 struct extent_map *em; 833 struct defrag_target_range *new; 834 bool next_mergeable = true; 835 u64 range_len; 836 837 last_is_target = false; 838 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); 839 if (!em) 840 break; 841 842 /* 843 * If the file extent is an inlined one, we may still want to 844 * defrag it (fallthrough) if it will cause a regular extent. 845 * This is for users who want to convert inline extents to 846 * regular ones through max_inline= mount option. 847 */ 848 if (em->block_start == EXTENT_MAP_INLINE && 849 em->len <= inode->root->fs_info->max_inline) 850 goto next; 851 852 /* Skip hole/delalloc/preallocated extents */ 853 if (em->block_start == EXTENT_MAP_HOLE || 854 em->block_start == EXTENT_MAP_DELALLOC || 855 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 856 goto next; 857 858 /* Skip older extent */ 859 if (em->generation < newer_than) 860 goto next; 861 862 /* This em is under writeback, no need to defrag */ 863 if (em->generation == (u64)-1) 864 goto next; 865 866 /* 867 * Our start offset might be in the middle of an existing extent 868 * map, so take that into account. 869 */ 870 range_len = em->len - (cur - em->start); 871 /* 872 * If this range of the extent map is already flagged for delalloc, 873 * skip it, because: 874 * 875 * 1) We could deadlock later, when trying to reserve space for 876 * delalloc, because in case we can't immediately reserve space 877 * the flusher can start delalloc and wait for the respective 878 * ordered extents to complete. The deadlock would happen 879 * because we do the space reservation while holding the range 880 * locked, and starting writeback, or finishing an ordered 881 * extent, requires locking the range; 882 * 883 * 2) If there's delalloc there, it means there's dirty pages for 884 * which writeback has not started yet (we clean the delalloc 885 * flag when starting writeback and after creating an ordered 886 * extent). If we mark pages in an adjacent range for defrag, 887 * then we will have a larger contiguous range for delalloc, 888 * very likely resulting in a larger extent after writeback is 889 * triggered (except in a case of free space fragmentation). 890 */ 891 if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, 892 EXTENT_DELALLOC, 0, NULL)) 893 goto next; 894 895 /* 896 * For do_compress case, we want to compress all valid file 897 * extents, thus no @extent_thresh or mergeable check. 898 */ 899 if (do_compress) 900 goto add; 901 902 /* Skip too large extent */ 903 if (range_len >= extent_thresh) 904 goto next; 905 906 /* 907 * Skip extents already at its max capacity, this is mostly for 908 * compressed extents, which max cap is only 128K. 909 */ 910 if (em->len >= get_extent_max_capacity(fs_info, em)) 911 goto next; 912 913 /* 914 * Normally there are no more extents after an inline one, thus 915 * @next_mergeable will normally be false and not defragged. 916 * So if an inline extent passed all above checks, just add it 917 * for defrag, and be converted to regular extents. 918 */ 919 if (em->block_start == EXTENT_MAP_INLINE) 920 goto add; 921 922 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 923 extent_thresh, newer_than, locked); 924 if (!next_mergeable) { 925 struct defrag_target_range *last; 926 927 /* Empty target list, no way to merge with last entry */ 928 if (list_empty(target_list)) 929 goto next; 930 last = list_entry(target_list->prev, 931 struct defrag_target_range, list); 932 /* Not mergeable with last entry */ 933 if (last->start + last->len != cur) 934 goto next; 935 936 /* Mergeable, fall through to add it to @target_list. */ 937 } 938 939 add: 940 last_is_target = true; 941 range_len = min(extent_map_end(em), start + len) - cur; 942 /* 943 * This one is a good target, check if it can be merged into 944 * last range of the target list. 945 */ 946 if (!list_empty(target_list)) { 947 struct defrag_target_range *last; 948 949 last = list_entry(target_list->prev, 950 struct defrag_target_range, list); 951 ASSERT(last->start + last->len <= cur); 952 if (last->start + last->len == cur) { 953 /* Mergeable, enlarge the last entry */ 954 last->len += range_len; 955 goto next; 956 } 957 /* Fall through to allocate a new entry */ 958 } 959 960 /* Allocate new defrag_target_range */ 961 new = kmalloc(sizeof(*new), GFP_NOFS); 962 if (!new) { 963 free_extent_map(em); 964 ret = -ENOMEM; 965 break; 966 } 967 new->start = cur; 968 new->len = range_len; 969 list_add_tail(&new->list, target_list); 970 971 next: 972 cur = extent_map_end(em); 973 free_extent_map(em); 974 } 975 if (ret < 0) { 976 struct defrag_target_range *entry; 977 struct defrag_target_range *tmp; 978 979 list_for_each_entry_safe(entry, tmp, target_list, list) { 980 list_del_init(&entry->list); 981 kfree(entry); 982 } 983 } 984 if (!ret && last_scanned_ret) { 985 /* 986 * If the last extent is not a target, the caller can skip to 987 * the end of that extent. 988 * Otherwise, we can only go the end of the specified range. 989 */ 990 if (!last_is_target) 991 *last_scanned_ret = max(cur, *last_scanned_ret); 992 else 993 *last_scanned_ret = max(start + len, *last_scanned_ret); 994 } 995 return ret; 996 } 997 998 #define CLUSTER_SIZE (SZ_256K) 999 static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); 1000 1001 /* 1002 * Defrag one contiguous target range. 1003 * 1004 * @inode: target inode 1005 * @target: target range to defrag 1006 * @pages: locked pages covering the defrag range 1007 * @nr_pages: number of locked pages 1008 * 1009 * Caller should ensure: 1010 * 1011 * - Pages are prepared 1012 * Pages should be locked, no ordered extent in the pages range, 1013 * no writeback. 1014 * 1015 * - Extent bits are locked 1016 */ 1017 static int defrag_one_locked_target(struct btrfs_inode *inode, 1018 struct defrag_target_range *target, 1019 struct page **pages, int nr_pages, 1020 struct extent_state **cached_state) 1021 { 1022 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1023 struct extent_changeset *data_reserved = NULL; 1024 const u64 start = target->start; 1025 const u64 len = target->len; 1026 unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; 1027 unsigned long start_index = start >> PAGE_SHIFT; 1028 unsigned long first_index = page_index(pages[0]); 1029 int ret = 0; 1030 int i; 1031 1032 ASSERT(last_index - first_index + 1 <= nr_pages); 1033 1034 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); 1035 if (ret < 0) 1036 return ret; 1037 clear_extent_bit(&inode->io_tree, start, start + len - 1, 1038 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1039 EXTENT_DEFRAG, cached_state); 1040 set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); 1041 1042 /* Update the page status */ 1043 for (i = start_index - first_index; i <= last_index - first_index; i++) { 1044 ClearPageChecked(pages[i]); 1045 btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); 1046 } 1047 btrfs_delalloc_release_extents(inode, len); 1048 extent_changeset_free(data_reserved); 1049 1050 return ret; 1051 } 1052 1053 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, 1054 u32 extent_thresh, u64 newer_than, bool do_compress, 1055 u64 *last_scanned_ret) 1056 { 1057 struct extent_state *cached_state = NULL; 1058 struct defrag_target_range *entry; 1059 struct defrag_target_range *tmp; 1060 LIST_HEAD(target_list); 1061 struct page **pages; 1062 const u32 sectorsize = inode->root->fs_info->sectorsize; 1063 u64 last_index = (start + len - 1) >> PAGE_SHIFT; 1064 u64 start_index = start >> PAGE_SHIFT; 1065 unsigned int nr_pages = last_index - start_index + 1; 1066 int ret = 0; 1067 int i; 1068 1069 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); 1070 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); 1071 1072 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 1073 if (!pages) 1074 return -ENOMEM; 1075 1076 /* Prepare all pages */ 1077 for (i = 0; i < nr_pages; i++) { 1078 pages[i] = defrag_prepare_one_page(inode, start_index + i); 1079 if (IS_ERR(pages[i])) { 1080 ret = PTR_ERR(pages[i]); 1081 pages[i] = NULL; 1082 goto free_pages; 1083 } 1084 } 1085 for (i = 0; i < nr_pages; i++) 1086 wait_on_page_writeback(pages[i]); 1087 1088 /* Lock the pages range */ 1089 lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1090 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1091 &cached_state); 1092 /* 1093 * Now we have a consistent view about the extent map, re-check 1094 * which range really needs to be defragged. 1095 * 1096 * And this time we have extent locked already, pass @locked = true 1097 * so that we won't relock the extent range and cause deadlock. 1098 */ 1099 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1100 newer_than, do_compress, true, 1101 &target_list, last_scanned_ret); 1102 if (ret < 0) 1103 goto unlock_extent; 1104 1105 list_for_each_entry(entry, &target_list, list) { 1106 ret = defrag_one_locked_target(inode, entry, pages, nr_pages, 1107 &cached_state); 1108 if (ret < 0) 1109 break; 1110 } 1111 1112 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1113 list_del_init(&entry->list); 1114 kfree(entry); 1115 } 1116 unlock_extent: 1117 unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1118 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1119 &cached_state); 1120 free_pages: 1121 for (i = 0; i < nr_pages; i++) { 1122 if (pages[i]) { 1123 unlock_page(pages[i]); 1124 put_page(pages[i]); 1125 } 1126 } 1127 kfree(pages); 1128 return ret; 1129 } 1130 1131 static int defrag_one_cluster(struct btrfs_inode *inode, 1132 struct file_ra_state *ra, 1133 u64 start, u32 len, u32 extent_thresh, 1134 u64 newer_than, bool do_compress, 1135 unsigned long *sectors_defragged, 1136 unsigned long max_sectors, 1137 u64 *last_scanned_ret) 1138 { 1139 const u32 sectorsize = inode->root->fs_info->sectorsize; 1140 struct defrag_target_range *entry; 1141 struct defrag_target_range *tmp; 1142 LIST_HEAD(target_list); 1143 int ret; 1144 1145 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1146 newer_than, do_compress, false, 1147 &target_list, NULL); 1148 if (ret < 0) 1149 goto out; 1150 1151 list_for_each_entry(entry, &target_list, list) { 1152 u32 range_len = entry->len; 1153 1154 /* Reached or beyond the limit */ 1155 if (max_sectors && *sectors_defragged >= max_sectors) { 1156 ret = 1; 1157 break; 1158 } 1159 1160 if (max_sectors) 1161 range_len = min_t(u32, range_len, 1162 (max_sectors - *sectors_defragged) * sectorsize); 1163 1164 /* 1165 * If defrag_one_range() has updated last_scanned_ret, 1166 * our range may already be invalid (e.g. hole punched). 1167 * Skip if our range is before last_scanned_ret, as there is 1168 * no need to defrag the range anymore. 1169 */ 1170 if (entry->start + range_len <= *last_scanned_ret) 1171 continue; 1172 1173 if (ra) 1174 page_cache_sync_readahead(inode->vfs_inode.i_mapping, 1175 ra, NULL, entry->start >> PAGE_SHIFT, 1176 ((entry->start + range_len - 1) >> PAGE_SHIFT) - 1177 (entry->start >> PAGE_SHIFT) + 1); 1178 /* 1179 * Here we may not defrag any range if holes are punched before 1180 * we locked the pages. 1181 * But that's fine, it only affects the @sectors_defragged 1182 * accounting. 1183 */ 1184 ret = defrag_one_range(inode, entry->start, range_len, 1185 extent_thresh, newer_than, do_compress, 1186 last_scanned_ret); 1187 if (ret < 0) 1188 break; 1189 *sectors_defragged += range_len >> 1190 inode->root->fs_info->sectorsize_bits; 1191 } 1192 out: 1193 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1194 list_del_init(&entry->list); 1195 kfree(entry); 1196 } 1197 if (ret >= 0) 1198 *last_scanned_ret = max(*last_scanned_ret, start + len); 1199 return ret; 1200 } 1201 1202 /* 1203 * Entry point to file defragmentation. 1204 * 1205 * @inode: inode to be defragged 1206 * @ra: readahead state (can be NUL) 1207 * @range: defrag options including range and flags 1208 * @newer_than: minimum transid to defrag 1209 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode 1210 * will be defragged. 1211 * 1212 * Return <0 for error. 1213 * Return >=0 for the number of sectors defragged, and range->start will be updated 1214 * to indicate the file offset where next defrag should be started at. 1215 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without 1216 * defragging all the range). 1217 */ 1218 int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, 1219 struct btrfs_ioctl_defrag_range_args *range, 1220 u64 newer_than, unsigned long max_to_defrag) 1221 { 1222 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1223 unsigned long sectors_defragged = 0; 1224 u64 isize = i_size_read(inode); 1225 u64 cur; 1226 u64 last_byte; 1227 bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); 1228 bool ra_allocated = false; 1229 int compress_type = BTRFS_COMPRESS_ZLIB; 1230 int ret = 0; 1231 u32 extent_thresh = range->extent_thresh; 1232 pgoff_t start_index; 1233 1234 if (isize == 0) 1235 return 0; 1236 1237 if (range->start >= isize) 1238 return -EINVAL; 1239 1240 if (do_compress) { 1241 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) 1242 return -EINVAL; 1243 if (range->compress_type) 1244 compress_type = range->compress_type; 1245 } 1246 1247 if (extent_thresh == 0) 1248 extent_thresh = SZ_256K; 1249 1250 if (range->start + range->len > range->start) { 1251 /* Got a specific range */ 1252 last_byte = min(isize, range->start + range->len); 1253 } else { 1254 /* Defrag until file end */ 1255 last_byte = isize; 1256 } 1257 1258 /* Align the range */ 1259 cur = round_down(range->start, fs_info->sectorsize); 1260 last_byte = round_up(last_byte, fs_info->sectorsize) - 1; 1261 1262 /* 1263 * If we were not given a ra, allocate a readahead context. As 1264 * readahead is just an optimization, defrag will work without it so 1265 * we don't error out. 1266 */ 1267 if (!ra) { 1268 ra_allocated = true; 1269 ra = kzalloc(sizeof(*ra), GFP_KERNEL); 1270 if (ra) 1271 file_ra_state_init(ra, inode->i_mapping); 1272 } 1273 1274 /* 1275 * Make writeback start from the beginning of the range, so that the 1276 * defrag range can be written sequentially. 1277 */ 1278 start_index = cur >> PAGE_SHIFT; 1279 if (start_index < inode->i_mapping->writeback_index) 1280 inode->i_mapping->writeback_index = start_index; 1281 1282 while (cur < last_byte) { 1283 const unsigned long prev_sectors_defragged = sectors_defragged; 1284 u64 last_scanned = cur; 1285 u64 cluster_end; 1286 1287 if (btrfs_defrag_cancelled(fs_info)) { 1288 ret = -EAGAIN; 1289 break; 1290 } 1291 1292 /* We want the cluster end at page boundary when possible */ 1293 cluster_end = (((cur >> PAGE_SHIFT) + 1294 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; 1295 cluster_end = min(cluster_end, last_byte); 1296 1297 btrfs_inode_lock(inode, 0); 1298 if (IS_SWAPFILE(inode)) { 1299 ret = -ETXTBSY; 1300 btrfs_inode_unlock(inode, 0); 1301 break; 1302 } 1303 if (!(inode->i_sb->s_flags & SB_ACTIVE)) { 1304 btrfs_inode_unlock(inode, 0); 1305 break; 1306 } 1307 if (do_compress) 1308 BTRFS_I(inode)->defrag_compress = compress_type; 1309 ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, 1310 cluster_end + 1 - cur, extent_thresh, 1311 newer_than, do_compress, §ors_defragged, 1312 max_to_defrag, &last_scanned); 1313 1314 if (sectors_defragged > prev_sectors_defragged) 1315 balance_dirty_pages_ratelimited(inode->i_mapping); 1316 1317 btrfs_inode_unlock(inode, 0); 1318 if (ret < 0) 1319 break; 1320 cur = max(cluster_end + 1, last_scanned); 1321 if (ret > 0) { 1322 ret = 0; 1323 break; 1324 } 1325 cond_resched(); 1326 } 1327 1328 if (ra_allocated) 1329 kfree(ra); 1330 /* 1331 * Update range.start for autodefrag, this will indicate where to start 1332 * in next run. 1333 */ 1334 range->start = cur; 1335 if (sectors_defragged) { 1336 /* 1337 * We have defragged some sectors, for compression case they 1338 * need to be written back immediately. 1339 */ 1340 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { 1341 filemap_flush(inode->i_mapping); 1342 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1343 &BTRFS_I(inode)->runtime_flags)) 1344 filemap_flush(inode->i_mapping); 1345 } 1346 if (range->compress_type == BTRFS_COMPRESS_LZO) 1347 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1348 else if (range->compress_type == BTRFS_COMPRESS_ZSTD) 1349 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1350 ret = sectors_defragged; 1351 } 1352 if (do_compress) { 1353 btrfs_inode_lock(inode, 0); 1354 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; 1355 btrfs_inode_unlock(inode, 0); 1356 } 1357 return ret; 1358 } 1359 1360 void __cold btrfs_auto_defrag_exit(void) 1361 { 1362 kmem_cache_destroy(btrfs_inode_defrag_cachep); 1363 } 1364 1365 int __init btrfs_auto_defrag_init(void) 1366 { 1367 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 1368 sizeof(struct inode_defrag), 0, 1369 SLAB_MEM_SPREAD, 1370 NULL); 1371 if (!btrfs_inode_defrag_cachep) 1372 return -ENOMEM; 1373 1374 return 0; 1375 } 1376