1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "print-tree.h" 10 #include "transaction.h" 11 #include "locking.h" 12 #include "accessors.h" 13 #include "messages.h" 14 #include "delalloc-space.h" 15 #include "subpage.h" 16 #include "defrag.h" 17 18 static struct kmem_cache *btrfs_inode_defrag_cachep; 19 20 /* 21 * When auto defrag is enabled we queue up these defrag structs to remember 22 * which inodes need defragging passes. 23 */ 24 struct inode_defrag { 25 struct rb_node rb_node; 26 /* Inode number */ 27 u64 ino; 28 /* 29 * Transid where the defrag was added, we search for extents newer than 30 * this. 31 */ 32 u64 transid; 33 34 /* Root objectid */ 35 u64 root; 36 37 /* 38 * The extent size threshold for autodefrag. 39 * 40 * This value is different for compressed/non-compressed extents, thus 41 * needs to be passed from higher layer. 42 * (aka, inode_should_defrag()) 43 */ 44 u32 extent_thresh; 45 }; 46 47 static int __compare_inode_defrag(struct inode_defrag *defrag1, 48 struct inode_defrag *defrag2) 49 { 50 if (defrag1->root > defrag2->root) 51 return 1; 52 else if (defrag1->root < defrag2->root) 53 return -1; 54 else if (defrag1->ino > defrag2->ino) 55 return 1; 56 else if (defrag1->ino < defrag2->ino) 57 return -1; 58 else 59 return 0; 60 } 61 62 /* 63 * Pop a record for an inode into the defrag tree. The lock must be held 64 * already. 65 * 66 * If you're inserting a record for an older transid than an existing record, 67 * the transid already in the tree is lowered. 68 * 69 * If an existing record is found the defrag item you pass in is freed. 70 */ 71 static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, 72 struct inode_defrag *defrag) 73 { 74 struct btrfs_fs_info *fs_info = inode->root->fs_info; 75 struct inode_defrag *entry; 76 struct rb_node **p; 77 struct rb_node *parent = NULL; 78 int ret; 79 80 p = &fs_info->defrag_inodes.rb_node; 81 while (*p) { 82 parent = *p; 83 entry = rb_entry(parent, struct inode_defrag, rb_node); 84 85 ret = __compare_inode_defrag(defrag, entry); 86 if (ret < 0) 87 p = &parent->rb_left; 88 else if (ret > 0) 89 p = &parent->rb_right; 90 else { 91 /* 92 * If we're reinserting an entry for an old defrag run, 93 * make sure to lower the transid of our existing 94 * record. 95 */ 96 if (defrag->transid < entry->transid) 97 entry->transid = defrag->transid; 98 entry->extent_thresh = min(defrag->extent_thresh, 99 entry->extent_thresh); 100 return -EEXIST; 101 } 102 } 103 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 104 rb_link_node(&defrag->rb_node, parent, p); 105 rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); 106 return 0; 107 } 108 109 static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) 110 { 111 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 112 return 0; 113 114 if (btrfs_fs_closing(fs_info)) 115 return 0; 116 117 return 1; 118 } 119 120 /* 121 * Insert a defrag record for this inode if auto defrag is enabled. 122 */ 123 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 124 struct btrfs_inode *inode, u32 extent_thresh) 125 { 126 struct btrfs_root *root = inode->root; 127 struct btrfs_fs_info *fs_info = root->fs_info; 128 struct inode_defrag *defrag; 129 u64 transid; 130 int ret; 131 132 if (!__need_auto_defrag(fs_info)) 133 return 0; 134 135 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 136 return 0; 137 138 if (trans) 139 transid = trans->transid; 140 else 141 transid = inode->root->last_trans; 142 143 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 144 if (!defrag) 145 return -ENOMEM; 146 147 defrag->ino = btrfs_ino(inode); 148 defrag->transid = transid; 149 defrag->root = root->root_key.objectid; 150 defrag->extent_thresh = extent_thresh; 151 152 spin_lock(&fs_info->defrag_inodes_lock); 153 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 154 /* 155 * If we set IN_DEFRAG flag and evict the inode from memory, 156 * and then re-read this inode, this new inode doesn't have 157 * IN_DEFRAG flag. At the case, we may find the existed defrag. 158 */ 159 ret = __btrfs_add_inode_defrag(inode, defrag); 160 if (ret) 161 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 162 } else { 163 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 164 } 165 spin_unlock(&fs_info->defrag_inodes_lock); 166 return 0; 167 } 168 169 /* 170 * Pick the defragable inode that we want, if it doesn't exist, we will get the 171 * next one. 172 */ 173 static struct inode_defrag *btrfs_pick_defrag_inode( 174 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 175 { 176 struct inode_defrag *entry = NULL; 177 struct inode_defrag tmp; 178 struct rb_node *p; 179 struct rb_node *parent = NULL; 180 int ret; 181 182 tmp.ino = ino; 183 tmp.root = root; 184 185 spin_lock(&fs_info->defrag_inodes_lock); 186 p = fs_info->defrag_inodes.rb_node; 187 while (p) { 188 parent = p; 189 entry = rb_entry(parent, struct inode_defrag, rb_node); 190 191 ret = __compare_inode_defrag(&tmp, entry); 192 if (ret < 0) 193 p = parent->rb_left; 194 else if (ret > 0) 195 p = parent->rb_right; 196 else 197 goto out; 198 } 199 200 if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 201 parent = rb_next(parent); 202 if (parent) 203 entry = rb_entry(parent, struct inode_defrag, rb_node); 204 else 205 entry = NULL; 206 } 207 out: 208 if (entry) 209 rb_erase(parent, &fs_info->defrag_inodes); 210 spin_unlock(&fs_info->defrag_inodes_lock); 211 return entry; 212 } 213 214 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 215 { 216 struct inode_defrag *defrag; 217 struct rb_node *node; 218 219 spin_lock(&fs_info->defrag_inodes_lock); 220 node = rb_first(&fs_info->defrag_inodes); 221 while (node) { 222 rb_erase(node, &fs_info->defrag_inodes); 223 defrag = rb_entry(node, struct inode_defrag, rb_node); 224 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 225 226 cond_resched_lock(&fs_info->defrag_inodes_lock); 227 228 node = rb_first(&fs_info->defrag_inodes); 229 } 230 spin_unlock(&fs_info->defrag_inodes_lock); 231 } 232 233 #define BTRFS_DEFRAG_BATCH 1024 234 235 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 236 struct inode_defrag *defrag) 237 { 238 struct btrfs_root *inode_root; 239 struct inode *inode; 240 struct btrfs_ioctl_defrag_range_args range; 241 int ret = 0; 242 u64 cur = 0; 243 244 again: 245 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 246 goto cleanup; 247 if (!__need_auto_defrag(fs_info)) 248 goto cleanup; 249 250 /* Get the inode */ 251 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 252 if (IS_ERR(inode_root)) { 253 ret = PTR_ERR(inode_root); 254 goto cleanup; 255 } 256 257 inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); 258 btrfs_put_root(inode_root); 259 if (IS_ERR(inode)) { 260 ret = PTR_ERR(inode); 261 goto cleanup; 262 } 263 264 if (cur >= i_size_read(inode)) { 265 iput(inode); 266 goto cleanup; 267 } 268 269 /* Do a chunk of defrag */ 270 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 271 memset(&range, 0, sizeof(range)); 272 range.len = (u64)-1; 273 range.start = cur; 274 range.extent_thresh = defrag->extent_thresh; 275 276 sb_start_write(fs_info->sb); 277 ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 278 BTRFS_DEFRAG_BATCH); 279 sb_end_write(fs_info->sb); 280 iput(inode); 281 282 if (ret < 0) 283 goto cleanup; 284 285 cur = max(cur + fs_info->sectorsize, range.start); 286 goto again; 287 288 cleanup: 289 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 290 return ret; 291 } 292 293 /* 294 * Run through the list of inodes in the FS that need defragging. 295 */ 296 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 297 { 298 struct inode_defrag *defrag; 299 u64 first_ino = 0; 300 u64 root_objectid = 0; 301 302 atomic_inc(&fs_info->defrag_running); 303 while (1) { 304 /* Pause the auto defragger. */ 305 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 306 break; 307 308 if (!__need_auto_defrag(fs_info)) 309 break; 310 311 /* find an inode to defrag */ 312 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 313 if (!defrag) { 314 if (root_objectid || first_ino) { 315 root_objectid = 0; 316 first_ino = 0; 317 continue; 318 } else { 319 break; 320 } 321 } 322 323 first_ino = defrag->ino + 1; 324 root_objectid = defrag->root; 325 326 __btrfs_run_defrag_inode(fs_info, defrag); 327 } 328 atomic_dec(&fs_info->defrag_running); 329 330 /* 331 * During unmount, we use the transaction_wait queue to wait for the 332 * defragger to stop. 333 */ 334 wake_up(&fs_info->transaction_wait); 335 return 0; 336 } 337 338 /* 339 * Defrag all the leaves in a given btree. 340 * Read all the leaves and try to get key order to 341 * better reflect disk order 342 */ 343 344 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 345 struct btrfs_root *root) 346 { 347 struct btrfs_path *path = NULL; 348 struct btrfs_key key; 349 int ret = 0; 350 int wret; 351 int level; 352 int next_key_ret = 0; 353 u64 last_ret = 0; 354 355 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 356 goto out; 357 358 path = btrfs_alloc_path(); 359 if (!path) 360 return -ENOMEM; 361 362 level = btrfs_header_level(root->node); 363 364 if (level == 0) 365 goto out; 366 367 if (root->defrag_progress.objectid == 0) { 368 struct extent_buffer *root_node; 369 u32 nritems; 370 371 root_node = btrfs_lock_root_node(root); 372 nritems = btrfs_header_nritems(root_node); 373 root->defrag_max.objectid = 0; 374 /* from above we know this is not a leaf */ 375 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 376 nritems - 1); 377 btrfs_tree_unlock(root_node); 378 free_extent_buffer(root_node); 379 memset(&key, 0, sizeof(key)); 380 } else { 381 memcpy(&key, &root->defrag_progress, sizeof(key)); 382 } 383 384 path->keep_locks = 1; 385 386 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 387 if (ret < 0) 388 goto out; 389 if (ret > 0) { 390 ret = 0; 391 goto out; 392 } 393 btrfs_release_path(path); 394 /* 395 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 396 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 397 * a deadlock (attempting to write lock an already write locked leaf). 398 */ 399 path->lowest_level = 1; 400 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 401 402 if (wret < 0) { 403 ret = wret; 404 goto out; 405 } 406 if (!path->nodes[1]) { 407 ret = 0; 408 goto out; 409 } 410 /* 411 * The node at level 1 must always be locked when our path has 412 * keep_locks set and lowest_level is 1, regardless of the value of 413 * path->slots[1]. 414 */ 415 BUG_ON(path->locks[1] == 0); 416 ret = btrfs_realloc_node(trans, root, 417 path->nodes[1], 0, 418 &last_ret, 419 &root->defrag_progress); 420 if (ret) { 421 WARN_ON(ret == -EAGAIN); 422 goto out; 423 } 424 /* 425 * Now that we reallocated the node we can find the next key. Note that 426 * btrfs_find_next_key() can release our path and do another search 427 * without COWing, this is because even with path->keep_locks = 1, 428 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 429 * node when path->slots[node_level - 1] does not point to the last 430 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 431 * we search for the next key after reallocating our node. 432 */ 433 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 434 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 435 BTRFS_OLDEST_GENERATION); 436 if (next_key_ret == 0) { 437 memcpy(&root->defrag_progress, &key, sizeof(key)); 438 ret = -EAGAIN; 439 } 440 out: 441 btrfs_free_path(path); 442 if (ret == -EAGAIN) { 443 if (root->defrag_max.objectid > root->defrag_progress.objectid) 444 goto done; 445 if (root->defrag_max.type > root->defrag_progress.type) 446 goto done; 447 if (root->defrag_max.offset > root->defrag_progress.offset) 448 goto done; 449 ret = 0; 450 } 451 done: 452 if (ret != -EAGAIN) 453 memset(&root->defrag_progress, 0, 454 sizeof(root->defrag_progress)); 455 456 return ret; 457 } 458 459 /* 460 * Defrag specific helper to get an extent map. 461 * 462 * Differences between this and btrfs_get_extent() are: 463 * 464 * - No extent_map will be added to inode->extent_tree 465 * To reduce memory usage in the long run. 466 * 467 * - Extra optimization to skip file extents older than @newer_than 468 * By using btrfs_search_forward() we can skip entire file ranges that 469 * have extents created in past transactions, because btrfs_search_forward() 470 * will not visit leaves and nodes with a generation smaller than given 471 * minimal generation threshold (@newer_than). 472 * 473 * Return valid em if we find a file extent matching the requirement. 474 * Return NULL if we can not find a file extent matching the requirement. 475 * 476 * Return ERR_PTR() for error. 477 */ 478 static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, 479 u64 start, u64 newer_than) 480 { 481 struct btrfs_root *root = inode->root; 482 struct btrfs_file_extent_item *fi; 483 struct btrfs_path path = { 0 }; 484 struct extent_map *em; 485 struct btrfs_key key; 486 u64 ino = btrfs_ino(inode); 487 int ret; 488 489 em = alloc_extent_map(); 490 if (!em) { 491 ret = -ENOMEM; 492 goto err; 493 } 494 495 key.objectid = ino; 496 key.type = BTRFS_EXTENT_DATA_KEY; 497 key.offset = start; 498 499 if (newer_than) { 500 ret = btrfs_search_forward(root, &key, &path, newer_than); 501 if (ret < 0) 502 goto err; 503 /* Can't find anything newer */ 504 if (ret > 0) 505 goto not_found; 506 } else { 507 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 508 if (ret < 0) 509 goto err; 510 } 511 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { 512 /* 513 * If btrfs_search_slot() makes path to point beyond nritems, 514 * we should not have an empty leaf, as this inode must at 515 * least have its INODE_ITEM. 516 */ 517 ASSERT(btrfs_header_nritems(path.nodes[0])); 518 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; 519 } 520 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 521 /* Perfect match, no need to go one slot back */ 522 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && 523 key.offset == start) 524 goto iterate; 525 526 /* We didn't find a perfect match, needs to go one slot back */ 527 if (path.slots[0] > 0) { 528 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 529 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 530 path.slots[0]--; 531 } 532 533 iterate: 534 /* Iterate through the path to find a file extent covering @start */ 535 while (true) { 536 u64 extent_end; 537 538 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 539 goto next; 540 541 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 542 543 /* 544 * We may go one slot back to INODE_REF/XATTR item, then 545 * need to go forward until we reach an EXTENT_DATA. 546 * But we should still has the correct ino as key.objectid. 547 */ 548 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) 549 goto next; 550 551 /* It's beyond our target range, definitely not extent found */ 552 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) 553 goto not_found; 554 555 /* 556 * | |<- File extent ->| 557 * \- start 558 * 559 * This means there is a hole between start and key.offset. 560 */ 561 if (key.offset > start) { 562 em->start = start; 563 em->orig_start = start; 564 em->block_start = EXTENT_MAP_HOLE; 565 em->len = key.offset - start; 566 break; 567 } 568 569 fi = btrfs_item_ptr(path.nodes[0], path.slots[0], 570 struct btrfs_file_extent_item); 571 extent_end = btrfs_file_extent_end(&path); 572 573 /* 574 * |<- file extent ->| | 575 * \- start 576 * 577 * We haven't reached start, search next slot. 578 */ 579 if (extent_end <= start) 580 goto next; 581 582 /* Now this extent covers @start, convert it to em */ 583 btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); 584 break; 585 next: 586 ret = btrfs_next_item(root, &path); 587 if (ret < 0) 588 goto err; 589 if (ret > 0) 590 goto not_found; 591 } 592 btrfs_release_path(&path); 593 return em; 594 595 not_found: 596 btrfs_release_path(&path); 597 free_extent_map(em); 598 return NULL; 599 600 err: 601 btrfs_release_path(&path); 602 free_extent_map(em); 603 return ERR_PTR(ret); 604 } 605 606 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, 607 u64 newer_than, bool locked) 608 { 609 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 610 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 611 struct extent_map *em; 612 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; 613 614 /* 615 * Hopefully we have this extent in the tree already, try without the 616 * full extent lock. 617 */ 618 read_lock(&em_tree->lock); 619 em = lookup_extent_mapping(em_tree, start, sectorsize); 620 read_unlock(&em_tree->lock); 621 622 /* 623 * We can get a merged extent, in that case, we need to re-search 624 * tree to get the original em for defrag. 625 * 626 * If @newer_than is 0 or em::generation < newer_than, we can trust 627 * this em, as either we don't care about the generation, or the 628 * merged extent map will be rejected anyway. 629 */ 630 if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && 631 newer_than && em->generation >= newer_than) { 632 free_extent_map(em); 633 em = NULL; 634 } 635 636 if (!em) { 637 struct extent_state *cached = NULL; 638 u64 end = start + sectorsize - 1; 639 640 /* Get the big lock and read metadata off disk. */ 641 if (!locked) 642 lock_extent(io_tree, start, end, &cached); 643 em = defrag_get_extent(BTRFS_I(inode), start, newer_than); 644 if (!locked) 645 unlock_extent(io_tree, start, end, &cached); 646 647 if (IS_ERR(em)) 648 return NULL; 649 } 650 651 return em; 652 } 653 654 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, 655 const struct extent_map *em) 656 { 657 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 658 return BTRFS_MAX_COMPRESSED; 659 return fs_info->max_extent_size; 660 } 661 662 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, 663 u32 extent_thresh, u64 newer_than, bool locked) 664 { 665 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 666 struct extent_map *next; 667 bool ret = false; 668 669 /* This is the last extent */ 670 if (em->start + em->len >= i_size_read(inode)) 671 return false; 672 673 /* 674 * Here we need to pass @newer_then when checking the next extent, or 675 * we will hit a case we mark current extent for defrag, but the next 676 * one will not be a target. 677 * This will just cause extra IO without really reducing the fragments. 678 */ 679 next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); 680 /* No more em or hole */ 681 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 682 goto out; 683 if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) 684 goto out; 685 /* 686 * If the next extent is at its max capacity, defragging current extent 687 * makes no sense, as the total number of extents won't change. 688 */ 689 if (next->len >= get_extent_max_capacity(fs_info, em)) 690 goto out; 691 /* Skip older extent */ 692 if (next->generation < newer_than) 693 goto out; 694 /* Also check extent size */ 695 if (next->len >= extent_thresh) 696 goto out; 697 698 ret = true; 699 out: 700 free_extent_map(next); 701 return ret; 702 } 703 704 /* 705 * Prepare one page to be defragged. 706 * 707 * This will ensure: 708 * 709 * - Returned page is locked and has been set up properly. 710 * - No ordered extent exists in the page. 711 * - The page is uptodate. 712 * 713 * NOTE: Caller should also wait for page writeback after the cluster is 714 * prepared, here we don't do writeback wait for each page. 715 */ 716 static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) 717 { 718 struct address_space *mapping = inode->vfs_inode.i_mapping; 719 gfp_t mask = btrfs_alloc_write_mask(mapping); 720 u64 page_start = (u64)index << PAGE_SHIFT; 721 u64 page_end = page_start + PAGE_SIZE - 1; 722 struct extent_state *cached_state = NULL; 723 struct page *page; 724 int ret; 725 726 again: 727 page = find_or_create_page(mapping, index, mask); 728 if (!page) 729 return ERR_PTR(-ENOMEM); 730 731 /* 732 * Since we can defragment files opened read-only, we can encounter 733 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We 734 * can't do I/O using huge pages yet, so return an error for now. 735 * Filesystem transparent huge pages are typically only used for 736 * executables that explicitly enable them, so this isn't very 737 * restrictive. 738 */ 739 if (PageCompound(page)) { 740 unlock_page(page); 741 put_page(page); 742 return ERR_PTR(-ETXTBSY); 743 } 744 745 ret = set_page_extent_mapped(page); 746 if (ret < 0) { 747 unlock_page(page); 748 put_page(page); 749 return ERR_PTR(ret); 750 } 751 752 /* Wait for any existing ordered extent in the range */ 753 while (1) { 754 struct btrfs_ordered_extent *ordered; 755 756 lock_extent(&inode->io_tree, page_start, page_end, &cached_state); 757 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 758 unlock_extent(&inode->io_tree, page_start, page_end, 759 &cached_state); 760 if (!ordered) 761 break; 762 763 unlock_page(page); 764 btrfs_start_ordered_extent(ordered, 1); 765 btrfs_put_ordered_extent(ordered); 766 lock_page(page); 767 /* 768 * We unlocked the page above, so we need check if it was 769 * released or not. 770 */ 771 if (page->mapping != mapping || !PagePrivate(page)) { 772 unlock_page(page); 773 put_page(page); 774 goto again; 775 } 776 } 777 778 /* 779 * Now the page range has no ordered extent any more. Read the page to 780 * make it uptodate. 781 */ 782 if (!PageUptodate(page)) { 783 btrfs_read_folio(NULL, page_folio(page)); 784 lock_page(page); 785 if (page->mapping != mapping || !PagePrivate(page)) { 786 unlock_page(page); 787 put_page(page); 788 goto again; 789 } 790 if (!PageUptodate(page)) { 791 unlock_page(page); 792 put_page(page); 793 return ERR_PTR(-EIO); 794 } 795 } 796 return page; 797 } 798 799 struct defrag_target_range { 800 struct list_head list; 801 u64 start; 802 u64 len; 803 }; 804 805 /* 806 * Collect all valid target extents. 807 * 808 * @start: file offset to lookup 809 * @len: length to lookup 810 * @extent_thresh: file extent size threshold, any extent size >= this value 811 * will be ignored 812 * @newer_than: only defrag extents newer than this value 813 * @do_compress: whether the defrag is doing compression 814 * if true, @extent_thresh will be ignored and all regular 815 * file extents meeting @newer_than will be targets. 816 * @locked: if the range has already held extent lock 817 * @target_list: list of targets file extents 818 */ 819 static int defrag_collect_targets(struct btrfs_inode *inode, 820 u64 start, u64 len, u32 extent_thresh, 821 u64 newer_than, bool do_compress, 822 bool locked, struct list_head *target_list, 823 u64 *last_scanned_ret) 824 { 825 struct btrfs_fs_info *fs_info = inode->root->fs_info; 826 bool last_is_target = false; 827 u64 cur = start; 828 int ret = 0; 829 830 while (cur < start + len) { 831 struct extent_map *em; 832 struct defrag_target_range *new; 833 bool next_mergeable = true; 834 u64 range_len; 835 836 last_is_target = false; 837 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); 838 if (!em) 839 break; 840 841 /* 842 * If the file extent is an inlined one, we may still want to 843 * defrag it (fallthrough) if it will cause a regular extent. 844 * This is for users who want to convert inline extents to 845 * regular ones through max_inline= mount option. 846 */ 847 if (em->block_start == EXTENT_MAP_INLINE && 848 em->len <= inode->root->fs_info->max_inline) 849 goto next; 850 851 /* Skip hole/delalloc/preallocated extents */ 852 if (em->block_start == EXTENT_MAP_HOLE || 853 em->block_start == EXTENT_MAP_DELALLOC || 854 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 855 goto next; 856 857 /* Skip older extent */ 858 if (em->generation < newer_than) 859 goto next; 860 861 /* This em is under writeback, no need to defrag */ 862 if (em->generation == (u64)-1) 863 goto next; 864 865 /* 866 * Our start offset might be in the middle of an existing extent 867 * map, so take that into account. 868 */ 869 range_len = em->len - (cur - em->start); 870 /* 871 * If this range of the extent map is already flagged for delalloc, 872 * skip it, because: 873 * 874 * 1) We could deadlock later, when trying to reserve space for 875 * delalloc, because in case we can't immediately reserve space 876 * the flusher can start delalloc and wait for the respective 877 * ordered extents to complete. The deadlock would happen 878 * because we do the space reservation while holding the range 879 * locked, and starting writeback, or finishing an ordered 880 * extent, requires locking the range; 881 * 882 * 2) If there's delalloc there, it means there's dirty pages for 883 * which writeback has not started yet (we clean the delalloc 884 * flag when starting writeback and after creating an ordered 885 * extent). If we mark pages in an adjacent range for defrag, 886 * then we will have a larger contiguous range for delalloc, 887 * very likely resulting in a larger extent after writeback is 888 * triggered (except in a case of free space fragmentation). 889 */ 890 if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, 891 EXTENT_DELALLOC, 0, NULL)) 892 goto next; 893 894 /* 895 * For do_compress case, we want to compress all valid file 896 * extents, thus no @extent_thresh or mergeable check. 897 */ 898 if (do_compress) 899 goto add; 900 901 /* Skip too large extent */ 902 if (range_len >= extent_thresh) 903 goto next; 904 905 /* 906 * Skip extents already at its max capacity, this is mostly for 907 * compressed extents, which max cap is only 128K. 908 */ 909 if (em->len >= get_extent_max_capacity(fs_info, em)) 910 goto next; 911 912 /* 913 * Normally there are no more extents after an inline one, thus 914 * @next_mergeable will normally be false and not defragged. 915 * So if an inline extent passed all above checks, just add it 916 * for defrag, and be converted to regular extents. 917 */ 918 if (em->block_start == EXTENT_MAP_INLINE) 919 goto add; 920 921 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 922 extent_thresh, newer_than, locked); 923 if (!next_mergeable) { 924 struct defrag_target_range *last; 925 926 /* Empty target list, no way to merge with last entry */ 927 if (list_empty(target_list)) 928 goto next; 929 last = list_entry(target_list->prev, 930 struct defrag_target_range, list); 931 /* Not mergeable with last entry */ 932 if (last->start + last->len != cur) 933 goto next; 934 935 /* Mergeable, fall through to add it to @target_list. */ 936 } 937 938 add: 939 last_is_target = true; 940 range_len = min(extent_map_end(em), start + len) - cur; 941 /* 942 * This one is a good target, check if it can be merged into 943 * last range of the target list. 944 */ 945 if (!list_empty(target_list)) { 946 struct defrag_target_range *last; 947 948 last = list_entry(target_list->prev, 949 struct defrag_target_range, list); 950 ASSERT(last->start + last->len <= cur); 951 if (last->start + last->len == cur) { 952 /* Mergeable, enlarge the last entry */ 953 last->len += range_len; 954 goto next; 955 } 956 /* Fall through to allocate a new entry */ 957 } 958 959 /* Allocate new defrag_target_range */ 960 new = kmalloc(sizeof(*new), GFP_NOFS); 961 if (!new) { 962 free_extent_map(em); 963 ret = -ENOMEM; 964 break; 965 } 966 new->start = cur; 967 new->len = range_len; 968 list_add_tail(&new->list, target_list); 969 970 next: 971 cur = extent_map_end(em); 972 free_extent_map(em); 973 } 974 if (ret < 0) { 975 struct defrag_target_range *entry; 976 struct defrag_target_range *tmp; 977 978 list_for_each_entry_safe(entry, tmp, target_list, list) { 979 list_del_init(&entry->list); 980 kfree(entry); 981 } 982 } 983 if (!ret && last_scanned_ret) { 984 /* 985 * If the last extent is not a target, the caller can skip to 986 * the end of that extent. 987 * Otherwise, we can only go the end of the specified range. 988 */ 989 if (!last_is_target) 990 *last_scanned_ret = max(cur, *last_scanned_ret); 991 else 992 *last_scanned_ret = max(start + len, *last_scanned_ret); 993 } 994 return ret; 995 } 996 997 #define CLUSTER_SIZE (SZ_256K) 998 static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); 999 1000 /* 1001 * Defrag one contiguous target range. 1002 * 1003 * @inode: target inode 1004 * @target: target range to defrag 1005 * @pages: locked pages covering the defrag range 1006 * @nr_pages: number of locked pages 1007 * 1008 * Caller should ensure: 1009 * 1010 * - Pages are prepared 1011 * Pages should be locked, no ordered extent in the pages range, 1012 * no writeback. 1013 * 1014 * - Extent bits are locked 1015 */ 1016 static int defrag_one_locked_target(struct btrfs_inode *inode, 1017 struct defrag_target_range *target, 1018 struct page **pages, int nr_pages, 1019 struct extent_state **cached_state) 1020 { 1021 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1022 struct extent_changeset *data_reserved = NULL; 1023 const u64 start = target->start; 1024 const u64 len = target->len; 1025 unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; 1026 unsigned long start_index = start >> PAGE_SHIFT; 1027 unsigned long first_index = page_index(pages[0]); 1028 int ret = 0; 1029 int i; 1030 1031 ASSERT(last_index - first_index + 1 <= nr_pages); 1032 1033 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); 1034 if (ret < 0) 1035 return ret; 1036 clear_extent_bit(&inode->io_tree, start, start + len - 1, 1037 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1038 EXTENT_DEFRAG, cached_state); 1039 set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); 1040 1041 /* Update the page status */ 1042 for (i = start_index - first_index; i <= last_index - first_index; i++) { 1043 ClearPageChecked(pages[i]); 1044 btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); 1045 } 1046 btrfs_delalloc_release_extents(inode, len); 1047 extent_changeset_free(data_reserved); 1048 1049 return ret; 1050 } 1051 1052 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, 1053 u32 extent_thresh, u64 newer_than, bool do_compress, 1054 u64 *last_scanned_ret) 1055 { 1056 struct extent_state *cached_state = NULL; 1057 struct defrag_target_range *entry; 1058 struct defrag_target_range *tmp; 1059 LIST_HEAD(target_list); 1060 struct page **pages; 1061 const u32 sectorsize = inode->root->fs_info->sectorsize; 1062 u64 last_index = (start + len - 1) >> PAGE_SHIFT; 1063 u64 start_index = start >> PAGE_SHIFT; 1064 unsigned int nr_pages = last_index - start_index + 1; 1065 int ret = 0; 1066 int i; 1067 1068 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); 1069 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); 1070 1071 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 1072 if (!pages) 1073 return -ENOMEM; 1074 1075 /* Prepare all pages */ 1076 for (i = 0; i < nr_pages; i++) { 1077 pages[i] = defrag_prepare_one_page(inode, start_index + i); 1078 if (IS_ERR(pages[i])) { 1079 ret = PTR_ERR(pages[i]); 1080 pages[i] = NULL; 1081 goto free_pages; 1082 } 1083 } 1084 for (i = 0; i < nr_pages; i++) 1085 wait_on_page_writeback(pages[i]); 1086 1087 /* Lock the pages range */ 1088 lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1089 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1090 &cached_state); 1091 /* 1092 * Now we have a consistent view about the extent map, re-check 1093 * which range really needs to be defragged. 1094 * 1095 * And this time we have extent locked already, pass @locked = true 1096 * so that we won't relock the extent range and cause deadlock. 1097 */ 1098 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1099 newer_than, do_compress, true, 1100 &target_list, last_scanned_ret); 1101 if (ret < 0) 1102 goto unlock_extent; 1103 1104 list_for_each_entry(entry, &target_list, list) { 1105 ret = defrag_one_locked_target(inode, entry, pages, nr_pages, 1106 &cached_state); 1107 if (ret < 0) 1108 break; 1109 } 1110 1111 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1112 list_del_init(&entry->list); 1113 kfree(entry); 1114 } 1115 unlock_extent: 1116 unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1117 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1118 &cached_state); 1119 free_pages: 1120 for (i = 0; i < nr_pages; i++) { 1121 if (pages[i]) { 1122 unlock_page(pages[i]); 1123 put_page(pages[i]); 1124 } 1125 } 1126 kfree(pages); 1127 return ret; 1128 } 1129 1130 static int defrag_one_cluster(struct btrfs_inode *inode, 1131 struct file_ra_state *ra, 1132 u64 start, u32 len, u32 extent_thresh, 1133 u64 newer_than, bool do_compress, 1134 unsigned long *sectors_defragged, 1135 unsigned long max_sectors, 1136 u64 *last_scanned_ret) 1137 { 1138 const u32 sectorsize = inode->root->fs_info->sectorsize; 1139 struct defrag_target_range *entry; 1140 struct defrag_target_range *tmp; 1141 LIST_HEAD(target_list); 1142 int ret; 1143 1144 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1145 newer_than, do_compress, false, 1146 &target_list, NULL); 1147 if (ret < 0) 1148 goto out; 1149 1150 list_for_each_entry(entry, &target_list, list) { 1151 u32 range_len = entry->len; 1152 1153 /* Reached or beyond the limit */ 1154 if (max_sectors && *sectors_defragged >= max_sectors) { 1155 ret = 1; 1156 break; 1157 } 1158 1159 if (max_sectors) 1160 range_len = min_t(u32, range_len, 1161 (max_sectors - *sectors_defragged) * sectorsize); 1162 1163 /* 1164 * If defrag_one_range() has updated last_scanned_ret, 1165 * our range may already be invalid (e.g. hole punched). 1166 * Skip if our range is before last_scanned_ret, as there is 1167 * no need to defrag the range anymore. 1168 */ 1169 if (entry->start + range_len <= *last_scanned_ret) 1170 continue; 1171 1172 if (ra) 1173 page_cache_sync_readahead(inode->vfs_inode.i_mapping, 1174 ra, NULL, entry->start >> PAGE_SHIFT, 1175 ((entry->start + range_len - 1) >> PAGE_SHIFT) - 1176 (entry->start >> PAGE_SHIFT) + 1); 1177 /* 1178 * Here we may not defrag any range if holes are punched before 1179 * we locked the pages. 1180 * But that's fine, it only affects the @sectors_defragged 1181 * accounting. 1182 */ 1183 ret = defrag_one_range(inode, entry->start, range_len, 1184 extent_thresh, newer_than, do_compress, 1185 last_scanned_ret); 1186 if (ret < 0) 1187 break; 1188 *sectors_defragged += range_len >> 1189 inode->root->fs_info->sectorsize_bits; 1190 } 1191 out: 1192 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1193 list_del_init(&entry->list); 1194 kfree(entry); 1195 } 1196 if (ret >= 0) 1197 *last_scanned_ret = max(*last_scanned_ret, start + len); 1198 return ret; 1199 } 1200 1201 /* 1202 * Entry point to file defragmentation. 1203 * 1204 * @inode: inode to be defragged 1205 * @ra: readahead state (can be NUL) 1206 * @range: defrag options including range and flags 1207 * @newer_than: minimum transid to defrag 1208 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode 1209 * will be defragged. 1210 * 1211 * Return <0 for error. 1212 * Return >=0 for the number of sectors defragged, and range->start will be updated 1213 * to indicate the file offset where next defrag should be started at. 1214 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without 1215 * defragging all the range). 1216 */ 1217 int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, 1218 struct btrfs_ioctl_defrag_range_args *range, 1219 u64 newer_than, unsigned long max_to_defrag) 1220 { 1221 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1222 unsigned long sectors_defragged = 0; 1223 u64 isize = i_size_read(inode); 1224 u64 cur; 1225 u64 last_byte; 1226 bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); 1227 bool ra_allocated = false; 1228 int compress_type = BTRFS_COMPRESS_ZLIB; 1229 int ret = 0; 1230 u32 extent_thresh = range->extent_thresh; 1231 pgoff_t start_index; 1232 1233 if (isize == 0) 1234 return 0; 1235 1236 if (range->start >= isize) 1237 return -EINVAL; 1238 1239 if (do_compress) { 1240 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) 1241 return -EINVAL; 1242 if (range->compress_type) 1243 compress_type = range->compress_type; 1244 } 1245 1246 if (extent_thresh == 0) 1247 extent_thresh = SZ_256K; 1248 1249 if (range->start + range->len > range->start) { 1250 /* Got a specific range */ 1251 last_byte = min(isize, range->start + range->len); 1252 } else { 1253 /* Defrag until file end */ 1254 last_byte = isize; 1255 } 1256 1257 /* Align the range */ 1258 cur = round_down(range->start, fs_info->sectorsize); 1259 last_byte = round_up(last_byte, fs_info->sectorsize) - 1; 1260 1261 /* 1262 * If we were not given a ra, allocate a readahead context. As 1263 * readahead is just an optimization, defrag will work without it so 1264 * we don't error out. 1265 */ 1266 if (!ra) { 1267 ra_allocated = true; 1268 ra = kzalloc(sizeof(*ra), GFP_KERNEL); 1269 if (ra) 1270 file_ra_state_init(ra, inode->i_mapping); 1271 } 1272 1273 /* 1274 * Make writeback start from the beginning of the range, so that the 1275 * defrag range can be written sequentially. 1276 */ 1277 start_index = cur >> PAGE_SHIFT; 1278 if (start_index < inode->i_mapping->writeback_index) 1279 inode->i_mapping->writeback_index = start_index; 1280 1281 while (cur < last_byte) { 1282 const unsigned long prev_sectors_defragged = sectors_defragged; 1283 u64 last_scanned = cur; 1284 u64 cluster_end; 1285 1286 if (btrfs_defrag_cancelled(fs_info)) { 1287 ret = -EAGAIN; 1288 break; 1289 } 1290 1291 /* We want the cluster end at page boundary when possible */ 1292 cluster_end = (((cur >> PAGE_SHIFT) + 1293 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; 1294 cluster_end = min(cluster_end, last_byte); 1295 1296 btrfs_inode_lock(inode, 0); 1297 if (IS_SWAPFILE(inode)) { 1298 ret = -ETXTBSY; 1299 btrfs_inode_unlock(inode, 0); 1300 break; 1301 } 1302 if (!(inode->i_sb->s_flags & SB_ACTIVE)) { 1303 btrfs_inode_unlock(inode, 0); 1304 break; 1305 } 1306 if (do_compress) 1307 BTRFS_I(inode)->defrag_compress = compress_type; 1308 ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, 1309 cluster_end + 1 - cur, extent_thresh, 1310 newer_than, do_compress, §ors_defragged, 1311 max_to_defrag, &last_scanned); 1312 1313 if (sectors_defragged > prev_sectors_defragged) 1314 balance_dirty_pages_ratelimited(inode->i_mapping); 1315 1316 btrfs_inode_unlock(inode, 0); 1317 if (ret < 0) 1318 break; 1319 cur = max(cluster_end + 1, last_scanned); 1320 if (ret > 0) { 1321 ret = 0; 1322 break; 1323 } 1324 cond_resched(); 1325 } 1326 1327 if (ra_allocated) 1328 kfree(ra); 1329 /* 1330 * Update range.start for autodefrag, this will indicate where to start 1331 * in next run. 1332 */ 1333 range->start = cur; 1334 if (sectors_defragged) { 1335 /* 1336 * We have defragged some sectors, for compression case they 1337 * need to be written back immediately. 1338 */ 1339 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { 1340 filemap_flush(inode->i_mapping); 1341 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1342 &BTRFS_I(inode)->runtime_flags)) 1343 filemap_flush(inode->i_mapping); 1344 } 1345 if (range->compress_type == BTRFS_COMPRESS_LZO) 1346 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1347 else if (range->compress_type == BTRFS_COMPRESS_ZSTD) 1348 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1349 ret = sectors_defragged; 1350 } 1351 if (do_compress) { 1352 btrfs_inode_lock(inode, 0); 1353 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; 1354 btrfs_inode_unlock(inode, 0); 1355 } 1356 return ret; 1357 } 1358 1359 void __cold btrfs_auto_defrag_exit(void) 1360 { 1361 kmem_cache_destroy(btrfs_inode_defrag_cachep); 1362 } 1363 1364 int __init btrfs_auto_defrag_init(void) 1365 { 1366 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 1367 sizeof(struct inode_defrag), 0, 1368 SLAB_MEM_SPREAD, 1369 NULL); 1370 if (!btrfs_inode_defrag_cachep) 1371 return -ENOMEM; 1372 1373 return 0; 1374 } 1375