1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "print-tree.h" 10 #include "transaction.h" 11 #include "locking.h" 12 #include "accessors.h" 13 #include "messages.h" 14 #include "delalloc-space.h" 15 #include "subpage.h" 16 17 static struct kmem_cache *btrfs_inode_defrag_cachep; 18 19 /* 20 * When auto defrag is enabled we queue up these defrag structs to remember 21 * which inodes need defragging passes. 22 */ 23 struct inode_defrag { 24 struct rb_node rb_node; 25 /* Inode number */ 26 u64 ino; 27 /* 28 * Transid where the defrag was added, we search for extents newer than 29 * this. 30 */ 31 u64 transid; 32 33 /* Root objectid */ 34 u64 root; 35 36 /* 37 * The extent size threshold for autodefrag. 38 * 39 * This value is different for compressed/non-compressed extents, thus 40 * needs to be passed from higher layer. 41 * (aka, inode_should_defrag()) 42 */ 43 u32 extent_thresh; 44 }; 45 46 static int __compare_inode_defrag(struct inode_defrag *defrag1, 47 struct inode_defrag *defrag2) 48 { 49 if (defrag1->root > defrag2->root) 50 return 1; 51 else if (defrag1->root < defrag2->root) 52 return -1; 53 else if (defrag1->ino > defrag2->ino) 54 return 1; 55 else if (defrag1->ino < defrag2->ino) 56 return -1; 57 else 58 return 0; 59 } 60 61 /* 62 * Pop a record for an inode into the defrag tree. The lock must be held 63 * already. 64 * 65 * If you're inserting a record for an older transid than an existing record, 66 * the transid already in the tree is lowered. 67 * 68 * If an existing record is found the defrag item you pass in is freed. 69 */ 70 static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, 71 struct inode_defrag *defrag) 72 { 73 struct btrfs_fs_info *fs_info = inode->root->fs_info; 74 struct inode_defrag *entry; 75 struct rb_node **p; 76 struct rb_node *parent = NULL; 77 int ret; 78 79 p = &fs_info->defrag_inodes.rb_node; 80 while (*p) { 81 parent = *p; 82 entry = rb_entry(parent, struct inode_defrag, rb_node); 83 84 ret = __compare_inode_defrag(defrag, entry); 85 if (ret < 0) 86 p = &parent->rb_left; 87 else if (ret > 0) 88 p = &parent->rb_right; 89 else { 90 /* 91 * If we're reinserting an entry for an old defrag run, 92 * make sure to lower the transid of our existing 93 * record. 94 */ 95 if (defrag->transid < entry->transid) 96 entry->transid = defrag->transid; 97 entry->extent_thresh = min(defrag->extent_thresh, 98 entry->extent_thresh); 99 return -EEXIST; 100 } 101 } 102 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 103 rb_link_node(&defrag->rb_node, parent, p); 104 rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); 105 return 0; 106 } 107 108 static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) 109 { 110 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 111 return 0; 112 113 if (btrfs_fs_closing(fs_info)) 114 return 0; 115 116 return 1; 117 } 118 119 /* 120 * Insert a defrag record for this inode if auto defrag is enabled. 121 */ 122 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 123 struct btrfs_inode *inode, u32 extent_thresh) 124 { 125 struct btrfs_root *root = inode->root; 126 struct btrfs_fs_info *fs_info = root->fs_info; 127 struct inode_defrag *defrag; 128 u64 transid; 129 int ret; 130 131 if (!__need_auto_defrag(fs_info)) 132 return 0; 133 134 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 135 return 0; 136 137 if (trans) 138 transid = trans->transid; 139 else 140 transid = inode->root->last_trans; 141 142 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 143 if (!defrag) 144 return -ENOMEM; 145 146 defrag->ino = btrfs_ino(inode); 147 defrag->transid = transid; 148 defrag->root = root->root_key.objectid; 149 defrag->extent_thresh = extent_thresh; 150 151 spin_lock(&fs_info->defrag_inodes_lock); 152 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 153 /* 154 * If we set IN_DEFRAG flag and evict the inode from memory, 155 * and then re-read this inode, this new inode doesn't have 156 * IN_DEFRAG flag. At the case, we may find the existed defrag. 157 */ 158 ret = __btrfs_add_inode_defrag(inode, defrag); 159 if (ret) 160 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 161 } else { 162 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 163 } 164 spin_unlock(&fs_info->defrag_inodes_lock); 165 return 0; 166 } 167 168 /* 169 * Pick the defragable inode that we want, if it doesn't exist, we will get the 170 * next one. 171 */ 172 static struct inode_defrag *btrfs_pick_defrag_inode( 173 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 174 { 175 struct inode_defrag *entry = NULL; 176 struct inode_defrag tmp; 177 struct rb_node *p; 178 struct rb_node *parent = NULL; 179 int ret; 180 181 tmp.ino = ino; 182 tmp.root = root; 183 184 spin_lock(&fs_info->defrag_inodes_lock); 185 p = fs_info->defrag_inodes.rb_node; 186 while (p) { 187 parent = p; 188 entry = rb_entry(parent, struct inode_defrag, rb_node); 189 190 ret = __compare_inode_defrag(&tmp, entry); 191 if (ret < 0) 192 p = parent->rb_left; 193 else if (ret > 0) 194 p = parent->rb_right; 195 else 196 goto out; 197 } 198 199 if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 200 parent = rb_next(parent); 201 if (parent) 202 entry = rb_entry(parent, struct inode_defrag, rb_node); 203 else 204 entry = NULL; 205 } 206 out: 207 if (entry) 208 rb_erase(parent, &fs_info->defrag_inodes); 209 spin_unlock(&fs_info->defrag_inodes_lock); 210 return entry; 211 } 212 213 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 214 { 215 struct inode_defrag *defrag; 216 struct rb_node *node; 217 218 spin_lock(&fs_info->defrag_inodes_lock); 219 node = rb_first(&fs_info->defrag_inodes); 220 while (node) { 221 rb_erase(node, &fs_info->defrag_inodes); 222 defrag = rb_entry(node, struct inode_defrag, rb_node); 223 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 224 225 cond_resched_lock(&fs_info->defrag_inodes_lock); 226 227 node = rb_first(&fs_info->defrag_inodes); 228 } 229 spin_unlock(&fs_info->defrag_inodes_lock); 230 } 231 232 #define BTRFS_DEFRAG_BATCH 1024 233 234 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 235 struct inode_defrag *defrag) 236 { 237 struct btrfs_root *inode_root; 238 struct inode *inode; 239 struct btrfs_ioctl_defrag_range_args range; 240 int ret = 0; 241 u64 cur = 0; 242 243 again: 244 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 245 goto cleanup; 246 if (!__need_auto_defrag(fs_info)) 247 goto cleanup; 248 249 /* Get the inode */ 250 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 251 if (IS_ERR(inode_root)) { 252 ret = PTR_ERR(inode_root); 253 goto cleanup; 254 } 255 256 inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); 257 btrfs_put_root(inode_root); 258 if (IS_ERR(inode)) { 259 ret = PTR_ERR(inode); 260 goto cleanup; 261 } 262 263 if (cur >= i_size_read(inode)) { 264 iput(inode); 265 goto cleanup; 266 } 267 268 /* Do a chunk of defrag */ 269 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 270 memset(&range, 0, sizeof(range)); 271 range.len = (u64)-1; 272 range.start = cur; 273 range.extent_thresh = defrag->extent_thresh; 274 275 sb_start_write(fs_info->sb); 276 ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 277 BTRFS_DEFRAG_BATCH); 278 sb_end_write(fs_info->sb); 279 iput(inode); 280 281 if (ret < 0) 282 goto cleanup; 283 284 cur = max(cur + fs_info->sectorsize, range.start); 285 goto again; 286 287 cleanup: 288 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 289 return ret; 290 } 291 292 /* 293 * Run through the list of inodes in the FS that need defragging. 294 */ 295 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 296 { 297 struct inode_defrag *defrag; 298 u64 first_ino = 0; 299 u64 root_objectid = 0; 300 301 atomic_inc(&fs_info->defrag_running); 302 while (1) { 303 /* Pause the auto defragger. */ 304 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 305 break; 306 307 if (!__need_auto_defrag(fs_info)) 308 break; 309 310 /* find an inode to defrag */ 311 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 312 if (!defrag) { 313 if (root_objectid || first_ino) { 314 root_objectid = 0; 315 first_ino = 0; 316 continue; 317 } else { 318 break; 319 } 320 } 321 322 first_ino = defrag->ino + 1; 323 root_objectid = defrag->root; 324 325 __btrfs_run_defrag_inode(fs_info, defrag); 326 } 327 atomic_dec(&fs_info->defrag_running); 328 329 /* 330 * During unmount, we use the transaction_wait queue to wait for the 331 * defragger to stop. 332 */ 333 wake_up(&fs_info->transaction_wait); 334 return 0; 335 } 336 337 /* 338 * Defrag all the leaves in a given btree. 339 * Read all the leaves and try to get key order to 340 * better reflect disk order 341 */ 342 343 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 344 struct btrfs_root *root) 345 { 346 struct btrfs_path *path = NULL; 347 struct btrfs_key key; 348 int ret = 0; 349 int wret; 350 int level; 351 int next_key_ret = 0; 352 u64 last_ret = 0; 353 354 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 355 goto out; 356 357 path = btrfs_alloc_path(); 358 if (!path) 359 return -ENOMEM; 360 361 level = btrfs_header_level(root->node); 362 363 if (level == 0) 364 goto out; 365 366 if (root->defrag_progress.objectid == 0) { 367 struct extent_buffer *root_node; 368 u32 nritems; 369 370 root_node = btrfs_lock_root_node(root); 371 nritems = btrfs_header_nritems(root_node); 372 root->defrag_max.objectid = 0; 373 /* from above we know this is not a leaf */ 374 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 375 nritems - 1); 376 btrfs_tree_unlock(root_node); 377 free_extent_buffer(root_node); 378 memset(&key, 0, sizeof(key)); 379 } else { 380 memcpy(&key, &root->defrag_progress, sizeof(key)); 381 } 382 383 path->keep_locks = 1; 384 385 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 386 if (ret < 0) 387 goto out; 388 if (ret > 0) { 389 ret = 0; 390 goto out; 391 } 392 btrfs_release_path(path); 393 /* 394 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 395 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 396 * a deadlock (attempting to write lock an already write locked leaf). 397 */ 398 path->lowest_level = 1; 399 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 400 401 if (wret < 0) { 402 ret = wret; 403 goto out; 404 } 405 if (!path->nodes[1]) { 406 ret = 0; 407 goto out; 408 } 409 /* 410 * The node at level 1 must always be locked when our path has 411 * keep_locks set and lowest_level is 1, regardless of the value of 412 * path->slots[1]. 413 */ 414 BUG_ON(path->locks[1] == 0); 415 ret = btrfs_realloc_node(trans, root, 416 path->nodes[1], 0, 417 &last_ret, 418 &root->defrag_progress); 419 if (ret) { 420 WARN_ON(ret == -EAGAIN); 421 goto out; 422 } 423 /* 424 * Now that we reallocated the node we can find the next key. Note that 425 * btrfs_find_next_key() can release our path and do another search 426 * without COWing, this is because even with path->keep_locks = 1, 427 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 428 * node when path->slots[node_level - 1] does not point to the last 429 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 430 * we search for the next key after reallocating our node. 431 */ 432 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 433 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 434 BTRFS_OLDEST_GENERATION); 435 if (next_key_ret == 0) { 436 memcpy(&root->defrag_progress, &key, sizeof(key)); 437 ret = -EAGAIN; 438 } 439 out: 440 btrfs_free_path(path); 441 if (ret == -EAGAIN) { 442 if (root->defrag_max.objectid > root->defrag_progress.objectid) 443 goto done; 444 if (root->defrag_max.type > root->defrag_progress.type) 445 goto done; 446 if (root->defrag_max.offset > root->defrag_progress.offset) 447 goto done; 448 ret = 0; 449 } 450 done: 451 if (ret != -EAGAIN) 452 memset(&root->defrag_progress, 0, 453 sizeof(root->defrag_progress)); 454 455 return ret; 456 } 457 458 /* 459 * Defrag specific helper to get an extent map. 460 * 461 * Differences between this and btrfs_get_extent() are: 462 * 463 * - No extent_map will be added to inode->extent_tree 464 * To reduce memory usage in the long run. 465 * 466 * - Extra optimization to skip file extents older than @newer_than 467 * By using btrfs_search_forward() we can skip entire file ranges that 468 * have extents created in past transactions, because btrfs_search_forward() 469 * will not visit leaves and nodes with a generation smaller than given 470 * minimal generation threshold (@newer_than). 471 * 472 * Return valid em if we find a file extent matching the requirement. 473 * Return NULL if we can not find a file extent matching the requirement. 474 * 475 * Return ERR_PTR() for error. 476 */ 477 static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, 478 u64 start, u64 newer_than) 479 { 480 struct btrfs_root *root = inode->root; 481 struct btrfs_file_extent_item *fi; 482 struct btrfs_path path = { 0 }; 483 struct extent_map *em; 484 struct btrfs_key key; 485 u64 ino = btrfs_ino(inode); 486 int ret; 487 488 em = alloc_extent_map(); 489 if (!em) { 490 ret = -ENOMEM; 491 goto err; 492 } 493 494 key.objectid = ino; 495 key.type = BTRFS_EXTENT_DATA_KEY; 496 key.offset = start; 497 498 if (newer_than) { 499 ret = btrfs_search_forward(root, &key, &path, newer_than); 500 if (ret < 0) 501 goto err; 502 /* Can't find anything newer */ 503 if (ret > 0) 504 goto not_found; 505 } else { 506 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 507 if (ret < 0) 508 goto err; 509 } 510 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { 511 /* 512 * If btrfs_search_slot() makes path to point beyond nritems, 513 * we should not have an empty leaf, as this inode must at 514 * least have its INODE_ITEM. 515 */ 516 ASSERT(btrfs_header_nritems(path.nodes[0])); 517 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; 518 } 519 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 520 /* Perfect match, no need to go one slot back */ 521 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && 522 key.offset == start) 523 goto iterate; 524 525 /* We didn't find a perfect match, needs to go one slot back */ 526 if (path.slots[0] > 0) { 527 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 528 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 529 path.slots[0]--; 530 } 531 532 iterate: 533 /* Iterate through the path to find a file extent covering @start */ 534 while (true) { 535 u64 extent_end; 536 537 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 538 goto next; 539 540 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 541 542 /* 543 * We may go one slot back to INODE_REF/XATTR item, then 544 * need to go forward until we reach an EXTENT_DATA. 545 * But we should still has the correct ino as key.objectid. 546 */ 547 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) 548 goto next; 549 550 /* It's beyond our target range, definitely not extent found */ 551 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) 552 goto not_found; 553 554 /* 555 * | |<- File extent ->| 556 * \- start 557 * 558 * This means there is a hole between start and key.offset. 559 */ 560 if (key.offset > start) { 561 em->start = start; 562 em->orig_start = start; 563 em->block_start = EXTENT_MAP_HOLE; 564 em->len = key.offset - start; 565 break; 566 } 567 568 fi = btrfs_item_ptr(path.nodes[0], path.slots[0], 569 struct btrfs_file_extent_item); 570 extent_end = btrfs_file_extent_end(&path); 571 572 /* 573 * |<- file extent ->| | 574 * \- start 575 * 576 * We haven't reached start, search next slot. 577 */ 578 if (extent_end <= start) 579 goto next; 580 581 /* Now this extent covers @start, convert it to em */ 582 btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); 583 break; 584 next: 585 ret = btrfs_next_item(root, &path); 586 if (ret < 0) 587 goto err; 588 if (ret > 0) 589 goto not_found; 590 } 591 btrfs_release_path(&path); 592 return em; 593 594 not_found: 595 btrfs_release_path(&path); 596 free_extent_map(em); 597 return NULL; 598 599 err: 600 btrfs_release_path(&path); 601 free_extent_map(em); 602 return ERR_PTR(ret); 603 } 604 605 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, 606 u64 newer_than, bool locked) 607 { 608 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 609 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 610 struct extent_map *em; 611 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; 612 613 /* 614 * Hopefully we have this extent in the tree already, try without the 615 * full extent lock. 616 */ 617 read_lock(&em_tree->lock); 618 em = lookup_extent_mapping(em_tree, start, sectorsize); 619 read_unlock(&em_tree->lock); 620 621 /* 622 * We can get a merged extent, in that case, we need to re-search 623 * tree to get the original em for defrag. 624 * 625 * If @newer_than is 0 or em::generation < newer_than, we can trust 626 * this em, as either we don't care about the generation, or the 627 * merged extent map will be rejected anyway. 628 */ 629 if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && 630 newer_than && em->generation >= newer_than) { 631 free_extent_map(em); 632 em = NULL; 633 } 634 635 if (!em) { 636 struct extent_state *cached = NULL; 637 u64 end = start + sectorsize - 1; 638 639 /* Get the big lock and read metadata off disk. */ 640 if (!locked) 641 lock_extent(io_tree, start, end, &cached); 642 em = defrag_get_extent(BTRFS_I(inode), start, newer_than); 643 if (!locked) 644 unlock_extent(io_tree, start, end, &cached); 645 646 if (IS_ERR(em)) 647 return NULL; 648 } 649 650 return em; 651 } 652 653 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, 654 const struct extent_map *em) 655 { 656 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 657 return BTRFS_MAX_COMPRESSED; 658 return fs_info->max_extent_size; 659 } 660 661 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, 662 u32 extent_thresh, u64 newer_than, bool locked) 663 { 664 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 665 struct extent_map *next; 666 bool ret = false; 667 668 /* This is the last extent */ 669 if (em->start + em->len >= i_size_read(inode)) 670 return false; 671 672 /* 673 * Here we need to pass @newer_then when checking the next extent, or 674 * we will hit a case we mark current extent for defrag, but the next 675 * one will not be a target. 676 * This will just cause extra IO without really reducing the fragments. 677 */ 678 next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); 679 /* No more em or hole */ 680 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 681 goto out; 682 if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) 683 goto out; 684 /* 685 * If the next extent is at its max capacity, defragging current extent 686 * makes no sense, as the total number of extents won't change. 687 */ 688 if (next->len >= get_extent_max_capacity(fs_info, em)) 689 goto out; 690 /* Skip older extent */ 691 if (next->generation < newer_than) 692 goto out; 693 /* Also check extent size */ 694 if (next->len >= extent_thresh) 695 goto out; 696 697 ret = true; 698 out: 699 free_extent_map(next); 700 return ret; 701 } 702 703 /* 704 * Prepare one page to be defragged. 705 * 706 * This will ensure: 707 * 708 * - Returned page is locked and has been set up properly. 709 * - No ordered extent exists in the page. 710 * - The page is uptodate. 711 * 712 * NOTE: Caller should also wait for page writeback after the cluster is 713 * prepared, here we don't do writeback wait for each page. 714 */ 715 static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) 716 { 717 struct address_space *mapping = inode->vfs_inode.i_mapping; 718 gfp_t mask = btrfs_alloc_write_mask(mapping); 719 u64 page_start = (u64)index << PAGE_SHIFT; 720 u64 page_end = page_start + PAGE_SIZE - 1; 721 struct extent_state *cached_state = NULL; 722 struct page *page; 723 int ret; 724 725 again: 726 page = find_or_create_page(mapping, index, mask); 727 if (!page) 728 return ERR_PTR(-ENOMEM); 729 730 /* 731 * Since we can defragment files opened read-only, we can encounter 732 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We 733 * can't do I/O using huge pages yet, so return an error for now. 734 * Filesystem transparent huge pages are typically only used for 735 * executables that explicitly enable them, so this isn't very 736 * restrictive. 737 */ 738 if (PageCompound(page)) { 739 unlock_page(page); 740 put_page(page); 741 return ERR_PTR(-ETXTBSY); 742 } 743 744 ret = set_page_extent_mapped(page); 745 if (ret < 0) { 746 unlock_page(page); 747 put_page(page); 748 return ERR_PTR(ret); 749 } 750 751 /* Wait for any existing ordered extent in the range */ 752 while (1) { 753 struct btrfs_ordered_extent *ordered; 754 755 lock_extent(&inode->io_tree, page_start, page_end, &cached_state); 756 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 757 unlock_extent(&inode->io_tree, page_start, page_end, 758 &cached_state); 759 if (!ordered) 760 break; 761 762 unlock_page(page); 763 btrfs_start_ordered_extent(ordered, 1); 764 btrfs_put_ordered_extent(ordered); 765 lock_page(page); 766 /* 767 * We unlocked the page above, so we need check if it was 768 * released or not. 769 */ 770 if (page->mapping != mapping || !PagePrivate(page)) { 771 unlock_page(page); 772 put_page(page); 773 goto again; 774 } 775 } 776 777 /* 778 * Now the page range has no ordered extent any more. Read the page to 779 * make it uptodate. 780 */ 781 if (!PageUptodate(page)) { 782 btrfs_read_folio(NULL, page_folio(page)); 783 lock_page(page); 784 if (page->mapping != mapping || !PagePrivate(page)) { 785 unlock_page(page); 786 put_page(page); 787 goto again; 788 } 789 if (!PageUptodate(page)) { 790 unlock_page(page); 791 put_page(page); 792 return ERR_PTR(-EIO); 793 } 794 } 795 return page; 796 } 797 798 struct defrag_target_range { 799 struct list_head list; 800 u64 start; 801 u64 len; 802 }; 803 804 /* 805 * Collect all valid target extents. 806 * 807 * @start: file offset to lookup 808 * @len: length to lookup 809 * @extent_thresh: file extent size threshold, any extent size >= this value 810 * will be ignored 811 * @newer_than: only defrag extents newer than this value 812 * @do_compress: whether the defrag is doing compression 813 * if true, @extent_thresh will be ignored and all regular 814 * file extents meeting @newer_than will be targets. 815 * @locked: if the range has already held extent lock 816 * @target_list: list of targets file extents 817 */ 818 static int defrag_collect_targets(struct btrfs_inode *inode, 819 u64 start, u64 len, u32 extent_thresh, 820 u64 newer_than, bool do_compress, 821 bool locked, struct list_head *target_list, 822 u64 *last_scanned_ret) 823 { 824 struct btrfs_fs_info *fs_info = inode->root->fs_info; 825 bool last_is_target = false; 826 u64 cur = start; 827 int ret = 0; 828 829 while (cur < start + len) { 830 struct extent_map *em; 831 struct defrag_target_range *new; 832 bool next_mergeable = true; 833 u64 range_len; 834 835 last_is_target = false; 836 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); 837 if (!em) 838 break; 839 840 /* 841 * If the file extent is an inlined one, we may still want to 842 * defrag it (fallthrough) if it will cause a regular extent. 843 * This is for users who want to convert inline extents to 844 * regular ones through max_inline= mount option. 845 */ 846 if (em->block_start == EXTENT_MAP_INLINE && 847 em->len <= inode->root->fs_info->max_inline) 848 goto next; 849 850 /* Skip hole/delalloc/preallocated extents */ 851 if (em->block_start == EXTENT_MAP_HOLE || 852 em->block_start == EXTENT_MAP_DELALLOC || 853 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 854 goto next; 855 856 /* Skip older extent */ 857 if (em->generation < newer_than) 858 goto next; 859 860 /* This em is under writeback, no need to defrag */ 861 if (em->generation == (u64)-1) 862 goto next; 863 864 /* 865 * Our start offset might be in the middle of an existing extent 866 * map, so take that into account. 867 */ 868 range_len = em->len - (cur - em->start); 869 /* 870 * If this range of the extent map is already flagged for delalloc, 871 * skip it, because: 872 * 873 * 1) We could deadlock later, when trying to reserve space for 874 * delalloc, because in case we can't immediately reserve space 875 * the flusher can start delalloc and wait for the respective 876 * ordered extents to complete. The deadlock would happen 877 * because we do the space reservation while holding the range 878 * locked, and starting writeback, or finishing an ordered 879 * extent, requires locking the range; 880 * 881 * 2) If there's delalloc there, it means there's dirty pages for 882 * which writeback has not started yet (we clean the delalloc 883 * flag when starting writeback and after creating an ordered 884 * extent). If we mark pages in an adjacent range for defrag, 885 * then we will have a larger contiguous range for delalloc, 886 * very likely resulting in a larger extent after writeback is 887 * triggered (except in a case of free space fragmentation). 888 */ 889 if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, 890 EXTENT_DELALLOC, 0, NULL)) 891 goto next; 892 893 /* 894 * For do_compress case, we want to compress all valid file 895 * extents, thus no @extent_thresh or mergeable check. 896 */ 897 if (do_compress) 898 goto add; 899 900 /* Skip too large extent */ 901 if (range_len >= extent_thresh) 902 goto next; 903 904 /* 905 * Skip extents already at its max capacity, this is mostly for 906 * compressed extents, which max cap is only 128K. 907 */ 908 if (em->len >= get_extent_max_capacity(fs_info, em)) 909 goto next; 910 911 /* 912 * Normally there are no more extents after an inline one, thus 913 * @next_mergeable will normally be false and not defragged. 914 * So if an inline extent passed all above checks, just add it 915 * for defrag, and be converted to regular extents. 916 */ 917 if (em->block_start == EXTENT_MAP_INLINE) 918 goto add; 919 920 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 921 extent_thresh, newer_than, locked); 922 if (!next_mergeable) { 923 struct defrag_target_range *last; 924 925 /* Empty target list, no way to merge with last entry */ 926 if (list_empty(target_list)) 927 goto next; 928 last = list_entry(target_list->prev, 929 struct defrag_target_range, list); 930 /* Not mergeable with last entry */ 931 if (last->start + last->len != cur) 932 goto next; 933 934 /* Mergeable, fall through to add it to @target_list. */ 935 } 936 937 add: 938 last_is_target = true; 939 range_len = min(extent_map_end(em), start + len) - cur; 940 /* 941 * This one is a good target, check if it can be merged into 942 * last range of the target list. 943 */ 944 if (!list_empty(target_list)) { 945 struct defrag_target_range *last; 946 947 last = list_entry(target_list->prev, 948 struct defrag_target_range, list); 949 ASSERT(last->start + last->len <= cur); 950 if (last->start + last->len == cur) { 951 /* Mergeable, enlarge the last entry */ 952 last->len += range_len; 953 goto next; 954 } 955 /* Fall through to allocate a new entry */ 956 } 957 958 /* Allocate new defrag_target_range */ 959 new = kmalloc(sizeof(*new), GFP_NOFS); 960 if (!new) { 961 free_extent_map(em); 962 ret = -ENOMEM; 963 break; 964 } 965 new->start = cur; 966 new->len = range_len; 967 list_add_tail(&new->list, target_list); 968 969 next: 970 cur = extent_map_end(em); 971 free_extent_map(em); 972 } 973 if (ret < 0) { 974 struct defrag_target_range *entry; 975 struct defrag_target_range *tmp; 976 977 list_for_each_entry_safe(entry, tmp, target_list, list) { 978 list_del_init(&entry->list); 979 kfree(entry); 980 } 981 } 982 if (!ret && last_scanned_ret) { 983 /* 984 * If the last extent is not a target, the caller can skip to 985 * the end of that extent. 986 * Otherwise, we can only go the end of the specified range. 987 */ 988 if (!last_is_target) 989 *last_scanned_ret = max(cur, *last_scanned_ret); 990 else 991 *last_scanned_ret = max(start + len, *last_scanned_ret); 992 } 993 return ret; 994 } 995 996 #define CLUSTER_SIZE (SZ_256K) 997 static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); 998 999 /* 1000 * Defrag one contiguous target range. 1001 * 1002 * @inode: target inode 1003 * @target: target range to defrag 1004 * @pages: locked pages covering the defrag range 1005 * @nr_pages: number of locked pages 1006 * 1007 * Caller should ensure: 1008 * 1009 * - Pages are prepared 1010 * Pages should be locked, no ordered extent in the pages range, 1011 * no writeback. 1012 * 1013 * - Extent bits are locked 1014 */ 1015 static int defrag_one_locked_target(struct btrfs_inode *inode, 1016 struct defrag_target_range *target, 1017 struct page **pages, int nr_pages, 1018 struct extent_state **cached_state) 1019 { 1020 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1021 struct extent_changeset *data_reserved = NULL; 1022 const u64 start = target->start; 1023 const u64 len = target->len; 1024 unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; 1025 unsigned long start_index = start >> PAGE_SHIFT; 1026 unsigned long first_index = page_index(pages[0]); 1027 int ret = 0; 1028 int i; 1029 1030 ASSERT(last_index - first_index + 1 <= nr_pages); 1031 1032 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); 1033 if (ret < 0) 1034 return ret; 1035 clear_extent_bit(&inode->io_tree, start, start + len - 1, 1036 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1037 EXTENT_DEFRAG, cached_state); 1038 set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); 1039 1040 /* Update the page status */ 1041 for (i = start_index - first_index; i <= last_index - first_index; i++) { 1042 ClearPageChecked(pages[i]); 1043 btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); 1044 } 1045 btrfs_delalloc_release_extents(inode, len); 1046 extent_changeset_free(data_reserved); 1047 1048 return ret; 1049 } 1050 1051 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, 1052 u32 extent_thresh, u64 newer_than, bool do_compress, 1053 u64 *last_scanned_ret) 1054 { 1055 struct extent_state *cached_state = NULL; 1056 struct defrag_target_range *entry; 1057 struct defrag_target_range *tmp; 1058 LIST_HEAD(target_list); 1059 struct page **pages; 1060 const u32 sectorsize = inode->root->fs_info->sectorsize; 1061 u64 last_index = (start + len - 1) >> PAGE_SHIFT; 1062 u64 start_index = start >> PAGE_SHIFT; 1063 unsigned int nr_pages = last_index - start_index + 1; 1064 int ret = 0; 1065 int i; 1066 1067 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); 1068 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); 1069 1070 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 1071 if (!pages) 1072 return -ENOMEM; 1073 1074 /* Prepare all pages */ 1075 for (i = 0; i < nr_pages; i++) { 1076 pages[i] = defrag_prepare_one_page(inode, start_index + i); 1077 if (IS_ERR(pages[i])) { 1078 ret = PTR_ERR(pages[i]); 1079 pages[i] = NULL; 1080 goto free_pages; 1081 } 1082 } 1083 for (i = 0; i < nr_pages; i++) 1084 wait_on_page_writeback(pages[i]); 1085 1086 /* Lock the pages range */ 1087 lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1088 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1089 &cached_state); 1090 /* 1091 * Now we have a consistent view about the extent map, re-check 1092 * which range really needs to be defragged. 1093 * 1094 * And this time we have extent locked already, pass @locked = true 1095 * so that we won't relock the extent range and cause deadlock. 1096 */ 1097 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1098 newer_than, do_compress, true, 1099 &target_list, last_scanned_ret); 1100 if (ret < 0) 1101 goto unlock_extent; 1102 1103 list_for_each_entry(entry, &target_list, list) { 1104 ret = defrag_one_locked_target(inode, entry, pages, nr_pages, 1105 &cached_state); 1106 if (ret < 0) 1107 break; 1108 } 1109 1110 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1111 list_del_init(&entry->list); 1112 kfree(entry); 1113 } 1114 unlock_extent: 1115 unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1116 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1117 &cached_state); 1118 free_pages: 1119 for (i = 0; i < nr_pages; i++) { 1120 if (pages[i]) { 1121 unlock_page(pages[i]); 1122 put_page(pages[i]); 1123 } 1124 } 1125 kfree(pages); 1126 return ret; 1127 } 1128 1129 static int defrag_one_cluster(struct btrfs_inode *inode, 1130 struct file_ra_state *ra, 1131 u64 start, u32 len, u32 extent_thresh, 1132 u64 newer_than, bool do_compress, 1133 unsigned long *sectors_defragged, 1134 unsigned long max_sectors, 1135 u64 *last_scanned_ret) 1136 { 1137 const u32 sectorsize = inode->root->fs_info->sectorsize; 1138 struct defrag_target_range *entry; 1139 struct defrag_target_range *tmp; 1140 LIST_HEAD(target_list); 1141 int ret; 1142 1143 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1144 newer_than, do_compress, false, 1145 &target_list, NULL); 1146 if (ret < 0) 1147 goto out; 1148 1149 list_for_each_entry(entry, &target_list, list) { 1150 u32 range_len = entry->len; 1151 1152 /* Reached or beyond the limit */ 1153 if (max_sectors && *sectors_defragged >= max_sectors) { 1154 ret = 1; 1155 break; 1156 } 1157 1158 if (max_sectors) 1159 range_len = min_t(u32, range_len, 1160 (max_sectors - *sectors_defragged) * sectorsize); 1161 1162 /* 1163 * If defrag_one_range() has updated last_scanned_ret, 1164 * our range may already be invalid (e.g. hole punched). 1165 * Skip if our range is before last_scanned_ret, as there is 1166 * no need to defrag the range anymore. 1167 */ 1168 if (entry->start + range_len <= *last_scanned_ret) 1169 continue; 1170 1171 if (ra) 1172 page_cache_sync_readahead(inode->vfs_inode.i_mapping, 1173 ra, NULL, entry->start >> PAGE_SHIFT, 1174 ((entry->start + range_len - 1) >> PAGE_SHIFT) - 1175 (entry->start >> PAGE_SHIFT) + 1); 1176 /* 1177 * Here we may not defrag any range if holes are punched before 1178 * we locked the pages. 1179 * But that's fine, it only affects the @sectors_defragged 1180 * accounting. 1181 */ 1182 ret = defrag_one_range(inode, entry->start, range_len, 1183 extent_thresh, newer_than, do_compress, 1184 last_scanned_ret); 1185 if (ret < 0) 1186 break; 1187 *sectors_defragged += range_len >> 1188 inode->root->fs_info->sectorsize_bits; 1189 } 1190 out: 1191 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1192 list_del_init(&entry->list); 1193 kfree(entry); 1194 } 1195 if (ret >= 0) 1196 *last_scanned_ret = max(*last_scanned_ret, start + len); 1197 return ret; 1198 } 1199 1200 /* 1201 * Entry point to file defragmentation. 1202 * 1203 * @inode: inode to be defragged 1204 * @ra: readahead state (can be NUL) 1205 * @range: defrag options including range and flags 1206 * @newer_than: minimum transid to defrag 1207 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode 1208 * will be defragged. 1209 * 1210 * Return <0 for error. 1211 * Return >=0 for the number of sectors defragged, and range->start will be updated 1212 * to indicate the file offset where next defrag should be started at. 1213 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without 1214 * defragging all the range). 1215 */ 1216 int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, 1217 struct btrfs_ioctl_defrag_range_args *range, 1218 u64 newer_than, unsigned long max_to_defrag) 1219 { 1220 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1221 unsigned long sectors_defragged = 0; 1222 u64 isize = i_size_read(inode); 1223 u64 cur; 1224 u64 last_byte; 1225 bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); 1226 bool ra_allocated = false; 1227 int compress_type = BTRFS_COMPRESS_ZLIB; 1228 int ret = 0; 1229 u32 extent_thresh = range->extent_thresh; 1230 pgoff_t start_index; 1231 1232 if (isize == 0) 1233 return 0; 1234 1235 if (range->start >= isize) 1236 return -EINVAL; 1237 1238 if (do_compress) { 1239 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) 1240 return -EINVAL; 1241 if (range->compress_type) 1242 compress_type = range->compress_type; 1243 } 1244 1245 if (extent_thresh == 0) 1246 extent_thresh = SZ_256K; 1247 1248 if (range->start + range->len > range->start) { 1249 /* Got a specific range */ 1250 last_byte = min(isize, range->start + range->len); 1251 } else { 1252 /* Defrag until file end */ 1253 last_byte = isize; 1254 } 1255 1256 /* Align the range */ 1257 cur = round_down(range->start, fs_info->sectorsize); 1258 last_byte = round_up(last_byte, fs_info->sectorsize) - 1; 1259 1260 /* 1261 * If we were not given a ra, allocate a readahead context. As 1262 * readahead is just an optimization, defrag will work without it so 1263 * we don't error out. 1264 */ 1265 if (!ra) { 1266 ra_allocated = true; 1267 ra = kzalloc(sizeof(*ra), GFP_KERNEL); 1268 if (ra) 1269 file_ra_state_init(ra, inode->i_mapping); 1270 } 1271 1272 /* 1273 * Make writeback start from the beginning of the range, so that the 1274 * defrag range can be written sequentially. 1275 */ 1276 start_index = cur >> PAGE_SHIFT; 1277 if (start_index < inode->i_mapping->writeback_index) 1278 inode->i_mapping->writeback_index = start_index; 1279 1280 while (cur < last_byte) { 1281 const unsigned long prev_sectors_defragged = sectors_defragged; 1282 u64 last_scanned = cur; 1283 u64 cluster_end; 1284 1285 if (btrfs_defrag_cancelled(fs_info)) { 1286 ret = -EAGAIN; 1287 break; 1288 } 1289 1290 /* We want the cluster end at page boundary when possible */ 1291 cluster_end = (((cur >> PAGE_SHIFT) + 1292 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; 1293 cluster_end = min(cluster_end, last_byte); 1294 1295 btrfs_inode_lock(inode, 0); 1296 if (IS_SWAPFILE(inode)) { 1297 ret = -ETXTBSY; 1298 btrfs_inode_unlock(inode, 0); 1299 break; 1300 } 1301 if (!(inode->i_sb->s_flags & SB_ACTIVE)) { 1302 btrfs_inode_unlock(inode, 0); 1303 break; 1304 } 1305 if (do_compress) 1306 BTRFS_I(inode)->defrag_compress = compress_type; 1307 ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, 1308 cluster_end + 1 - cur, extent_thresh, 1309 newer_than, do_compress, §ors_defragged, 1310 max_to_defrag, &last_scanned); 1311 1312 if (sectors_defragged > prev_sectors_defragged) 1313 balance_dirty_pages_ratelimited(inode->i_mapping); 1314 1315 btrfs_inode_unlock(inode, 0); 1316 if (ret < 0) 1317 break; 1318 cur = max(cluster_end + 1, last_scanned); 1319 if (ret > 0) { 1320 ret = 0; 1321 break; 1322 } 1323 cond_resched(); 1324 } 1325 1326 if (ra_allocated) 1327 kfree(ra); 1328 /* 1329 * Update range.start for autodefrag, this will indicate where to start 1330 * in next run. 1331 */ 1332 range->start = cur; 1333 if (sectors_defragged) { 1334 /* 1335 * We have defragged some sectors, for compression case they 1336 * need to be written back immediately. 1337 */ 1338 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { 1339 filemap_flush(inode->i_mapping); 1340 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1341 &BTRFS_I(inode)->runtime_flags)) 1342 filemap_flush(inode->i_mapping); 1343 } 1344 if (range->compress_type == BTRFS_COMPRESS_LZO) 1345 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1346 else if (range->compress_type == BTRFS_COMPRESS_ZSTD) 1347 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1348 ret = sectors_defragged; 1349 } 1350 if (do_compress) { 1351 btrfs_inode_lock(inode, 0); 1352 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; 1353 btrfs_inode_unlock(inode, 0); 1354 } 1355 return ret; 1356 } 1357 1358 void __cold btrfs_auto_defrag_exit(void) 1359 { 1360 kmem_cache_destroy(btrfs_inode_defrag_cachep); 1361 } 1362 1363 int __init btrfs_auto_defrag_init(void) 1364 { 1365 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 1366 sizeof(struct inode_defrag), 0, 1367 SLAB_MEM_SPREAD, 1368 NULL); 1369 if (!btrfs_inode_defrag_cachep) 1370 return -ENOMEM; 1371 1372 return 0; 1373 } 1374