1778dd695SJosef Bacik // SPDX-License-Identifier: GPL-2.0
2778dd695SJosef Bacik /*
3778dd695SJosef Bacik * Copyright (C) 2007 Oracle. All rights reserved.
4778dd695SJosef Bacik */
5778dd695SJosef Bacik
6778dd695SJosef Bacik #include <linux/sched.h>
7778dd695SJosef Bacik #include "ctree.h"
8778dd695SJosef Bacik #include "disk-io.h"
9778dd695SJosef Bacik #include "print-tree.h"
10778dd695SJosef Bacik #include "transaction.h"
11778dd695SJosef Bacik #include "locking.h"
12778dd695SJosef Bacik #include "accessors.h"
13a6a01ca6SJosef Bacik #include "messages.h"
14a6a01ca6SJosef Bacik #include "delalloc-space.h"
15a6a01ca6SJosef Bacik #include "subpage.h"
1659b818e0SJosef Bacik #include "defrag.h"
177c8ede16SJosef Bacik #include "file-item.h"
187f0add25SJosef Bacik #include "super.h"
19778dd695SJosef Bacik
206e3df18bSJosef Bacik static struct kmem_cache *btrfs_inode_defrag_cachep;
216e3df18bSJosef Bacik
226e3df18bSJosef Bacik /*
236e3df18bSJosef Bacik * When auto defrag is enabled we queue up these defrag structs to remember
246e3df18bSJosef Bacik * which inodes need defragging passes.
256e3df18bSJosef Bacik */
266e3df18bSJosef Bacik struct inode_defrag {
276e3df18bSJosef Bacik struct rb_node rb_node;
286e3df18bSJosef Bacik /* Inode number */
296e3df18bSJosef Bacik u64 ino;
306e3df18bSJosef Bacik /*
316e3df18bSJosef Bacik * Transid where the defrag was added, we search for extents newer than
326e3df18bSJosef Bacik * this.
336e3df18bSJosef Bacik */
346e3df18bSJosef Bacik u64 transid;
356e3df18bSJosef Bacik
366e3df18bSJosef Bacik /* Root objectid */
376e3df18bSJosef Bacik u64 root;
386e3df18bSJosef Bacik
396e3df18bSJosef Bacik /*
406e3df18bSJosef Bacik * The extent size threshold for autodefrag.
416e3df18bSJosef Bacik *
426e3df18bSJosef Bacik * This value is different for compressed/non-compressed extents, thus
436e3df18bSJosef Bacik * needs to be passed from higher layer.
446e3df18bSJosef Bacik * (aka, inode_should_defrag())
456e3df18bSJosef Bacik */
466e3df18bSJosef Bacik u32 extent_thresh;
476e3df18bSJosef Bacik };
486e3df18bSJosef Bacik
__compare_inode_defrag(struct inode_defrag * defrag1,struct inode_defrag * defrag2)496e3df18bSJosef Bacik static int __compare_inode_defrag(struct inode_defrag *defrag1,
506e3df18bSJosef Bacik struct inode_defrag *defrag2)
516e3df18bSJosef Bacik {
526e3df18bSJosef Bacik if (defrag1->root > defrag2->root)
536e3df18bSJosef Bacik return 1;
546e3df18bSJosef Bacik else if (defrag1->root < defrag2->root)
556e3df18bSJosef Bacik return -1;
566e3df18bSJosef Bacik else if (defrag1->ino > defrag2->ino)
576e3df18bSJosef Bacik return 1;
586e3df18bSJosef Bacik else if (defrag1->ino < defrag2->ino)
596e3df18bSJosef Bacik return -1;
606e3df18bSJosef Bacik else
616e3df18bSJosef Bacik return 0;
626e3df18bSJosef Bacik }
636e3df18bSJosef Bacik
646e3df18bSJosef Bacik /*
656e3df18bSJosef Bacik * Pop a record for an inode into the defrag tree. The lock must be held
666e3df18bSJosef Bacik * already.
676e3df18bSJosef Bacik *
686e3df18bSJosef Bacik * If you're inserting a record for an older transid than an existing record,
696e3df18bSJosef Bacik * the transid already in the tree is lowered.
706e3df18bSJosef Bacik *
716e3df18bSJosef Bacik * If an existing record is found the defrag item you pass in is freed.
726e3df18bSJosef Bacik */
__btrfs_add_inode_defrag(struct btrfs_inode * inode,struct inode_defrag * defrag)736e3df18bSJosef Bacik static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
746e3df18bSJosef Bacik struct inode_defrag *defrag)
756e3df18bSJosef Bacik {
766e3df18bSJosef Bacik struct btrfs_fs_info *fs_info = inode->root->fs_info;
776e3df18bSJosef Bacik struct inode_defrag *entry;
786e3df18bSJosef Bacik struct rb_node **p;
796e3df18bSJosef Bacik struct rb_node *parent = NULL;
806e3df18bSJosef Bacik int ret;
816e3df18bSJosef Bacik
826e3df18bSJosef Bacik p = &fs_info->defrag_inodes.rb_node;
836e3df18bSJosef Bacik while (*p) {
846e3df18bSJosef Bacik parent = *p;
856e3df18bSJosef Bacik entry = rb_entry(parent, struct inode_defrag, rb_node);
866e3df18bSJosef Bacik
876e3df18bSJosef Bacik ret = __compare_inode_defrag(defrag, entry);
886e3df18bSJosef Bacik if (ret < 0)
896e3df18bSJosef Bacik p = &parent->rb_left;
906e3df18bSJosef Bacik else if (ret > 0)
916e3df18bSJosef Bacik p = &parent->rb_right;
926e3df18bSJosef Bacik else {
936e3df18bSJosef Bacik /*
946e3df18bSJosef Bacik * If we're reinserting an entry for an old defrag run,
956e3df18bSJosef Bacik * make sure to lower the transid of our existing
966e3df18bSJosef Bacik * record.
976e3df18bSJosef Bacik */
986e3df18bSJosef Bacik if (defrag->transid < entry->transid)
996e3df18bSJosef Bacik entry->transid = defrag->transid;
1006e3df18bSJosef Bacik entry->extent_thresh = min(defrag->extent_thresh,
1016e3df18bSJosef Bacik entry->extent_thresh);
1026e3df18bSJosef Bacik return -EEXIST;
1036e3df18bSJosef Bacik }
1046e3df18bSJosef Bacik }
1056e3df18bSJosef Bacik set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
1066e3df18bSJosef Bacik rb_link_node(&defrag->rb_node, parent, p);
1076e3df18bSJosef Bacik rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
1086e3df18bSJosef Bacik return 0;
1096e3df18bSJosef Bacik }
1106e3df18bSJosef Bacik
__need_auto_defrag(struct btrfs_fs_info * fs_info)1116e3df18bSJosef Bacik static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
1126e3df18bSJosef Bacik {
1136e3df18bSJosef Bacik if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
1146e3df18bSJosef Bacik return 0;
1156e3df18bSJosef Bacik
1166e3df18bSJosef Bacik if (btrfs_fs_closing(fs_info))
1176e3df18bSJosef Bacik return 0;
1186e3df18bSJosef Bacik
1196e3df18bSJosef Bacik return 1;
1206e3df18bSJosef Bacik }
1216e3df18bSJosef Bacik
1226e3df18bSJosef Bacik /*
1236e3df18bSJosef Bacik * Insert a defrag record for this inode if auto defrag is enabled.
1246e3df18bSJosef Bacik */
btrfs_add_inode_defrag(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u32 extent_thresh)1256e3df18bSJosef Bacik int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
1266e3df18bSJosef Bacik struct btrfs_inode *inode, u32 extent_thresh)
1276e3df18bSJosef Bacik {
1286e3df18bSJosef Bacik struct btrfs_root *root = inode->root;
1296e3df18bSJosef Bacik struct btrfs_fs_info *fs_info = root->fs_info;
1306e3df18bSJosef Bacik struct inode_defrag *defrag;
1316e3df18bSJosef Bacik u64 transid;
1326e3df18bSJosef Bacik int ret;
1336e3df18bSJosef Bacik
1346e3df18bSJosef Bacik if (!__need_auto_defrag(fs_info))
1356e3df18bSJosef Bacik return 0;
1366e3df18bSJosef Bacik
1376e3df18bSJosef Bacik if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
1386e3df18bSJosef Bacik return 0;
1396e3df18bSJosef Bacik
1406e3df18bSJosef Bacik if (trans)
1416e3df18bSJosef Bacik transid = trans->transid;
1426e3df18bSJosef Bacik else
1436e3df18bSJosef Bacik transid = inode->root->last_trans;
1446e3df18bSJosef Bacik
1456e3df18bSJosef Bacik defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
1466e3df18bSJosef Bacik if (!defrag)
1476e3df18bSJosef Bacik return -ENOMEM;
1486e3df18bSJosef Bacik
1496e3df18bSJosef Bacik defrag->ino = btrfs_ino(inode);
1506e3df18bSJosef Bacik defrag->transid = transid;
1516e3df18bSJosef Bacik defrag->root = root->root_key.objectid;
1526e3df18bSJosef Bacik defrag->extent_thresh = extent_thresh;
1536e3df18bSJosef Bacik
1546e3df18bSJosef Bacik spin_lock(&fs_info->defrag_inodes_lock);
1556e3df18bSJosef Bacik if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
1566e3df18bSJosef Bacik /*
1576e3df18bSJosef Bacik * If we set IN_DEFRAG flag and evict the inode from memory,
1586e3df18bSJosef Bacik * and then re-read this inode, this new inode doesn't have
1596e3df18bSJosef Bacik * IN_DEFRAG flag. At the case, we may find the existed defrag.
1606e3df18bSJosef Bacik */
1616e3df18bSJosef Bacik ret = __btrfs_add_inode_defrag(inode, defrag);
1626e3df18bSJosef Bacik if (ret)
1636e3df18bSJosef Bacik kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
1646e3df18bSJosef Bacik } else {
1656e3df18bSJosef Bacik kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
1666e3df18bSJosef Bacik }
1676e3df18bSJosef Bacik spin_unlock(&fs_info->defrag_inodes_lock);
1686e3df18bSJosef Bacik return 0;
1696e3df18bSJosef Bacik }
1706e3df18bSJosef Bacik
1716e3df18bSJosef Bacik /*
1726e3df18bSJosef Bacik * Pick the defragable inode that we want, if it doesn't exist, we will get the
1736e3df18bSJosef Bacik * next one.
1746e3df18bSJosef Bacik */
btrfs_pick_defrag_inode(struct btrfs_fs_info * fs_info,u64 root,u64 ino)1756e3df18bSJosef Bacik static struct inode_defrag *btrfs_pick_defrag_inode(
1766e3df18bSJosef Bacik struct btrfs_fs_info *fs_info, u64 root, u64 ino)
1776e3df18bSJosef Bacik {
1786e3df18bSJosef Bacik struct inode_defrag *entry = NULL;
1796e3df18bSJosef Bacik struct inode_defrag tmp;
1806e3df18bSJosef Bacik struct rb_node *p;
1816e3df18bSJosef Bacik struct rb_node *parent = NULL;
1826e3df18bSJosef Bacik int ret;
1836e3df18bSJosef Bacik
1846e3df18bSJosef Bacik tmp.ino = ino;
1856e3df18bSJosef Bacik tmp.root = root;
1866e3df18bSJosef Bacik
1876e3df18bSJosef Bacik spin_lock(&fs_info->defrag_inodes_lock);
1886e3df18bSJosef Bacik p = fs_info->defrag_inodes.rb_node;
1896e3df18bSJosef Bacik while (p) {
1906e3df18bSJosef Bacik parent = p;
1916e3df18bSJosef Bacik entry = rb_entry(parent, struct inode_defrag, rb_node);
1926e3df18bSJosef Bacik
1936e3df18bSJosef Bacik ret = __compare_inode_defrag(&tmp, entry);
1946e3df18bSJosef Bacik if (ret < 0)
1956e3df18bSJosef Bacik p = parent->rb_left;
1966e3df18bSJosef Bacik else if (ret > 0)
1976e3df18bSJosef Bacik p = parent->rb_right;
1986e3df18bSJosef Bacik else
1996e3df18bSJosef Bacik goto out;
2006e3df18bSJosef Bacik }
2016e3df18bSJosef Bacik
2026e3df18bSJosef Bacik if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
2036e3df18bSJosef Bacik parent = rb_next(parent);
2046e3df18bSJosef Bacik if (parent)
2056e3df18bSJosef Bacik entry = rb_entry(parent, struct inode_defrag, rb_node);
2066e3df18bSJosef Bacik else
2076e3df18bSJosef Bacik entry = NULL;
2086e3df18bSJosef Bacik }
2096e3df18bSJosef Bacik out:
2106e3df18bSJosef Bacik if (entry)
2116e3df18bSJosef Bacik rb_erase(parent, &fs_info->defrag_inodes);
2126e3df18bSJosef Bacik spin_unlock(&fs_info->defrag_inodes_lock);
2136e3df18bSJosef Bacik return entry;
2146e3df18bSJosef Bacik }
2156e3df18bSJosef Bacik
btrfs_cleanup_defrag_inodes(struct btrfs_fs_info * fs_info)2166e3df18bSJosef Bacik void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
2176e3df18bSJosef Bacik {
2186e3df18bSJosef Bacik struct inode_defrag *defrag;
2196e3df18bSJosef Bacik struct rb_node *node;
2206e3df18bSJosef Bacik
2216e3df18bSJosef Bacik spin_lock(&fs_info->defrag_inodes_lock);
2226e3df18bSJosef Bacik node = rb_first(&fs_info->defrag_inodes);
2236e3df18bSJosef Bacik while (node) {
2246e3df18bSJosef Bacik rb_erase(node, &fs_info->defrag_inodes);
2256e3df18bSJosef Bacik defrag = rb_entry(node, struct inode_defrag, rb_node);
2266e3df18bSJosef Bacik kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
2276e3df18bSJosef Bacik
2286e3df18bSJosef Bacik cond_resched_lock(&fs_info->defrag_inodes_lock);
2296e3df18bSJosef Bacik
2306e3df18bSJosef Bacik node = rb_first(&fs_info->defrag_inodes);
2316e3df18bSJosef Bacik }
2326e3df18bSJosef Bacik spin_unlock(&fs_info->defrag_inodes_lock);
2336e3df18bSJosef Bacik }
2346e3df18bSJosef Bacik
2356e3df18bSJosef Bacik #define BTRFS_DEFRAG_BATCH 1024
2366e3df18bSJosef Bacik
__btrfs_run_defrag_inode(struct btrfs_fs_info * fs_info,struct inode_defrag * defrag)2376e3df18bSJosef Bacik static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
2386e3df18bSJosef Bacik struct inode_defrag *defrag)
2396e3df18bSJosef Bacik {
2406e3df18bSJosef Bacik struct btrfs_root *inode_root;
2416e3df18bSJosef Bacik struct inode *inode;
2426e3df18bSJosef Bacik struct btrfs_ioctl_defrag_range_args range;
2436e3df18bSJosef Bacik int ret = 0;
2446e3df18bSJosef Bacik u64 cur = 0;
2456e3df18bSJosef Bacik
2466e3df18bSJosef Bacik again:
2476e3df18bSJosef Bacik if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
2486e3df18bSJosef Bacik goto cleanup;
2496e3df18bSJosef Bacik if (!__need_auto_defrag(fs_info))
2506e3df18bSJosef Bacik goto cleanup;
2516e3df18bSJosef Bacik
2526e3df18bSJosef Bacik /* Get the inode */
2536e3df18bSJosef Bacik inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
2546e3df18bSJosef Bacik if (IS_ERR(inode_root)) {
2556e3df18bSJosef Bacik ret = PTR_ERR(inode_root);
2566e3df18bSJosef Bacik goto cleanup;
2576e3df18bSJosef Bacik }
2586e3df18bSJosef Bacik
2596e3df18bSJosef Bacik inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
2606e3df18bSJosef Bacik btrfs_put_root(inode_root);
2616e3df18bSJosef Bacik if (IS_ERR(inode)) {
2626e3df18bSJosef Bacik ret = PTR_ERR(inode);
2636e3df18bSJosef Bacik goto cleanup;
2646e3df18bSJosef Bacik }
2656e3df18bSJosef Bacik
2666e3df18bSJosef Bacik if (cur >= i_size_read(inode)) {
2676e3df18bSJosef Bacik iput(inode);
2686e3df18bSJosef Bacik goto cleanup;
2696e3df18bSJosef Bacik }
2706e3df18bSJosef Bacik
2716e3df18bSJosef Bacik /* Do a chunk of defrag */
2726e3df18bSJosef Bacik clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
2736e3df18bSJosef Bacik memset(&range, 0, sizeof(range));
2746e3df18bSJosef Bacik range.len = (u64)-1;
2756e3df18bSJosef Bacik range.start = cur;
2766e3df18bSJosef Bacik range.extent_thresh = defrag->extent_thresh;
2776e3df18bSJosef Bacik
2786e3df18bSJosef Bacik sb_start_write(fs_info->sb);
2796e3df18bSJosef Bacik ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
2806e3df18bSJosef Bacik BTRFS_DEFRAG_BATCH);
2816e3df18bSJosef Bacik sb_end_write(fs_info->sb);
2826e3df18bSJosef Bacik iput(inode);
2836e3df18bSJosef Bacik
2846e3df18bSJosef Bacik if (ret < 0)
2856e3df18bSJosef Bacik goto cleanup;
2866e3df18bSJosef Bacik
2876e3df18bSJosef Bacik cur = max(cur + fs_info->sectorsize, range.start);
2886e3df18bSJosef Bacik goto again;
2896e3df18bSJosef Bacik
2906e3df18bSJosef Bacik cleanup:
2916e3df18bSJosef Bacik kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
2926e3df18bSJosef Bacik return ret;
2936e3df18bSJosef Bacik }
2946e3df18bSJosef Bacik
2956e3df18bSJosef Bacik /*
2966e3df18bSJosef Bacik * Run through the list of inodes in the FS that need defragging.
2976e3df18bSJosef Bacik */
btrfs_run_defrag_inodes(struct btrfs_fs_info * fs_info)2986e3df18bSJosef Bacik int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
2996e3df18bSJosef Bacik {
3006e3df18bSJosef Bacik struct inode_defrag *defrag;
3016e3df18bSJosef Bacik u64 first_ino = 0;
3026e3df18bSJosef Bacik u64 root_objectid = 0;
3036e3df18bSJosef Bacik
3046e3df18bSJosef Bacik atomic_inc(&fs_info->defrag_running);
3056e3df18bSJosef Bacik while (1) {
3066e3df18bSJosef Bacik /* Pause the auto defragger. */
3076e3df18bSJosef Bacik if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
3086e3df18bSJosef Bacik break;
3096e3df18bSJosef Bacik
3106e3df18bSJosef Bacik if (!__need_auto_defrag(fs_info))
3116e3df18bSJosef Bacik break;
3126e3df18bSJosef Bacik
3136e3df18bSJosef Bacik /* find an inode to defrag */
3146e3df18bSJosef Bacik defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
3156e3df18bSJosef Bacik if (!defrag) {
3166e3df18bSJosef Bacik if (root_objectid || first_ino) {
3176e3df18bSJosef Bacik root_objectid = 0;
3186e3df18bSJosef Bacik first_ino = 0;
3196e3df18bSJosef Bacik continue;
3206e3df18bSJosef Bacik } else {
3216e3df18bSJosef Bacik break;
3226e3df18bSJosef Bacik }
3236e3df18bSJosef Bacik }
3246e3df18bSJosef Bacik
3256e3df18bSJosef Bacik first_ino = defrag->ino + 1;
3266e3df18bSJosef Bacik root_objectid = defrag->root;
3276e3df18bSJosef Bacik
3286e3df18bSJosef Bacik __btrfs_run_defrag_inode(fs_info, defrag);
3296e3df18bSJosef Bacik }
3306e3df18bSJosef Bacik atomic_dec(&fs_info->defrag_running);
3316e3df18bSJosef Bacik
3326e3df18bSJosef Bacik /*
3336e3df18bSJosef Bacik * During unmount, we use the transaction_wait queue to wait for the
3346e3df18bSJosef Bacik * defragger to stop.
3356e3df18bSJosef Bacik */
3366e3df18bSJosef Bacik wake_up(&fs_info->transaction_wait);
3376e3df18bSJosef Bacik return 0;
3386e3df18bSJosef Bacik }
3396e3df18bSJosef Bacik
340778dd695SJosef Bacik /*
341778dd695SJosef Bacik * Defrag all the leaves in a given btree.
342778dd695SJosef Bacik * Read all the leaves and try to get key order to
343778dd695SJosef Bacik * better reflect disk order
344778dd695SJosef Bacik */
345778dd695SJosef Bacik
btrfs_defrag_leaves(struct btrfs_trans_handle * trans,struct btrfs_root * root)346778dd695SJosef Bacik int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
347778dd695SJosef Bacik struct btrfs_root *root)
348778dd695SJosef Bacik {
349778dd695SJosef Bacik struct btrfs_path *path = NULL;
350778dd695SJosef Bacik struct btrfs_key key;
351778dd695SJosef Bacik int ret = 0;
352778dd695SJosef Bacik int wret;
353778dd695SJosef Bacik int level;
354778dd695SJosef Bacik int next_key_ret = 0;
355778dd695SJosef Bacik u64 last_ret = 0;
356778dd695SJosef Bacik
357778dd695SJosef Bacik if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
358778dd695SJosef Bacik goto out;
359778dd695SJosef Bacik
360778dd695SJosef Bacik path = btrfs_alloc_path();
361db0a4a7bSChristophe JAILLET if (!path) {
362db0a4a7bSChristophe JAILLET ret = -ENOMEM;
363db0a4a7bSChristophe JAILLET goto out;
364db0a4a7bSChristophe JAILLET }
365778dd695SJosef Bacik
366778dd695SJosef Bacik level = btrfs_header_level(root->node);
367778dd695SJosef Bacik
368778dd695SJosef Bacik if (level == 0)
369778dd695SJosef Bacik goto out;
370778dd695SJosef Bacik
371778dd695SJosef Bacik if (root->defrag_progress.objectid == 0) {
372778dd695SJosef Bacik struct extent_buffer *root_node;
373778dd695SJosef Bacik u32 nritems;
374778dd695SJosef Bacik
375778dd695SJosef Bacik root_node = btrfs_lock_root_node(root);
376778dd695SJosef Bacik nritems = btrfs_header_nritems(root_node);
377778dd695SJosef Bacik root->defrag_max.objectid = 0;
378778dd695SJosef Bacik /* from above we know this is not a leaf */
379778dd695SJosef Bacik btrfs_node_key_to_cpu(root_node, &root->defrag_max,
380778dd695SJosef Bacik nritems - 1);
381778dd695SJosef Bacik btrfs_tree_unlock(root_node);
382778dd695SJosef Bacik free_extent_buffer(root_node);
383778dd695SJosef Bacik memset(&key, 0, sizeof(key));
384778dd695SJosef Bacik } else {
385778dd695SJosef Bacik memcpy(&key, &root->defrag_progress, sizeof(key));
386778dd695SJosef Bacik }
387778dd695SJosef Bacik
388778dd695SJosef Bacik path->keep_locks = 1;
389778dd695SJosef Bacik
390778dd695SJosef Bacik ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
391778dd695SJosef Bacik if (ret < 0)
392778dd695SJosef Bacik goto out;
393778dd695SJosef Bacik if (ret > 0) {
394778dd695SJosef Bacik ret = 0;
395778dd695SJosef Bacik goto out;
396778dd695SJosef Bacik }
397778dd695SJosef Bacik btrfs_release_path(path);
398778dd695SJosef Bacik /*
399778dd695SJosef Bacik * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
400778dd695SJosef Bacik * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
401778dd695SJosef Bacik * a deadlock (attempting to write lock an already write locked leaf).
402778dd695SJosef Bacik */
403778dd695SJosef Bacik path->lowest_level = 1;
404778dd695SJosef Bacik wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
405778dd695SJosef Bacik
406778dd695SJosef Bacik if (wret < 0) {
407778dd695SJosef Bacik ret = wret;
408778dd695SJosef Bacik goto out;
409778dd695SJosef Bacik }
410778dd695SJosef Bacik if (!path->nodes[1]) {
411778dd695SJosef Bacik ret = 0;
412778dd695SJosef Bacik goto out;
413778dd695SJosef Bacik }
414778dd695SJosef Bacik /*
415778dd695SJosef Bacik * The node at level 1 must always be locked when our path has
416778dd695SJosef Bacik * keep_locks set and lowest_level is 1, regardless of the value of
417778dd695SJosef Bacik * path->slots[1].
418778dd695SJosef Bacik */
419*31547100SDavid Sterba ASSERT(path->locks[1] != 0);
420778dd695SJosef Bacik ret = btrfs_realloc_node(trans, root,
421778dd695SJosef Bacik path->nodes[1], 0,
422778dd695SJosef Bacik &last_ret,
423778dd695SJosef Bacik &root->defrag_progress);
424778dd695SJosef Bacik if (ret) {
425778dd695SJosef Bacik WARN_ON(ret == -EAGAIN);
426778dd695SJosef Bacik goto out;
427778dd695SJosef Bacik }
428778dd695SJosef Bacik /*
429778dd695SJosef Bacik * Now that we reallocated the node we can find the next key. Note that
430778dd695SJosef Bacik * btrfs_find_next_key() can release our path and do another search
431778dd695SJosef Bacik * without COWing, this is because even with path->keep_locks = 1,
432778dd695SJosef Bacik * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
433778dd695SJosef Bacik * node when path->slots[node_level - 1] does not point to the last
434778dd695SJosef Bacik * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
435778dd695SJosef Bacik * we search for the next key after reallocating our node.
436778dd695SJosef Bacik */
437778dd695SJosef Bacik path->slots[1] = btrfs_header_nritems(path->nodes[1]);
438778dd695SJosef Bacik next_key_ret = btrfs_find_next_key(root, path, &key, 1,
439778dd695SJosef Bacik BTRFS_OLDEST_GENERATION);
440778dd695SJosef Bacik if (next_key_ret == 0) {
441778dd695SJosef Bacik memcpy(&root->defrag_progress, &key, sizeof(key));
442778dd695SJosef Bacik ret = -EAGAIN;
443778dd695SJosef Bacik }
444778dd695SJosef Bacik out:
445778dd695SJosef Bacik btrfs_free_path(path);
446778dd695SJosef Bacik if (ret == -EAGAIN) {
447778dd695SJosef Bacik if (root->defrag_max.objectid > root->defrag_progress.objectid)
448778dd695SJosef Bacik goto done;
449778dd695SJosef Bacik if (root->defrag_max.type > root->defrag_progress.type)
450778dd695SJosef Bacik goto done;
451778dd695SJosef Bacik if (root->defrag_max.offset > root->defrag_progress.offset)
452778dd695SJosef Bacik goto done;
453778dd695SJosef Bacik ret = 0;
454778dd695SJosef Bacik }
455778dd695SJosef Bacik done:
456778dd695SJosef Bacik if (ret != -EAGAIN)
457778dd695SJosef Bacik memset(&root->defrag_progress, 0,
458778dd695SJosef Bacik sizeof(root->defrag_progress));
459778dd695SJosef Bacik
460778dd695SJosef Bacik return ret;
461778dd695SJosef Bacik }
4626e3df18bSJosef Bacik
463a6a01ca6SJosef Bacik /*
464a6a01ca6SJosef Bacik * Defrag specific helper to get an extent map.
465a6a01ca6SJosef Bacik *
466a6a01ca6SJosef Bacik * Differences between this and btrfs_get_extent() are:
467a6a01ca6SJosef Bacik *
468a6a01ca6SJosef Bacik * - No extent_map will be added to inode->extent_tree
469a6a01ca6SJosef Bacik * To reduce memory usage in the long run.
470a6a01ca6SJosef Bacik *
471a6a01ca6SJosef Bacik * - Extra optimization to skip file extents older than @newer_than
472a6a01ca6SJosef Bacik * By using btrfs_search_forward() we can skip entire file ranges that
473a6a01ca6SJosef Bacik * have extents created in past transactions, because btrfs_search_forward()
474a6a01ca6SJosef Bacik * will not visit leaves and nodes with a generation smaller than given
475a6a01ca6SJosef Bacik * minimal generation threshold (@newer_than).
476a6a01ca6SJosef Bacik *
477a6a01ca6SJosef Bacik * Return valid em if we find a file extent matching the requirement.
478a6a01ca6SJosef Bacik * Return NULL if we can not find a file extent matching the requirement.
479a6a01ca6SJosef Bacik *
480a6a01ca6SJosef Bacik * Return ERR_PTR() for error.
481a6a01ca6SJosef Bacik */
defrag_get_extent(struct btrfs_inode * inode,u64 start,u64 newer_than)482a6a01ca6SJosef Bacik static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
483a6a01ca6SJosef Bacik u64 start, u64 newer_than)
484a6a01ca6SJosef Bacik {
485a6a01ca6SJosef Bacik struct btrfs_root *root = inode->root;
486a6a01ca6SJosef Bacik struct btrfs_file_extent_item *fi;
487a6a01ca6SJosef Bacik struct btrfs_path path = { 0 };
488a6a01ca6SJosef Bacik struct extent_map *em;
489a6a01ca6SJosef Bacik struct btrfs_key key;
490a6a01ca6SJosef Bacik u64 ino = btrfs_ino(inode);
491a6a01ca6SJosef Bacik int ret;
492a6a01ca6SJosef Bacik
493a6a01ca6SJosef Bacik em = alloc_extent_map();
494a6a01ca6SJosef Bacik if (!em) {
495a6a01ca6SJosef Bacik ret = -ENOMEM;
496a6a01ca6SJosef Bacik goto err;
497a6a01ca6SJosef Bacik }
498a6a01ca6SJosef Bacik
499a6a01ca6SJosef Bacik key.objectid = ino;
500a6a01ca6SJosef Bacik key.type = BTRFS_EXTENT_DATA_KEY;
501a6a01ca6SJosef Bacik key.offset = start;
502a6a01ca6SJosef Bacik
503a6a01ca6SJosef Bacik if (newer_than) {
504a6a01ca6SJosef Bacik ret = btrfs_search_forward(root, &key, &path, newer_than);
505a6a01ca6SJosef Bacik if (ret < 0)
506a6a01ca6SJosef Bacik goto err;
507a6a01ca6SJosef Bacik /* Can't find anything newer */
508a6a01ca6SJosef Bacik if (ret > 0)
509a6a01ca6SJosef Bacik goto not_found;
510a6a01ca6SJosef Bacik } else {
511a6a01ca6SJosef Bacik ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
512a6a01ca6SJosef Bacik if (ret < 0)
513a6a01ca6SJosef Bacik goto err;
514a6a01ca6SJosef Bacik }
515a6a01ca6SJosef Bacik if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
516a6a01ca6SJosef Bacik /*
517a6a01ca6SJosef Bacik * If btrfs_search_slot() makes path to point beyond nritems,
518a6a01ca6SJosef Bacik * we should not have an empty leaf, as this inode must at
519a6a01ca6SJosef Bacik * least have its INODE_ITEM.
520a6a01ca6SJosef Bacik */
521a6a01ca6SJosef Bacik ASSERT(btrfs_header_nritems(path.nodes[0]));
522a6a01ca6SJosef Bacik path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
523a6a01ca6SJosef Bacik }
524a6a01ca6SJosef Bacik btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
525a6a01ca6SJosef Bacik /* Perfect match, no need to go one slot back */
526a6a01ca6SJosef Bacik if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
527a6a01ca6SJosef Bacik key.offset == start)
528a6a01ca6SJosef Bacik goto iterate;
529a6a01ca6SJosef Bacik
530a6a01ca6SJosef Bacik /* We didn't find a perfect match, needs to go one slot back */
531a6a01ca6SJosef Bacik if (path.slots[0] > 0) {
532a6a01ca6SJosef Bacik btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
533a6a01ca6SJosef Bacik if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
534a6a01ca6SJosef Bacik path.slots[0]--;
535a6a01ca6SJosef Bacik }
536a6a01ca6SJosef Bacik
537a6a01ca6SJosef Bacik iterate:
538a6a01ca6SJosef Bacik /* Iterate through the path to find a file extent covering @start */
539a6a01ca6SJosef Bacik while (true) {
540a6a01ca6SJosef Bacik u64 extent_end;
541a6a01ca6SJosef Bacik
542a6a01ca6SJosef Bacik if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
543a6a01ca6SJosef Bacik goto next;
544a6a01ca6SJosef Bacik
545a6a01ca6SJosef Bacik btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
546a6a01ca6SJosef Bacik
547a6a01ca6SJosef Bacik /*
548a6a01ca6SJosef Bacik * We may go one slot back to INODE_REF/XATTR item, then
549a6a01ca6SJosef Bacik * need to go forward until we reach an EXTENT_DATA.
550a6a01ca6SJosef Bacik * But we should still has the correct ino as key.objectid.
551a6a01ca6SJosef Bacik */
552a6a01ca6SJosef Bacik if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
553a6a01ca6SJosef Bacik goto next;
554a6a01ca6SJosef Bacik
555a6a01ca6SJosef Bacik /* It's beyond our target range, definitely not extent found */
556a6a01ca6SJosef Bacik if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
557a6a01ca6SJosef Bacik goto not_found;
558a6a01ca6SJosef Bacik
559a6a01ca6SJosef Bacik /*
560a6a01ca6SJosef Bacik * | |<- File extent ->|
561a6a01ca6SJosef Bacik * \- start
562a6a01ca6SJosef Bacik *
563a6a01ca6SJosef Bacik * This means there is a hole between start and key.offset.
564a6a01ca6SJosef Bacik */
565a6a01ca6SJosef Bacik if (key.offset > start) {
566a6a01ca6SJosef Bacik em->start = start;
567a6a01ca6SJosef Bacik em->orig_start = start;
568a6a01ca6SJosef Bacik em->block_start = EXTENT_MAP_HOLE;
569a6a01ca6SJosef Bacik em->len = key.offset - start;
570a6a01ca6SJosef Bacik break;
571a6a01ca6SJosef Bacik }
572a6a01ca6SJosef Bacik
573a6a01ca6SJosef Bacik fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
574a6a01ca6SJosef Bacik struct btrfs_file_extent_item);
575a6a01ca6SJosef Bacik extent_end = btrfs_file_extent_end(&path);
576a6a01ca6SJosef Bacik
577a6a01ca6SJosef Bacik /*
578a6a01ca6SJosef Bacik * |<- file extent ->| |
579a6a01ca6SJosef Bacik * \- start
580a6a01ca6SJosef Bacik *
581a6a01ca6SJosef Bacik * We haven't reached start, search next slot.
582a6a01ca6SJosef Bacik */
583a6a01ca6SJosef Bacik if (extent_end <= start)
584a6a01ca6SJosef Bacik goto next;
585a6a01ca6SJosef Bacik
586a6a01ca6SJosef Bacik /* Now this extent covers @start, convert it to em */
587280f15cbSQu Wenruo btrfs_extent_item_to_extent_map(inode, &path, fi, em);
588a6a01ca6SJosef Bacik break;
589a6a01ca6SJosef Bacik next:
590a6a01ca6SJosef Bacik ret = btrfs_next_item(root, &path);
591a6a01ca6SJosef Bacik if (ret < 0)
592a6a01ca6SJosef Bacik goto err;
593a6a01ca6SJosef Bacik if (ret > 0)
594a6a01ca6SJosef Bacik goto not_found;
595a6a01ca6SJosef Bacik }
596a6a01ca6SJosef Bacik btrfs_release_path(&path);
597a6a01ca6SJosef Bacik return em;
598a6a01ca6SJosef Bacik
599a6a01ca6SJosef Bacik not_found:
600a6a01ca6SJosef Bacik btrfs_release_path(&path);
601a6a01ca6SJosef Bacik free_extent_map(em);
602a6a01ca6SJosef Bacik return NULL;
603a6a01ca6SJosef Bacik
604a6a01ca6SJosef Bacik err:
605a6a01ca6SJosef Bacik btrfs_release_path(&path);
606a6a01ca6SJosef Bacik free_extent_map(em);
607a6a01ca6SJosef Bacik return ERR_PTR(ret);
608a6a01ca6SJosef Bacik }
609a6a01ca6SJosef Bacik
defrag_lookup_extent(struct inode * inode,u64 start,u64 newer_than,bool locked)610a6a01ca6SJosef Bacik static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
611a6a01ca6SJosef Bacik u64 newer_than, bool locked)
612a6a01ca6SJosef Bacik {
613a6a01ca6SJosef Bacik struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
614a6a01ca6SJosef Bacik struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
615a6a01ca6SJosef Bacik struct extent_map *em;
616a6a01ca6SJosef Bacik const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
617a6a01ca6SJosef Bacik
618a6a01ca6SJosef Bacik /*
619a6a01ca6SJosef Bacik * Hopefully we have this extent in the tree already, try without the
620a6a01ca6SJosef Bacik * full extent lock.
621a6a01ca6SJosef Bacik */
622a6a01ca6SJosef Bacik read_lock(&em_tree->lock);
623a6a01ca6SJosef Bacik em = lookup_extent_mapping(em_tree, start, sectorsize);
624a6a01ca6SJosef Bacik read_unlock(&em_tree->lock);
625a6a01ca6SJosef Bacik
626a6a01ca6SJosef Bacik /*
627a6a01ca6SJosef Bacik * We can get a merged extent, in that case, we need to re-search
628a6a01ca6SJosef Bacik * tree to get the original em for defrag.
629a6a01ca6SJosef Bacik *
630a6a01ca6SJosef Bacik * If @newer_than is 0 or em::generation < newer_than, we can trust
631a6a01ca6SJosef Bacik * this em, as either we don't care about the generation, or the
632a6a01ca6SJosef Bacik * merged extent map will be rejected anyway.
633a6a01ca6SJosef Bacik */
634a6a01ca6SJosef Bacik if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
635a6a01ca6SJosef Bacik newer_than && em->generation >= newer_than) {
636a6a01ca6SJosef Bacik free_extent_map(em);
637a6a01ca6SJosef Bacik em = NULL;
638a6a01ca6SJosef Bacik }
639a6a01ca6SJosef Bacik
640a6a01ca6SJosef Bacik if (!em) {
641a6a01ca6SJosef Bacik struct extent_state *cached = NULL;
642a6a01ca6SJosef Bacik u64 end = start + sectorsize - 1;
643a6a01ca6SJosef Bacik
644a6a01ca6SJosef Bacik /* Get the big lock and read metadata off disk. */
645a6a01ca6SJosef Bacik if (!locked)
646a6a01ca6SJosef Bacik lock_extent(io_tree, start, end, &cached);
647a6a01ca6SJosef Bacik em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
648a6a01ca6SJosef Bacik if (!locked)
649a6a01ca6SJosef Bacik unlock_extent(io_tree, start, end, &cached);
650a6a01ca6SJosef Bacik
651a6a01ca6SJosef Bacik if (IS_ERR(em))
652a6a01ca6SJosef Bacik return NULL;
653a6a01ca6SJosef Bacik }
654a6a01ca6SJosef Bacik
655a6a01ca6SJosef Bacik return em;
656a6a01ca6SJosef Bacik }
657a6a01ca6SJosef Bacik
get_extent_max_capacity(const struct btrfs_fs_info * fs_info,const struct extent_map * em)658a6a01ca6SJosef Bacik static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
659a6a01ca6SJosef Bacik const struct extent_map *em)
660a6a01ca6SJosef Bacik {
661a6a01ca6SJosef Bacik if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
662a6a01ca6SJosef Bacik return BTRFS_MAX_COMPRESSED;
663a6a01ca6SJosef Bacik return fs_info->max_extent_size;
664a6a01ca6SJosef Bacik }
665a6a01ca6SJosef Bacik
defrag_check_next_extent(struct inode * inode,struct extent_map * em,u32 extent_thresh,u64 newer_than,bool locked)666a6a01ca6SJosef Bacik static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
667a6a01ca6SJosef Bacik u32 extent_thresh, u64 newer_than, bool locked)
668a6a01ca6SJosef Bacik {
669a6a01ca6SJosef Bacik struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
670a6a01ca6SJosef Bacik struct extent_map *next;
671a6a01ca6SJosef Bacik bool ret = false;
672a6a01ca6SJosef Bacik
673a6a01ca6SJosef Bacik /* This is the last extent */
674a6a01ca6SJosef Bacik if (em->start + em->len >= i_size_read(inode))
675a6a01ca6SJosef Bacik return false;
676a6a01ca6SJosef Bacik
677a6a01ca6SJosef Bacik /*
678a6a01ca6SJosef Bacik * Here we need to pass @newer_then when checking the next extent, or
679a6a01ca6SJosef Bacik * we will hit a case we mark current extent for defrag, but the next
680a6a01ca6SJosef Bacik * one will not be a target.
681a6a01ca6SJosef Bacik * This will just cause extra IO without really reducing the fragments.
682a6a01ca6SJosef Bacik */
683a6a01ca6SJosef Bacik next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
684a6a01ca6SJosef Bacik /* No more em or hole */
685a6a01ca6SJosef Bacik if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
686a6a01ca6SJosef Bacik goto out;
687a6a01ca6SJosef Bacik if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
688a6a01ca6SJosef Bacik goto out;
689a6a01ca6SJosef Bacik /*
690a6a01ca6SJosef Bacik * If the next extent is at its max capacity, defragging current extent
691a6a01ca6SJosef Bacik * makes no sense, as the total number of extents won't change.
692a6a01ca6SJosef Bacik */
693a6a01ca6SJosef Bacik if (next->len >= get_extent_max_capacity(fs_info, em))
694a6a01ca6SJosef Bacik goto out;
695a6a01ca6SJosef Bacik /* Skip older extent */
696a6a01ca6SJosef Bacik if (next->generation < newer_than)
697a6a01ca6SJosef Bacik goto out;
698a6a01ca6SJosef Bacik /* Also check extent size */
699a6a01ca6SJosef Bacik if (next->len >= extent_thresh)
700a6a01ca6SJosef Bacik goto out;
701a6a01ca6SJosef Bacik
702a6a01ca6SJosef Bacik ret = true;
703a6a01ca6SJosef Bacik out:
704a6a01ca6SJosef Bacik free_extent_map(next);
705a6a01ca6SJosef Bacik return ret;
706a6a01ca6SJosef Bacik }
707a6a01ca6SJosef Bacik
708a6a01ca6SJosef Bacik /*
709a6a01ca6SJosef Bacik * Prepare one page to be defragged.
710a6a01ca6SJosef Bacik *
711a6a01ca6SJosef Bacik * This will ensure:
712a6a01ca6SJosef Bacik *
713a6a01ca6SJosef Bacik * - Returned page is locked and has been set up properly.
714a6a01ca6SJosef Bacik * - No ordered extent exists in the page.
715a6a01ca6SJosef Bacik * - The page is uptodate.
716a6a01ca6SJosef Bacik *
717a6a01ca6SJosef Bacik * NOTE: Caller should also wait for page writeback after the cluster is
718a6a01ca6SJosef Bacik * prepared, here we don't do writeback wait for each page.
719a6a01ca6SJosef Bacik */
defrag_prepare_one_page(struct btrfs_inode * inode,pgoff_t index)720a6a01ca6SJosef Bacik static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index)
721a6a01ca6SJosef Bacik {
722a6a01ca6SJosef Bacik struct address_space *mapping = inode->vfs_inode.i_mapping;
723a6a01ca6SJosef Bacik gfp_t mask = btrfs_alloc_write_mask(mapping);
724a6a01ca6SJosef Bacik u64 page_start = (u64)index << PAGE_SHIFT;
725a6a01ca6SJosef Bacik u64 page_end = page_start + PAGE_SIZE - 1;
726a6a01ca6SJosef Bacik struct extent_state *cached_state = NULL;
727a6a01ca6SJosef Bacik struct page *page;
728a6a01ca6SJosef Bacik int ret;
729a6a01ca6SJosef Bacik
730a6a01ca6SJosef Bacik again:
731a6a01ca6SJosef Bacik page = find_or_create_page(mapping, index, mask);
732a6a01ca6SJosef Bacik if (!page)
733a6a01ca6SJosef Bacik return ERR_PTR(-ENOMEM);
734a6a01ca6SJosef Bacik
735a6a01ca6SJosef Bacik /*
736a6a01ca6SJosef Bacik * Since we can defragment files opened read-only, we can encounter
737a6a01ca6SJosef Bacik * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
738a6a01ca6SJosef Bacik * can't do I/O using huge pages yet, so return an error for now.
739a6a01ca6SJosef Bacik * Filesystem transparent huge pages are typically only used for
740a6a01ca6SJosef Bacik * executables that explicitly enable them, so this isn't very
741a6a01ca6SJosef Bacik * restrictive.
742a6a01ca6SJosef Bacik */
743a6a01ca6SJosef Bacik if (PageCompound(page)) {
744a6a01ca6SJosef Bacik unlock_page(page);
745a6a01ca6SJosef Bacik put_page(page);
746a6a01ca6SJosef Bacik return ERR_PTR(-ETXTBSY);
747a6a01ca6SJosef Bacik }
748a6a01ca6SJosef Bacik
749a6a01ca6SJosef Bacik ret = set_page_extent_mapped(page);
750a6a01ca6SJosef Bacik if (ret < 0) {
751a6a01ca6SJosef Bacik unlock_page(page);
752a6a01ca6SJosef Bacik put_page(page);
753a6a01ca6SJosef Bacik return ERR_PTR(ret);
754a6a01ca6SJosef Bacik }
755a6a01ca6SJosef Bacik
756a6a01ca6SJosef Bacik /* Wait for any existing ordered extent in the range */
757a6a01ca6SJosef Bacik while (1) {
758a6a01ca6SJosef Bacik struct btrfs_ordered_extent *ordered;
759a6a01ca6SJosef Bacik
760a6a01ca6SJosef Bacik lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
761a6a01ca6SJosef Bacik ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
762a6a01ca6SJosef Bacik unlock_extent(&inode->io_tree, page_start, page_end,
763a6a01ca6SJosef Bacik &cached_state);
764a6a01ca6SJosef Bacik if (!ordered)
765a6a01ca6SJosef Bacik break;
766a6a01ca6SJosef Bacik
767a6a01ca6SJosef Bacik unlock_page(page);
76836d45567SChristoph Hellwig btrfs_start_ordered_extent(ordered);
769a6a01ca6SJosef Bacik btrfs_put_ordered_extent(ordered);
770a6a01ca6SJosef Bacik lock_page(page);
771a6a01ca6SJosef Bacik /*
772a6a01ca6SJosef Bacik * We unlocked the page above, so we need check if it was
773a6a01ca6SJosef Bacik * released or not.
774a6a01ca6SJosef Bacik */
775a6a01ca6SJosef Bacik if (page->mapping != mapping || !PagePrivate(page)) {
776a6a01ca6SJosef Bacik unlock_page(page);
777a6a01ca6SJosef Bacik put_page(page);
778a6a01ca6SJosef Bacik goto again;
779a6a01ca6SJosef Bacik }
780a6a01ca6SJosef Bacik }
781a6a01ca6SJosef Bacik
782a6a01ca6SJosef Bacik /*
783a6a01ca6SJosef Bacik * Now the page range has no ordered extent any more. Read the page to
784a6a01ca6SJosef Bacik * make it uptodate.
785a6a01ca6SJosef Bacik */
786a6a01ca6SJosef Bacik if (!PageUptodate(page)) {
787a6a01ca6SJosef Bacik btrfs_read_folio(NULL, page_folio(page));
788a6a01ca6SJosef Bacik lock_page(page);
789a6a01ca6SJosef Bacik if (page->mapping != mapping || !PagePrivate(page)) {
790a6a01ca6SJosef Bacik unlock_page(page);
791a6a01ca6SJosef Bacik put_page(page);
792a6a01ca6SJosef Bacik goto again;
793a6a01ca6SJosef Bacik }
794a6a01ca6SJosef Bacik if (!PageUptodate(page)) {
795a6a01ca6SJosef Bacik unlock_page(page);
796a6a01ca6SJosef Bacik put_page(page);
797a6a01ca6SJosef Bacik return ERR_PTR(-EIO);
798a6a01ca6SJosef Bacik }
799a6a01ca6SJosef Bacik }
800a6a01ca6SJosef Bacik return page;
801a6a01ca6SJosef Bacik }
802a6a01ca6SJosef Bacik
803a6a01ca6SJosef Bacik struct defrag_target_range {
804a6a01ca6SJosef Bacik struct list_head list;
805a6a01ca6SJosef Bacik u64 start;
806a6a01ca6SJosef Bacik u64 len;
807a6a01ca6SJosef Bacik };
808a6a01ca6SJosef Bacik
809a6a01ca6SJosef Bacik /*
810a6a01ca6SJosef Bacik * Collect all valid target extents.
811a6a01ca6SJosef Bacik *
812a6a01ca6SJosef Bacik * @start: file offset to lookup
813a6a01ca6SJosef Bacik * @len: length to lookup
814a6a01ca6SJosef Bacik * @extent_thresh: file extent size threshold, any extent size >= this value
815a6a01ca6SJosef Bacik * will be ignored
816a6a01ca6SJosef Bacik * @newer_than: only defrag extents newer than this value
817a6a01ca6SJosef Bacik * @do_compress: whether the defrag is doing compression
818a6a01ca6SJosef Bacik * if true, @extent_thresh will be ignored and all regular
819a6a01ca6SJosef Bacik * file extents meeting @newer_than will be targets.
820a6a01ca6SJosef Bacik * @locked: if the range has already held extent lock
821a6a01ca6SJosef Bacik * @target_list: list of targets file extents
822a6a01ca6SJosef Bacik */
defrag_collect_targets(struct btrfs_inode * inode,u64 start,u64 len,u32 extent_thresh,u64 newer_than,bool do_compress,bool locked,struct list_head * target_list,u64 * last_scanned_ret)823a6a01ca6SJosef Bacik static int defrag_collect_targets(struct btrfs_inode *inode,
824a6a01ca6SJosef Bacik u64 start, u64 len, u32 extent_thresh,
825a6a01ca6SJosef Bacik u64 newer_than, bool do_compress,
826a6a01ca6SJosef Bacik bool locked, struct list_head *target_list,
827a6a01ca6SJosef Bacik u64 *last_scanned_ret)
828a6a01ca6SJosef Bacik {
829a6a01ca6SJosef Bacik struct btrfs_fs_info *fs_info = inode->root->fs_info;
830a6a01ca6SJosef Bacik bool last_is_target = false;
831a6a01ca6SJosef Bacik u64 cur = start;
832a6a01ca6SJosef Bacik int ret = 0;
833a6a01ca6SJosef Bacik
834a6a01ca6SJosef Bacik while (cur < start + len) {
835a6a01ca6SJosef Bacik struct extent_map *em;
836a6a01ca6SJosef Bacik struct defrag_target_range *new;
837a6a01ca6SJosef Bacik bool next_mergeable = true;
838a6a01ca6SJosef Bacik u64 range_len;
839a6a01ca6SJosef Bacik
840a6a01ca6SJosef Bacik last_is_target = false;
841a6a01ca6SJosef Bacik em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked);
842a6a01ca6SJosef Bacik if (!em)
843a6a01ca6SJosef Bacik break;
844a6a01ca6SJosef Bacik
845a6a01ca6SJosef Bacik /*
846a6a01ca6SJosef Bacik * If the file extent is an inlined one, we may still want to
847a6a01ca6SJosef Bacik * defrag it (fallthrough) if it will cause a regular extent.
848a6a01ca6SJosef Bacik * This is for users who want to convert inline extents to
849a6a01ca6SJosef Bacik * regular ones through max_inline= mount option.
850a6a01ca6SJosef Bacik */
851a6a01ca6SJosef Bacik if (em->block_start == EXTENT_MAP_INLINE &&
852a6a01ca6SJosef Bacik em->len <= inode->root->fs_info->max_inline)
853a6a01ca6SJosef Bacik goto next;
854a6a01ca6SJosef Bacik
855a6a01ca6SJosef Bacik /* Skip hole/delalloc/preallocated extents */
856a6a01ca6SJosef Bacik if (em->block_start == EXTENT_MAP_HOLE ||
857a6a01ca6SJosef Bacik em->block_start == EXTENT_MAP_DELALLOC ||
858a6a01ca6SJosef Bacik test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
859a6a01ca6SJosef Bacik goto next;
860a6a01ca6SJosef Bacik
861a6a01ca6SJosef Bacik /* Skip older extent */
862a6a01ca6SJosef Bacik if (em->generation < newer_than)
863a6a01ca6SJosef Bacik goto next;
864a6a01ca6SJosef Bacik
865a6a01ca6SJosef Bacik /* This em is under writeback, no need to defrag */
866a6a01ca6SJosef Bacik if (em->generation == (u64)-1)
867a6a01ca6SJosef Bacik goto next;
868a6a01ca6SJosef Bacik
869a6a01ca6SJosef Bacik /*
870a6a01ca6SJosef Bacik * Our start offset might be in the middle of an existing extent
871a6a01ca6SJosef Bacik * map, so take that into account.
872a6a01ca6SJosef Bacik */
873a6a01ca6SJosef Bacik range_len = em->len - (cur - em->start);
874a6a01ca6SJosef Bacik /*
875a6a01ca6SJosef Bacik * If this range of the extent map is already flagged for delalloc,
876a6a01ca6SJosef Bacik * skip it, because:
877a6a01ca6SJosef Bacik *
878a6a01ca6SJosef Bacik * 1) We could deadlock later, when trying to reserve space for
879a6a01ca6SJosef Bacik * delalloc, because in case we can't immediately reserve space
880a6a01ca6SJosef Bacik * the flusher can start delalloc and wait for the respective
881a6a01ca6SJosef Bacik * ordered extents to complete. The deadlock would happen
882a6a01ca6SJosef Bacik * because we do the space reservation while holding the range
883a6a01ca6SJosef Bacik * locked, and starting writeback, or finishing an ordered
884a6a01ca6SJosef Bacik * extent, requires locking the range;
885a6a01ca6SJosef Bacik *
886a6a01ca6SJosef Bacik * 2) If there's delalloc there, it means there's dirty pages for
887a6a01ca6SJosef Bacik * which writeback has not started yet (we clean the delalloc
888a6a01ca6SJosef Bacik * flag when starting writeback and after creating an ordered
889a6a01ca6SJosef Bacik * extent). If we mark pages in an adjacent range for defrag,
890a6a01ca6SJosef Bacik * then we will have a larger contiguous range for delalloc,
891a6a01ca6SJosef Bacik * very likely resulting in a larger extent after writeback is
892a6a01ca6SJosef Bacik * triggered (except in a case of free space fragmentation).
893a6a01ca6SJosef Bacik */
894a6a01ca6SJosef Bacik if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
895a6a01ca6SJosef Bacik EXTENT_DELALLOC, 0, NULL))
896a6a01ca6SJosef Bacik goto next;
897a6a01ca6SJosef Bacik
898a6a01ca6SJosef Bacik /*
899a6a01ca6SJosef Bacik * For do_compress case, we want to compress all valid file
900a6a01ca6SJosef Bacik * extents, thus no @extent_thresh or mergeable check.
901a6a01ca6SJosef Bacik */
902a6a01ca6SJosef Bacik if (do_compress)
903a6a01ca6SJosef Bacik goto add;
904a6a01ca6SJosef Bacik
905a6a01ca6SJosef Bacik /* Skip too large extent */
9060bb020dcSQu Wenruo if (em->len >= extent_thresh)
907a6a01ca6SJosef Bacik goto next;
908a6a01ca6SJosef Bacik
909a6a01ca6SJosef Bacik /*
910a6a01ca6SJosef Bacik * Skip extents already at its max capacity, this is mostly for
911a6a01ca6SJosef Bacik * compressed extents, which max cap is only 128K.
912a6a01ca6SJosef Bacik */
913a6a01ca6SJosef Bacik if (em->len >= get_extent_max_capacity(fs_info, em))
914a6a01ca6SJosef Bacik goto next;
915a6a01ca6SJosef Bacik
916a6a01ca6SJosef Bacik /*
917a6a01ca6SJosef Bacik * Normally there are no more extents after an inline one, thus
918a6a01ca6SJosef Bacik * @next_mergeable will normally be false and not defragged.
919a6a01ca6SJosef Bacik * So if an inline extent passed all above checks, just add it
920a6a01ca6SJosef Bacik * for defrag, and be converted to regular extents.
921a6a01ca6SJosef Bacik */
922a6a01ca6SJosef Bacik if (em->block_start == EXTENT_MAP_INLINE)
923a6a01ca6SJosef Bacik goto add;
924a6a01ca6SJosef Bacik
925a6a01ca6SJosef Bacik next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
926a6a01ca6SJosef Bacik extent_thresh, newer_than, locked);
927a6a01ca6SJosef Bacik if (!next_mergeable) {
928a6a01ca6SJosef Bacik struct defrag_target_range *last;
929a6a01ca6SJosef Bacik
930a6a01ca6SJosef Bacik /* Empty target list, no way to merge with last entry */
931a6a01ca6SJosef Bacik if (list_empty(target_list))
932a6a01ca6SJosef Bacik goto next;
933a6a01ca6SJosef Bacik last = list_entry(target_list->prev,
934a6a01ca6SJosef Bacik struct defrag_target_range, list);
935a6a01ca6SJosef Bacik /* Not mergeable with last entry */
936a6a01ca6SJosef Bacik if (last->start + last->len != cur)
937a6a01ca6SJosef Bacik goto next;
938a6a01ca6SJosef Bacik
939a6a01ca6SJosef Bacik /* Mergeable, fall through to add it to @target_list. */
940a6a01ca6SJosef Bacik }
941a6a01ca6SJosef Bacik
942a6a01ca6SJosef Bacik add:
943a6a01ca6SJosef Bacik last_is_target = true;
944a6a01ca6SJosef Bacik range_len = min(extent_map_end(em), start + len) - cur;
945a6a01ca6SJosef Bacik /*
946a6a01ca6SJosef Bacik * This one is a good target, check if it can be merged into
947a6a01ca6SJosef Bacik * last range of the target list.
948a6a01ca6SJosef Bacik */
949a6a01ca6SJosef Bacik if (!list_empty(target_list)) {
950a6a01ca6SJosef Bacik struct defrag_target_range *last;
951a6a01ca6SJosef Bacik
952a6a01ca6SJosef Bacik last = list_entry(target_list->prev,
953a6a01ca6SJosef Bacik struct defrag_target_range, list);
954a6a01ca6SJosef Bacik ASSERT(last->start + last->len <= cur);
955a6a01ca6SJosef Bacik if (last->start + last->len == cur) {
956a6a01ca6SJosef Bacik /* Mergeable, enlarge the last entry */
957a6a01ca6SJosef Bacik last->len += range_len;
958a6a01ca6SJosef Bacik goto next;
959a6a01ca6SJosef Bacik }
960a6a01ca6SJosef Bacik /* Fall through to allocate a new entry */
961a6a01ca6SJosef Bacik }
962a6a01ca6SJosef Bacik
963a6a01ca6SJosef Bacik /* Allocate new defrag_target_range */
964a6a01ca6SJosef Bacik new = kmalloc(sizeof(*new), GFP_NOFS);
965a6a01ca6SJosef Bacik if (!new) {
966a6a01ca6SJosef Bacik free_extent_map(em);
967a6a01ca6SJosef Bacik ret = -ENOMEM;
968a6a01ca6SJosef Bacik break;
969a6a01ca6SJosef Bacik }
970a6a01ca6SJosef Bacik new->start = cur;
971a6a01ca6SJosef Bacik new->len = range_len;
972a6a01ca6SJosef Bacik list_add_tail(&new->list, target_list);
973a6a01ca6SJosef Bacik
974a6a01ca6SJosef Bacik next:
975a6a01ca6SJosef Bacik cur = extent_map_end(em);
976a6a01ca6SJosef Bacik free_extent_map(em);
977a6a01ca6SJosef Bacik }
978a6a01ca6SJosef Bacik if (ret < 0) {
979a6a01ca6SJosef Bacik struct defrag_target_range *entry;
980a6a01ca6SJosef Bacik struct defrag_target_range *tmp;
981a6a01ca6SJosef Bacik
982a6a01ca6SJosef Bacik list_for_each_entry_safe(entry, tmp, target_list, list) {
983a6a01ca6SJosef Bacik list_del_init(&entry->list);
984a6a01ca6SJosef Bacik kfree(entry);
985a6a01ca6SJosef Bacik }
986a6a01ca6SJosef Bacik }
987a6a01ca6SJosef Bacik if (!ret && last_scanned_ret) {
988a6a01ca6SJosef Bacik /*
989a6a01ca6SJosef Bacik * If the last extent is not a target, the caller can skip to
990a6a01ca6SJosef Bacik * the end of that extent.
991a6a01ca6SJosef Bacik * Otherwise, we can only go the end of the specified range.
992a6a01ca6SJosef Bacik */
993a6a01ca6SJosef Bacik if (!last_is_target)
994a6a01ca6SJosef Bacik *last_scanned_ret = max(cur, *last_scanned_ret);
995a6a01ca6SJosef Bacik else
996a6a01ca6SJosef Bacik *last_scanned_ret = max(start + len, *last_scanned_ret);
997a6a01ca6SJosef Bacik }
998a6a01ca6SJosef Bacik return ret;
999a6a01ca6SJosef Bacik }
1000a6a01ca6SJosef Bacik
1001a6a01ca6SJosef Bacik #define CLUSTER_SIZE (SZ_256K)
1002ce394a7fSYushan Zhou static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
1003a6a01ca6SJosef Bacik
1004a6a01ca6SJosef Bacik /*
1005a6a01ca6SJosef Bacik * Defrag one contiguous target range.
1006a6a01ca6SJosef Bacik *
1007a6a01ca6SJosef Bacik * @inode: target inode
1008a6a01ca6SJosef Bacik * @target: target range to defrag
1009a6a01ca6SJosef Bacik * @pages: locked pages covering the defrag range
1010a6a01ca6SJosef Bacik * @nr_pages: number of locked pages
1011a6a01ca6SJosef Bacik *
1012a6a01ca6SJosef Bacik * Caller should ensure:
1013a6a01ca6SJosef Bacik *
1014a6a01ca6SJosef Bacik * - Pages are prepared
1015a6a01ca6SJosef Bacik * Pages should be locked, no ordered extent in the pages range,
1016a6a01ca6SJosef Bacik * no writeback.
1017a6a01ca6SJosef Bacik *
1018a6a01ca6SJosef Bacik * - Extent bits are locked
1019a6a01ca6SJosef Bacik */
defrag_one_locked_target(struct btrfs_inode * inode,struct defrag_target_range * target,struct page ** pages,int nr_pages,struct extent_state ** cached_state)1020a6a01ca6SJosef Bacik static int defrag_one_locked_target(struct btrfs_inode *inode,
1021a6a01ca6SJosef Bacik struct defrag_target_range *target,
1022a6a01ca6SJosef Bacik struct page **pages, int nr_pages,
1023a6a01ca6SJosef Bacik struct extent_state **cached_state)
1024a6a01ca6SJosef Bacik {
1025a6a01ca6SJosef Bacik struct btrfs_fs_info *fs_info = inode->root->fs_info;
1026a6a01ca6SJosef Bacik struct extent_changeset *data_reserved = NULL;
1027a6a01ca6SJosef Bacik const u64 start = target->start;
1028a6a01ca6SJosef Bacik const u64 len = target->len;
1029a6a01ca6SJosef Bacik unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
1030a6a01ca6SJosef Bacik unsigned long start_index = start >> PAGE_SHIFT;
1031a6a01ca6SJosef Bacik unsigned long first_index = page_index(pages[0]);
1032a6a01ca6SJosef Bacik int ret = 0;
1033a6a01ca6SJosef Bacik int i;
1034a6a01ca6SJosef Bacik
1035a6a01ca6SJosef Bacik ASSERT(last_index - first_index + 1 <= nr_pages);
1036a6a01ca6SJosef Bacik
1037a6a01ca6SJosef Bacik ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
1038a6a01ca6SJosef Bacik if (ret < 0)
1039a6a01ca6SJosef Bacik return ret;
1040a6a01ca6SJosef Bacik clear_extent_bit(&inode->io_tree, start, start + len - 1,
1041a6a01ca6SJosef Bacik EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1042a6a01ca6SJosef Bacik EXTENT_DEFRAG, cached_state);
1043dc5646c1SDavid Sterba set_extent_bit(&inode->io_tree, start, start + len - 1,
10441d126800SDavid Sterba EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
1045a6a01ca6SJosef Bacik
1046a6a01ca6SJosef Bacik /* Update the page status */
1047a6a01ca6SJosef Bacik for (i = start_index - first_index; i <= last_index - first_index; i++) {
1048a6a01ca6SJosef Bacik ClearPageChecked(pages[i]);
1049a6a01ca6SJosef Bacik btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
1050a6a01ca6SJosef Bacik }
1051a6a01ca6SJosef Bacik btrfs_delalloc_release_extents(inode, len);
1052a6a01ca6SJosef Bacik extent_changeset_free(data_reserved);
1053a6a01ca6SJosef Bacik
1054a6a01ca6SJosef Bacik return ret;
1055a6a01ca6SJosef Bacik }
1056a6a01ca6SJosef Bacik
defrag_one_range(struct btrfs_inode * inode,u64 start,u32 len,u32 extent_thresh,u64 newer_than,bool do_compress,u64 * last_scanned_ret)1057a6a01ca6SJosef Bacik static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
1058a6a01ca6SJosef Bacik u32 extent_thresh, u64 newer_than, bool do_compress,
1059a6a01ca6SJosef Bacik u64 *last_scanned_ret)
1060a6a01ca6SJosef Bacik {
1061a6a01ca6SJosef Bacik struct extent_state *cached_state = NULL;
1062a6a01ca6SJosef Bacik struct defrag_target_range *entry;
1063a6a01ca6SJosef Bacik struct defrag_target_range *tmp;
1064a6a01ca6SJosef Bacik LIST_HEAD(target_list);
1065a6a01ca6SJosef Bacik struct page **pages;
1066a6a01ca6SJosef Bacik const u32 sectorsize = inode->root->fs_info->sectorsize;
1067a6a01ca6SJosef Bacik u64 last_index = (start + len - 1) >> PAGE_SHIFT;
1068a6a01ca6SJosef Bacik u64 start_index = start >> PAGE_SHIFT;
1069a6a01ca6SJosef Bacik unsigned int nr_pages = last_index - start_index + 1;
1070a6a01ca6SJosef Bacik int ret = 0;
1071a6a01ca6SJosef Bacik int i;
1072a6a01ca6SJosef Bacik
1073a6a01ca6SJosef Bacik ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
1074a6a01ca6SJosef Bacik ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
1075a6a01ca6SJosef Bacik
1076a6a01ca6SJosef Bacik pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
1077a6a01ca6SJosef Bacik if (!pages)
1078a6a01ca6SJosef Bacik return -ENOMEM;
1079a6a01ca6SJosef Bacik
1080a6a01ca6SJosef Bacik /* Prepare all pages */
1081a6a01ca6SJosef Bacik for (i = 0; i < nr_pages; i++) {
1082a6a01ca6SJosef Bacik pages[i] = defrag_prepare_one_page(inode, start_index + i);
1083a6a01ca6SJosef Bacik if (IS_ERR(pages[i])) {
1084a6a01ca6SJosef Bacik ret = PTR_ERR(pages[i]);
1085a6a01ca6SJosef Bacik pages[i] = NULL;
1086a6a01ca6SJosef Bacik goto free_pages;
1087a6a01ca6SJosef Bacik }
1088a6a01ca6SJosef Bacik }
1089a6a01ca6SJosef Bacik for (i = 0; i < nr_pages; i++)
1090a6a01ca6SJosef Bacik wait_on_page_writeback(pages[i]);
1091a6a01ca6SJosef Bacik
1092a6a01ca6SJosef Bacik /* Lock the pages range */
1093a6a01ca6SJosef Bacik lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
1094a6a01ca6SJosef Bacik (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
1095a6a01ca6SJosef Bacik &cached_state);
1096a6a01ca6SJosef Bacik /*
1097a6a01ca6SJosef Bacik * Now we have a consistent view about the extent map, re-check
1098a6a01ca6SJosef Bacik * which range really needs to be defragged.
1099a6a01ca6SJosef Bacik *
1100a6a01ca6SJosef Bacik * And this time we have extent locked already, pass @locked = true
1101a6a01ca6SJosef Bacik * so that we won't relock the extent range and cause deadlock.
1102a6a01ca6SJosef Bacik */
1103a6a01ca6SJosef Bacik ret = defrag_collect_targets(inode, start, len, extent_thresh,
1104a6a01ca6SJosef Bacik newer_than, do_compress, true,
1105a6a01ca6SJosef Bacik &target_list, last_scanned_ret);
1106a6a01ca6SJosef Bacik if (ret < 0)
1107a6a01ca6SJosef Bacik goto unlock_extent;
1108a6a01ca6SJosef Bacik
1109a6a01ca6SJosef Bacik list_for_each_entry(entry, &target_list, list) {
1110a6a01ca6SJosef Bacik ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
1111a6a01ca6SJosef Bacik &cached_state);
1112a6a01ca6SJosef Bacik if (ret < 0)
1113a6a01ca6SJosef Bacik break;
1114a6a01ca6SJosef Bacik }
1115a6a01ca6SJosef Bacik
1116a6a01ca6SJosef Bacik list_for_each_entry_safe(entry, tmp, &target_list, list) {
1117a6a01ca6SJosef Bacik list_del_init(&entry->list);
1118a6a01ca6SJosef Bacik kfree(entry);
1119a6a01ca6SJosef Bacik }
1120a6a01ca6SJosef Bacik unlock_extent:
1121a6a01ca6SJosef Bacik unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
1122a6a01ca6SJosef Bacik (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
1123a6a01ca6SJosef Bacik &cached_state);
1124a6a01ca6SJosef Bacik free_pages:
1125a6a01ca6SJosef Bacik for (i = 0; i < nr_pages; i++) {
1126a6a01ca6SJosef Bacik if (pages[i]) {
1127a6a01ca6SJosef Bacik unlock_page(pages[i]);
1128a6a01ca6SJosef Bacik put_page(pages[i]);
1129a6a01ca6SJosef Bacik }
1130a6a01ca6SJosef Bacik }
1131a6a01ca6SJosef Bacik kfree(pages);
1132a6a01ca6SJosef Bacik return ret;
1133a6a01ca6SJosef Bacik }
1134a6a01ca6SJosef Bacik
defrag_one_cluster(struct btrfs_inode * inode,struct file_ra_state * ra,u64 start,u32 len,u32 extent_thresh,u64 newer_than,bool do_compress,unsigned long * sectors_defragged,unsigned long max_sectors,u64 * last_scanned_ret)1135a6a01ca6SJosef Bacik static int defrag_one_cluster(struct btrfs_inode *inode,
1136a6a01ca6SJosef Bacik struct file_ra_state *ra,
1137a6a01ca6SJosef Bacik u64 start, u32 len, u32 extent_thresh,
1138a6a01ca6SJosef Bacik u64 newer_than, bool do_compress,
1139a6a01ca6SJosef Bacik unsigned long *sectors_defragged,
1140a6a01ca6SJosef Bacik unsigned long max_sectors,
1141a6a01ca6SJosef Bacik u64 *last_scanned_ret)
1142a6a01ca6SJosef Bacik {
1143a6a01ca6SJosef Bacik const u32 sectorsize = inode->root->fs_info->sectorsize;
1144a6a01ca6SJosef Bacik struct defrag_target_range *entry;
1145a6a01ca6SJosef Bacik struct defrag_target_range *tmp;
1146a6a01ca6SJosef Bacik LIST_HEAD(target_list);
1147a6a01ca6SJosef Bacik int ret;
1148a6a01ca6SJosef Bacik
1149a6a01ca6SJosef Bacik ret = defrag_collect_targets(inode, start, len, extent_thresh,
1150a6a01ca6SJosef Bacik newer_than, do_compress, false,
1151a6a01ca6SJosef Bacik &target_list, NULL);
1152a6a01ca6SJosef Bacik if (ret < 0)
1153a6a01ca6SJosef Bacik goto out;
1154a6a01ca6SJosef Bacik
1155a6a01ca6SJosef Bacik list_for_each_entry(entry, &target_list, list) {
1156a6a01ca6SJosef Bacik u32 range_len = entry->len;
1157a6a01ca6SJosef Bacik
1158a6a01ca6SJosef Bacik /* Reached or beyond the limit */
1159a6a01ca6SJosef Bacik if (max_sectors && *sectors_defragged >= max_sectors) {
1160a6a01ca6SJosef Bacik ret = 1;
1161a6a01ca6SJosef Bacik break;
1162a6a01ca6SJosef Bacik }
1163a6a01ca6SJosef Bacik
1164a6a01ca6SJosef Bacik if (max_sectors)
1165a6a01ca6SJosef Bacik range_len = min_t(u32, range_len,
1166a6a01ca6SJosef Bacik (max_sectors - *sectors_defragged) * sectorsize);
1167a6a01ca6SJosef Bacik
1168a6a01ca6SJosef Bacik /*
1169a6a01ca6SJosef Bacik * If defrag_one_range() has updated last_scanned_ret,
1170a6a01ca6SJosef Bacik * our range may already be invalid (e.g. hole punched).
1171a6a01ca6SJosef Bacik * Skip if our range is before last_scanned_ret, as there is
1172a6a01ca6SJosef Bacik * no need to defrag the range anymore.
1173a6a01ca6SJosef Bacik */
1174a6a01ca6SJosef Bacik if (entry->start + range_len <= *last_scanned_ret)
1175a6a01ca6SJosef Bacik continue;
1176a6a01ca6SJosef Bacik
1177a6a01ca6SJosef Bacik if (ra)
1178a6a01ca6SJosef Bacik page_cache_sync_readahead(inode->vfs_inode.i_mapping,
1179a6a01ca6SJosef Bacik ra, NULL, entry->start >> PAGE_SHIFT,
1180a6a01ca6SJosef Bacik ((entry->start + range_len - 1) >> PAGE_SHIFT) -
1181a6a01ca6SJosef Bacik (entry->start >> PAGE_SHIFT) + 1);
1182a6a01ca6SJosef Bacik /*
1183a6a01ca6SJosef Bacik * Here we may not defrag any range if holes are punched before
1184a6a01ca6SJosef Bacik * we locked the pages.
1185a6a01ca6SJosef Bacik * But that's fine, it only affects the @sectors_defragged
1186a6a01ca6SJosef Bacik * accounting.
1187a6a01ca6SJosef Bacik */
1188a6a01ca6SJosef Bacik ret = defrag_one_range(inode, entry->start, range_len,
1189a6a01ca6SJosef Bacik extent_thresh, newer_than, do_compress,
1190a6a01ca6SJosef Bacik last_scanned_ret);
1191a6a01ca6SJosef Bacik if (ret < 0)
1192a6a01ca6SJosef Bacik break;
1193a6a01ca6SJosef Bacik *sectors_defragged += range_len >>
1194a6a01ca6SJosef Bacik inode->root->fs_info->sectorsize_bits;
1195a6a01ca6SJosef Bacik }
1196a6a01ca6SJosef Bacik out:
1197a6a01ca6SJosef Bacik list_for_each_entry_safe(entry, tmp, &target_list, list) {
1198a6a01ca6SJosef Bacik list_del_init(&entry->list);
1199a6a01ca6SJosef Bacik kfree(entry);
1200a6a01ca6SJosef Bacik }
1201a6a01ca6SJosef Bacik if (ret >= 0)
1202a6a01ca6SJosef Bacik *last_scanned_ret = max(*last_scanned_ret, start + len);
1203a6a01ca6SJosef Bacik return ret;
1204a6a01ca6SJosef Bacik }
1205a6a01ca6SJosef Bacik
1206a6a01ca6SJosef Bacik /*
1207a6a01ca6SJosef Bacik * Entry point to file defragmentation.
1208a6a01ca6SJosef Bacik *
1209a6a01ca6SJosef Bacik * @inode: inode to be defragged
1210a6a01ca6SJosef Bacik * @ra: readahead state (can be NUL)
1211a6a01ca6SJosef Bacik * @range: defrag options including range and flags
1212a6a01ca6SJosef Bacik * @newer_than: minimum transid to defrag
1213a6a01ca6SJosef Bacik * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
1214a6a01ca6SJosef Bacik * will be defragged.
1215a6a01ca6SJosef Bacik *
1216a6a01ca6SJosef Bacik * Return <0 for error.
1217a6a01ca6SJosef Bacik * Return >=0 for the number of sectors defragged, and range->start will be updated
1218a6a01ca6SJosef Bacik * to indicate the file offset where next defrag should be started at.
1219a6a01ca6SJosef Bacik * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1220a6a01ca6SJosef Bacik * defragging all the range).
1221a6a01ca6SJosef Bacik */
btrfs_defrag_file(struct inode * inode,struct file_ra_state * ra,struct btrfs_ioctl_defrag_range_args * range,u64 newer_than,unsigned long max_to_defrag)1222a6a01ca6SJosef Bacik int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
1223a6a01ca6SJosef Bacik struct btrfs_ioctl_defrag_range_args *range,
1224a6a01ca6SJosef Bacik u64 newer_than, unsigned long max_to_defrag)
1225a6a01ca6SJosef Bacik {
1226a6a01ca6SJosef Bacik struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1227a6a01ca6SJosef Bacik unsigned long sectors_defragged = 0;
1228a6a01ca6SJosef Bacik u64 isize = i_size_read(inode);
1229a6a01ca6SJosef Bacik u64 cur;
1230a6a01ca6SJosef Bacik u64 last_byte;
1231a6a01ca6SJosef Bacik bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
1232a6a01ca6SJosef Bacik bool ra_allocated = false;
1233a6a01ca6SJosef Bacik int compress_type = BTRFS_COMPRESS_ZLIB;
1234a6a01ca6SJosef Bacik int ret = 0;
1235a6a01ca6SJosef Bacik u32 extent_thresh = range->extent_thresh;
1236a6a01ca6SJosef Bacik pgoff_t start_index;
1237a6a01ca6SJosef Bacik
1238a6a01ca6SJosef Bacik if (isize == 0)
1239a6a01ca6SJosef Bacik return 0;
1240a6a01ca6SJosef Bacik
1241a6a01ca6SJosef Bacik if (range->start >= isize)
1242a6a01ca6SJosef Bacik return -EINVAL;
1243a6a01ca6SJosef Bacik
1244a6a01ca6SJosef Bacik if (do_compress) {
1245a6a01ca6SJosef Bacik if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
1246a6a01ca6SJosef Bacik return -EINVAL;
1247a6a01ca6SJosef Bacik if (range->compress_type)
1248a6a01ca6SJosef Bacik compress_type = range->compress_type;
1249a6a01ca6SJosef Bacik }
1250a6a01ca6SJosef Bacik
1251a6a01ca6SJosef Bacik if (extent_thresh == 0)
1252a6a01ca6SJosef Bacik extent_thresh = SZ_256K;
1253a6a01ca6SJosef Bacik
1254a6a01ca6SJosef Bacik if (range->start + range->len > range->start) {
1255a6a01ca6SJosef Bacik /* Got a specific range */
1256a6a01ca6SJosef Bacik last_byte = min(isize, range->start + range->len);
1257a6a01ca6SJosef Bacik } else {
1258a6a01ca6SJosef Bacik /* Defrag until file end */
1259a6a01ca6SJosef Bacik last_byte = isize;
1260a6a01ca6SJosef Bacik }
1261a6a01ca6SJosef Bacik
1262a6a01ca6SJosef Bacik /* Align the range */
1263a6a01ca6SJosef Bacik cur = round_down(range->start, fs_info->sectorsize);
1264a6a01ca6SJosef Bacik last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
1265a6a01ca6SJosef Bacik
1266a6a01ca6SJosef Bacik /*
1267a6a01ca6SJosef Bacik * If we were not given a ra, allocate a readahead context. As
1268a6a01ca6SJosef Bacik * readahead is just an optimization, defrag will work without it so
1269a6a01ca6SJosef Bacik * we don't error out.
1270a6a01ca6SJosef Bacik */
1271a6a01ca6SJosef Bacik if (!ra) {
1272a6a01ca6SJosef Bacik ra_allocated = true;
1273a6a01ca6SJosef Bacik ra = kzalloc(sizeof(*ra), GFP_KERNEL);
1274a6a01ca6SJosef Bacik if (ra)
1275a6a01ca6SJosef Bacik file_ra_state_init(ra, inode->i_mapping);
1276a6a01ca6SJosef Bacik }
1277a6a01ca6SJosef Bacik
1278a6a01ca6SJosef Bacik /*
1279a6a01ca6SJosef Bacik * Make writeback start from the beginning of the range, so that the
1280a6a01ca6SJosef Bacik * defrag range can be written sequentially.
1281a6a01ca6SJosef Bacik */
1282a6a01ca6SJosef Bacik start_index = cur >> PAGE_SHIFT;
1283a6a01ca6SJosef Bacik if (start_index < inode->i_mapping->writeback_index)
1284a6a01ca6SJosef Bacik inode->i_mapping->writeback_index = start_index;
1285a6a01ca6SJosef Bacik
1286a6a01ca6SJosef Bacik while (cur < last_byte) {
1287a6a01ca6SJosef Bacik const unsigned long prev_sectors_defragged = sectors_defragged;
1288a6a01ca6SJosef Bacik u64 last_scanned = cur;
1289a6a01ca6SJosef Bacik u64 cluster_end;
1290a6a01ca6SJosef Bacik
1291a6a01ca6SJosef Bacik if (btrfs_defrag_cancelled(fs_info)) {
1292a6a01ca6SJosef Bacik ret = -EAGAIN;
1293a6a01ca6SJosef Bacik break;
1294a6a01ca6SJosef Bacik }
1295a6a01ca6SJosef Bacik
1296a6a01ca6SJosef Bacik /* We want the cluster end at page boundary when possible */
1297a6a01ca6SJosef Bacik cluster_end = (((cur >> PAGE_SHIFT) +
1298a6a01ca6SJosef Bacik (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
1299a6a01ca6SJosef Bacik cluster_end = min(cluster_end, last_byte);
1300a6a01ca6SJosef Bacik
130129b6352bSDavid Sterba btrfs_inode_lock(BTRFS_I(inode), 0);
1302a6a01ca6SJosef Bacik if (IS_SWAPFILE(inode)) {
1303a6a01ca6SJosef Bacik ret = -ETXTBSY;
1304e5d4d75bSDavid Sterba btrfs_inode_unlock(BTRFS_I(inode), 0);
1305a6a01ca6SJosef Bacik break;
1306a6a01ca6SJosef Bacik }
1307a6a01ca6SJosef Bacik if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
1308e5d4d75bSDavid Sterba btrfs_inode_unlock(BTRFS_I(inode), 0);
1309a6a01ca6SJosef Bacik break;
1310a6a01ca6SJosef Bacik }
1311a6a01ca6SJosef Bacik if (do_compress)
1312a6a01ca6SJosef Bacik BTRFS_I(inode)->defrag_compress = compress_type;
1313a6a01ca6SJosef Bacik ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
1314a6a01ca6SJosef Bacik cluster_end + 1 - cur, extent_thresh,
1315a6a01ca6SJosef Bacik newer_than, do_compress, §ors_defragged,
1316a6a01ca6SJosef Bacik max_to_defrag, &last_scanned);
1317a6a01ca6SJosef Bacik
1318a6a01ca6SJosef Bacik if (sectors_defragged > prev_sectors_defragged)
1319a6a01ca6SJosef Bacik balance_dirty_pages_ratelimited(inode->i_mapping);
1320a6a01ca6SJosef Bacik
1321e5d4d75bSDavid Sterba btrfs_inode_unlock(BTRFS_I(inode), 0);
1322a6a01ca6SJosef Bacik if (ret < 0)
1323a6a01ca6SJosef Bacik break;
1324a6a01ca6SJosef Bacik cur = max(cluster_end + 1, last_scanned);
1325a6a01ca6SJosef Bacik if (ret > 0) {
1326a6a01ca6SJosef Bacik ret = 0;
1327a6a01ca6SJosef Bacik break;
1328a6a01ca6SJosef Bacik }
1329a6a01ca6SJosef Bacik cond_resched();
1330a6a01ca6SJosef Bacik }
1331a6a01ca6SJosef Bacik
1332a6a01ca6SJosef Bacik if (ra_allocated)
1333a6a01ca6SJosef Bacik kfree(ra);
1334a6a01ca6SJosef Bacik /*
1335a6a01ca6SJosef Bacik * Update range.start for autodefrag, this will indicate where to start
1336a6a01ca6SJosef Bacik * in next run.
1337a6a01ca6SJosef Bacik */
1338a6a01ca6SJosef Bacik range->start = cur;
1339a6a01ca6SJosef Bacik if (sectors_defragged) {
1340a6a01ca6SJosef Bacik /*
1341a6a01ca6SJosef Bacik * We have defragged some sectors, for compression case they
1342a6a01ca6SJosef Bacik * need to be written back immediately.
1343a6a01ca6SJosef Bacik */
1344a6a01ca6SJosef Bacik if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
1345a6a01ca6SJosef Bacik filemap_flush(inode->i_mapping);
1346a6a01ca6SJosef Bacik if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1347a6a01ca6SJosef Bacik &BTRFS_I(inode)->runtime_flags))
1348a6a01ca6SJosef Bacik filemap_flush(inode->i_mapping);
1349a6a01ca6SJosef Bacik }
1350a6a01ca6SJosef Bacik if (range->compress_type == BTRFS_COMPRESS_LZO)
1351a6a01ca6SJosef Bacik btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
1352a6a01ca6SJosef Bacik else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
1353a6a01ca6SJosef Bacik btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
1354a6a01ca6SJosef Bacik ret = sectors_defragged;
1355a6a01ca6SJosef Bacik }
1356a6a01ca6SJosef Bacik if (do_compress) {
135729b6352bSDavid Sterba btrfs_inode_lock(BTRFS_I(inode), 0);
1358a6a01ca6SJosef Bacik BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
1359e5d4d75bSDavid Sterba btrfs_inode_unlock(BTRFS_I(inode), 0);
1360a6a01ca6SJosef Bacik }
1361a6a01ca6SJosef Bacik return ret;
1362a6a01ca6SJosef Bacik }
1363a6a01ca6SJosef Bacik
btrfs_auto_defrag_exit(void)13646e3df18bSJosef Bacik void __cold btrfs_auto_defrag_exit(void)
13656e3df18bSJosef Bacik {
13666e3df18bSJosef Bacik kmem_cache_destroy(btrfs_inode_defrag_cachep);
13676e3df18bSJosef Bacik }
13686e3df18bSJosef Bacik
btrfs_auto_defrag_init(void)13696e3df18bSJosef Bacik int __init btrfs_auto_defrag_init(void)
13706e3df18bSJosef Bacik {
13716e3df18bSJosef Bacik btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
13726e3df18bSJosef Bacik sizeof(struct inode_defrag), 0,
13736e3df18bSJosef Bacik SLAB_MEM_SPREAD,
13746e3df18bSJosef Bacik NULL);
13756e3df18bSJosef Bacik if (!btrfs_inode_defrag_cachep)
13766e3df18bSJosef Bacik return -ENOMEM;
13776e3df18bSJosef Bacik
13786e3df18bSJosef Bacik return 0;
13796e3df18bSJosef Bacik }
1380