xref: /openbmc/linux/fs/btrfs/defrag.c (revision 26d0dfbb16fcb17d128a79dc70f3020ea6992af0)
1778dd695SJosef Bacik // SPDX-License-Identifier: GPL-2.0
2778dd695SJosef Bacik /*
3778dd695SJosef Bacik  * Copyright (C) 2007 Oracle.  All rights reserved.
4778dd695SJosef Bacik  */
5778dd695SJosef Bacik 
6778dd695SJosef Bacik #include <linux/sched.h>
7778dd695SJosef Bacik #include "ctree.h"
8778dd695SJosef Bacik #include "disk-io.h"
9778dd695SJosef Bacik #include "print-tree.h"
10778dd695SJosef Bacik #include "transaction.h"
11778dd695SJosef Bacik #include "locking.h"
12778dd695SJosef Bacik #include "accessors.h"
13a6a01ca6SJosef Bacik #include "messages.h"
14a6a01ca6SJosef Bacik #include "delalloc-space.h"
15a6a01ca6SJosef Bacik #include "subpage.h"
1659b818e0SJosef Bacik #include "defrag.h"
177c8ede16SJosef Bacik #include "file-item.h"
187f0add25SJosef Bacik #include "super.h"
19778dd695SJosef Bacik 
206e3df18bSJosef Bacik static struct kmem_cache *btrfs_inode_defrag_cachep;
216e3df18bSJosef Bacik 
226e3df18bSJosef Bacik /*
236e3df18bSJosef Bacik  * When auto defrag is enabled we queue up these defrag structs to remember
246e3df18bSJosef Bacik  * which inodes need defragging passes.
256e3df18bSJosef Bacik  */
266e3df18bSJosef Bacik struct inode_defrag {
276e3df18bSJosef Bacik 	struct rb_node rb_node;
286e3df18bSJosef Bacik 	/* Inode number */
296e3df18bSJosef Bacik 	u64 ino;
306e3df18bSJosef Bacik 	/*
316e3df18bSJosef Bacik 	 * Transid where the defrag was added, we search for extents newer than
326e3df18bSJosef Bacik 	 * this.
336e3df18bSJosef Bacik 	 */
346e3df18bSJosef Bacik 	u64 transid;
356e3df18bSJosef Bacik 
366e3df18bSJosef Bacik 	/* Root objectid */
376e3df18bSJosef Bacik 	u64 root;
386e3df18bSJosef Bacik 
396e3df18bSJosef Bacik 	/*
406e3df18bSJosef Bacik 	 * The extent size threshold for autodefrag.
416e3df18bSJosef Bacik 	 *
426e3df18bSJosef Bacik 	 * This value is different for compressed/non-compressed extents, thus
436e3df18bSJosef Bacik 	 * needs to be passed from higher layer.
446e3df18bSJosef Bacik 	 * (aka, inode_should_defrag())
456e3df18bSJosef Bacik 	 */
466e3df18bSJosef Bacik 	u32 extent_thresh;
476e3df18bSJosef Bacik };
486e3df18bSJosef Bacik 
__compare_inode_defrag(struct inode_defrag * defrag1,struct inode_defrag * defrag2)496e3df18bSJosef Bacik static int __compare_inode_defrag(struct inode_defrag *defrag1,
506e3df18bSJosef Bacik 				  struct inode_defrag *defrag2)
516e3df18bSJosef Bacik {
526e3df18bSJosef Bacik 	if (defrag1->root > defrag2->root)
536e3df18bSJosef Bacik 		return 1;
546e3df18bSJosef Bacik 	else if (defrag1->root < defrag2->root)
556e3df18bSJosef Bacik 		return -1;
566e3df18bSJosef Bacik 	else if (defrag1->ino > defrag2->ino)
576e3df18bSJosef Bacik 		return 1;
586e3df18bSJosef Bacik 	else if (defrag1->ino < defrag2->ino)
596e3df18bSJosef Bacik 		return -1;
606e3df18bSJosef Bacik 	else
616e3df18bSJosef Bacik 		return 0;
626e3df18bSJosef Bacik }
636e3df18bSJosef Bacik 
646e3df18bSJosef Bacik /*
656e3df18bSJosef Bacik  * Pop a record for an inode into the defrag tree.  The lock must be held
666e3df18bSJosef Bacik  * already.
676e3df18bSJosef Bacik  *
686e3df18bSJosef Bacik  * If you're inserting a record for an older transid than an existing record,
696e3df18bSJosef Bacik  * the transid already in the tree is lowered.
706e3df18bSJosef Bacik  *
716e3df18bSJosef Bacik  * If an existing record is found the defrag item you pass in is freed.
726e3df18bSJosef Bacik  */
__btrfs_add_inode_defrag(struct btrfs_inode * inode,struct inode_defrag * defrag)736e3df18bSJosef Bacik static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
746e3df18bSJosef Bacik 				    struct inode_defrag *defrag)
756e3df18bSJosef Bacik {
766e3df18bSJosef Bacik 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
776e3df18bSJosef Bacik 	struct inode_defrag *entry;
786e3df18bSJosef Bacik 	struct rb_node **p;
796e3df18bSJosef Bacik 	struct rb_node *parent = NULL;
806e3df18bSJosef Bacik 	int ret;
816e3df18bSJosef Bacik 
826e3df18bSJosef Bacik 	p = &fs_info->defrag_inodes.rb_node;
836e3df18bSJosef Bacik 	while (*p) {
846e3df18bSJosef Bacik 		parent = *p;
856e3df18bSJosef Bacik 		entry = rb_entry(parent, struct inode_defrag, rb_node);
866e3df18bSJosef Bacik 
876e3df18bSJosef Bacik 		ret = __compare_inode_defrag(defrag, entry);
886e3df18bSJosef Bacik 		if (ret < 0)
896e3df18bSJosef Bacik 			p = &parent->rb_left;
906e3df18bSJosef Bacik 		else if (ret > 0)
916e3df18bSJosef Bacik 			p = &parent->rb_right;
926e3df18bSJosef Bacik 		else {
936e3df18bSJosef Bacik 			/*
946e3df18bSJosef Bacik 			 * If we're reinserting an entry for an old defrag run,
956e3df18bSJosef Bacik 			 * make sure to lower the transid of our existing
966e3df18bSJosef Bacik 			 * record.
976e3df18bSJosef Bacik 			 */
986e3df18bSJosef Bacik 			if (defrag->transid < entry->transid)
996e3df18bSJosef Bacik 				entry->transid = defrag->transid;
1006e3df18bSJosef Bacik 			entry->extent_thresh = min(defrag->extent_thresh,
1016e3df18bSJosef Bacik 						   entry->extent_thresh);
1026e3df18bSJosef Bacik 			return -EEXIST;
1036e3df18bSJosef Bacik 		}
1046e3df18bSJosef Bacik 	}
1056e3df18bSJosef Bacik 	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
1066e3df18bSJosef Bacik 	rb_link_node(&defrag->rb_node, parent, p);
1076e3df18bSJosef Bacik 	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
1086e3df18bSJosef Bacik 	return 0;
1096e3df18bSJosef Bacik }
1106e3df18bSJosef Bacik 
__need_auto_defrag(struct btrfs_fs_info * fs_info)1116e3df18bSJosef Bacik static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
1126e3df18bSJosef Bacik {
1136e3df18bSJosef Bacik 	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
1146e3df18bSJosef Bacik 		return 0;
1156e3df18bSJosef Bacik 
1166e3df18bSJosef Bacik 	if (btrfs_fs_closing(fs_info))
1176e3df18bSJosef Bacik 		return 0;
1186e3df18bSJosef Bacik 
1196e3df18bSJosef Bacik 	return 1;
1206e3df18bSJosef Bacik }
1216e3df18bSJosef Bacik 
1226e3df18bSJosef Bacik /*
1236e3df18bSJosef Bacik  * Insert a defrag record for this inode if auto defrag is enabled.
1246e3df18bSJosef Bacik  */
btrfs_add_inode_defrag(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u32 extent_thresh)1256e3df18bSJosef Bacik int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
1266e3df18bSJosef Bacik 			   struct btrfs_inode *inode, u32 extent_thresh)
1276e3df18bSJosef Bacik {
1286e3df18bSJosef Bacik 	struct btrfs_root *root = inode->root;
1296e3df18bSJosef Bacik 	struct btrfs_fs_info *fs_info = root->fs_info;
1306e3df18bSJosef Bacik 	struct inode_defrag *defrag;
1316e3df18bSJosef Bacik 	u64 transid;
1326e3df18bSJosef Bacik 	int ret;
1336e3df18bSJosef Bacik 
1346e3df18bSJosef Bacik 	if (!__need_auto_defrag(fs_info))
1356e3df18bSJosef Bacik 		return 0;
1366e3df18bSJosef Bacik 
1376e3df18bSJosef Bacik 	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
1386e3df18bSJosef Bacik 		return 0;
1396e3df18bSJosef Bacik 
1406e3df18bSJosef Bacik 	if (trans)
1416e3df18bSJosef Bacik 		transid = trans->transid;
1426e3df18bSJosef Bacik 	else
1436e3df18bSJosef Bacik 		transid = inode->root->last_trans;
1446e3df18bSJosef Bacik 
1456e3df18bSJosef Bacik 	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
1466e3df18bSJosef Bacik 	if (!defrag)
1476e3df18bSJosef Bacik 		return -ENOMEM;
1486e3df18bSJosef Bacik 
1496e3df18bSJosef Bacik 	defrag->ino = btrfs_ino(inode);
1506e3df18bSJosef Bacik 	defrag->transid = transid;
1516e3df18bSJosef Bacik 	defrag->root = root->root_key.objectid;
1526e3df18bSJosef Bacik 	defrag->extent_thresh = extent_thresh;
1536e3df18bSJosef Bacik 
1546e3df18bSJosef Bacik 	spin_lock(&fs_info->defrag_inodes_lock);
1556e3df18bSJosef Bacik 	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
1566e3df18bSJosef Bacik 		/*
1576e3df18bSJosef Bacik 		 * If we set IN_DEFRAG flag and evict the inode from memory,
1586e3df18bSJosef Bacik 		 * and then re-read this inode, this new inode doesn't have
1596e3df18bSJosef Bacik 		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
1606e3df18bSJosef Bacik 		 */
1616e3df18bSJosef Bacik 		ret = __btrfs_add_inode_defrag(inode, defrag);
1626e3df18bSJosef Bacik 		if (ret)
1636e3df18bSJosef Bacik 			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
1646e3df18bSJosef Bacik 	} else {
1656e3df18bSJosef Bacik 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
1666e3df18bSJosef Bacik 	}
1676e3df18bSJosef Bacik 	spin_unlock(&fs_info->defrag_inodes_lock);
1686e3df18bSJosef Bacik 	return 0;
1696e3df18bSJosef Bacik }
1706e3df18bSJosef Bacik 
1716e3df18bSJosef Bacik /*
1726e3df18bSJosef Bacik  * Pick the defragable inode that we want, if it doesn't exist, we will get the
1736e3df18bSJosef Bacik  * next one.
1746e3df18bSJosef Bacik  */
btrfs_pick_defrag_inode(struct btrfs_fs_info * fs_info,u64 root,u64 ino)1756e3df18bSJosef Bacik static struct inode_defrag *btrfs_pick_defrag_inode(
1766e3df18bSJosef Bacik 			struct btrfs_fs_info *fs_info, u64 root, u64 ino)
1776e3df18bSJosef Bacik {
1786e3df18bSJosef Bacik 	struct inode_defrag *entry = NULL;
1796e3df18bSJosef Bacik 	struct inode_defrag tmp;
1806e3df18bSJosef Bacik 	struct rb_node *p;
1816e3df18bSJosef Bacik 	struct rb_node *parent = NULL;
1826e3df18bSJosef Bacik 	int ret;
1836e3df18bSJosef Bacik 
1846e3df18bSJosef Bacik 	tmp.ino = ino;
1856e3df18bSJosef Bacik 	tmp.root = root;
1866e3df18bSJosef Bacik 
1876e3df18bSJosef Bacik 	spin_lock(&fs_info->defrag_inodes_lock);
1886e3df18bSJosef Bacik 	p = fs_info->defrag_inodes.rb_node;
1896e3df18bSJosef Bacik 	while (p) {
1906e3df18bSJosef Bacik 		parent = p;
1916e3df18bSJosef Bacik 		entry = rb_entry(parent, struct inode_defrag, rb_node);
1926e3df18bSJosef Bacik 
1936e3df18bSJosef Bacik 		ret = __compare_inode_defrag(&tmp, entry);
1946e3df18bSJosef Bacik 		if (ret < 0)
1956e3df18bSJosef Bacik 			p = parent->rb_left;
1966e3df18bSJosef Bacik 		else if (ret > 0)
1976e3df18bSJosef Bacik 			p = parent->rb_right;
1986e3df18bSJosef Bacik 		else
1996e3df18bSJosef Bacik 			goto out;
2006e3df18bSJosef Bacik 	}
2016e3df18bSJosef Bacik 
2026e3df18bSJosef Bacik 	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
2036e3df18bSJosef Bacik 		parent = rb_next(parent);
2046e3df18bSJosef Bacik 		if (parent)
2056e3df18bSJosef Bacik 			entry = rb_entry(parent, struct inode_defrag, rb_node);
2066e3df18bSJosef Bacik 		else
2076e3df18bSJosef Bacik 			entry = NULL;
2086e3df18bSJosef Bacik 	}
2096e3df18bSJosef Bacik out:
2106e3df18bSJosef Bacik 	if (entry)
2116e3df18bSJosef Bacik 		rb_erase(parent, &fs_info->defrag_inodes);
2126e3df18bSJosef Bacik 	spin_unlock(&fs_info->defrag_inodes_lock);
2136e3df18bSJosef Bacik 	return entry;
2146e3df18bSJosef Bacik }
2156e3df18bSJosef Bacik 
btrfs_cleanup_defrag_inodes(struct btrfs_fs_info * fs_info)2166e3df18bSJosef Bacik void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
2176e3df18bSJosef Bacik {
2186e3df18bSJosef Bacik 	struct inode_defrag *defrag;
2196e3df18bSJosef Bacik 	struct rb_node *node;
2206e3df18bSJosef Bacik 
2216e3df18bSJosef Bacik 	spin_lock(&fs_info->defrag_inodes_lock);
2226e3df18bSJosef Bacik 	node = rb_first(&fs_info->defrag_inodes);
2236e3df18bSJosef Bacik 	while (node) {
2246e3df18bSJosef Bacik 		rb_erase(node, &fs_info->defrag_inodes);
2256e3df18bSJosef Bacik 		defrag = rb_entry(node, struct inode_defrag, rb_node);
2266e3df18bSJosef Bacik 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
2276e3df18bSJosef Bacik 
2286e3df18bSJosef Bacik 		cond_resched_lock(&fs_info->defrag_inodes_lock);
2296e3df18bSJosef Bacik 
2306e3df18bSJosef Bacik 		node = rb_first(&fs_info->defrag_inodes);
2316e3df18bSJosef Bacik 	}
2326e3df18bSJosef Bacik 	spin_unlock(&fs_info->defrag_inodes_lock);
2336e3df18bSJosef Bacik }
2346e3df18bSJosef Bacik 
2356e3df18bSJosef Bacik #define BTRFS_DEFRAG_BATCH	1024
2366e3df18bSJosef Bacik 
__btrfs_run_defrag_inode(struct btrfs_fs_info * fs_info,struct inode_defrag * defrag)2376e3df18bSJosef Bacik static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
2386e3df18bSJosef Bacik 				    struct inode_defrag *defrag)
2396e3df18bSJosef Bacik {
2406e3df18bSJosef Bacik 	struct btrfs_root *inode_root;
2416e3df18bSJosef Bacik 	struct inode *inode;
2426e3df18bSJosef Bacik 	struct btrfs_ioctl_defrag_range_args range;
2436e3df18bSJosef Bacik 	int ret = 0;
2446e3df18bSJosef Bacik 	u64 cur = 0;
2456e3df18bSJosef Bacik 
2466e3df18bSJosef Bacik again:
2476e3df18bSJosef Bacik 	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
2486e3df18bSJosef Bacik 		goto cleanup;
2496e3df18bSJosef Bacik 	if (!__need_auto_defrag(fs_info))
2506e3df18bSJosef Bacik 		goto cleanup;
2516e3df18bSJosef Bacik 
2526e3df18bSJosef Bacik 	/* Get the inode */
2536e3df18bSJosef Bacik 	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
2546e3df18bSJosef Bacik 	if (IS_ERR(inode_root)) {
2556e3df18bSJosef Bacik 		ret = PTR_ERR(inode_root);
2566e3df18bSJosef Bacik 		goto cleanup;
2576e3df18bSJosef Bacik 	}
2586e3df18bSJosef Bacik 
2596e3df18bSJosef Bacik 	inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
2606e3df18bSJosef Bacik 	btrfs_put_root(inode_root);
2616e3df18bSJosef Bacik 	if (IS_ERR(inode)) {
2626e3df18bSJosef Bacik 		ret = PTR_ERR(inode);
2636e3df18bSJosef Bacik 		goto cleanup;
2646e3df18bSJosef Bacik 	}
2656e3df18bSJosef Bacik 
2666e3df18bSJosef Bacik 	if (cur >= i_size_read(inode)) {
2676e3df18bSJosef Bacik 		iput(inode);
2686e3df18bSJosef Bacik 		goto cleanup;
2696e3df18bSJosef Bacik 	}
2706e3df18bSJosef Bacik 
2716e3df18bSJosef Bacik 	/* Do a chunk of defrag */
2726e3df18bSJosef Bacik 	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
2736e3df18bSJosef Bacik 	memset(&range, 0, sizeof(range));
2746e3df18bSJosef Bacik 	range.len = (u64)-1;
2756e3df18bSJosef Bacik 	range.start = cur;
2766e3df18bSJosef Bacik 	range.extent_thresh = defrag->extent_thresh;
2776e3df18bSJosef Bacik 
2786e3df18bSJosef Bacik 	sb_start_write(fs_info->sb);
2796e3df18bSJosef Bacik 	ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
2806e3df18bSJosef Bacik 				       BTRFS_DEFRAG_BATCH);
2816e3df18bSJosef Bacik 	sb_end_write(fs_info->sb);
2826e3df18bSJosef Bacik 	iput(inode);
2836e3df18bSJosef Bacik 
2846e3df18bSJosef Bacik 	if (ret < 0)
2856e3df18bSJosef Bacik 		goto cleanup;
2866e3df18bSJosef Bacik 
2876e3df18bSJosef Bacik 	cur = max(cur + fs_info->sectorsize, range.start);
2886e3df18bSJosef Bacik 	goto again;
2896e3df18bSJosef Bacik 
2906e3df18bSJosef Bacik cleanup:
2916e3df18bSJosef Bacik 	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
2926e3df18bSJosef Bacik 	return ret;
2936e3df18bSJosef Bacik }
2946e3df18bSJosef Bacik 
2956e3df18bSJosef Bacik /*
2966e3df18bSJosef Bacik  * Run through the list of inodes in the FS that need defragging.
2976e3df18bSJosef Bacik  */
btrfs_run_defrag_inodes(struct btrfs_fs_info * fs_info)2986e3df18bSJosef Bacik int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
2996e3df18bSJosef Bacik {
3006e3df18bSJosef Bacik 	struct inode_defrag *defrag;
3016e3df18bSJosef Bacik 	u64 first_ino = 0;
3026e3df18bSJosef Bacik 	u64 root_objectid = 0;
3036e3df18bSJosef Bacik 
3046e3df18bSJosef Bacik 	atomic_inc(&fs_info->defrag_running);
3056e3df18bSJosef Bacik 	while (1) {
3066e3df18bSJosef Bacik 		/* Pause the auto defragger. */
3076e3df18bSJosef Bacik 		if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
3086e3df18bSJosef Bacik 			break;
3096e3df18bSJosef Bacik 
3106e3df18bSJosef Bacik 		if (!__need_auto_defrag(fs_info))
3116e3df18bSJosef Bacik 			break;
3126e3df18bSJosef Bacik 
3136e3df18bSJosef Bacik 		/* find an inode to defrag */
3146e3df18bSJosef Bacik 		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
3156e3df18bSJosef Bacik 		if (!defrag) {
3166e3df18bSJosef Bacik 			if (root_objectid || first_ino) {
3176e3df18bSJosef Bacik 				root_objectid = 0;
3186e3df18bSJosef Bacik 				first_ino = 0;
3196e3df18bSJosef Bacik 				continue;
3206e3df18bSJosef Bacik 			} else {
3216e3df18bSJosef Bacik 				break;
3226e3df18bSJosef Bacik 			}
3236e3df18bSJosef Bacik 		}
3246e3df18bSJosef Bacik 
3256e3df18bSJosef Bacik 		first_ino = defrag->ino + 1;
3266e3df18bSJosef Bacik 		root_objectid = defrag->root;
3276e3df18bSJosef Bacik 
3286e3df18bSJosef Bacik 		__btrfs_run_defrag_inode(fs_info, defrag);
3296e3df18bSJosef Bacik 	}
3306e3df18bSJosef Bacik 	atomic_dec(&fs_info->defrag_running);
3316e3df18bSJosef Bacik 
3326e3df18bSJosef Bacik 	/*
3336e3df18bSJosef Bacik 	 * During unmount, we use the transaction_wait queue to wait for the
3346e3df18bSJosef Bacik 	 * defragger to stop.
3356e3df18bSJosef Bacik 	 */
3366e3df18bSJosef Bacik 	wake_up(&fs_info->transaction_wait);
3376e3df18bSJosef Bacik 	return 0;
3386e3df18bSJosef Bacik }
3396e3df18bSJosef Bacik 
340778dd695SJosef Bacik /*
341778dd695SJosef Bacik  * Defrag all the leaves in a given btree.
342778dd695SJosef Bacik  * Read all the leaves and try to get key order to
343778dd695SJosef Bacik  * better reflect disk order
344778dd695SJosef Bacik  */
345778dd695SJosef Bacik 
btrfs_defrag_leaves(struct btrfs_trans_handle * trans,struct btrfs_root * root)346778dd695SJosef Bacik int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
347778dd695SJosef Bacik 			struct btrfs_root *root)
348778dd695SJosef Bacik {
349778dd695SJosef Bacik 	struct btrfs_path *path = NULL;
350778dd695SJosef Bacik 	struct btrfs_key key;
351778dd695SJosef Bacik 	int ret = 0;
352778dd695SJosef Bacik 	int wret;
353778dd695SJosef Bacik 	int level;
354778dd695SJosef Bacik 	int next_key_ret = 0;
355778dd695SJosef Bacik 	u64 last_ret = 0;
356778dd695SJosef Bacik 
357778dd695SJosef Bacik 	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
358778dd695SJosef Bacik 		goto out;
359778dd695SJosef Bacik 
360778dd695SJosef Bacik 	path = btrfs_alloc_path();
361db0a4a7bSChristophe JAILLET 	if (!path) {
362db0a4a7bSChristophe JAILLET 		ret = -ENOMEM;
363db0a4a7bSChristophe JAILLET 		goto out;
364db0a4a7bSChristophe JAILLET 	}
365778dd695SJosef Bacik 
366778dd695SJosef Bacik 	level = btrfs_header_level(root->node);
367778dd695SJosef Bacik 
368778dd695SJosef Bacik 	if (level == 0)
369778dd695SJosef Bacik 		goto out;
370778dd695SJosef Bacik 
371778dd695SJosef Bacik 	if (root->defrag_progress.objectid == 0) {
372778dd695SJosef Bacik 		struct extent_buffer *root_node;
373778dd695SJosef Bacik 		u32 nritems;
374778dd695SJosef Bacik 
375778dd695SJosef Bacik 		root_node = btrfs_lock_root_node(root);
376778dd695SJosef Bacik 		nritems = btrfs_header_nritems(root_node);
377778dd695SJosef Bacik 		root->defrag_max.objectid = 0;
378778dd695SJosef Bacik 		/* from above we know this is not a leaf */
379778dd695SJosef Bacik 		btrfs_node_key_to_cpu(root_node, &root->defrag_max,
380778dd695SJosef Bacik 				      nritems - 1);
381778dd695SJosef Bacik 		btrfs_tree_unlock(root_node);
382778dd695SJosef Bacik 		free_extent_buffer(root_node);
383778dd695SJosef Bacik 		memset(&key, 0, sizeof(key));
384778dd695SJosef Bacik 	} else {
385778dd695SJosef Bacik 		memcpy(&key, &root->defrag_progress, sizeof(key));
386778dd695SJosef Bacik 	}
387778dd695SJosef Bacik 
388778dd695SJosef Bacik 	path->keep_locks = 1;
389778dd695SJosef Bacik 
390778dd695SJosef Bacik 	ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
391778dd695SJosef Bacik 	if (ret < 0)
392778dd695SJosef Bacik 		goto out;
393778dd695SJosef Bacik 	if (ret > 0) {
394778dd695SJosef Bacik 		ret = 0;
395778dd695SJosef Bacik 		goto out;
396778dd695SJosef Bacik 	}
397778dd695SJosef Bacik 	btrfs_release_path(path);
398778dd695SJosef Bacik 	/*
399778dd695SJosef Bacik 	 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
400778dd695SJosef Bacik 	 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
401778dd695SJosef Bacik 	 * a deadlock (attempting to write lock an already write locked leaf).
402778dd695SJosef Bacik 	 */
403778dd695SJosef Bacik 	path->lowest_level = 1;
404778dd695SJosef Bacik 	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
405778dd695SJosef Bacik 
406778dd695SJosef Bacik 	if (wret < 0) {
407778dd695SJosef Bacik 		ret = wret;
408778dd695SJosef Bacik 		goto out;
409778dd695SJosef Bacik 	}
410778dd695SJosef Bacik 	if (!path->nodes[1]) {
411778dd695SJosef Bacik 		ret = 0;
412778dd695SJosef Bacik 		goto out;
413778dd695SJosef Bacik 	}
414778dd695SJosef Bacik 	/*
415778dd695SJosef Bacik 	 * The node at level 1 must always be locked when our path has
416778dd695SJosef Bacik 	 * keep_locks set and lowest_level is 1, regardless of the value of
417778dd695SJosef Bacik 	 * path->slots[1].
418778dd695SJosef Bacik 	 */
419*31547100SDavid Sterba 	ASSERT(path->locks[1] != 0);
420778dd695SJosef Bacik 	ret = btrfs_realloc_node(trans, root,
421778dd695SJosef Bacik 				 path->nodes[1], 0,
422778dd695SJosef Bacik 				 &last_ret,
423778dd695SJosef Bacik 				 &root->defrag_progress);
424778dd695SJosef Bacik 	if (ret) {
425778dd695SJosef Bacik 		WARN_ON(ret == -EAGAIN);
426778dd695SJosef Bacik 		goto out;
427778dd695SJosef Bacik 	}
428778dd695SJosef Bacik 	/*
429778dd695SJosef Bacik 	 * Now that we reallocated the node we can find the next key. Note that
430778dd695SJosef Bacik 	 * btrfs_find_next_key() can release our path and do another search
431778dd695SJosef Bacik 	 * without COWing, this is because even with path->keep_locks = 1,
432778dd695SJosef Bacik 	 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
433778dd695SJosef Bacik 	 * node when path->slots[node_level - 1] does not point to the last
434778dd695SJosef Bacik 	 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
435778dd695SJosef Bacik 	 * we search for the next key after reallocating our node.
436778dd695SJosef Bacik 	 */
437778dd695SJosef Bacik 	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
438778dd695SJosef Bacik 	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
439778dd695SJosef Bacik 					   BTRFS_OLDEST_GENERATION);
440778dd695SJosef Bacik 	if (next_key_ret == 0) {
441778dd695SJosef Bacik 		memcpy(&root->defrag_progress, &key, sizeof(key));
442778dd695SJosef Bacik 		ret = -EAGAIN;
443778dd695SJosef Bacik 	}
444778dd695SJosef Bacik out:
445778dd695SJosef Bacik 	btrfs_free_path(path);
446778dd695SJosef Bacik 	if (ret == -EAGAIN) {
447778dd695SJosef Bacik 		if (root->defrag_max.objectid > root->defrag_progress.objectid)
448778dd695SJosef Bacik 			goto done;
449778dd695SJosef Bacik 		if (root->defrag_max.type > root->defrag_progress.type)
450778dd695SJosef Bacik 			goto done;
451778dd695SJosef Bacik 		if (root->defrag_max.offset > root->defrag_progress.offset)
452778dd695SJosef Bacik 			goto done;
453778dd695SJosef Bacik 		ret = 0;
454778dd695SJosef Bacik 	}
455778dd695SJosef Bacik done:
456778dd695SJosef Bacik 	if (ret != -EAGAIN)
457778dd695SJosef Bacik 		memset(&root->defrag_progress, 0,
458778dd695SJosef Bacik 		       sizeof(root->defrag_progress));
459778dd695SJosef Bacik 
460778dd695SJosef Bacik 	return ret;
461778dd695SJosef Bacik }
4626e3df18bSJosef Bacik 
463a6a01ca6SJosef Bacik /*
464a6a01ca6SJosef Bacik  * Defrag specific helper to get an extent map.
465a6a01ca6SJosef Bacik  *
466a6a01ca6SJosef Bacik  * Differences between this and btrfs_get_extent() are:
467a6a01ca6SJosef Bacik  *
468a6a01ca6SJosef Bacik  * - No extent_map will be added to inode->extent_tree
469a6a01ca6SJosef Bacik  *   To reduce memory usage in the long run.
470a6a01ca6SJosef Bacik  *
471a6a01ca6SJosef Bacik  * - Extra optimization to skip file extents older than @newer_than
472a6a01ca6SJosef Bacik  *   By using btrfs_search_forward() we can skip entire file ranges that
473a6a01ca6SJosef Bacik  *   have extents created in past transactions, because btrfs_search_forward()
474a6a01ca6SJosef Bacik  *   will not visit leaves and nodes with a generation smaller than given
475a6a01ca6SJosef Bacik  *   minimal generation threshold (@newer_than).
476a6a01ca6SJosef Bacik  *
477a6a01ca6SJosef Bacik  * Return valid em if we find a file extent matching the requirement.
478a6a01ca6SJosef Bacik  * Return NULL if we can not find a file extent matching the requirement.
479a6a01ca6SJosef Bacik  *
480a6a01ca6SJosef Bacik  * Return ERR_PTR() for error.
481a6a01ca6SJosef Bacik  */
defrag_get_extent(struct btrfs_inode * inode,u64 start,u64 newer_than)482a6a01ca6SJosef Bacik static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
483a6a01ca6SJosef Bacik 					    u64 start, u64 newer_than)
484a6a01ca6SJosef Bacik {
485a6a01ca6SJosef Bacik 	struct btrfs_root *root = inode->root;
486a6a01ca6SJosef Bacik 	struct btrfs_file_extent_item *fi;
487a6a01ca6SJosef Bacik 	struct btrfs_path path = { 0 };
488a6a01ca6SJosef Bacik 	struct extent_map *em;
489a6a01ca6SJosef Bacik 	struct btrfs_key key;
490a6a01ca6SJosef Bacik 	u64 ino = btrfs_ino(inode);
491a6a01ca6SJosef Bacik 	int ret;
492a6a01ca6SJosef Bacik 
493a6a01ca6SJosef Bacik 	em = alloc_extent_map();
494a6a01ca6SJosef Bacik 	if (!em) {
495a6a01ca6SJosef Bacik 		ret = -ENOMEM;
496a6a01ca6SJosef Bacik 		goto err;
497a6a01ca6SJosef Bacik 	}
498a6a01ca6SJosef Bacik 
499a6a01ca6SJosef Bacik 	key.objectid = ino;
500a6a01ca6SJosef Bacik 	key.type = BTRFS_EXTENT_DATA_KEY;
501a6a01ca6SJosef Bacik 	key.offset = start;
502a6a01ca6SJosef Bacik 
503a6a01ca6SJosef Bacik 	if (newer_than) {
504a6a01ca6SJosef Bacik 		ret = btrfs_search_forward(root, &key, &path, newer_than);
505a6a01ca6SJosef Bacik 		if (ret < 0)
506a6a01ca6SJosef Bacik 			goto err;
507a6a01ca6SJosef Bacik 		/* Can't find anything newer */
508a6a01ca6SJosef Bacik 		if (ret > 0)
509a6a01ca6SJosef Bacik 			goto not_found;
510a6a01ca6SJosef Bacik 	} else {
511a6a01ca6SJosef Bacik 		ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
512a6a01ca6SJosef Bacik 		if (ret < 0)
513a6a01ca6SJosef Bacik 			goto err;
514a6a01ca6SJosef Bacik 	}
515a6a01ca6SJosef Bacik 	if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
516a6a01ca6SJosef Bacik 		/*
517a6a01ca6SJosef Bacik 		 * If btrfs_search_slot() makes path to point beyond nritems,
518a6a01ca6SJosef Bacik 		 * we should not have an empty leaf, as this inode must at
519a6a01ca6SJosef Bacik 		 * least have its INODE_ITEM.
520a6a01ca6SJosef Bacik 		 */
521a6a01ca6SJosef Bacik 		ASSERT(btrfs_header_nritems(path.nodes[0]));
522a6a01ca6SJosef Bacik 		path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
523a6a01ca6SJosef Bacik 	}
524a6a01ca6SJosef Bacik 	btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
525a6a01ca6SJosef Bacik 	/* Perfect match, no need to go one slot back */
526a6a01ca6SJosef Bacik 	if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
527a6a01ca6SJosef Bacik 	    key.offset == start)
528a6a01ca6SJosef Bacik 		goto iterate;
529a6a01ca6SJosef Bacik 
530a6a01ca6SJosef Bacik 	/* We didn't find a perfect match, needs to go one slot back */
531a6a01ca6SJosef Bacik 	if (path.slots[0] > 0) {
532a6a01ca6SJosef Bacik 		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
533a6a01ca6SJosef Bacik 		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
534a6a01ca6SJosef Bacik 			path.slots[0]--;
535a6a01ca6SJosef Bacik 	}
536a6a01ca6SJosef Bacik 
537a6a01ca6SJosef Bacik iterate:
538a6a01ca6SJosef Bacik 	/* Iterate through the path to find a file extent covering @start */
539a6a01ca6SJosef Bacik 	while (true) {
540a6a01ca6SJosef Bacik 		u64 extent_end;
541a6a01ca6SJosef Bacik 
542a6a01ca6SJosef Bacik 		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
543a6a01ca6SJosef Bacik 			goto next;
544a6a01ca6SJosef Bacik 
545a6a01ca6SJosef Bacik 		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
546a6a01ca6SJosef Bacik 
547a6a01ca6SJosef Bacik 		/*
548a6a01ca6SJosef Bacik 		 * We may go one slot back to INODE_REF/XATTR item, then
549a6a01ca6SJosef Bacik 		 * need to go forward until we reach an EXTENT_DATA.
550a6a01ca6SJosef Bacik 		 * But we should still has the correct ino as key.objectid.
551a6a01ca6SJosef Bacik 		 */
552a6a01ca6SJosef Bacik 		if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
553a6a01ca6SJosef Bacik 			goto next;
554a6a01ca6SJosef Bacik 
555a6a01ca6SJosef Bacik 		/* It's beyond our target range, definitely not extent found */
556a6a01ca6SJosef Bacik 		if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
557a6a01ca6SJosef Bacik 			goto not_found;
558a6a01ca6SJosef Bacik 
559a6a01ca6SJosef Bacik 		/*
560a6a01ca6SJosef Bacik 		 *	|	|<- File extent ->|
561a6a01ca6SJosef Bacik 		 *	\- start
562a6a01ca6SJosef Bacik 		 *
563a6a01ca6SJosef Bacik 		 * This means there is a hole between start and key.offset.
564a6a01ca6SJosef Bacik 		 */
565a6a01ca6SJosef Bacik 		if (key.offset > start) {
566a6a01ca6SJosef Bacik 			em->start = start;
567a6a01ca6SJosef Bacik 			em->orig_start = start;
568a6a01ca6SJosef Bacik 			em->block_start = EXTENT_MAP_HOLE;
569a6a01ca6SJosef Bacik 			em->len = key.offset - start;
570a6a01ca6SJosef Bacik 			break;
571a6a01ca6SJosef Bacik 		}
572a6a01ca6SJosef Bacik 
573a6a01ca6SJosef Bacik 		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
574a6a01ca6SJosef Bacik 				    struct btrfs_file_extent_item);
575a6a01ca6SJosef Bacik 		extent_end = btrfs_file_extent_end(&path);
576a6a01ca6SJosef Bacik 
577a6a01ca6SJosef Bacik 		/*
578a6a01ca6SJosef Bacik 		 *	|<- file extent ->|	|
579a6a01ca6SJosef Bacik 		 *				\- start
580a6a01ca6SJosef Bacik 		 *
581a6a01ca6SJosef Bacik 		 * We haven't reached start, search next slot.
582a6a01ca6SJosef Bacik 		 */
583a6a01ca6SJosef Bacik 		if (extent_end <= start)
584a6a01ca6SJosef Bacik 			goto next;
585a6a01ca6SJosef Bacik 
586a6a01ca6SJosef Bacik 		/* Now this extent covers @start, convert it to em */
587280f15cbSQu Wenruo 		btrfs_extent_item_to_extent_map(inode, &path, fi, em);
588a6a01ca6SJosef Bacik 		break;
589a6a01ca6SJosef Bacik next:
590a6a01ca6SJosef Bacik 		ret = btrfs_next_item(root, &path);
591a6a01ca6SJosef Bacik 		if (ret < 0)
592a6a01ca6SJosef Bacik 			goto err;
593a6a01ca6SJosef Bacik 		if (ret > 0)
594a6a01ca6SJosef Bacik 			goto not_found;
595a6a01ca6SJosef Bacik 	}
596a6a01ca6SJosef Bacik 	btrfs_release_path(&path);
597a6a01ca6SJosef Bacik 	return em;
598a6a01ca6SJosef Bacik 
599a6a01ca6SJosef Bacik not_found:
600a6a01ca6SJosef Bacik 	btrfs_release_path(&path);
601a6a01ca6SJosef Bacik 	free_extent_map(em);
602a6a01ca6SJosef Bacik 	return NULL;
603a6a01ca6SJosef Bacik 
604a6a01ca6SJosef Bacik err:
605a6a01ca6SJosef Bacik 	btrfs_release_path(&path);
606a6a01ca6SJosef Bacik 	free_extent_map(em);
607a6a01ca6SJosef Bacik 	return ERR_PTR(ret);
608a6a01ca6SJosef Bacik }
609a6a01ca6SJosef Bacik 
defrag_lookup_extent(struct inode * inode,u64 start,u64 newer_than,bool locked)610a6a01ca6SJosef Bacik static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
611a6a01ca6SJosef Bacik 					       u64 newer_than, bool locked)
612a6a01ca6SJosef Bacik {
613a6a01ca6SJosef Bacik 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
614a6a01ca6SJosef Bacik 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
615a6a01ca6SJosef Bacik 	struct extent_map *em;
616a6a01ca6SJosef Bacik 	const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
617a6a01ca6SJosef Bacik 
618a6a01ca6SJosef Bacik 	/*
619a6a01ca6SJosef Bacik 	 * Hopefully we have this extent in the tree already, try without the
620a6a01ca6SJosef Bacik 	 * full extent lock.
621a6a01ca6SJosef Bacik 	 */
622a6a01ca6SJosef Bacik 	read_lock(&em_tree->lock);
623a6a01ca6SJosef Bacik 	em = lookup_extent_mapping(em_tree, start, sectorsize);
624a6a01ca6SJosef Bacik 	read_unlock(&em_tree->lock);
625a6a01ca6SJosef Bacik 
626a6a01ca6SJosef Bacik 	/*
627a6a01ca6SJosef Bacik 	 * We can get a merged extent, in that case, we need to re-search
628a6a01ca6SJosef Bacik 	 * tree to get the original em for defrag.
629a6a01ca6SJosef Bacik 	 *
630a6a01ca6SJosef Bacik 	 * If @newer_than is 0 or em::generation < newer_than, we can trust
631a6a01ca6SJosef Bacik 	 * this em, as either we don't care about the generation, or the
632a6a01ca6SJosef Bacik 	 * merged extent map will be rejected anyway.
633a6a01ca6SJosef Bacik 	 */
634a6a01ca6SJosef Bacik 	if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
635a6a01ca6SJosef Bacik 	    newer_than && em->generation >= newer_than) {
636a6a01ca6SJosef Bacik 		free_extent_map(em);
637a6a01ca6SJosef Bacik 		em = NULL;
638a6a01ca6SJosef Bacik 	}
639a6a01ca6SJosef Bacik 
640a6a01ca6SJosef Bacik 	if (!em) {
641a6a01ca6SJosef Bacik 		struct extent_state *cached = NULL;
642a6a01ca6SJosef Bacik 		u64 end = start + sectorsize - 1;
643a6a01ca6SJosef Bacik 
644a6a01ca6SJosef Bacik 		/* Get the big lock and read metadata off disk. */
645a6a01ca6SJosef Bacik 		if (!locked)
646a6a01ca6SJosef Bacik 			lock_extent(io_tree, start, end, &cached);
647a6a01ca6SJosef Bacik 		em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
648a6a01ca6SJosef Bacik 		if (!locked)
649a6a01ca6SJosef Bacik 			unlock_extent(io_tree, start, end, &cached);
650a6a01ca6SJosef Bacik 
651a6a01ca6SJosef Bacik 		if (IS_ERR(em))
652a6a01ca6SJosef Bacik 			return NULL;
653a6a01ca6SJosef Bacik 	}
654a6a01ca6SJosef Bacik 
655a6a01ca6SJosef Bacik 	return em;
656a6a01ca6SJosef Bacik }
657a6a01ca6SJosef Bacik 
get_extent_max_capacity(const struct btrfs_fs_info * fs_info,const struct extent_map * em)658a6a01ca6SJosef Bacik static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
659a6a01ca6SJosef Bacik 				   const struct extent_map *em)
660a6a01ca6SJosef Bacik {
661a6a01ca6SJosef Bacik 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
662a6a01ca6SJosef Bacik 		return BTRFS_MAX_COMPRESSED;
663a6a01ca6SJosef Bacik 	return fs_info->max_extent_size;
664a6a01ca6SJosef Bacik }
665a6a01ca6SJosef Bacik 
defrag_check_next_extent(struct inode * inode,struct extent_map * em,u32 extent_thresh,u64 newer_than,bool locked)666a6a01ca6SJosef Bacik static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
667a6a01ca6SJosef Bacik 				     u32 extent_thresh, u64 newer_than, bool locked)
668a6a01ca6SJosef Bacik {
669a6a01ca6SJosef Bacik 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
670a6a01ca6SJosef Bacik 	struct extent_map *next;
671a6a01ca6SJosef Bacik 	bool ret = false;
672a6a01ca6SJosef Bacik 
673a6a01ca6SJosef Bacik 	/* This is the last extent */
674a6a01ca6SJosef Bacik 	if (em->start + em->len >= i_size_read(inode))
675a6a01ca6SJosef Bacik 		return false;
676a6a01ca6SJosef Bacik 
677a6a01ca6SJosef Bacik 	/*
678a6a01ca6SJosef Bacik 	 * Here we need to pass @newer_then when checking the next extent, or
679a6a01ca6SJosef Bacik 	 * we will hit a case we mark current extent for defrag, but the next
680a6a01ca6SJosef Bacik 	 * one will not be a target.
681a6a01ca6SJosef Bacik 	 * This will just cause extra IO without really reducing the fragments.
682a6a01ca6SJosef Bacik 	 */
683a6a01ca6SJosef Bacik 	next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
684a6a01ca6SJosef Bacik 	/* No more em or hole */
685a6a01ca6SJosef Bacik 	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
686a6a01ca6SJosef Bacik 		goto out;
687a6a01ca6SJosef Bacik 	if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
688a6a01ca6SJosef Bacik 		goto out;
689a6a01ca6SJosef Bacik 	/*
690a6a01ca6SJosef Bacik 	 * If the next extent is at its max capacity, defragging current extent
691a6a01ca6SJosef Bacik 	 * makes no sense, as the total number of extents won't change.
692a6a01ca6SJosef Bacik 	 */
693a6a01ca6SJosef Bacik 	if (next->len >= get_extent_max_capacity(fs_info, em))
694a6a01ca6SJosef Bacik 		goto out;
695a6a01ca6SJosef Bacik 	/* Skip older extent */
696a6a01ca6SJosef Bacik 	if (next->generation < newer_than)
697a6a01ca6SJosef Bacik 		goto out;
698a6a01ca6SJosef Bacik 	/* Also check extent size */
699a6a01ca6SJosef Bacik 	if (next->len >= extent_thresh)
700a6a01ca6SJosef Bacik 		goto out;
701a6a01ca6SJosef Bacik 
702a6a01ca6SJosef Bacik 	ret = true;
703a6a01ca6SJosef Bacik out:
704a6a01ca6SJosef Bacik 	free_extent_map(next);
705a6a01ca6SJosef Bacik 	return ret;
706a6a01ca6SJosef Bacik }
707a6a01ca6SJosef Bacik 
708a6a01ca6SJosef Bacik /*
709a6a01ca6SJosef Bacik  * Prepare one page to be defragged.
710a6a01ca6SJosef Bacik  *
711a6a01ca6SJosef Bacik  * This will ensure:
712a6a01ca6SJosef Bacik  *
713a6a01ca6SJosef Bacik  * - Returned page is locked and has been set up properly.
714a6a01ca6SJosef Bacik  * - No ordered extent exists in the page.
715a6a01ca6SJosef Bacik  * - The page is uptodate.
716a6a01ca6SJosef Bacik  *
717a6a01ca6SJosef Bacik  * NOTE: Caller should also wait for page writeback after the cluster is
718a6a01ca6SJosef Bacik  * prepared, here we don't do writeback wait for each page.
719a6a01ca6SJosef Bacik  */
defrag_prepare_one_page(struct btrfs_inode * inode,pgoff_t index)720a6a01ca6SJosef Bacik static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index)
721a6a01ca6SJosef Bacik {
722a6a01ca6SJosef Bacik 	struct address_space *mapping = inode->vfs_inode.i_mapping;
723a6a01ca6SJosef Bacik 	gfp_t mask = btrfs_alloc_write_mask(mapping);
724a6a01ca6SJosef Bacik 	u64 page_start = (u64)index << PAGE_SHIFT;
725a6a01ca6SJosef Bacik 	u64 page_end = page_start + PAGE_SIZE - 1;
726a6a01ca6SJosef Bacik 	struct extent_state *cached_state = NULL;
727a6a01ca6SJosef Bacik 	struct page *page;
728a6a01ca6SJosef Bacik 	int ret;
729a6a01ca6SJosef Bacik 
730a6a01ca6SJosef Bacik again:
731a6a01ca6SJosef Bacik 	page = find_or_create_page(mapping, index, mask);
732a6a01ca6SJosef Bacik 	if (!page)
733a6a01ca6SJosef Bacik 		return ERR_PTR(-ENOMEM);
734a6a01ca6SJosef Bacik 
735a6a01ca6SJosef Bacik 	/*
736a6a01ca6SJosef Bacik 	 * Since we can defragment files opened read-only, we can encounter
737a6a01ca6SJosef Bacik 	 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
738a6a01ca6SJosef Bacik 	 * can't do I/O using huge pages yet, so return an error for now.
739a6a01ca6SJosef Bacik 	 * Filesystem transparent huge pages are typically only used for
740a6a01ca6SJosef Bacik 	 * executables that explicitly enable them, so this isn't very
741a6a01ca6SJosef Bacik 	 * restrictive.
742a6a01ca6SJosef Bacik 	 */
743a6a01ca6SJosef Bacik 	if (PageCompound(page)) {
744a6a01ca6SJosef Bacik 		unlock_page(page);
745a6a01ca6SJosef Bacik 		put_page(page);
746a6a01ca6SJosef Bacik 		return ERR_PTR(-ETXTBSY);
747a6a01ca6SJosef Bacik 	}
748a6a01ca6SJosef Bacik 
749a6a01ca6SJosef Bacik 	ret = set_page_extent_mapped(page);
750a6a01ca6SJosef Bacik 	if (ret < 0) {
751a6a01ca6SJosef Bacik 		unlock_page(page);
752a6a01ca6SJosef Bacik 		put_page(page);
753a6a01ca6SJosef Bacik 		return ERR_PTR(ret);
754a6a01ca6SJosef Bacik 	}
755a6a01ca6SJosef Bacik 
756a6a01ca6SJosef Bacik 	/* Wait for any existing ordered extent in the range */
757a6a01ca6SJosef Bacik 	while (1) {
758a6a01ca6SJosef Bacik 		struct btrfs_ordered_extent *ordered;
759a6a01ca6SJosef Bacik 
760a6a01ca6SJosef Bacik 		lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
761a6a01ca6SJosef Bacik 		ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
762a6a01ca6SJosef Bacik 		unlock_extent(&inode->io_tree, page_start, page_end,
763a6a01ca6SJosef Bacik 			      &cached_state);
764a6a01ca6SJosef Bacik 		if (!ordered)
765a6a01ca6SJosef Bacik 			break;
766a6a01ca6SJosef Bacik 
767a6a01ca6SJosef Bacik 		unlock_page(page);
76836d45567SChristoph Hellwig 		btrfs_start_ordered_extent(ordered);
769a6a01ca6SJosef Bacik 		btrfs_put_ordered_extent(ordered);
770a6a01ca6SJosef Bacik 		lock_page(page);
771a6a01ca6SJosef Bacik 		/*
772a6a01ca6SJosef Bacik 		 * We unlocked the page above, so we need check if it was
773a6a01ca6SJosef Bacik 		 * released or not.
774a6a01ca6SJosef Bacik 		 */
775a6a01ca6SJosef Bacik 		if (page->mapping != mapping || !PagePrivate(page)) {
776a6a01ca6SJosef Bacik 			unlock_page(page);
777a6a01ca6SJosef Bacik 			put_page(page);
778a6a01ca6SJosef Bacik 			goto again;
779a6a01ca6SJosef Bacik 		}
780a6a01ca6SJosef Bacik 	}
781a6a01ca6SJosef Bacik 
782a6a01ca6SJosef Bacik 	/*
783a6a01ca6SJosef Bacik 	 * Now the page range has no ordered extent any more.  Read the page to
784a6a01ca6SJosef Bacik 	 * make it uptodate.
785a6a01ca6SJosef Bacik 	 */
786a6a01ca6SJosef Bacik 	if (!PageUptodate(page)) {
787a6a01ca6SJosef Bacik 		btrfs_read_folio(NULL, page_folio(page));
788a6a01ca6SJosef Bacik 		lock_page(page);
789a6a01ca6SJosef Bacik 		if (page->mapping != mapping || !PagePrivate(page)) {
790a6a01ca6SJosef Bacik 			unlock_page(page);
791a6a01ca6SJosef Bacik 			put_page(page);
792a6a01ca6SJosef Bacik 			goto again;
793a6a01ca6SJosef Bacik 		}
794a6a01ca6SJosef Bacik 		if (!PageUptodate(page)) {
795a6a01ca6SJosef Bacik 			unlock_page(page);
796a6a01ca6SJosef Bacik 			put_page(page);
797a6a01ca6SJosef Bacik 			return ERR_PTR(-EIO);
798a6a01ca6SJosef Bacik 		}
799a6a01ca6SJosef Bacik 	}
800a6a01ca6SJosef Bacik 	return page;
801a6a01ca6SJosef Bacik }
802a6a01ca6SJosef Bacik 
803a6a01ca6SJosef Bacik struct defrag_target_range {
804a6a01ca6SJosef Bacik 	struct list_head list;
805a6a01ca6SJosef Bacik 	u64 start;
806a6a01ca6SJosef Bacik 	u64 len;
807a6a01ca6SJosef Bacik };
808a6a01ca6SJosef Bacik 
809a6a01ca6SJosef Bacik /*
810a6a01ca6SJosef Bacik  * Collect all valid target extents.
811a6a01ca6SJosef Bacik  *
812a6a01ca6SJosef Bacik  * @start:	   file offset to lookup
813a6a01ca6SJosef Bacik  * @len:	   length to lookup
814a6a01ca6SJosef Bacik  * @extent_thresh: file extent size threshold, any extent size >= this value
815a6a01ca6SJosef Bacik  *		   will be ignored
816a6a01ca6SJosef Bacik  * @newer_than:    only defrag extents newer than this value
817a6a01ca6SJosef Bacik  * @do_compress:   whether the defrag is doing compression
818a6a01ca6SJosef Bacik  *		   if true, @extent_thresh will be ignored and all regular
819a6a01ca6SJosef Bacik  *		   file extents meeting @newer_than will be targets.
820a6a01ca6SJosef Bacik  * @locked:	   if the range has already held extent lock
821a6a01ca6SJosef Bacik  * @target_list:   list of targets file extents
822a6a01ca6SJosef Bacik  */
defrag_collect_targets(struct btrfs_inode * inode,u64 start,u64 len,u32 extent_thresh,u64 newer_than,bool do_compress,bool locked,struct list_head * target_list,u64 * last_scanned_ret)823a6a01ca6SJosef Bacik static int defrag_collect_targets(struct btrfs_inode *inode,
824a6a01ca6SJosef Bacik 				  u64 start, u64 len, u32 extent_thresh,
825a6a01ca6SJosef Bacik 				  u64 newer_than, bool do_compress,
826a6a01ca6SJosef Bacik 				  bool locked, struct list_head *target_list,
827a6a01ca6SJosef Bacik 				  u64 *last_scanned_ret)
828a6a01ca6SJosef Bacik {
829a6a01ca6SJosef Bacik 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
830a6a01ca6SJosef Bacik 	bool last_is_target = false;
831a6a01ca6SJosef Bacik 	u64 cur = start;
832a6a01ca6SJosef Bacik 	int ret = 0;
833a6a01ca6SJosef Bacik 
834a6a01ca6SJosef Bacik 	while (cur < start + len) {
835a6a01ca6SJosef Bacik 		struct extent_map *em;
836a6a01ca6SJosef Bacik 		struct defrag_target_range *new;
837a6a01ca6SJosef Bacik 		bool next_mergeable = true;
838a6a01ca6SJosef Bacik 		u64 range_len;
839a6a01ca6SJosef Bacik 
840a6a01ca6SJosef Bacik 		last_is_target = false;
841a6a01ca6SJosef Bacik 		em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked);
842a6a01ca6SJosef Bacik 		if (!em)
843a6a01ca6SJosef Bacik 			break;
844a6a01ca6SJosef Bacik 
845a6a01ca6SJosef Bacik 		/*
846a6a01ca6SJosef Bacik 		 * If the file extent is an inlined one, we may still want to
847a6a01ca6SJosef Bacik 		 * defrag it (fallthrough) if it will cause a regular extent.
848a6a01ca6SJosef Bacik 		 * This is for users who want to convert inline extents to
849a6a01ca6SJosef Bacik 		 * regular ones through max_inline= mount option.
850a6a01ca6SJosef Bacik 		 */
851a6a01ca6SJosef Bacik 		if (em->block_start == EXTENT_MAP_INLINE &&
852a6a01ca6SJosef Bacik 		    em->len <= inode->root->fs_info->max_inline)
853a6a01ca6SJosef Bacik 			goto next;
854a6a01ca6SJosef Bacik 
855a6a01ca6SJosef Bacik 		/* Skip hole/delalloc/preallocated extents */
856a6a01ca6SJosef Bacik 		if (em->block_start == EXTENT_MAP_HOLE ||
857a6a01ca6SJosef Bacik 		    em->block_start == EXTENT_MAP_DELALLOC ||
858a6a01ca6SJosef Bacik 		    test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
859a6a01ca6SJosef Bacik 			goto next;
860a6a01ca6SJosef Bacik 
861a6a01ca6SJosef Bacik 		/* Skip older extent */
862a6a01ca6SJosef Bacik 		if (em->generation < newer_than)
863a6a01ca6SJosef Bacik 			goto next;
864a6a01ca6SJosef Bacik 
865a6a01ca6SJosef Bacik 		/* This em is under writeback, no need to defrag */
866a6a01ca6SJosef Bacik 		if (em->generation == (u64)-1)
867a6a01ca6SJosef Bacik 			goto next;
868a6a01ca6SJosef Bacik 
869a6a01ca6SJosef Bacik 		/*
870a6a01ca6SJosef Bacik 		 * Our start offset might be in the middle of an existing extent
871a6a01ca6SJosef Bacik 		 * map, so take that into account.
872a6a01ca6SJosef Bacik 		 */
873a6a01ca6SJosef Bacik 		range_len = em->len - (cur - em->start);
874a6a01ca6SJosef Bacik 		/*
875a6a01ca6SJosef Bacik 		 * If this range of the extent map is already flagged for delalloc,
876a6a01ca6SJosef Bacik 		 * skip it, because:
877a6a01ca6SJosef Bacik 		 *
878a6a01ca6SJosef Bacik 		 * 1) We could deadlock later, when trying to reserve space for
879a6a01ca6SJosef Bacik 		 *    delalloc, because in case we can't immediately reserve space
880a6a01ca6SJosef Bacik 		 *    the flusher can start delalloc and wait for the respective
881a6a01ca6SJosef Bacik 		 *    ordered extents to complete. The deadlock would happen
882a6a01ca6SJosef Bacik 		 *    because we do the space reservation while holding the range
883a6a01ca6SJosef Bacik 		 *    locked, and starting writeback, or finishing an ordered
884a6a01ca6SJosef Bacik 		 *    extent, requires locking the range;
885a6a01ca6SJosef Bacik 		 *
886a6a01ca6SJosef Bacik 		 * 2) If there's delalloc there, it means there's dirty pages for
887a6a01ca6SJosef Bacik 		 *    which writeback has not started yet (we clean the delalloc
888a6a01ca6SJosef Bacik 		 *    flag when starting writeback and after creating an ordered
889a6a01ca6SJosef Bacik 		 *    extent). If we mark pages in an adjacent range for defrag,
890a6a01ca6SJosef Bacik 		 *    then we will have a larger contiguous range for delalloc,
891a6a01ca6SJosef Bacik 		 *    very likely resulting in a larger extent after writeback is
892a6a01ca6SJosef Bacik 		 *    triggered (except in a case of free space fragmentation).
893a6a01ca6SJosef Bacik 		 */
894a6a01ca6SJosef Bacik 		if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
895a6a01ca6SJosef Bacik 				   EXTENT_DELALLOC, 0, NULL))
896a6a01ca6SJosef Bacik 			goto next;
897a6a01ca6SJosef Bacik 
898a6a01ca6SJosef Bacik 		/*
899a6a01ca6SJosef Bacik 		 * For do_compress case, we want to compress all valid file
900a6a01ca6SJosef Bacik 		 * extents, thus no @extent_thresh or mergeable check.
901a6a01ca6SJosef Bacik 		 */
902a6a01ca6SJosef Bacik 		if (do_compress)
903a6a01ca6SJosef Bacik 			goto add;
904a6a01ca6SJosef Bacik 
905a6a01ca6SJosef Bacik 		/* Skip too large extent */
9060bb020dcSQu Wenruo 		if (em->len >= extent_thresh)
907a6a01ca6SJosef Bacik 			goto next;
908a6a01ca6SJosef Bacik 
909a6a01ca6SJosef Bacik 		/*
910a6a01ca6SJosef Bacik 		 * Skip extents already at its max capacity, this is mostly for
911a6a01ca6SJosef Bacik 		 * compressed extents, which max cap is only 128K.
912a6a01ca6SJosef Bacik 		 */
913a6a01ca6SJosef Bacik 		if (em->len >= get_extent_max_capacity(fs_info, em))
914a6a01ca6SJosef Bacik 			goto next;
915a6a01ca6SJosef Bacik 
916a6a01ca6SJosef Bacik 		/*
917a6a01ca6SJosef Bacik 		 * Normally there are no more extents after an inline one, thus
918a6a01ca6SJosef Bacik 		 * @next_mergeable will normally be false and not defragged.
919a6a01ca6SJosef Bacik 		 * So if an inline extent passed all above checks, just add it
920a6a01ca6SJosef Bacik 		 * for defrag, and be converted to regular extents.
921a6a01ca6SJosef Bacik 		 */
922a6a01ca6SJosef Bacik 		if (em->block_start == EXTENT_MAP_INLINE)
923a6a01ca6SJosef Bacik 			goto add;
924a6a01ca6SJosef Bacik 
925a6a01ca6SJosef Bacik 		next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
926a6a01ca6SJosef Bacik 						extent_thresh, newer_than, locked);
927a6a01ca6SJosef Bacik 		if (!next_mergeable) {
928a6a01ca6SJosef Bacik 			struct defrag_target_range *last;
929a6a01ca6SJosef Bacik 
930a6a01ca6SJosef Bacik 			/* Empty target list, no way to merge with last entry */
931a6a01ca6SJosef Bacik 			if (list_empty(target_list))
932a6a01ca6SJosef Bacik 				goto next;
933a6a01ca6SJosef Bacik 			last = list_entry(target_list->prev,
934a6a01ca6SJosef Bacik 					  struct defrag_target_range, list);
935a6a01ca6SJosef Bacik 			/* Not mergeable with last entry */
936a6a01ca6SJosef Bacik 			if (last->start + last->len != cur)
937a6a01ca6SJosef Bacik 				goto next;
938a6a01ca6SJosef Bacik 
939a6a01ca6SJosef Bacik 			/* Mergeable, fall through to add it to @target_list. */
940a6a01ca6SJosef Bacik 		}
941a6a01ca6SJosef Bacik 
942a6a01ca6SJosef Bacik add:
943a6a01ca6SJosef Bacik 		last_is_target = true;
944a6a01ca6SJosef Bacik 		range_len = min(extent_map_end(em), start + len) - cur;
945a6a01ca6SJosef Bacik 		/*
946a6a01ca6SJosef Bacik 		 * This one is a good target, check if it can be merged into
947a6a01ca6SJosef Bacik 		 * last range of the target list.
948a6a01ca6SJosef Bacik 		 */
949a6a01ca6SJosef Bacik 		if (!list_empty(target_list)) {
950a6a01ca6SJosef Bacik 			struct defrag_target_range *last;
951a6a01ca6SJosef Bacik 
952a6a01ca6SJosef Bacik 			last = list_entry(target_list->prev,
953a6a01ca6SJosef Bacik 					  struct defrag_target_range, list);
954a6a01ca6SJosef Bacik 			ASSERT(last->start + last->len <= cur);
955a6a01ca6SJosef Bacik 			if (last->start + last->len == cur) {
956a6a01ca6SJosef Bacik 				/* Mergeable, enlarge the last entry */
957a6a01ca6SJosef Bacik 				last->len += range_len;
958a6a01ca6SJosef Bacik 				goto next;
959a6a01ca6SJosef Bacik 			}
960a6a01ca6SJosef Bacik 			/* Fall through to allocate a new entry */
961a6a01ca6SJosef Bacik 		}
962a6a01ca6SJosef Bacik 
963a6a01ca6SJosef Bacik 		/* Allocate new defrag_target_range */
964a6a01ca6SJosef Bacik 		new = kmalloc(sizeof(*new), GFP_NOFS);
965a6a01ca6SJosef Bacik 		if (!new) {
966a6a01ca6SJosef Bacik 			free_extent_map(em);
967a6a01ca6SJosef Bacik 			ret = -ENOMEM;
968a6a01ca6SJosef Bacik 			break;
969a6a01ca6SJosef Bacik 		}
970a6a01ca6SJosef Bacik 		new->start = cur;
971a6a01ca6SJosef Bacik 		new->len = range_len;
972a6a01ca6SJosef Bacik 		list_add_tail(&new->list, target_list);
973a6a01ca6SJosef Bacik 
974a6a01ca6SJosef Bacik next:
975a6a01ca6SJosef Bacik 		cur = extent_map_end(em);
976a6a01ca6SJosef Bacik 		free_extent_map(em);
977a6a01ca6SJosef Bacik 	}
978a6a01ca6SJosef Bacik 	if (ret < 0) {
979a6a01ca6SJosef Bacik 		struct defrag_target_range *entry;
980a6a01ca6SJosef Bacik 		struct defrag_target_range *tmp;
981a6a01ca6SJosef Bacik 
982a6a01ca6SJosef Bacik 		list_for_each_entry_safe(entry, tmp, target_list, list) {
983a6a01ca6SJosef Bacik 			list_del_init(&entry->list);
984a6a01ca6SJosef Bacik 			kfree(entry);
985a6a01ca6SJosef Bacik 		}
986a6a01ca6SJosef Bacik 	}
987a6a01ca6SJosef Bacik 	if (!ret && last_scanned_ret) {
988a6a01ca6SJosef Bacik 		/*
989a6a01ca6SJosef Bacik 		 * If the last extent is not a target, the caller can skip to
990a6a01ca6SJosef Bacik 		 * the end of that extent.
991a6a01ca6SJosef Bacik 		 * Otherwise, we can only go the end of the specified range.
992a6a01ca6SJosef Bacik 		 */
993a6a01ca6SJosef Bacik 		if (!last_is_target)
994a6a01ca6SJosef Bacik 			*last_scanned_ret = max(cur, *last_scanned_ret);
995a6a01ca6SJosef Bacik 		else
996a6a01ca6SJosef Bacik 			*last_scanned_ret = max(start + len, *last_scanned_ret);
997a6a01ca6SJosef Bacik 	}
998a6a01ca6SJosef Bacik 	return ret;
999a6a01ca6SJosef Bacik }
1000a6a01ca6SJosef Bacik 
1001a6a01ca6SJosef Bacik #define CLUSTER_SIZE	(SZ_256K)
1002ce394a7fSYushan Zhou static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
1003a6a01ca6SJosef Bacik 
1004a6a01ca6SJosef Bacik /*
1005a6a01ca6SJosef Bacik  * Defrag one contiguous target range.
1006a6a01ca6SJosef Bacik  *
1007a6a01ca6SJosef Bacik  * @inode:	target inode
1008a6a01ca6SJosef Bacik  * @target:	target range to defrag
1009a6a01ca6SJosef Bacik  * @pages:	locked pages covering the defrag range
1010a6a01ca6SJosef Bacik  * @nr_pages:	number of locked pages
1011a6a01ca6SJosef Bacik  *
1012a6a01ca6SJosef Bacik  * Caller should ensure:
1013a6a01ca6SJosef Bacik  *
1014a6a01ca6SJosef Bacik  * - Pages are prepared
1015a6a01ca6SJosef Bacik  *   Pages should be locked, no ordered extent in the pages range,
1016a6a01ca6SJosef Bacik  *   no writeback.
1017a6a01ca6SJosef Bacik  *
1018a6a01ca6SJosef Bacik  * - Extent bits are locked
1019a6a01ca6SJosef Bacik  */
defrag_one_locked_target(struct btrfs_inode * inode,struct defrag_target_range * target,struct page ** pages,int nr_pages,struct extent_state ** cached_state)1020a6a01ca6SJosef Bacik static int defrag_one_locked_target(struct btrfs_inode *inode,
1021a6a01ca6SJosef Bacik 				    struct defrag_target_range *target,
1022a6a01ca6SJosef Bacik 				    struct page **pages, int nr_pages,
1023a6a01ca6SJosef Bacik 				    struct extent_state **cached_state)
1024a6a01ca6SJosef Bacik {
1025a6a01ca6SJosef Bacik 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1026a6a01ca6SJosef Bacik 	struct extent_changeset *data_reserved = NULL;
1027a6a01ca6SJosef Bacik 	const u64 start = target->start;
1028a6a01ca6SJosef Bacik 	const u64 len = target->len;
1029a6a01ca6SJosef Bacik 	unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
1030a6a01ca6SJosef Bacik 	unsigned long start_index = start >> PAGE_SHIFT;
1031a6a01ca6SJosef Bacik 	unsigned long first_index = page_index(pages[0]);
1032a6a01ca6SJosef Bacik 	int ret = 0;
1033a6a01ca6SJosef Bacik 	int i;
1034a6a01ca6SJosef Bacik 
1035a6a01ca6SJosef Bacik 	ASSERT(last_index - first_index + 1 <= nr_pages);
1036a6a01ca6SJosef Bacik 
1037a6a01ca6SJosef Bacik 	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
1038a6a01ca6SJosef Bacik 	if (ret < 0)
1039a6a01ca6SJosef Bacik 		return ret;
1040a6a01ca6SJosef Bacik 	clear_extent_bit(&inode->io_tree, start, start + len - 1,
1041a6a01ca6SJosef Bacik 			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1042a6a01ca6SJosef Bacik 			 EXTENT_DEFRAG, cached_state);
1043dc5646c1SDavid Sterba 	set_extent_bit(&inode->io_tree, start, start + len - 1,
10441d126800SDavid Sterba 		       EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
1045a6a01ca6SJosef Bacik 
1046a6a01ca6SJosef Bacik 	/* Update the page status */
1047a6a01ca6SJosef Bacik 	for (i = start_index - first_index; i <= last_index - first_index; i++) {
1048a6a01ca6SJosef Bacik 		ClearPageChecked(pages[i]);
1049a6a01ca6SJosef Bacik 		btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
1050a6a01ca6SJosef Bacik 	}
1051a6a01ca6SJosef Bacik 	btrfs_delalloc_release_extents(inode, len);
1052a6a01ca6SJosef Bacik 	extent_changeset_free(data_reserved);
1053a6a01ca6SJosef Bacik 
1054a6a01ca6SJosef Bacik 	return ret;
1055a6a01ca6SJosef Bacik }
1056a6a01ca6SJosef Bacik 
defrag_one_range(struct btrfs_inode * inode,u64 start,u32 len,u32 extent_thresh,u64 newer_than,bool do_compress,u64 * last_scanned_ret)1057a6a01ca6SJosef Bacik static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
1058a6a01ca6SJosef Bacik 			    u32 extent_thresh, u64 newer_than, bool do_compress,
1059a6a01ca6SJosef Bacik 			    u64 *last_scanned_ret)
1060a6a01ca6SJosef Bacik {
1061a6a01ca6SJosef Bacik 	struct extent_state *cached_state = NULL;
1062a6a01ca6SJosef Bacik 	struct defrag_target_range *entry;
1063a6a01ca6SJosef Bacik 	struct defrag_target_range *tmp;
1064a6a01ca6SJosef Bacik 	LIST_HEAD(target_list);
1065a6a01ca6SJosef Bacik 	struct page **pages;
1066a6a01ca6SJosef Bacik 	const u32 sectorsize = inode->root->fs_info->sectorsize;
1067a6a01ca6SJosef Bacik 	u64 last_index = (start + len - 1) >> PAGE_SHIFT;
1068a6a01ca6SJosef Bacik 	u64 start_index = start >> PAGE_SHIFT;
1069a6a01ca6SJosef Bacik 	unsigned int nr_pages = last_index - start_index + 1;
1070a6a01ca6SJosef Bacik 	int ret = 0;
1071a6a01ca6SJosef Bacik 	int i;
1072a6a01ca6SJosef Bacik 
1073a6a01ca6SJosef Bacik 	ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
1074a6a01ca6SJosef Bacik 	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
1075a6a01ca6SJosef Bacik 
1076a6a01ca6SJosef Bacik 	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
1077a6a01ca6SJosef Bacik 	if (!pages)
1078a6a01ca6SJosef Bacik 		return -ENOMEM;
1079a6a01ca6SJosef Bacik 
1080a6a01ca6SJosef Bacik 	/* Prepare all pages */
1081a6a01ca6SJosef Bacik 	for (i = 0; i < nr_pages; i++) {
1082a6a01ca6SJosef Bacik 		pages[i] = defrag_prepare_one_page(inode, start_index + i);
1083a6a01ca6SJosef Bacik 		if (IS_ERR(pages[i])) {
1084a6a01ca6SJosef Bacik 			ret = PTR_ERR(pages[i]);
1085a6a01ca6SJosef Bacik 			pages[i] = NULL;
1086a6a01ca6SJosef Bacik 			goto free_pages;
1087a6a01ca6SJosef Bacik 		}
1088a6a01ca6SJosef Bacik 	}
1089a6a01ca6SJosef Bacik 	for (i = 0; i < nr_pages; i++)
1090a6a01ca6SJosef Bacik 		wait_on_page_writeback(pages[i]);
1091a6a01ca6SJosef Bacik 
1092a6a01ca6SJosef Bacik 	/* Lock the pages range */
1093a6a01ca6SJosef Bacik 	lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
1094a6a01ca6SJosef Bacik 		    (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
1095a6a01ca6SJosef Bacik 		    &cached_state);
1096a6a01ca6SJosef Bacik 	/*
1097a6a01ca6SJosef Bacik 	 * Now we have a consistent view about the extent map, re-check
1098a6a01ca6SJosef Bacik 	 * which range really needs to be defragged.
1099a6a01ca6SJosef Bacik 	 *
1100a6a01ca6SJosef Bacik 	 * And this time we have extent locked already, pass @locked = true
1101a6a01ca6SJosef Bacik 	 * so that we won't relock the extent range and cause deadlock.
1102a6a01ca6SJosef Bacik 	 */
1103a6a01ca6SJosef Bacik 	ret = defrag_collect_targets(inode, start, len, extent_thresh,
1104a6a01ca6SJosef Bacik 				     newer_than, do_compress, true,
1105a6a01ca6SJosef Bacik 				     &target_list, last_scanned_ret);
1106a6a01ca6SJosef Bacik 	if (ret < 0)
1107a6a01ca6SJosef Bacik 		goto unlock_extent;
1108a6a01ca6SJosef Bacik 
1109a6a01ca6SJosef Bacik 	list_for_each_entry(entry, &target_list, list) {
1110a6a01ca6SJosef Bacik 		ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
1111a6a01ca6SJosef Bacik 					       &cached_state);
1112a6a01ca6SJosef Bacik 		if (ret < 0)
1113a6a01ca6SJosef Bacik 			break;
1114a6a01ca6SJosef Bacik 	}
1115a6a01ca6SJosef Bacik 
1116a6a01ca6SJosef Bacik 	list_for_each_entry_safe(entry, tmp, &target_list, list) {
1117a6a01ca6SJosef Bacik 		list_del_init(&entry->list);
1118a6a01ca6SJosef Bacik 		kfree(entry);
1119a6a01ca6SJosef Bacik 	}
1120a6a01ca6SJosef Bacik unlock_extent:
1121a6a01ca6SJosef Bacik 	unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
1122a6a01ca6SJosef Bacik 		      (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
1123a6a01ca6SJosef Bacik 		      &cached_state);
1124a6a01ca6SJosef Bacik free_pages:
1125a6a01ca6SJosef Bacik 	for (i = 0; i < nr_pages; i++) {
1126a6a01ca6SJosef Bacik 		if (pages[i]) {
1127a6a01ca6SJosef Bacik 			unlock_page(pages[i]);
1128a6a01ca6SJosef Bacik 			put_page(pages[i]);
1129a6a01ca6SJosef Bacik 		}
1130a6a01ca6SJosef Bacik 	}
1131a6a01ca6SJosef Bacik 	kfree(pages);
1132a6a01ca6SJosef Bacik 	return ret;
1133a6a01ca6SJosef Bacik }
1134a6a01ca6SJosef Bacik 
defrag_one_cluster(struct btrfs_inode * inode,struct file_ra_state * ra,u64 start,u32 len,u32 extent_thresh,u64 newer_than,bool do_compress,unsigned long * sectors_defragged,unsigned long max_sectors,u64 * last_scanned_ret)1135a6a01ca6SJosef Bacik static int defrag_one_cluster(struct btrfs_inode *inode,
1136a6a01ca6SJosef Bacik 			      struct file_ra_state *ra,
1137a6a01ca6SJosef Bacik 			      u64 start, u32 len, u32 extent_thresh,
1138a6a01ca6SJosef Bacik 			      u64 newer_than, bool do_compress,
1139a6a01ca6SJosef Bacik 			      unsigned long *sectors_defragged,
1140a6a01ca6SJosef Bacik 			      unsigned long max_sectors,
1141a6a01ca6SJosef Bacik 			      u64 *last_scanned_ret)
1142a6a01ca6SJosef Bacik {
1143a6a01ca6SJosef Bacik 	const u32 sectorsize = inode->root->fs_info->sectorsize;
1144a6a01ca6SJosef Bacik 	struct defrag_target_range *entry;
1145a6a01ca6SJosef Bacik 	struct defrag_target_range *tmp;
1146a6a01ca6SJosef Bacik 	LIST_HEAD(target_list);
1147a6a01ca6SJosef Bacik 	int ret;
1148a6a01ca6SJosef Bacik 
1149a6a01ca6SJosef Bacik 	ret = defrag_collect_targets(inode, start, len, extent_thresh,
1150a6a01ca6SJosef Bacik 				     newer_than, do_compress, false,
1151a6a01ca6SJosef Bacik 				     &target_list, NULL);
1152a6a01ca6SJosef Bacik 	if (ret < 0)
1153a6a01ca6SJosef Bacik 		goto out;
1154a6a01ca6SJosef Bacik 
1155a6a01ca6SJosef Bacik 	list_for_each_entry(entry, &target_list, list) {
1156a6a01ca6SJosef Bacik 		u32 range_len = entry->len;
1157a6a01ca6SJosef Bacik 
1158a6a01ca6SJosef Bacik 		/* Reached or beyond the limit */
1159a6a01ca6SJosef Bacik 		if (max_sectors && *sectors_defragged >= max_sectors) {
1160a6a01ca6SJosef Bacik 			ret = 1;
1161a6a01ca6SJosef Bacik 			break;
1162a6a01ca6SJosef Bacik 		}
1163a6a01ca6SJosef Bacik 
1164a6a01ca6SJosef Bacik 		if (max_sectors)
1165a6a01ca6SJosef Bacik 			range_len = min_t(u32, range_len,
1166a6a01ca6SJosef Bacik 				(max_sectors - *sectors_defragged) * sectorsize);
1167a6a01ca6SJosef Bacik 
1168a6a01ca6SJosef Bacik 		/*
1169a6a01ca6SJosef Bacik 		 * If defrag_one_range() has updated last_scanned_ret,
1170a6a01ca6SJosef Bacik 		 * our range may already be invalid (e.g. hole punched).
1171a6a01ca6SJosef Bacik 		 * Skip if our range is before last_scanned_ret, as there is
1172a6a01ca6SJosef Bacik 		 * no need to defrag the range anymore.
1173a6a01ca6SJosef Bacik 		 */
1174a6a01ca6SJosef Bacik 		if (entry->start + range_len <= *last_scanned_ret)
1175a6a01ca6SJosef Bacik 			continue;
1176a6a01ca6SJosef Bacik 
1177a6a01ca6SJosef Bacik 		if (ra)
1178a6a01ca6SJosef Bacik 			page_cache_sync_readahead(inode->vfs_inode.i_mapping,
1179a6a01ca6SJosef Bacik 				ra, NULL, entry->start >> PAGE_SHIFT,
1180a6a01ca6SJosef Bacik 				((entry->start + range_len - 1) >> PAGE_SHIFT) -
1181a6a01ca6SJosef Bacik 				(entry->start >> PAGE_SHIFT) + 1);
1182a6a01ca6SJosef Bacik 		/*
1183a6a01ca6SJosef Bacik 		 * Here we may not defrag any range if holes are punched before
1184a6a01ca6SJosef Bacik 		 * we locked the pages.
1185a6a01ca6SJosef Bacik 		 * But that's fine, it only affects the @sectors_defragged
1186a6a01ca6SJosef Bacik 		 * accounting.
1187a6a01ca6SJosef Bacik 		 */
1188a6a01ca6SJosef Bacik 		ret = defrag_one_range(inode, entry->start, range_len,
1189a6a01ca6SJosef Bacik 				       extent_thresh, newer_than, do_compress,
1190a6a01ca6SJosef Bacik 				       last_scanned_ret);
1191a6a01ca6SJosef Bacik 		if (ret < 0)
1192a6a01ca6SJosef Bacik 			break;
1193a6a01ca6SJosef Bacik 		*sectors_defragged += range_len >>
1194a6a01ca6SJosef Bacik 				      inode->root->fs_info->sectorsize_bits;
1195a6a01ca6SJosef Bacik 	}
1196a6a01ca6SJosef Bacik out:
1197a6a01ca6SJosef Bacik 	list_for_each_entry_safe(entry, tmp, &target_list, list) {
1198a6a01ca6SJosef Bacik 		list_del_init(&entry->list);
1199a6a01ca6SJosef Bacik 		kfree(entry);
1200a6a01ca6SJosef Bacik 	}
1201a6a01ca6SJosef Bacik 	if (ret >= 0)
1202a6a01ca6SJosef Bacik 		*last_scanned_ret = max(*last_scanned_ret, start + len);
1203a6a01ca6SJosef Bacik 	return ret;
1204a6a01ca6SJosef Bacik }
1205a6a01ca6SJosef Bacik 
1206a6a01ca6SJosef Bacik /*
1207a6a01ca6SJosef Bacik  * Entry point to file defragmentation.
1208a6a01ca6SJosef Bacik  *
1209a6a01ca6SJosef Bacik  * @inode:	   inode to be defragged
1210a6a01ca6SJosef Bacik  * @ra:		   readahead state (can be NUL)
1211a6a01ca6SJosef Bacik  * @range:	   defrag options including range and flags
1212a6a01ca6SJosef Bacik  * @newer_than:	   minimum transid to defrag
1213a6a01ca6SJosef Bacik  * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
1214a6a01ca6SJosef Bacik  *		   will be defragged.
1215a6a01ca6SJosef Bacik  *
1216a6a01ca6SJosef Bacik  * Return <0 for error.
1217a6a01ca6SJosef Bacik  * Return >=0 for the number of sectors defragged, and range->start will be updated
1218a6a01ca6SJosef Bacik  * to indicate the file offset where next defrag should be started at.
1219a6a01ca6SJosef Bacik  * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1220a6a01ca6SJosef Bacik  *  defragging all the range).
1221a6a01ca6SJosef Bacik  */
btrfs_defrag_file(struct inode * inode,struct file_ra_state * ra,struct btrfs_ioctl_defrag_range_args * range,u64 newer_than,unsigned long max_to_defrag)1222a6a01ca6SJosef Bacik int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
1223a6a01ca6SJosef Bacik 		      struct btrfs_ioctl_defrag_range_args *range,
1224a6a01ca6SJosef Bacik 		      u64 newer_than, unsigned long max_to_defrag)
1225a6a01ca6SJosef Bacik {
1226a6a01ca6SJosef Bacik 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1227a6a01ca6SJosef Bacik 	unsigned long sectors_defragged = 0;
1228a6a01ca6SJosef Bacik 	u64 isize = i_size_read(inode);
1229a6a01ca6SJosef Bacik 	u64 cur;
1230a6a01ca6SJosef Bacik 	u64 last_byte;
1231a6a01ca6SJosef Bacik 	bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
1232a6a01ca6SJosef Bacik 	bool ra_allocated = false;
1233a6a01ca6SJosef Bacik 	int compress_type = BTRFS_COMPRESS_ZLIB;
1234a6a01ca6SJosef Bacik 	int ret = 0;
1235a6a01ca6SJosef Bacik 	u32 extent_thresh = range->extent_thresh;
1236a6a01ca6SJosef Bacik 	pgoff_t start_index;
1237a6a01ca6SJosef Bacik 
1238a6a01ca6SJosef Bacik 	if (isize == 0)
1239a6a01ca6SJosef Bacik 		return 0;
1240a6a01ca6SJosef Bacik 
1241a6a01ca6SJosef Bacik 	if (range->start >= isize)
1242a6a01ca6SJosef Bacik 		return -EINVAL;
1243a6a01ca6SJosef Bacik 
1244a6a01ca6SJosef Bacik 	if (do_compress) {
1245a6a01ca6SJosef Bacik 		if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
1246a6a01ca6SJosef Bacik 			return -EINVAL;
1247a6a01ca6SJosef Bacik 		if (range->compress_type)
1248a6a01ca6SJosef Bacik 			compress_type = range->compress_type;
1249a6a01ca6SJosef Bacik 	}
1250a6a01ca6SJosef Bacik 
1251a6a01ca6SJosef Bacik 	if (extent_thresh == 0)
1252a6a01ca6SJosef Bacik 		extent_thresh = SZ_256K;
1253a6a01ca6SJosef Bacik 
1254a6a01ca6SJosef Bacik 	if (range->start + range->len > range->start) {
1255a6a01ca6SJosef Bacik 		/* Got a specific range */
1256a6a01ca6SJosef Bacik 		last_byte = min(isize, range->start + range->len);
1257a6a01ca6SJosef Bacik 	} else {
1258a6a01ca6SJosef Bacik 		/* Defrag until file end */
1259a6a01ca6SJosef Bacik 		last_byte = isize;
1260a6a01ca6SJosef Bacik 	}
1261a6a01ca6SJosef Bacik 
1262a6a01ca6SJosef Bacik 	/* Align the range */
1263a6a01ca6SJosef Bacik 	cur = round_down(range->start, fs_info->sectorsize);
1264a6a01ca6SJosef Bacik 	last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
1265a6a01ca6SJosef Bacik 
1266a6a01ca6SJosef Bacik 	/*
1267a6a01ca6SJosef Bacik 	 * If we were not given a ra, allocate a readahead context. As
1268a6a01ca6SJosef Bacik 	 * readahead is just an optimization, defrag will work without it so
1269a6a01ca6SJosef Bacik 	 * we don't error out.
1270a6a01ca6SJosef Bacik 	 */
1271a6a01ca6SJosef Bacik 	if (!ra) {
1272a6a01ca6SJosef Bacik 		ra_allocated = true;
1273a6a01ca6SJosef Bacik 		ra = kzalloc(sizeof(*ra), GFP_KERNEL);
1274a6a01ca6SJosef Bacik 		if (ra)
1275a6a01ca6SJosef Bacik 			file_ra_state_init(ra, inode->i_mapping);
1276a6a01ca6SJosef Bacik 	}
1277a6a01ca6SJosef Bacik 
1278a6a01ca6SJosef Bacik 	/*
1279a6a01ca6SJosef Bacik 	 * Make writeback start from the beginning of the range, so that the
1280a6a01ca6SJosef Bacik 	 * defrag range can be written sequentially.
1281a6a01ca6SJosef Bacik 	 */
1282a6a01ca6SJosef Bacik 	start_index = cur >> PAGE_SHIFT;
1283a6a01ca6SJosef Bacik 	if (start_index < inode->i_mapping->writeback_index)
1284a6a01ca6SJosef Bacik 		inode->i_mapping->writeback_index = start_index;
1285a6a01ca6SJosef Bacik 
1286a6a01ca6SJosef Bacik 	while (cur < last_byte) {
1287a6a01ca6SJosef Bacik 		const unsigned long prev_sectors_defragged = sectors_defragged;
1288a6a01ca6SJosef Bacik 		u64 last_scanned = cur;
1289a6a01ca6SJosef Bacik 		u64 cluster_end;
1290a6a01ca6SJosef Bacik 
1291a6a01ca6SJosef Bacik 		if (btrfs_defrag_cancelled(fs_info)) {
1292a6a01ca6SJosef Bacik 			ret = -EAGAIN;
1293a6a01ca6SJosef Bacik 			break;
1294a6a01ca6SJosef Bacik 		}
1295a6a01ca6SJosef Bacik 
1296a6a01ca6SJosef Bacik 		/* We want the cluster end at page boundary when possible */
1297a6a01ca6SJosef Bacik 		cluster_end = (((cur >> PAGE_SHIFT) +
1298a6a01ca6SJosef Bacik 			       (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
1299a6a01ca6SJosef Bacik 		cluster_end = min(cluster_end, last_byte);
1300a6a01ca6SJosef Bacik 
130129b6352bSDavid Sterba 		btrfs_inode_lock(BTRFS_I(inode), 0);
1302a6a01ca6SJosef Bacik 		if (IS_SWAPFILE(inode)) {
1303a6a01ca6SJosef Bacik 			ret = -ETXTBSY;
1304e5d4d75bSDavid Sterba 			btrfs_inode_unlock(BTRFS_I(inode), 0);
1305a6a01ca6SJosef Bacik 			break;
1306a6a01ca6SJosef Bacik 		}
1307a6a01ca6SJosef Bacik 		if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
1308e5d4d75bSDavid Sterba 			btrfs_inode_unlock(BTRFS_I(inode), 0);
1309a6a01ca6SJosef Bacik 			break;
1310a6a01ca6SJosef Bacik 		}
1311a6a01ca6SJosef Bacik 		if (do_compress)
1312a6a01ca6SJosef Bacik 			BTRFS_I(inode)->defrag_compress = compress_type;
1313a6a01ca6SJosef Bacik 		ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
1314a6a01ca6SJosef Bacik 				cluster_end + 1 - cur, extent_thresh,
1315a6a01ca6SJosef Bacik 				newer_than, do_compress, &sectors_defragged,
1316a6a01ca6SJosef Bacik 				max_to_defrag, &last_scanned);
1317a6a01ca6SJosef Bacik 
1318a6a01ca6SJosef Bacik 		if (sectors_defragged > prev_sectors_defragged)
1319a6a01ca6SJosef Bacik 			balance_dirty_pages_ratelimited(inode->i_mapping);
1320a6a01ca6SJosef Bacik 
1321e5d4d75bSDavid Sterba 		btrfs_inode_unlock(BTRFS_I(inode), 0);
1322a6a01ca6SJosef Bacik 		if (ret < 0)
1323a6a01ca6SJosef Bacik 			break;
1324a6a01ca6SJosef Bacik 		cur = max(cluster_end + 1, last_scanned);
1325a6a01ca6SJosef Bacik 		if (ret > 0) {
1326a6a01ca6SJosef Bacik 			ret = 0;
1327a6a01ca6SJosef Bacik 			break;
1328a6a01ca6SJosef Bacik 		}
1329a6a01ca6SJosef Bacik 		cond_resched();
1330a6a01ca6SJosef Bacik 	}
1331a6a01ca6SJosef Bacik 
1332a6a01ca6SJosef Bacik 	if (ra_allocated)
1333a6a01ca6SJosef Bacik 		kfree(ra);
1334a6a01ca6SJosef Bacik 	/*
1335a6a01ca6SJosef Bacik 	 * Update range.start for autodefrag, this will indicate where to start
1336a6a01ca6SJosef Bacik 	 * in next run.
1337a6a01ca6SJosef Bacik 	 */
1338a6a01ca6SJosef Bacik 	range->start = cur;
1339a6a01ca6SJosef Bacik 	if (sectors_defragged) {
1340a6a01ca6SJosef Bacik 		/*
1341a6a01ca6SJosef Bacik 		 * We have defragged some sectors, for compression case they
1342a6a01ca6SJosef Bacik 		 * need to be written back immediately.
1343a6a01ca6SJosef Bacik 		 */
1344a6a01ca6SJosef Bacik 		if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
1345a6a01ca6SJosef Bacik 			filemap_flush(inode->i_mapping);
1346a6a01ca6SJosef Bacik 			if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1347a6a01ca6SJosef Bacik 				     &BTRFS_I(inode)->runtime_flags))
1348a6a01ca6SJosef Bacik 				filemap_flush(inode->i_mapping);
1349a6a01ca6SJosef Bacik 		}
1350a6a01ca6SJosef Bacik 		if (range->compress_type == BTRFS_COMPRESS_LZO)
1351a6a01ca6SJosef Bacik 			btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
1352a6a01ca6SJosef Bacik 		else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
1353a6a01ca6SJosef Bacik 			btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
1354a6a01ca6SJosef Bacik 		ret = sectors_defragged;
1355a6a01ca6SJosef Bacik 	}
1356a6a01ca6SJosef Bacik 	if (do_compress) {
135729b6352bSDavid Sterba 		btrfs_inode_lock(BTRFS_I(inode), 0);
1358a6a01ca6SJosef Bacik 		BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
1359e5d4d75bSDavid Sterba 		btrfs_inode_unlock(BTRFS_I(inode), 0);
1360a6a01ca6SJosef Bacik 	}
1361a6a01ca6SJosef Bacik 	return ret;
1362a6a01ca6SJosef Bacik }
1363a6a01ca6SJosef Bacik 
btrfs_auto_defrag_exit(void)13646e3df18bSJosef Bacik void __cold btrfs_auto_defrag_exit(void)
13656e3df18bSJosef Bacik {
13666e3df18bSJosef Bacik 	kmem_cache_destroy(btrfs_inode_defrag_cachep);
13676e3df18bSJosef Bacik }
13686e3df18bSJosef Bacik 
btrfs_auto_defrag_init(void)13696e3df18bSJosef Bacik int __init btrfs_auto_defrag_init(void)
13706e3df18bSJosef Bacik {
13716e3df18bSJosef Bacik 	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
13726e3df18bSJosef Bacik 					sizeof(struct inode_defrag), 0,
13736e3df18bSJosef Bacik 					SLAB_MEM_SPREAD,
13746e3df18bSJosef Bacik 					NULL);
13756e3df18bSJosef Bacik 	if (!btrfs_inode_defrag_cachep)
13766e3df18bSJosef Bacik 		return -ENOMEM;
13776e3df18bSJosef Bacik 
13786e3df18bSJosef Bacik 	return 0;
13796e3df18bSJosef Bacik }
1380