xref: /openbmc/linux/fs/btrfs/btrfs_inode.h (revision d3c6be6fdab48dd26af3d3e01c5972ffe20985b9)
19888c340SDavid Sterba /* SPDX-License-Identifier: GPL-2.0 */
26cbd5570SChris Mason /*
36cbd5570SChris Mason  * Copyright (C) 2007 Oracle.  All rights reserved.
46cbd5570SChris Mason  */
56cbd5570SChris Mason 
69888c340SDavid Sterba #ifndef BTRFS_INODE_H
79888c340SDavid Sterba #define BTRFS_INODE_H
82c90e5d6SChris Mason 
9778ba82bSFilipe David Borba Manana #include <linux/hash.h>
10a52d9a80SChris Mason #include "extent_map.h"
11d1310b2eSChris Mason #include "extent_io.h"
12e6dcd2dcSChris Mason #include "ordered-data.h"
1316cdcec7SMiao Xie #include "delayed-inode.h"
14a52d9a80SChris Mason 
1572ac3c0dSJosef Bacik /*
1672ac3c0dSJosef Bacik  * ordered_data_close is set by truncate when a file that used
1772ac3c0dSJosef Bacik  * to have good data has been truncated to zero.  When it is set
1872ac3c0dSJosef Bacik  * the btrfs file release call will add this inode to the
1972ac3c0dSJosef Bacik  * ordered operations list so that we make sure to flush out any
2072ac3c0dSJosef Bacik  * new data the application may have written before commit.
2172ac3c0dSJosef Bacik  */
227efc3e34SOmar Sandoval enum {
237efc3e34SOmar Sandoval 	BTRFS_INODE_ORDERED_DATA_CLOSE = 0,
247efc3e34SOmar Sandoval 	BTRFS_INODE_DUMMY,
257efc3e34SOmar Sandoval 	BTRFS_INODE_IN_DEFRAG,
267efc3e34SOmar Sandoval 	BTRFS_INODE_HAS_ASYNC_EXTENT,
277efc3e34SOmar Sandoval 	BTRFS_INODE_NEEDS_FULL_SYNC,
287efc3e34SOmar Sandoval 	BTRFS_INODE_COPY_EVERYTHING,
297efc3e34SOmar Sandoval 	BTRFS_INODE_IN_DELALLOC_LIST,
307efc3e34SOmar Sandoval 	BTRFS_INODE_READDIO_NEED_LOCK,
317efc3e34SOmar Sandoval 	BTRFS_INODE_HAS_PROPS,
327efc3e34SOmar Sandoval };
3372ac3c0dSJosef Bacik 
34f1ace244SAneesh /* in memory btrfs inode */
352c90e5d6SChris Mason struct btrfs_inode {
36d352ac68SChris Mason 	/* which subvolume this inode belongs to */
37d6e4a428SChris Mason 	struct btrfs_root *root;
38d352ac68SChris Mason 
39d352ac68SChris Mason 	/* key used to find this inode on disk.  This is used by the code
40d352ac68SChris Mason 	 * to read in roots of subvolumes
41d352ac68SChris Mason 	 */
42d6e4a428SChris Mason 	struct btrfs_key location;
43d352ac68SChris Mason 
442f2ff0eeSFilipe Manana 	/*
452f2ff0eeSFilipe Manana 	 * Lock for counters and all fields used to determine if the inode is in
462f2ff0eeSFilipe Manana 	 * the log or not (last_trans, last_sub_trans, last_log_commit,
472f2ff0eeSFilipe Manana 	 * logged_trans).
482f2ff0eeSFilipe Manana 	 */
499e0baf60SJosef Bacik 	spinlock_t lock;
509e0baf60SJosef Bacik 
51d352ac68SChris Mason 	/* the extent_tree has caches of all the extent mappings to disk */
52a52d9a80SChris Mason 	struct extent_map_tree extent_tree;
53d352ac68SChris Mason 
54d352ac68SChris Mason 	/* the io_tree does range state (DIRTY, LOCKED etc) */
55d1310b2eSChris Mason 	struct extent_io_tree io_tree;
56d352ac68SChris Mason 
57d352ac68SChris Mason 	/* special utility tree used to record which mirrors have already been
58d352ac68SChris Mason 	 * tried when checksums fail for a given block
59d352ac68SChris Mason 	 */
607e38326fSChris Mason 	struct extent_io_tree io_failure_tree;
61d352ac68SChris Mason 
62d352ac68SChris Mason 	/* held while logging the inode in tree-log.c */
63e02119d5SChris Mason 	struct mutex log_mutex;
64d352ac68SChris Mason 
65f248679eSJosef Bacik 	/* held while doing delalloc reservations */
66f248679eSJosef Bacik 	struct mutex delalloc_mutex;
67f248679eSJosef Bacik 
68d352ac68SChris Mason 	/* used to order data wrt metadata */
69e6dcd2dcSChris Mason 	struct btrfs_ordered_inode_tree ordered_tree;
7015ee9bc7SJosef Bacik 
71d352ac68SChris Mason 	/* list of all the delalloc inodes in the FS.  There are times we need
72d352ac68SChris Mason 	 * to write all the delalloc pages to disk, and this list is used
73d352ac68SChris Mason 	 * to walk them all.
74d352ac68SChris Mason 	 */
75ea8c2819SChris Mason 	struct list_head delalloc_inodes;
76ea8c2819SChris Mason 
775d4f98a2SYan Zheng 	/* node for the red-black tree that links inodes in subvolume root */
785d4f98a2SYan Zheng 	struct rb_node rb_node;
795d4f98a2SYan Zheng 
8072ac3c0dSJosef Bacik 	unsigned long runtime_flags;
8172ac3c0dSJosef Bacik 
829c931c5aSNathaniel Yazdani 	/* Keep track of who's O_SYNC/fsyncing currently */
83b812ce28SJosef Bacik 	atomic_t sync_writers;
84b812ce28SJosef Bacik 
85d352ac68SChris Mason 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
86d352ac68SChris Mason 	 * enough field for this.
87d352ac68SChris Mason 	 */
88e02119d5SChris Mason 	u64 generation;
89e02119d5SChris Mason 
9015ee9bc7SJosef Bacik 	/*
9115ee9bc7SJosef Bacik 	 * transid of the trans_handle that last modified this inode
9215ee9bc7SJosef Bacik 	 */
9315ee9bc7SJosef Bacik 	u64 last_trans;
94257c62e1SChris Mason 
95257c62e1SChris Mason 	/*
96e02119d5SChris Mason 	 * transid that last logged this inode
97e02119d5SChris Mason 	 */
98e02119d5SChris Mason 	u64 logged_trans;
9949eb7e46SChris Mason 
100bb14a59bSMiao Xie 	/*
101bb14a59bSMiao Xie 	 * log transid when this inode was last modified
102bb14a59bSMiao Xie 	 */
103bb14a59bSMiao Xie 	int last_sub_trans;
104bb14a59bSMiao Xie 
105bb14a59bSMiao Xie 	/* a local copy of root's last_log_commit */
106bb14a59bSMiao Xie 	int last_log_commit;
107bb14a59bSMiao Xie 
108d352ac68SChris Mason 	/* total number of bytes pending delalloc, used by stat to calc the
109d352ac68SChris Mason 	 * real block usage of the file
110d352ac68SChris Mason 	 */
1119069218dSChris Mason 	u64 delalloc_bytes;
112d352ac68SChris Mason 
113d352ac68SChris Mason 	/*
114a7e3b975SFilipe Manana 	 * Total number of bytes pending delalloc that fall within a file
115a7e3b975SFilipe Manana 	 * range that is either a hole or beyond EOF (and no prealloc extent
116a7e3b975SFilipe Manana 	 * exists in the range). This is always <= delalloc_bytes.
117a7e3b975SFilipe Manana 	 */
118a7e3b975SFilipe Manana 	u64 new_delalloc_bytes;
119a7e3b975SFilipe Manana 
120a7e3b975SFilipe Manana 	/*
12147059d93SWang Shilong 	 * total number of bytes pending defrag, used by stat to check whether
12247059d93SWang Shilong 	 * it needs COW.
12347059d93SWang Shilong 	 */
12447059d93SWang Shilong 	u64 defrag_bytes;
12547059d93SWang Shilong 
12647059d93SWang Shilong 	/*
127d352ac68SChris Mason 	 * the size of the file stored in the metadata on disk.  data=ordered
128d352ac68SChris Mason 	 * means the in-memory i_size might be larger than the size on disk
129d352ac68SChris Mason 	 * because not all the blocks are written yet.
130d352ac68SChris Mason 	 */
131dbe674a9SChris Mason 	u64 disk_i_size;
132d352ac68SChris Mason 
133aec7477bSJosef Bacik 	/*
134aec7477bSJosef Bacik 	 * if this is a directory then index_cnt is the counter for the index
135aec7477bSJosef Bacik 	 * number for new files that are created
136aec7477bSJosef Bacik 	 */
137aec7477bSJosef Bacik 	u64 index_cnt;
138d352ac68SChris Mason 
13967de1176SMiao Xie 	/* Cache the directory index number to speed the dir/file remove */
14067de1176SMiao Xie 	u64 dir_index;
14167de1176SMiao Xie 
14212fcfd22SChris Mason 	/* the fsync log has some corner cases that mean we have to check
14312fcfd22SChris Mason 	 * directories to see if any unlinks have been done before
14412fcfd22SChris Mason 	 * the directory was logged.  See tree-log.c for all the
14512fcfd22SChris Mason 	 * details
14612fcfd22SChris Mason 	 */
14712fcfd22SChris Mason 	u64 last_unlink_trans;
14812fcfd22SChris Mason 
1497709cde3SJosef Bacik 	/*
1507709cde3SJosef Bacik 	 * Number of bytes outstanding that are going to need csums.  This is
1517709cde3SJosef Bacik 	 * used in ENOSPC accounting.
1527709cde3SJosef Bacik 	 */
1537709cde3SJosef Bacik 	u64 csum_bytes;
1547709cde3SJosef Bacik 
155f1bdcc0aSJosef Bacik 	/* flags field from the on disk inode */
156f1bdcc0aSJosef Bacik 	u32 flags;
157f1bdcc0aSJosef Bacik 
1585a3f23d5SChris Mason 	/*
15932c00affSJosef Bacik 	 * Counters to keep track of the number of extent item's we may use due
16032c00affSJosef Bacik 	 * to delalloc and such.  outstanding_extents is the number of extent
16132c00affSJosef Bacik 	 * items we think we'll end up using, and reserved_extents is the number
16232c00affSJosef Bacik 	 * of extent items we've reserved metadata for.
1639ed74f2dSJosef Bacik 	 */
1649e0baf60SJosef Bacik 	unsigned outstanding_extents;
16569fe2d75SJosef Bacik 
16669fe2d75SJosef Bacik 	struct btrfs_block_rsv block_rsv;
1679ed74f2dSJosef Bacik 
1689ed74f2dSJosef Bacik 	/*
169b52aa8c9SDavid Sterba 	 * Cached values of inode properties
1701e701a32SChris Mason 	 */
171b52aa8c9SDavid Sterba 	unsigned prop_compress;		/* per-file compression algorithm */
172eec63c65SDavid Sterba 	/*
173eec63c65SDavid Sterba 	 * Force compression on the file using the defrag ioctl, could be
174eec63c65SDavid Sterba 	 * different from prop_compress and takes precedence if set
175eec63c65SDavid Sterba 	 */
176eec63c65SDavid Sterba 	unsigned defrag_compress;
1771e701a32SChris Mason 
17816cdcec7SMiao Xie 	struct btrfs_delayed_node *delayed_node;
17916cdcec7SMiao Xie 
1809cc97d64Schandan r 	/* File creation time. */
181*d3c6be6fSArnd Bergmann 	struct timespec64 i_otime;
1829cc97d64Schandan r 
1838089fe62SDavid Sterba 	/* Hook into fs_info->delayed_iputs */
1848089fe62SDavid Sterba 	struct list_head delayed_iput;
1858089fe62SDavid Sterba 
1865f9a8a51SFilipe Manana 	/*
1875f9a8a51SFilipe Manana 	 * To avoid races between lockless (i_mutex not held) direct IO writes
1885f9a8a51SFilipe Manana 	 * and concurrent fsync requests. Direct IO writes must acquire read
1895f9a8a51SFilipe Manana 	 * access on this semaphore for creating an extent map and its
1905f9a8a51SFilipe Manana 	 * corresponding ordered extent. The fast fsync path must acquire write
1915f9a8a51SFilipe Manana 	 * access on this semaphore before it collects ordered extents and
1925f9a8a51SFilipe Manana 	 * extent maps.
1935f9a8a51SFilipe Manana 	 */
1945f9a8a51SFilipe Manana 	struct rw_semaphore dio_sem;
1955f9a8a51SFilipe Manana 
196d352ac68SChris Mason 	struct inode vfs_inode;
1972c90e5d6SChris Mason };
198dbe674a9SChris Mason 
19916cdcec7SMiao Xie extern unsigned char btrfs_filetype_table[];
20016cdcec7SMiao Xie 
2019a35b637SJeff Mahoney static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
2022c90e5d6SChris Mason {
2032c90e5d6SChris Mason 	return container_of(inode, struct btrfs_inode, vfs_inode);
2042c90e5d6SChris Mason }
2052c90e5d6SChris Mason 
206778ba82bSFilipe David Borba Manana static inline unsigned long btrfs_inode_hash(u64 objectid,
207778ba82bSFilipe David Borba Manana 					     const struct btrfs_root *root)
208778ba82bSFilipe David Borba Manana {
209778ba82bSFilipe David Borba Manana 	u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
210778ba82bSFilipe David Borba Manana 
211778ba82bSFilipe David Borba Manana #if BITS_PER_LONG == 32
212778ba82bSFilipe David Borba Manana 	h = (h >> 32) ^ (h & 0xffffffff);
213778ba82bSFilipe David Borba Manana #endif
214778ba82bSFilipe David Borba Manana 
215778ba82bSFilipe David Borba Manana 	return (unsigned long)h;
216778ba82bSFilipe David Borba Manana }
217778ba82bSFilipe David Borba Manana 
218778ba82bSFilipe David Borba Manana static inline void btrfs_insert_inode_hash(struct inode *inode)
219778ba82bSFilipe David Borba Manana {
220778ba82bSFilipe David Borba Manana 	unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
221778ba82bSFilipe David Borba Manana 
222778ba82bSFilipe David Borba Manana 	__insert_inode_hash(inode, h);
223778ba82bSFilipe David Borba Manana }
224778ba82bSFilipe David Borba Manana 
2259a35b637SJeff Mahoney static inline u64 btrfs_ino(const struct btrfs_inode *inode)
22633345d01SLi Zefan {
2274a0cc7caSNikolay Borisov 	u64 ino = inode->location.objectid;
22833345d01SLi Zefan 
22914c7cca7SLiu Bo 	/*
23014c7cca7SLiu Bo 	 * !ino: btree_inode
23114c7cca7SLiu Bo 	 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
23214c7cca7SLiu Bo 	 */
2334a0cc7caSNikolay Borisov 	if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY)
2344a0cc7caSNikolay Borisov 		ino = inode->vfs_inode.i_ino;
23533345d01SLi Zefan 	return ino;
23633345d01SLi Zefan }
23733345d01SLi Zefan 
2386ef06d27SNikolay Borisov static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
239dbe674a9SChris Mason {
2406ef06d27SNikolay Borisov 	i_size_write(&inode->vfs_inode, size);
2416ef06d27SNikolay Borisov 	inode->disk_i_size = size;
242dbe674a9SChris Mason }
243dbe674a9SChris Mason 
24470ddc553SNikolay Borisov static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
2452cf8572dSChris Mason {
24670ddc553SNikolay Borisov 	struct btrfs_root *root = inode->root;
24783eea1f1SLiu Bo 
24851a8cf9dSLiu Bo 	if (root == root->fs_info->tree_root &&
24970ddc553SNikolay Borisov 	    btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
25051a8cf9dSLiu Bo 		return true;
25170ddc553SNikolay Borisov 	if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID)
2522cf8572dSChris Mason 		return true;
2532cf8572dSChris Mason 	return false;
2542cf8572dSChris Mason }
2552cf8572dSChris Mason 
2568b62f87bSJosef Bacik static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
2578b62f87bSJosef Bacik 						 int mod)
2588b62f87bSJosef Bacik {
2598b62f87bSJosef Bacik 	lockdep_assert_held(&inode->lock);
2608b62f87bSJosef Bacik 	inode->outstanding_extents += mod;
2618b62f87bSJosef Bacik 	if (btrfs_is_free_space_inode(inode))
2628b62f87bSJosef Bacik 		return;
263dd48d407SJosef Bacik 	trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
264dd48d407SJosef Bacik 						  mod);
2658b62f87bSJosef Bacik }
2668b62f87bSJosef Bacik 
2670f8939b8SNikolay Borisov static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
26822ee6985SJosef Bacik {
2692f2ff0eeSFilipe Manana 	int ret = 0;
2702f2ff0eeSFilipe Manana 
2710f8939b8SNikolay Borisov 	spin_lock(&inode->lock);
2720f8939b8SNikolay Borisov 	if (inode->logged_trans == generation &&
2730f8939b8SNikolay Borisov 	    inode->last_sub_trans <= inode->last_log_commit &&
2740f8939b8SNikolay Borisov 	    inode->last_sub_trans <= inode->root->last_log_commit) {
275125c4cf9SFilipe Manana 		/*
276125c4cf9SFilipe Manana 		 * After a ranged fsync we might have left some extent maps
277125c4cf9SFilipe Manana 		 * (that fall outside the fsync's range). So return false
278125c4cf9SFilipe Manana 		 * here if the list isn't empty, to make sure btrfs_log_inode()
279125c4cf9SFilipe Manana 		 * will be called and process those extent maps.
280125c4cf9SFilipe Manana 		 */
281125c4cf9SFilipe Manana 		smp_mb();
2820f8939b8SNikolay Borisov 		if (list_empty(&inode->extent_tree.modified_extents))
2832f2ff0eeSFilipe Manana 			ret = 1;
284125c4cf9SFilipe Manana 	}
2850f8939b8SNikolay Borisov 	spin_unlock(&inode->lock);
2862f2ff0eeSFilipe Manana 	return ret;
28722ee6985SJosef Bacik }
28822ee6985SJosef Bacik 
289c1dc0896SMiao Xie #define BTRFS_DIO_ORIG_BIO_SUBMITTED	0x1
290c1dc0896SMiao Xie 
291facc8a22SMiao Xie struct btrfs_dio_private {
292facc8a22SMiao Xie 	struct inode *inode;
293c1dc0896SMiao Xie 	unsigned long flags;
294facc8a22SMiao Xie 	u64 logical_offset;
295facc8a22SMiao Xie 	u64 disk_bytenr;
296facc8a22SMiao Xie 	u64 bytes;
297facc8a22SMiao Xie 	void *private;
298facc8a22SMiao Xie 
299facc8a22SMiao Xie 	/* number of bios pending for this dio */
300facc8a22SMiao Xie 	atomic_t pending_bios;
301facc8a22SMiao Xie 
302facc8a22SMiao Xie 	/* IO errors */
303facc8a22SMiao Xie 	int errors;
304facc8a22SMiao Xie 
305facc8a22SMiao Xie 	/* orig_bio is our btrfs_io_bio */
306facc8a22SMiao Xie 	struct bio *orig_bio;
307facc8a22SMiao Xie 
308facc8a22SMiao Xie 	/* dio_bio came from fs/direct-io.c */
309facc8a22SMiao Xie 	struct bio *dio_bio;
310c1dc0896SMiao Xie 
311c1dc0896SMiao Xie 	/*
31201327610SNicholas D Steeves 	 * The original bio may be split to several sub-bios, this is
313c1dc0896SMiao Xie 	 * done during endio of sub-bios
314c1dc0896SMiao Xie 	 */
3154e4cbee9SChristoph Hellwig 	blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *,
3164e4cbee9SChristoph Hellwig 			blk_status_t);
317facc8a22SMiao Xie };
318facc8a22SMiao Xie 
3192e60a51eSMiao Xie /*
3202e60a51eSMiao Xie  * Disable DIO read nolock optimization, so new dio readers will be forced
3212e60a51eSMiao Xie  * to grab i_mutex. It is used to avoid the endless truncate due to
3222e60a51eSMiao Xie  * nonlocked dio read.
3232e60a51eSMiao Xie  */
324abcefb1eSNikolay Borisov static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
3252e60a51eSMiao Xie {
326abcefb1eSNikolay Borisov 	set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
3272e60a51eSMiao Xie 	smp_mb();
3282e60a51eSMiao Xie }
3292e60a51eSMiao Xie 
3300b581701SNikolay Borisov static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
3312e60a51eSMiao Xie {
3324e857c58SPeter Zijlstra 	smp_mb__before_atomic();
3330b581701SNikolay Borisov 	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
3342e60a51eSMiao Xie }
3352e60a51eSMiao Xie 
3360970a22eSNikolay Borisov static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
3376f6b643eSQu Wenruo 		u64 logical_start, u32 csum, u32 csum_expected, int mirror_num)
3386f6b643eSQu Wenruo {
3390970a22eSNikolay Borisov 	struct btrfs_root *root = inode->root;
3406f6b643eSQu Wenruo 
3416f6b643eSQu Wenruo 	/* Output minus objectid, which is more meaningful */
3426f6b643eSQu Wenruo 	if (root->objectid >= BTRFS_LAST_FREE_OBJECTID)
3436f6b643eSQu Wenruo 		btrfs_warn_rl(root->fs_info,
3446f6b643eSQu Wenruo 	"csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d",
3450970a22eSNikolay Borisov 			root->objectid, btrfs_ino(inode),
3466f6b643eSQu Wenruo 			logical_start, csum, csum_expected, mirror_num);
3476f6b643eSQu Wenruo 	else
3486f6b643eSQu Wenruo 		btrfs_warn_rl(root->fs_info,
3496f6b643eSQu Wenruo 	"csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d",
3500970a22eSNikolay Borisov 			root->objectid, btrfs_ino(inode),
3516f6b643eSQu Wenruo 			logical_start, csum, csum_expected, mirror_num);
3526f6b643eSQu Wenruo }
3536f6b643eSQu Wenruo 
3542c90e5d6SChris Mason #endif
355