xref: /openbmc/linux/fs/ext4/mballoc.h (revision 6ba495e9)
18f6e39a7SMingming Cao /*
28f6e39a7SMingming Cao  *  fs/ext4/mballoc.h
38f6e39a7SMingming Cao  *
48f6e39a7SMingming Cao  *  Written by: Alex Tomas <alex@clusterfs.com>
58f6e39a7SMingming Cao  *
68f6e39a7SMingming Cao  */
78f6e39a7SMingming Cao #ifndef _EXT4_MBALLOC_H
88f6e39a7SMingming Cao #define _EXT4_MBALLOC_H
98f6e39a7SMingming Cao 
108f6e39a7SMingming Cao #include <linux/time.h>
118f6e39a7SMingming Cao #include <linux/fs.h>
128f6e39a7SMingming Cao #include <linux/namei.h>
138f6e39a7SMingming Cao #include <linux/quotaops.h>
148f6e39a7SMingming Cao #include <linux/buffer_head.h>
158f6e39a7SMingming Cao #include <linux/module.h>
168f6e39a7SMingming Cao #include <linux/swap.h>
178f6e39a7SMingming Cao #include <linux/proc_fs.h>
188f6e39a7SMingming Cao #include <linux/pagemap.h>
198f6e39a7SMingming Cao #include <linux/seq_file.h>
208f6e39a7SMingming Cao #include <linux/version.h>
218a0aba73STheodore Ts'o #include <linux/blkdev.h>
22920313a7SAneesh Kumar K.V #include <linux/mutex.h>
238f6e39a7SMingming Cao #include "ext4_jbd2.h"
248f6e39a7SMingming Cao #include "ext4.h"
258f6e39a7SMingming Cao 
268f6e39a7SMingming Cao /*
278f6e39a7SMingming Cao  * with AGGRESSIVE_CHECK allocator runs consistency checks over
288f6e39a7SMingming Cao  * structures. these checks slow things down a lot
298f6e39a7SMingming Cao  */
308f6e39a7SMingming Cao #define AGGRESSIVE_CHECK__
318f6e39a7SMingming Cao 
328f6e39a7SMingming Cao /*
338f6e39a7SMingming Cao  * with DOUBLE_CHECK defined mballoc creates persistent in-core
348f6e39a7SMingming Cao  * bitmaps, maintains and uses them to check for double allocations
358f6e39a7SMingming Cao  */
368f6e39a7SMingming Cao #define DOUBLE_CHECK__
378f6e39a7SMingming Cao 
388f6e39a7SMingming Cao /*
398f6e39a7SMingming Cao  */
406ba495e9STheodore Ts'o #ifdef CONFIG_EXT4_DEBUG
416ba495e9STheodore Ts'o extern u8 mb_enable_debug;
426ba495e9STheodore Ts'o 
436ba495e9STheodore Ts'o #define mb_debug(n, fmt, a...)	                                        \
446ba495e9STheodore Ts'o 	do {								\
456ba495e9STheodore Ts'o 		if ((n) <= mb_enable_debug) {		        	\
466ba495e9STheodore Ts'o 			printk(KERN_DEBUG "(%s, %d): %s: ",		\
476ba495e9STheodore Ts'o 			       __FILE__, __LINE__, __func__);		\
486ba495e9STheodore Ts'o 			printk(fmt, ## a);				\
496ba495e9STheodore Ts'o 		}							\
506ba495e9STheodore Ts'o 	} while (0)
518f6e39a7SMingming Cao #else
526ba495e9STheodore Ts'o #define mb_debug(n, fmt, a...)
538f6e39a7SMingming Cao #endif
548f6e39a7SMingming Cao 
558f6e39a7SMingming Cao /*
568f6e39a7SMingming Cao  * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
578f6e39a7SMingming Cao  * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
588f6e39a7SMingming Cao  */
598f6e39a7SMingming Cao #define EXT4_MB_HISTORY
608f6e39a7SMingming Cao #define EXT4_MB_HISTORY_ALLOC		1	/* allocation */
618f6e39a7SMingming Cao #define EXT4_MB_HISTORY_PREALLOC	2	/* preallocated blocks used */
628f6e39a7SMingming Cao #define EXT4_MB_HISTORY_DISCARD		4	/* preallocation discarded */
638f6e39a7SMingming Cao #define EXT4_MB_HISTORY_FREE		8	/* free */
648f6e39a7SMingming Cao 
658f6e39a7SMingming Cao #define EXT4_MB_HISTORY_DEFAULT		(EXT4_MB_HISTORY_ALLOC | \
668f6e39a7SMingming Cao 					 EXT4_MB_HISTORY_PREALLOC)
678f6e39a7SMingming Cao 
688f6e39a7SMingming Cao /*
698f6e39a7SMingming Cao  * How long mballoc can look for a best extent (in found extents)
708f6e39a7SMingming Cao  */
718f6e39a7SMingming Cao #define MB_DEFAULT_MAX_TO_SCAN		200
728f6e39a7SMingming Cao 
738f6e39a7SMingming Cao /*
748f6e39a7SMingming Cao  * How long mballoc must look for a best extent
758f6e39a7SMingming Cao  */
768f6e39a7SMingming Cao #define MB_DEFAULT_MIN_TO_SCAN		10
778f6e39a7SMingming Cao 
788f6e39a7SMingming Cao /*
798f6e39a7SMingming Cao  * How many groups mballoc will scan looking for the best chunk
808f6e39a7SMingming Cao  */
818f6e39a7SMingming Cao #define MB_DEFAULT_MAX_GROUPS_TO_SCAN	5
828f6e39a7SMingming Cao 
838f6e39a7SMingming Cao /*
848f6e39a7SMingming Cao  * with 'ext4_mb_stats' allocator will collect stats that will be
858f6e39a7SMingming Cao  * shown at umount. The collecting costs though!
868f6e39a7SMingming Cao  */
878f6e39a7SMingming Cao #define MB_DEFAULT_STATS		1
888f6e39a7SMingming Cao 
898f6e39a7SMingming Cao /*
908f6e39a7SMingming Cao  * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
918f6e39a7SMingming Cao  * by the stream allocator, which purpose is to pack requests
928f6e39a7SMingming Cao  * as close each to other as possible to produce smooth I/O traffic
938f6e39a7SMingming Cao  * We use locality group prealloc space for stream request.
948f6e39a7SMingming Cao  * We can tune the same via /proc/fs/ext4/<parition>/stream_req
958f6e39a7SMingming Cao  */
968f6e39a7SMingming Cao #define MB_DEFAULT_STREAM_THRESHOLD	16	/* 64K */
978f6e39a7SMingming Cao 
988f6e39a7SMingming Cao /*
998f6e39a7SMingming Cao  * for which requests use 2^N search using buddies
1008f6e39a7SMingming Cao  */
1018f6e39a7SMingming Cao #define MB_DEFAULT_ORDER2_REQS		2
1028f6e39a7SMingming Cao 
1038f6e39a7SMingming Cao /*
1048f6e39a7SMingming Cao  * default group prealloc size 512 blocks
1058f6e39a7SMingming Cao  */
1068f6e39a7SMingming Cao #define MB_DEFAULT_GROUP_PREALLOC	512
1078f6e39a7SMingming Cao 
1088f6e39a7SMingming Cao 
109c894058dSAneesh Kumar K.V struct ext4_free_data {
110c894058dSAneesh Kumar K.V 	/* this links the free block information from group_info */
111c894058dSAneesh Kumar K.V 	struct rb_node node;
1128f6e39a7SMingming Cao 
113c894058dSAneesh Kumar K.V 	/* this links the free block information from ext4_sb_info */
1148f6e39a7SMingming Cao 	struct list_head list;
115c894058dSAneesh Kumar K.V 
116c894058dSAneesh Kumar K.V 	/* group which free block extent belongs */
117c894058dSAneesh Kumar K.V 	ext4_group_t group;
118c894058dSAneesh Kumar K.V 
119c894058dSAneesh Kumar K.V 	/* free block extent */
120c894058dSAneesh Kumar K.V 	ext4_grpblk_t start_blk;
121c894058dSAneesh Kumar K.V 	ext4_grpblk_t count;
122c894058dSAneesh Kumar K.V 
123c894058dSAneesh Kumar K.V 	/* transaction which freed this extent */
124c894058dSAneesh Kumar K.V 	tid_t	t_tid;
1258f6e39a7SMingming Cao };
1268f6e39a7SMingming Cao 
1278f6e39a7SMingming Cao struct ext4_prealloc_space {
1288f6e39a7SMingming Cao 	struct list_head	pa_inode_list;
1298f6e39a7SMingming Cao 	struct list_head	pa_group_list;
1308f6e39a7SMingming Cao 	union {
1318f6e39a7SMingming Cao 		struct list_head pa_tmp_list;
1328f6e39a7SMingming Cao 		struct rcu_head	pa_rcu;
1338f6e39a7SMingming Cao 	} u;
1348f6e39a7SMingming Cao 	spinlock_t		pa_lock;
1358f6e39a7SMingming Cao 	atomic_t		pa_count;
1368f6e39a7SMingming Cao 	unsigned		pa_deleted;
1378f6e39a7SMingming Cao 	ext4_fsblk_t		pa_pstart;	/* phys. block */
1388f6e39a7SMingming Cao 	ext4_lblk_t		pa_lstart;	/* log. block */
1398f6e39a7SMingming Cao 	unsigned short		pa_len;		/* len of preallocated chunk */
1408f6e39a7SMingming Cao 	unsigned short		pa_free;	/* how many blocks are free */
141cc0fb9adSAneesh Kumar K.V 	unsigned short		pa_type;	/* pa type. inode or group */
1428f6e39a7SMingming Cao 	spinlock_t		*pa_obj_lock;
1438f6e39a7SMingming Cao 	struct inode		*pa_inode;	/* hack, for history only */
1448f6e39a7SMingming Cao };
1458f6e39a7SMingming Cao 
146cc0fb9adSAneesh Kumar K.V enum {
147cc0fb9adSAneesh Kumar K.V 	MB_INODE_PA = 0,
148cc0fb9adSAneesh Kumar K.V 	MB_GROUP_PA = 1
149cc0fb9adSAneesh Kumar K.V };
1508f6e39a7SMingming Cao 
1518f6e39a7SMingming Cao struct ext4_free_extent {
1528f6e39a7SMingming Cao 	ext4_lblk_t fe_logical;
1538f6e39a7SMingming Cao 	ext4_grpblk_t fe_start;
1548f6e39a7SMingming Cao 	ext4_group_t fe_group;
1558f6e39a7SMingming Cao 	int fe_len;
1568f6e39a7SMingming Cao };
1578f6e39a7SMingming Cao 
1588f6e39a7SMingming Cao /*
1598f6e39a7SMingming Cao  * Locality group:
1608f6e39a7SMingming Cao  *   we try to group all related changes together
1618f6e39a7SMingming Cao  *   so that writeback can flush/allocate them together as well
1626be2ded1SAneesh Kumar K.V  *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
1636be2ded1SAneesh Kumar K.V  *   (512). We store prealloc space into the hash based on the pa_free blocks
1646be2ded1SAneesh Kumar K.V  *   order value.ie, fls(pa_free)-1;
1658f6e39a7SMingming Cao  */
1666be2ded1SAneesh Kumar K.V #define PREALLOC_TB_SIZE 10
1678f6e39a7SMingming Cao struct ext4_locality_group {
1688f6e39a7SMingming Cao 	/* for allocator */
1696be2ded1SAneesh Kumar K.V 	/* to serialize allocates */
1706be2ded1SAneesh Kumar K.V 	struct mutex		lg_mutex;
1716be2ded1SAneesh Kumar K.V 	/* list of preallocations */
1726be2ded1SAneesh Kumar K.V 	struct list_head	lg_prealloc_list[PREALLOC_TB_SIZE];
1738f6e39a7SMingming Cao 	spinlock_t		lg_prealloc_lock;
1748f6e39a7SMingming Cao };
1758f6e39a7SMingming Cao 
1768f6e39a7SMingming Cao struct ext4_allocation_context {
1778f6e39a7SMingming Cao 	struct inode *ac_inode;
1788f6e39a7SMingming Cao 	struct super_block *ac_sb;
1798f6e39a7SMingming Cao 
1808f6e39a7SMingming Cao 	/* original request */
1818f6e39a7SMingming Cao 	struct ext4_free_extent ac_o_ex;
1828f6e39a7SMingming Cao 
1838f6e39a7SMingming Cao 	/* goal request (after normalization) */
1848f6e39a7SMingming Cao 	struct ext4_free_extent ac_g_ex;
1858f6e39a7SMingming Cao 
1868f6e39a7SMingming Cao 	/* the best found extent */
1878f6e39a7SMingming Cao 	struct ext4_free_extent ac_b_ex;
1888f6e39a7SMingming Cao 
1898f6e39a7SMingming Cao 	/* copy of the bext found extent taken before preallocation efforts */
1908f6e39a7SMingming Cao 	struct ext4_free_extent ac_f_ex;
1918f6e39a7SMingming Cao 
1928f6e39a7SMingming Cao 	/* number of iterations done. we have to track to limit searching */
1938f6e39a7SMingming Cao 	unsigned long ac_ex_scanned;
1948f6e39a7SMingming Cao 	__u16 ac_groups_scanned;
1958f6e39a7SMingming Cao 	__u16 ac_found;
1968f6e39a7SMingming Cao 	__u16 ac_tail;
1978f6e39a7SMingming Cao 	__u16 ac_buddy;
1988f6e39a7SMingming Cao 	__u16 ac_flags;		/* allocation hints */
1998f6e39a7SMingming Cao 	__u8 ac_status;
2008f6e39a7SMingming Cao 	__u8 ac_criteria;
2018f6e39a7SMingming Cao 	__u8 ac_repeats;
2028f6e39a7SMingming Cao 	__u8 ac_2order;		/* if request is to allocate 2^N blocks and
2038f6e39a7SMingming Cao 				 * N > 0, the field stores N, otherwise 0 */
2048f6e39a7SMingming Cao 	__u8 ac_op;		/* operation, for history only */
2058f6e39a7SMingming Cao 	struct page *ac_bitmap_page;
2068f6e39a7SMingming Cao 	struct page *ac_buddy_page;
2078556e8f3SAneesh Kumar K.V 	/*
2088556e8f3SAneesh Kumar K.V 	 * pointer to the held semaphore upon successful
2098556e8f3SAneesh Kumar K.V 	 * block allocation
2108556e8f3SAneesh Kumar K.V 	 */
2118556e8f3SAneesh Kumar K.V 	struct rw_semaphore *alloc_semp;
2128f6e39a7SMingming Cao 	struct ext4_prealloc_space *ac_pa;
2138f6e39a7SMingming Cao 	struct ext4_locality_group *ac_lg;
2148f6e39a7SMingming Cao };
2158f6e39a7SMingming Cao 
2168f6e39a7SMingming Cao #define AC_STATUS_CONTINUE	1
2178f6e39a7SMingming Cao #define AC_STATUS_FOUND		2
2188f6e39a7SMingming Cao #define AC_STATUS_BREAK		3
2198f6e39a7SMingming Cao 
2208f6e39a7SMingming Cao struct ext4_mb_history {
2218f6e39a7SMingming Cao 	struct ext4_free_extent orig;	/* orig allocation */
2228f6e39a7SMingming Cao 	struct ext4_free_extent goal;	/* goal allocation */
2238f6e39a7SMingming Cao 	struct ext4_free_extent result;	/* result allocation */
2248f6e39a7SMingming Cao 	unsigned pid;
2258f6e39a7SMingming Cao 	unsigned ino;
2268f6e39a7SMingming Cao 	__u16 found;	/* how many extents have been found */
2278f6e39a7SMingming Cao 	__u16 groups;	/* how many groups have been scanned */
2288f6e39a7SMingming Cao 	__u16 tail;	/* what tail broke some buddy */
2298f6e39a7SMingming Cao 	__u16 buddy;	/* buddy the tail ^^^ broke */
2308f6e39a7SMingming Cao 	__u16 flags;
2318f6e39a7SMingming Cao 	__u8 cr:3;	/* which phase the result extent was found at */
2328f6e39a7SMingming Cao 	__u8 op:4;
2338f6e39a7SMingming Cao 	__u8 merged:1;
2348f6e39a7SMingming Cao };
2358f6e39a7SMingming Cao 
2368f6e39a7SMingming Cao struct ext4_buddy {
2378f6e39a7SMingming Cao 	struct page *bd_buddy_page;
2388f6e39a7SMingming Cao 	void *bd_buddy;
2398f6e39a7SMingming Cao 	struct page *bd_bitmap_page;
2408f6e39a7SMingming Cao 	void *bd_bitmap;
2418f6e39a7SMingming Cao 	struct ext4_group_info *bd_info;
2428f6e39a7SMingming Cao 	struct super_block *bd_sb;
2438f6e39a7SMingming Cao 	__u16 bd_blkbits;
2448f6e39a7SMingming Cao 	ext4_group_t bd_group;
245920313a7SAneesh Kumar K.V 	struct rw_semaphore *alloc_semp;
2468f6e39a7SMingming Cao };
2478f6e39a7SMingming Cao #define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
2488f6e39a7SMingming Cao #define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
2498f6e39a7SMingming Cao 
2508f6e39a7SMingming Cao #ifndef EXT4_MB_HISTORY
2518f6e39a7SMingming Cao static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
2528f6e39a7SMingming Cao {
2538f6e39a7SMingming Cao 	return;
2548f6e39a7SMingming Cao }
2558f6e39a7SMingming Cao #endif
2568f6e39a7SMingming Cao 
2578f6e39a7SMingming Cao #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
2588f6e39a7SMingming Cao 
259c3a326a6SAneesh Kumar K.V static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2608f6e39a7SMingming Cao 					struct ext4_free_extent *fex)
2618f6e39a7SMingming Cao {
2628f6e39a7SMingming Cao 	ext4_fsblk_t block;
2638f6e39a7SMingming Cao 
2648f6e39a7SMingming Cao 	block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
2658f6e39a7SMingming Cao 			+ fex->fe_start
2668f6e39a7SMingming Cao 			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
2678f6e39a7SMingming Cao 	return block;
2688f6e39a7SMingming Cao }
2698f6e39a7SMingming Cao #endif
270