18f6e39a7SMingming Cao /* 28f6e39a7SMingming Cao * fs/ext4/mballoc.h 38f6e39a7SMingming Cao * 48f6e39a7SMingming Cao * Written by: Alex Tomas <alex@clusterfs.com> 58f6e39a7SMingming Cao * 68f6e39a7SMingming Cao */ 78f6e39a7SMingming Cao #ifndef _EXT4_MBALLOC_H 88f6e39a7SMingming Cao #define _EXT4_MBALLOC_H 98f6e39a7SMingming Cao 108f6e39a7SMingming Cao #include <linux/time.h> 118f6e39a7SMingming Cao #include <linux/fs.h> 128f6e39a7SMingming Cao #include <linux/namei.h> 138f6e39a7SMingming Cao #include <linux/quotaops.h> 148f6e39a7SMingming Cao #include <linux/buffer_head.h> 158f6e39a7SMingming Cao #include <linux/module.h> 168f6e39a7SMingming Cao #include <linux/swap.h> 178f6e39a7SMingming Cao #include <linux/proc_fs.h> 188f6e39a7SMingming Cao #include <linux/pagemap.h> 198f6e39a7SMingming Cao #include <linux/seq_file.h> 208f6e39a7SMingming Cao #include <linux/version.h> 218a0aba73STheodore Ts'o #include <linux/blkdev.h> 22920313a7SAneesh Kumar K.V #include <linux/mutex.h> 238f6e39a7SMingming Cao #include "ext4_jbd2.h" 248f6e39a7SMingming Cao #include "ext4.h" 258f6e39a7SMingming Cao 268f6e39a7SMingming Cao /* 278f6e39a7SMingming Cao * with AGGRESSIVE_CHECK allocator runs consistency checks over 288f6e39a7SMingming Cao * structures. these checks slow things down a lot 298f6e39a7SMingming Cao */ 308f6e39a7SMingming Cao #define AGGRESSIVE_CHECK__ 318f6e39a7SMingming Cao 328f6e39a7SMingming Cao /* 338f6e39a7SMingming Cao * with DOUBLE_CHECK defined mballoc creates persistent in-core 348f6e39a7SMingming Cao * bitmaps, maintains and uses them to check for double allocations 358f6e39a7SMingming Cao */ 368f6e39a7SMingming Cao #define DOUBLE_CHECK__ 378f6e39a7SMingming Cao 388f6e39a7SMingming Cao /* 398f6e39a7SMingming Cao */ 406ba495e9STheodore Ts'o #ifdef CONFIG_EXT4_DEBUG 416ba495e9STheodore Ts'o extern u8 mb_enable_debug; 426ba495e9STheodore Ts'o 436ba495e9STheodore Ts'o #define mb_debug(n, fmt, a...) \ 446ba495e9STheodore Ts'o do { \ 456ba495e9STheodore Ts'o if ((n) <= mb_enable_debug) { \ 466ba495e9STheodore Ts'o printk(KERN_DEBUG "(%s, %d): %s: ", \ 476ba495e9STheodore Ts'o __FILE__, __LINE__, __func__); \ 486ba495e9STheodore Ts'o printk(fmt, ## a); \ 496ba495e9STheodore Ts'o } \ 506ba495e9STheodore Ts'o } while (0) 518f6e39a7SMingming Cao #else 526ba495e9STheodore Ts'o #define mb_debug(n, fmt, a...) 538f6e39a7SMingming Cao #endif 548f6e39a7SMingming Cao 558f6e39a7SMingming Cao /* 568f6e39a7SMingming Cao * with EXT4_MB_HISTORY mballoc stores last N allocations in memory 578f6e39a7SMingming Cao * and you can monitor it in /proc/fs/ext4/<dev>/mb_history 588f6e39a7SMingming Cao */ 598f6e39a7SMingming Cao #define EXT4_MB_HISTORY 608f6e39a7SMingming Cao #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ 618f6e39a7SMingming Cao #define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ 628f6e39a7SMingming Cao #define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */ 638f6e39a7SMingming Cao #define EXT4_MB_HISTORY_FREE 8 /* free */ 648f6e39a7SMingming Cao 658f6e39a7SMingming Cao #define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \ 668f6e39a7SMingming Cao EXT4_MB_HISTORY_PREALLOC) 678f6e39a7SMingming Cao 688f6e39a7SMingming Cao /* 698f6e39a7SMingming Cao * How long mballoc can look for a best extent (in found extents) 708f6e39a7SMingming Cao */ 718f6e39a7SMingming Cao #define MB_DEFAULT_MAX_TO_SCAN 200 728f6e39a7SMingming Cao 738f6e39a7SMingming Cao /* 748f6e39a7SMingming Cao * How long mballoc must look for a best extent 758f6e39a7SMingming Cao */ 768f6e39a7SMingming Cao #define MB_DEFAULT_MIN_TO_SCAN 10 778f6e39a7SMingming Cao 788f6e39a7SMingming Cao /* 798f6e39a7SMingming Cao * How many groups mballoc will scan looking for the best chunk 808f6e39a7SMingming Cao */ 818f6e39a7SMingming Cao #define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 828f6e39a7SMingming Cao 838f6e39a7SMingming Cao /* 848f6e39a7SMingming Cao * with 'ext4_mb_stats' allocator will collect stats that will be 858f6e39a7SMingming Cao * shown at umount. The collecting costs though! 868f6e39a7SMingming Cao */ 878f6e39a7SMingming Cao #define MB_DEFAULT_STATS 1 888f6e39a7SMingming Cao 898f6e39a7SMingming Cao /* 908f6e39a7SMingming Cao * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served 918f6e39a7SMingming Cao * by the stream allocator, which purpose is to pack requests 928f6e39a7SMingming Cao * as close each to other as possible to produce smooth I/O traffic 938f6e39a7SMingming Cao * We use locality group prealloc space for stream request. 948f6e39a7SMingming Cao * We can tune the same via /proc/fs/ext4/<parition>/stream_req 958f6e39a7SMingming Cao */ 968f6e39a7SMingming Cao #define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ 978f6e39a7SMingming Cao 988f6e39a7SMingming Cao /* 998f6e39a7SMingming Cao * for which requests use 2^N search using buddies 1008f6e39a7SMingming Cao */ 1018f6e39a7SMingming Cao #define MB_DEFAULT_ORDER2_REQS 2 1028f6e39a7SMingming Cao 1038f6e39a7SMingming Cao /* 1048f6e39a7SMingming Cao * default group prealloc size 512 blocks 1058f6e39a7SMingming Cao */ 1068f6e39a7SMingming Cao #define MB_DEFAULT_GROUP_PREALLOC 512 1078f6e39a7SMingming Cao 1088f6e39a7SMingming Cao 109c894058dSAneesh Kumar K.V struct ext4_free_data { 110c894058dSAneesh Kumar K.V /* this links the free block information from group_info */ 111c894058dSAneesh Kumar K.V struct rb_node node; 1128f6e39a7SMingming Cao 113c894058dSAneesh Kumar K.V /* this links the free block information from ext4_sb_info */ 1148f6e39a7SMingming Cao struct list_head list; 115c894058dSAneesh Kumar K.V 116c894058dSAneesh Kumar K.V /* group which free block extent belongs */ 117c894058dSAneesh Kumar K.V ext4_group_t group; 118c894058dSAneesh Kumar K.V 119c894058dSAneesh Kumar K.V /* free block extent */ 120c894058dSAneesh Kumar K.V ext4_grpblk_t start_blk; 121c894058dSAneesh Kumar K.V ext4_grpblk_t count; 122c894058dSAneesh Kumar K.V 123c894058dSAneesh Kumar K.V /* transaction which freed this extent */ 124c894058dSAneesh Kumar K.V tid_t t_tid; 1258f6e39a7SMingming Cao }; 1268f6e39a7SMingming Cao 1278f6e39a7SMingming Cao struct ext4_prealloc_space { 1288f6e39a7SMingming Cao struct list_head pa_inode_list; 1298f6e39a7SMingming Cao struct list_head pa_group_list; 1308f6e39a7SMingming Cao union { 1318f6e39a7SMingming Cao struct list_head pa_tmp_list; 1328f6e39a7SMingming Cao struct rcu_head pa_rcu; 1338f6e39a7SMingming Cao } u; 1348f6e39a7SMingming Cao spinlock_t pa_lock; 1358f6e39a7SMingming Cao atomic_t pa_count; 1368f6e39a7SMingming Cao unsigned pa_deleted; 1378f6e39a7SMingming Cao ext4_fsblk_t pa_pstart; /* phys. block */ 1388f6e39a7SMingming Cao ext4_lblk_t pa_lstart; /* log. block */ 1398f6e39a7SMingming Cao unsigned short pa_len; /* len of preallocated chunk */ 1408f6e39a7SMingming Cao unsigned short pa_free; /* how many blocks are free */ 141cc0fb9adSAneesh Kumar K.V unsigned short pa_type; /* pa type. inode or group */ 1428f6e39a7SMingming Cao spinlock_t *pa_obj_lock; 1438f6e39a7SMingming Cao struct inode *pa_inode; /* hack, for history only */ 1448f6e39a7SMingming Cao }; 1458f6e39a7SMingming Cao 146cc0fb9adSAneesh Kumar K.V enum { 147cc0fb9adSAneesh Kumar K.V MB_INODE_PA = 0, 148cc0fb9adSAneesh Kumar K.V MB_GROUP_PA = 1 149cc0fb9adSAneesh Kumar K.V }; 1508f6e39a7SMingming Cao 1518f6e39a7SMingming Cao struct ext4_free_extent { 1528f6e39a7SMingming Cao ext4_lblk_t fe_logical; 1538f6e39a7SMingming Cao ext4_grpblk_t fe_start; 1548f6e39a7SMingming Cao ext4_group_t fe_group; 1558f6e39a7SMingming Cao int fe_len; 1568f6e39a7SMingming Cao }; 1578f6e39a7SMingming Cao 1588f6e39a7SMingming Cao /* 1598f6e39a7SMingming Cao * Locality group: 1608f6e39a7SMingming Cao * we try to group all related changes together 1618f6e39a7SMingming Cao * so that writeback can flush/allocate them together as well 1626be2ded1SAneesh Kumar K.V * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC 1636be2ded1SAneesh Kumar K.V * (512). We store prealloc space into the hash based on the pa_free blocks 1646be2ded1SAneesh Kumar K.V * order value.ie, fls(pa_free)-1; 1658f6e39a7SMingming Cao */ 1666be2ded1SAneesh Kumar K.V #define PREALLOC_TB_SIZE 10 1678f6e39a7SMingming Cao struct ext4_locality_group { 1688f6e39a7SMingming Cao /* for allocator */ 1696be2ded1SAneesh Kumar K.V /* to serialize allocates */ 1706be2ded1SAneesh Kumar K.V struct mutex lg_mutex; 1716be2ded1SAneesh Kumar K.V /* list of preallocations */ 1726be2ded1SAneesh Kumar K.V struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; 1738f6e39a7SMingming Cao spinlock_t lg_prealloc_lock; 1748f6e39a7SMingming Cao }; 1758f6e39a7SMingming Cao 1768f6e39a7SMingming Cao struct ext4_allocation_context { 1778f6e39a7SMingming Cao struct inode *ac_inode; 1788f6e39a7SMingming Cao struct super_block *ac_sb; 1798f6e39a7SMingming Cao 1808f6e39a7SMingming Cao /* original request */ 1818f6e39a7SMingming Cao struct ext4_free_extent ac_o_ex; 1828f6e39a7SMingming Cao 1838f6e39a7SMingming Cao /* goal request (after normalization) */ 1848f6e39a7SMingming Cao struct ext4_free_extent ac_g_ex; 1858f6e39a7SMingming Cao 1868f6e39a7SMingming Cao /* the best found extent */ 1878f6e39a7SMingming Cao struct ext4_free_extent ac_b_ex; 1888f6e39a7SMingming Cao 1898f6e39a7SMingming Cao /* copy of the bext found extent taken before preallocation efforts */ 1908f6e39a7SMingming Cao struct ext4_free_extent ac_f_ex; 1918f6e39a7SMingming Cao 1928f6e39a7SMingming Cao /* number of iterations done. we have to track to limit searching */ 1938f6e39a7SMingming Cao unsigned long ac_ex_scanned; 1948f6e39a7SMingming Cao __u16 ac_groups_scanned; 1958f6e39a7SMingming Cao __u16 ac_found; 1968f6e39a7SMingming Cao __u16 ac_tail; 1978f6e39a7SMingming Cao __u16 ac_buddy; 1988f6e39a7SMingming Cao __u16 ac_flags; /* allocation hints */ 1998f6e39a7SMingming Cao __u8 ac_status; 2008f6e39a7SMingming Cao __u8 ac_criteria; 2018f6e39a7SMingming Cao __u8 ac_repeats; 2028f6e39a7SMingming Cao __u8 ac_2order; /* if request is to allocate 2^N blocks and 2038f6e39a7SMingming Cao * N > 0, the field stores N, otherwise 0 */ 2048f6e39a7SMingming Cao __u8 ac_op; /* operation, for history only */ 2058f6e39a7SMingming Cao struct page *ac_bitmap_page; 2068f6e39a7SMingming Cao struct page *ac_buddy_page; 2078556e8f3SAneesh Kumar K.V /* 2088556e8f3SAneesh Kumar K.V * pointer to the held semaphore upon successful 2098556e8f3SAneesh Kumar K.V * block allocation 2108556e8f3SAneesh Kumar K.V */ 2118556e8f3SAneesh Kumar K.V struct rw_semaphore *alloc_semp; 2128f6e39a7SMingming Cao struct ext4_prealloc_space *ac_pa; 2138f6e39a7SMingming Cao struct ext4_locality_group *ac_lg; 2148f6e39a7SMingming Cao }; 2158f6e39a7SMingming Cao 2168f6e39a7SMingming Cao #define AC_STATUS_CONTINUE 1 2178f6e39a7SMingming Cao #define AC_STATUS_FOUND 2 2188f6e39a7SMingming Cao #define AC_STATUS_BREAK 3 2198f6e39a7SMingming Cao 2208f6e39a7SMingming Cao struct ext4_mb_history { 2218f6e39a7SMingming Cao struct ext4_free_extent orig; /* orig allocation */ 2228f6e39a7SMingming Cao struct ext4_free_extent goal; /* goal allocation */ 2238f6e39a7SMingming Cao struct ext4_free_extent result; /* result allocation */ 2248f6e39a7SMingming Cao unsigned pid; 2258f6e39a7SMingming Cao unsigned ino; 2268f6e39a7SMingming Cao __u16 found; /* how many extents have been found */ 2278f6e39a7SMingming Cao __u16 groups; /* how many groups have been scanned */ 2288f6e39a7SMingming Cao __u16 tail; /* what tail broke some buddy */ 2298f6e39a7SMingming Cao __u16 buddy; /* buddy the tail ^^^ broke */ 2308f6e39a7SMingming Cao __u16 flags; 2318f6e39a7SMingming Cao __u8 cr:3; /* which phase the result extent was found at */ 2328f6e39a7SMingming Cao __u8 op:4; 2338f6e39a7SMingming Cao __u8 merged:1; 2348f6e39a7SMingming Cao }; 2358f6e39a7SMingming Cao 2368f6e39a7SMingming Cao struct ext4_buddy { 2378f6e39a7SMingming Cao struct page *bd_buddy_page; 2388f6e39a7SMingming Cao void *bd_buddy; 2398f6e39a7SMingming Cao struct page *bd_bitmap_page; 2408f6e39a7SMingming Cao void *bd_bitmap; 2418f6e39a7SMingming Cao struct ext4_group_info *bd_info; 2428f6e39a7SMingming Cao struct super_block *bd_sb; 2438f6e39a7SMingming Cao __u16 bd_blkbits; 2448f6e39a7SMingming Cao ext4_group_t bd_group; 245920313a7SAneesh Kumar K.V struct rw_semaphore *alloc_semp; 2468f6e39a7SMingming Cao }; 2478f6e39a7SMingming Cao #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 2488f6e39a7SMingming Cao #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 2498f6e39a7SMingming Cao 2508f6e39a7SMingming Cao #ifndef EXT4_MB_HISTORY 2518f6e39a7SMingming Cao static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) 2528f6e39a7SMingming Cao { 2538f6e39a7SMingming Cao return; 2548f6e39a7SMingming Cao } 2558f6e39a7SMingming Cao #endif 2568f6e39a7SMingming Cao 2578f6e39a7SMingming Cao #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 2588f6e39a7SMingming Cao 259c3a326a6SAneesh Kumar K.V static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 2608f6e39a7SMingming Cao struct ext4_free_extent *fex) 2618f6e39a7SMingming Cao { 2628f6e39a7SMingming Cao ext4_fsblk_t block; 2638f6e39a7SMingming Cao 2648f6e39a7SMingming Cao block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb) 2658f6e39a7SMingming Cao + fex->fe_start 2668f6e39a7SMingming Cao + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2678f6e39a7SMingming Cao return block; 2688f6e39a7SMingming Cao } 2698f6e39a7SMingming Cao #endif 270