1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 4 * Written by Alex Tomas <alex@clusterfs.com> 5 */ 6 7 8 /* 9 * mballoc.c contains the multiblocks allocation routines 10 */ 11 12 #include "ext4_jbd2.h" 13 #include "mballoc.h" 14 #include <linux/log2.h> 15 #include <linux/module.h> 16 #include <linux/slab.h> 17 #include <linux/nospec.h> 18 #include <linux/backing-dev.h> 19 #include <trace/events/ext4.h> 20 21 /* 22 * MUSTDO: 23 * - test ext4_ext_search_left() and ext4_ext_search_right() 24 * - search for metadata in few groups 25 * 26 * TODO v4: 27 * - normalization should take into account whether file is still open 28 * - discard preallocations if no free space left (policy?) 29 * - don't normalize tails 30 * - quota 31 * - reservation for superuser 32 * 33 * TODO v3: 34 * - bitmap read-ahead (proposed by Oleg Drokin aka green) 35 * - track min/max extents in each group for better group selection 36 * - mb_mark_used() may allocate chunk right after splitting buddy 37 * - tree of groups sorted by number of free blocks 38 * - error handling 39 */ 40 41 /* 42 * The allocation request involve request for multiple number of blocks 43 * near to the goal(block) value specified. 44 * 45 * During initialization phase of the allocator we decide to use the 46 * group preallocation or inode preallocation depending on the size of 47 * the file. The size of the file could be the resulting file size we 48 * would have after allocation, or the current file size, which ever 49 * is larger. If the size is less than sbi->s_mb_stream_request we 50 * select to use the group preallocation. The default value of 51 * s_mb_stream_request is 16 blocks. This can also be tuned via 52 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in 53 * terms of number of blocks. 54 * 55 * The main motivation for having small file use group preallocation is to 56 * ensure that we have small files closer together on the disk. 57 * 58 * First stage the allocator looks at the inode prealloc list, 59 * ext4_inode_info->i_prealloc_list, which contains list of prealloc 60 * spaces for this particular inode. The inode prealloc space is 61 * represented as: 62 * 63 * pa_lstart -> the logical start block for this prealloc space 64 * pa_pstart -> the physical start block for this prealloc space 65 * pa_len -> length for this prealloc space (in clusters) 66 * pa_free -> free space available in this prealloc space (in clusters) 67 * 68 * The inode preallocation space is used looking at the _logical_ start 69 * block. If only the logical file block falls within the range of prealloc 70 * space we will consume the particular prealloc space. This makes sure that 71 * we have contiguous physical blocks representing the file blocks 72 * 73 * The important thing to be noted in case of inode prealloc space is that 74 * we don't modify the values associated to inode prealloc space except 75 * pa_free. 76 * 77 * If we are not able to find blocks in the inode prealloc space and if we 78 * have the group allocation flag set then we look at the locality group 79 * prealloc space. These are per CPU prealloc list represented as 80 * 81 * ext4_sb_info.s_locality_groups[smp_processor_id()] 82 * 83 * The reason for having a per cpu locality group is to reduce the contention 84 * between CPUs. It is possible to get scheduled at this point. 85 * 86 * The locality group prealloc space is used looking at whether we have 87 * enough free space (pa_free) within the prealloc space. 88 * 89 * If we can't allocate blocks via inode prealloc or/and locality group 90 * prealloc then we look at the buddy cache. The buddy cache is represented 91 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets 92 * mapped to the buddy and bitmap information regarding different 93 * groups. The buddy information is attached to buddy cache inode so that 94 * we can access them through the page cache. The information regarding 95 * each group is loaded via ext4_mb_load_buddy. The information involve 96 * block bitmap and buddy information. The information are stored in the 97 * inode as: 98 * 99 * { page } 100 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 101 * 102 * 103 * one block each for bitmap and buddy information. So for each group we 104 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / 105 * blocksize) blocks. So it can have information regarding groups_per_page 106 * which is blocks_per_page/2 107 * 108 * The buddy cache inode is not stored on disk. The inode is thrown 109 * away when the filesystem is unmounted. 110 * 111 * We look for count number of blocks in the buddy cache. If we were able 112 * to locate that many free blocks we return with additional information 113 * regarding rest of the contiguous physical block available 114 * 115 * Before allocating blocks via buddy cache we normalize the request 116 * blocks. This ensure we ask for more blocks that we needed. The extra 117 * blocks that we get after allocation is added to the respective prealloc 118 * list. In case of inode preallocation we follow a list of heuristics 119 * based on file size. This can be found in ext4_mb_normalize_request. If 120 * we are doing a group prealloc we try to normalize the request to 121 * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is 122 * dependent on the cluster size; for non-bigalloc file systems, it is 123 * 512 blocks. This can be tuned via 124 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in 125 * terms of number of blocks. If we have mounted the file system with -O 126 * stripe=<value> option the group prealloc request is normalized to the 127 * smallest multiple of the stripe value (sbi->s_stripe) which is 128 * greater than the default mb_group_prealloc. 129 * 130 * If "mb_optimize_scan" mount option is set, we maintain in memory group info 131 * structures in two data structures: 132 * 133 * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) 134 * 135 * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) 136 * 137 * This is an array of lists where the index in the array represents the 138 * largest free order in the buddy bitmap of the participating group infos of 139 * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total 140 * number of buddy bitmap orders possible) number of lists. Group-infos are 141 * placed in appropriate lists. 142 * 143 * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) 144 * 145 * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) 146 * 147 * This is an array of lists where in the i-th list there are groups with 148 * average fragment size >= 2^i and < 2^(i+1). The average fragment size 149 * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. 150 * Note that we don't bother with a special list for completely empty groups 151 * so we only have MB_NUM_ORDERS(sb) lists. 152 * 153 * When "mb_optimize_scan" mount option is set, mballoc consults the above data 154 * structures to decide the order in which groups are to be traversed for 155 * fulfilling an allocation request. 156 * 157 * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order 158 * >= the order of the request. We directly look at the largest free order list 159 * in the data structure (1) above where largest_free_order = order of the 160 * request. If that list is empty, we look at remaining list in the increasing 161 * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED 162 * lookup in O(1) time. 163 * 164 * At CR_GOAL_LEN_FAST, we only consider groups where 165 * average fragment size > request size. So, we lookup a group which has average 166 * fragment size just above or equal to request size using our average fragment 167 * size group lists (data structure 2) in O(1) time. 168 * 169 * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied 170 * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in 171 * CR_GOAL_LEN_FAST suggests that there is no BG that has avg 172 * fragment size > goal length. So before falling to the slower 173 * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and 174 * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big 175 * enough average fragment size. This increases the chances of finding a 176 * suitable block group in O(1) time and results in faster allocation at the 177 * cost of reduced size of allocation. 178 * 179 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in 180 * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and 181 * CR_GOAL_LEN_FAST phase. 182 * 183 * The regular allocator (using the buddy cache) supports a few tunables. 184 * 185 * /sys/fs/ext4/<partition>/mb_min_to_scan 186 * /sys/fs/ext4/<partition>/mb_max_to_scan 187 * /sys/fs/ext4/<partition>/mb_order2_req 188 * /sys/fs/ext4/<partition>/mb_linear_limit 189 * 190 * The regular allocator uses buddy scan only if the request len is power of 191 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 192 * value of s_mb_order2_reqs can be tuned via 193 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to 194 * stripe size (sbi->s_stripe), we try to search for contiguous block in 195 * stripe size. This should result in better allocation on RAID setups. If 196 * not, we search in the specific group using bitmap for best extents. The 197 * tunable min_to_scan and max_to_scan control the behaviour here. 198 * min_to_scan indicate how long the mballoc __must__ look for a best 199 * extent and max_to_scan indicates how long the mballoc __can__ look for a 200 * best extent in the found extents. Searching for the blocks starts with 201 * the group specified as the goal value in allocation context via 202 * ac_g_ex. Each group is first checked based on the criteria whether it 203 * can be used for allocation. ext4_mb_good_group explains how the groups are 204 * checked. 205 * 206 * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not 207 * get traversed linearly. That may result in subsequent allocations being not 208 * close to each other. And so, the underlying device may get filled up in a 209 * non-linear fashion. While that may not matter on non-rotational devices, for 210 * rotational devices that may result in higher seek times. "mb_linear_limit" 211 * tells mballoc how many groups mballoc should search linearly before 212 * performing consulting above data structures for more efficient lookups. For 213 * non rotational devices, this value defaults to 0 and for rotational devices 214 * this is set to MB_DEFAULT_LINEAR_LIMIT. 215 * 216 * Both the prealloc space are getting populated as above. So for the first 217 * request we will hit the buddy cache which will result in this prealloc 218 * space getting filled. The prealloc space is then later used for the 219 * subsequent request. 220 */ 221 222 /* 223 * mballoc operates on the following data: 224 * - on-disk bitmap 225 * - in-core buddy (actually includes buddy and bitmap) 226 * - preallocation descriptors (PAs) 227 * 228 * there are two types of preallocations: 229 * - inode 230 * assiged to specific inode and can be used for this inode only. 231 * it describes part of inode's space preallocated to specific 232 * physical blocks. any block from that preallocated can be used 233 * independent. the descriptor just tracks number of blocks left 234 * unused. so, before taking some block from descriptor, one must 235 * make sure corresponded logical block isn't allocated yet. this 236 * also means that freeing any block within descriptor's range 237 * must discard all preallocated blocks. 238 * - locality group 239 * assigned to specific locality group which does not translate to 240 * permanent set of inodes: inode can join and leave group. space 241 * from this type of preallocation can be used for any inode. thus 242 * it's consumed from the beginning to the end. 243 * 244 * relation between them can be expressed as: 245 * in-core buddy = on-disk bitmap + preallocation descriptors 246 * 247 * this mean blocks mballoc considers used are: 248 * - allocated blocks (persistent) 249 * - preallocated blocks (non-persistent) 250 * 251 * consistency in mballoc world means that at any time a block is either 252 * free or used in ALL structures. notice: "any time" should not be read 253 * literally -- time is discrete and delimited by locks. 254 * 255 * to keep it simple, we don't use block numbers, instead we count number of 256 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. 257 * 258 * all operations can be expressed as: 259 * - init buddy: buddy = on-disk + PAs 260 * - new PA: buddy += N; PA = N 261 * - use inode PA: on-disk += N; PA -= N 262 * - discard inode PA buddy -= on-disk - PA; PA = 0 263 * - use locality group PA on-disk += N; PA -= N 264 * - discard locality group PA buddy -= PA; PA = 0 265 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap 266 * is used in real operation because we can't know actual used 267 * bits from PA, only from on-disk bitmap 268 * 269 * if we follow this strict logic, then all operations above should be atomic. 270 * given some of them can block, we'd have to use something like semaphores 271 * killing performance on high-end SMP hardware. let's try to relax it using 272 * the following knowledge: 273 * 1) if buddy is referenced, it's already initialized 274 * 2) while block is used in buddy and the buddy is referenced, 275 * nobody can re-allocate that block 276 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has 277 * bit set and PA claims same block, it's OK. IOW, one can set bit in 278 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded 279 * block 280 * 281 * so, now we're building a concurrency table: 282 * - init buddy vs. 283 * - new PA 284 * blocks for PA are allocated in the buddy, buddy must be referenced 285 * until PA is linked to allocation group to avoid concurrent buddy init 286 * - use inode PA 287 * we need to make sure that either on-disk bitmap or PA has uptodate data 288 * given (3) we care that PA-=N operation doesn't interfere with init 289 * - discard inode PA 290 * the simplest way would be to have buddy initialized by the discard 291 * - use locality group PA 292 * again PA-=N must be serialized with init 293 * - discard locality group PA 294 * the simplest way would be to have buddy initialized by the discard 295 * - new PA vs. 296 * - use inode PA 297 * i_data_sem serializes them 298 * - discard inode PA 299 * discard process must wait until PA isn't used by another process 300 * - use locality group PA 301 * some mutex should serialize them 302 * - discard locality group PA 303 * discard process must wait until PA isn't used by another process 304 * - use inode PA 305 * - use inode PA 306 * i_data_sem or another mutex should serializes them 307 * - discard inode PA 308 * discard process must wait until PA isn't used by another process 309 * - use locality group PA 310 * nothing wrong here -- they're different PAs covering different blocks 311 * - discard locality group PA 312 * discard process must wait until PA isn't used by another process 313 * 314 * now we're ready to make few consequences: 315 * - PA is referenced and while it is no discard is possible 316 * - PA is referenced until block isn't marked in on-disk bitmap 317 * - PA changes only after on-disk bitmap 318 * - discard must not compete with init. either init is done before 319 * any discard or they're serialized somehow 320 * - buddy init as sum of on-disk bitmap and PAs is done atomically 321 * 322 * a special case when we've used PA to emptiness. no need to modify buddy 323 * in this case, but we should care about concurrent init 324 * 325 */ 326 327 /* 328 * Logic in few words: 329 * 330 * - allocation: 331 * load group 332 * find blocks 333 * mark bits in on-disk bitmap 334 * release group 335 * 336 * - use preallocation: 337 * find proper PA (per-inode or group) 338 * load group 339 * mark bits in on-disk bitmap 340 * release group 341 * release PA 342 * 343 * - free: 344 * load group 345 * mark bits in on-disk bitmap 346 * release group 347 * 348 * - discard preallocations in group: 349 * mark PAs deleted 350 * move them onto local list 351 * load on-disk bitmap 352 * load group 353 * remove PA from object (inode or locality group) 354 * mark free blocks in-core 355 * 356 * - discard inode's preallocations: 357 */ 358 359 /* 360 * Locking rules 361 * 362 * Locks: 363 * - bitlock on a group (group) 364 * - object (inode/locality) (object) 365 * - per-pa lock (pa) 366 * - cr_power2_aligned lists lock (cr_power2_aligned) 367 * - cr_goal_len_fast lists lock (cr_goal_len_fast) 368 * 369 * Paths: 370 * - new pa 371 * object 372 * group 373 * 374 * - find and use pa: 375 * pa 376 * 377 * - release consumed pa: 378 * pa 379 * group 380 * object 381 * 382 * - generate in-core bitmap: 383 * group 384 * pa 385 * 386 * - discard all for given object (inode, locality group): 387 * object 388 * pa 389 * group 390 * 391 * - discard all for given group: 392 * group 393 * pa 394 * group 395 * object 396 * 397 * - allocation path (ext4_mb_regular_allocator) 398 * group 399 * cr_power2_aligned/cr_goal_len_fast 400 */ 401 static struct kmem_cache *ext4_pspace_cachep; 402 static struct kmem_cache *ext4_ac_cachep; 403 static struct kmem_cache *ext4_free_data_cachep; 404 405 /* We create slab caches for groupinfo data structures based on the 406 * superblock block size. There will be one per mounted filesystem for 407 * each unique s_blocksize_bits */ 408 #define NR_GRPINFO_CACHES 8 409 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; 410 411 static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { 412 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", 413 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", 414 "ext4_groupinfo_64k", "ext4_groupinfo_128k" 415 }; 416 417 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 418 ext4_group_t group); 419 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 420 ext4_group_t group); 421 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); 422 423 static bool ext4_mb_good_group(struct ext4_allocation_context *ac, 424 ext4_group_t group, enum criteria cr); 425 426 static int ext4_try_to_trim_range(struct super_block *sb, 427 struct ext4_buddy *e4b, ext4_grpblk_t start, 428 ext4_grpblk_t max, ext4_grpblk_t minblocks); 429 430 /* 431 * The algorithm using this percpu seq counter goes below: 432 * 1. We sample the percpu discard_pa_seq counter before trying for block 433 * allocation in ext4_mb_new_blocks(). 434 * 2. We increment this percpu discard_pa_seq counter when we either allocate 435 * or free these blocks i.e. while marking those blocks as used/free in 436 * mb_mark_used()/mb_free_blocks(). 437 * 3. We also increment this percpu seq counter when we successfully identify 438 * that the bb_prealloc_list is not empty and hence proceed for discarding 439 * of those PAs inside ext4_mb_discard_group_preallocations(). 440 * 441 * Now to make sure that the regular fast path of block allocation is not 442 * affected, as a small optimization we only sample the percpu seq counter 443 * on that cpu. Only when the block allocation fails and when freed blocks 444 * found were 0, that is when we sample percpu seq counter for all cpus using 445 * below function ext4_get_discard_pa_seq_sum(). This happens after making 446 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. 447 */ 448 static DEFINE_PER_CPU(u64, discard_pa_seq); 449 static inline u64 ext4_get_discard_pa_seq_sum(void) 450 { 451 int __cpu; 452 u64 __seq = 0; 453 454 for_each_possible_cpu(__cpu) 455 __seq += per_cpu(discard_pa_seq, __cpu); 456 return __seq; 457 } 458 459 static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 460 { 461 #if BITS_PER_LONG == 64 462 *bit += ((unsigned long) addr & 7UL) << 3; 463 addr = (void *) ((unsigned long) addr & ~7UL); 464 #elif BITS_PER_LONG == 32 465 *bit += ((unsigned long) addr & 3UL) << 3; 466 addr = (void *) ((unsigned long) addr & ~3UL); 467 #else 468 #error "how many bits you are?!" 469 #endif 470 return addr; 471 } 472 473 static inline int mb_test_bit(int bit, void *addr) 474 { 475 /* 476 * ext4_test_bit on architecture like powerpc 477 * needs unsigned long aligned address 478 */ 479 addr = mb_correct_addr_and_bit(&bit, addr); 480 return ext4_test_bit(bit, addr); 481 } 482 483 static inline void mb_set_bit(int bit, void *addr) 484 { 485 addr = mb_correct_addr_and_bit(&bit, addr); 486 ext4_set_bit(bit, addr); 487 } 488 489 static inline void mb_clear_bit(int bit, void *addr) 490 { 491 addr = mb_correct_addr_and_bit(&bit, addr); 492 ext4_clear_bit(bit, addr); 493 } 494 495 static inline int mb_test_and_clear_bit(int bit, void *addr) 496 { 497 addr = mb_correct_addr_and_bit(&bit, addr); 498 return ext4_test_and_clear_bit(bit, addr); 499 } 500 501 static inline int mb_find_next_zero_bit(void *addr, int max, int start) 502 { 503 int fix = 0, ret, tmpmax; 504 addr = mb_correct_addr_and_bit(&fix, addr); 505 tmpmax = max + fix; 506 start += fix; 507 508 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; 509 if (ret > max) 510 return max; 511 return ret; 512 } 513 514 static inline int mb_find_next_bit(void *addr, int max, int start) 515 { 516 int fix = 0, ret, tmpmax; 517 addr = mb_correct_addr_and_bit(&fix, addr); 518 tmpmax = max + fix; 519 start += fix; 520 521 ret = ext4_find_next_bit(addr, tmpmax, start) - fix; 522 if (ret > max) 523 return max; 524 return ret; 525 } 526 527 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 528 { 529 char *bb; 530 531 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 532 BUG_ON(max == NULL); 533 534 if (order > e4b->bd_blkbits + 1) { 535 *max = 0; 536 return NULL; 537 } 538 539 /* at order 0 we see each particular block */ 540 if (order == 0) { 541 *max = 1 << (e4b->bd_blkbits + 3); 542 return e4b->bd_bitmap; 543 } 544 545 bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 546 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 547 548 return bb; 549 } 550 551 #ifdef DOUBLE_CHECK 552 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, 553 int first, int count) 554 { 555 int i; 556 struct super_block *sb = e4b->bd_sb; 557 558 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 559 return; 560 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 561 for (i = 0; i < count; i++) { 562 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 563 ext4_fsblk_t blocknr; 564 565 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 566 blocknr += EXT4_C2B(EXT4_SB(sb), first + i); 567 ext4_grp_locked_error(sb, e4b->bd_group, 568 inode ? inode->i_ino : 0, 569 blocknr, 570 "freeing block already freed " 571 "(bit %u)", 572 first + i); 573 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 574 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 575 } 576 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 577 } 578 } 579 580 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) 581 { 582 int i; 583 584 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 585 return; 586 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 587 for (i = 0; i < count; i++) { 588 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 589 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 590 } 591 } 592 593 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 594 { 595 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 596 return; 597 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { 598 unsigned char *b1, *b2; 599 int i; 600 b1 = (unsigned char *) e4b->bd_info->bb_bitmap; 601 b2 = (unsigned char *) bitmap; 602 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 603 if (b1[i] != b2[i]) { 604 ext4_msg(e4b->bd_sb, KERN_ERR, 605 "corruption in group %u " 606 "at byte %u(%u): %x in copy != %x " 607 "on disk/prealloc", 608 e4b->bd_group, i, i * 8, b1[i], b2[i]); 609 BUG(); 610 } 611 } 612 } 613 } 614 615 static void mb_group_bb_bitmap_alloc(struct super_block *sb, 616 struct ext4_group_info *grp, ext4_group_t group) 617 { 618 struct buffer_head *bh; 619 620 grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); 621 if (!grp->bb_bitmap) 622 return; 623 624 bh = ext4_read_block_bitmap(sb, group); 625 if (IS_ERR_OR_NULL(bh)) { 626 kfree(grp->bb_bitmap); 627 grp->bb_bitmap = NULL; 628 return; 629 } 630 631 memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); 632 put_bh(bh); 633 } 634 635 static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) 636 { 637 kfree(grp->bb_bitmap); 638 } 639 640 #else 641 static inline void mb_free_blocks_double(struct inode *inode, 642 struct ext4_buddy *e4b, int first, int count) 643 { 644 return; 645 } 646 static inline void mb_mark_used_double(struct ext4_buddy *e4b, 647 int first, int count) 648 { 649 return; 650 } 651 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 652 { 653 return; 654 } 655 656 static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, 657 struct ext4_group_info *grp, ext4_group_t group) 658 { 659 return; 660 } 661 662 static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) 663 { 664 return; 665 } 666 #endif 667 668 #ifdef AGGRESSIVE_CHECK 669 670 #define MB_CHECK_ASSERT(assert) \ 671 do { \ 672 if (!(assert)) { \ 673 printk(KERN_EMERG \ 674 "Assertion failure in %s() at %s:%d: \"%s\"\n", \ 675 function, file, line, # assert); \ 676 BUG(); \ 677 } \ 678 } while (0) 679 680 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, 681 const char *function, int line) 682 { 683 struct super_block *sb = e4b->bd_sb; 684 int order = e4b->bd_blkbits + 1; 685 int max; 686 int max2; 687 int i; 688 int j; 689 int k; 690 int count; 691 struct ext4_group_info *grp; 692 int fragments = 0; 693 int fstart; 694 struct list_head *cur; 695 void *buddy; 696 void *buddy2; 697 698 if (e4b->bd_info->bb_check_counter++ % 10) 699 return 0; 700 701 while (order > 1) { 702 buddy = mb_find_buddy(e4b, order, &max); 703 MB_CHECK_ASSERT(buddy); 704 buddy2 = mb_find_buddy(e4b, order - 1, &max2); 705 MB_CHECK_ASSERT(buddy2); 706 MB_CHECK_ASSERT(buddy != buddy2); 707 MB_CHECK_ASSERT(max * 2 == max2); 708 709 count = 0; 710 for (i = 0; i < max; i++) { 711 712 if (mb_test_bit(i, buddy)) { 713 /* only single bit in buddy2 may be 0 */ 714 if (!mb_test_bit(i << 1, buddy2)) { 715 MB_CHECK_ASSERT( 716 mb_test_bit((i<<1)+1, buddy2)); 717 } 718 continue; 719 } 720 721 /* both bits in buddy2 must be 1 */ 722 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 723 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 724 725 for (j = 0; j < (1 << order); j++) { 726 k = (i * (1 << order)) + j; 727 MB_CHECK_ASSERT( 728 !mb_test_bit(k, e4b->bd_bitmap)); 729 } 730 count++; 731 } 732 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); 733 order--; 734 } 735 736 fstart = -1; 737 buddy = mb_find_buddy(e4b, 0, &max); 738 for (i = 0; i < max; i++) { 739 if (!mb_test_bit(i, buddy)) { 740 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); 741 if (fstart == -1) { 742 fragments++; 743 fstart = i; 744 } 745 continue; 746 } 747 fstart = -1; 748 /* check used bits only */ 749 for (j = 0; j < e4b->bd_blkbits + 1; j++) { 750 buddy2 = mb_find_buddy(e4b, j, &max2); 751 k = i >> j; 752 MB_CHECK_ASSERT(k < max2); 753 MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); 754 } 755 } 756 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); 757 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 758 759 grp = ext4_get_group_info(sb, e4b->bd_group); 760 if (!grp) 761 return NULL; 762 list_for_each(cur, &grp->bb_prealloc_list) { 763 ext4_group_t groupnr; 764 struct ext4_prealloc_space *pa; 765 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 766 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); 767 MB_CHECK_ASSERT(groupnr == e4b->bd_group); 768 for (i = 0; i < pa->pa_len; i++) 769 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); 770 } 771 return 0; 772 } 773 #undef MB_CHECK_ASSERT 774 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ 775 __FILE__, __func__, __LINE__) 776 #else 777 #define mb_check_buddy(e4b) 778 #endif 779 780 /* 781 * Divide blocks started from @first with length @len into 782 * smaller chunks with power of 2 blocks. 783 * Clear the bits in bitmap which the blocks of the chunk(s) covered, 784 * then increase bb_counters[] for corresponded chunk size. 785 */ 786 static void ext4_mb_mark_free_simple(struct super_block *sb, 787 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 788 struct ext4_group_info *grp) 789 { 790 struct ext4_sb_info *sbi = EXT4_SB(sb); 791 ext4_grpblk_t min; 792 ext4_grpblk_t max; 793 ext4_grpblk_t chunk; 794 unsigned int border; 795 796 BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); 797 798 border = 2 << sb->s_blocksize_bits; 799 800 while (len > 0) { 801 /* find how many blocks can be covered since this position */ 802 max = ffs(first | border) - 1; 803 804 /* find how many blocks of power 2 we need to mark */ 805 min = fls(len) - 1; 806 807 if (max < min) 808 min = max; 809 chunk = 1 << min; 810 811 /* mark multiblock chunks only */ 812 grp->bb_counters[min]++; 813 if (min > 0) 814 mb_clear_bit(first >> min, 815 buddy + sbi->s_mb_offsets[min]); 816 817 len -= chunk; 818 first += chunk; 819 } 820 } 821 822 static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) 823 { 824 int order; 825 826 /* 827 * We don't bother with a special lists groups with only 1 block free 828 * extents and for completely empty groups. 829 */ 830 order = fls(len) - 2; 831 if (order < 0) 832 return 0; 833 if (order == MB_NUM_ORDERS(sb)) 834 order--; 835 return order; 836 } 837 838 /* Move group to appropriate avg_fragment_size list */ 839 static void 840 mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) 841 { 842 struct ext4_sb_info *sbi = EXT4_SB(sb); 843 int new_order; 844 845 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) 846 return; 847 848 new_order = mb_avg_fragment_size_order(sb, 849 grp->bb_free / grp->bb_fragments); 850 if (new_order == grp->bb_avg_fragment_size_order) 851 return; 852 853 if (grp->bb_avg_fragment_size_order != -1) { 854 write_lock(&sbi->s_mb_avg_fragment_size_locks[ 855 grp->bb_avg_fragment_size_order]); 856 list_del(&grp->bb_avg_fragment_size_node); 857 write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 858 grp->bb_avg_fragment_size_order]); 859 } 860 grp->bb_avg_fragment_size_order = new_order; 861 write_lock(&sbi->s_mb_avg_fragment_size_locks[ 862 grp->bb_avg_fragment_size_order]); 863 list_add_tail(&grp->bb_avg_fragment_size_node, 864 &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); 865 write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 866 grp->bb_avg_fragment_size_order]); 867 } 868 869 /* 870 * Choose next group by traversing largest_free_order lists. Updates *new_cr if 871 * cr level needs an update. 872 */ 873 static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, 874 enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) 875 { 876 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 877 struct ext4_group_info *iter; 878 int i; 879 880 if (ac->ac_status == AC_STATUS_FOUND) 881 return; 882 883 if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) 884 atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); 885 886 for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { 887 if (list_empty(&sbi->s_mb_largest_free_orders[i])) 888 continue; 889 read_lock(&sbi->s_mb_largest_free_orders_locks[i]); 890 if (list_empty(&sbi->s_mb_largest_free_orders[i])) { 891 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 892 continue; 893 } 894 list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], 895 bb_largest_free_order_node) { 896 if (sbi->s_mb_stats) 897 atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); 898 if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { 899 *group = iter->bb_group; 900 ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; 901 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 902 return; 903 } 904 } 905 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 906 } 907 908 /* Increment cr and search again if no group is found */ 909 *new_cr = CR_GOAL_LEN_FAST; 910 } 911 912 /* 913 * Find a suitable group of given order from the average fragments list. 914 */ 915 static struct ext4_group_info * 916 ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order) 917 { 918 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 919 struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order]; 920 rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order]; 921 struct ext4_group_info *grp = NULL, *iter; 922 enum criteria cr = ac->ac_criteria; 923 924 if (list_empty(frag_list)) 925 return NULL; 926 read_lock(frag_list_lock); 927 if (list_empty(frag_list)) { 928 read_unlock(frag_list_lock); 929 return NULL; 930 } 931 list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) { 932 if (sbi->s_mb_stats) 933 atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); 934 if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) { 935 grp = iter; 936 break; 937 } 938 } 939 read_unlock(frag_list_lock); 940 return grp; 941 } 942 943 /* 944 * Choose next group by traversing average fragment size list of suitable 945 * order. Updates *new_cr if cr level needs an update. 946 */ 947 static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, 948 enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) 949 { 950 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 951 struct ext4_group_info *grp = NULL; 952 int i; 953 954 if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) { 955 if (sbi->s_mb_stats) 956 atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions); 957 } 958 959 for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); 960 i < MB_NUM_ORDERS(ac->ac_sb); i++) { 961 grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); 962 if (grp) { 963 *group = grp->bb_group; 964 ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; 965 return; 966 } 967 } 968 969 *new_cr = CR_BEST_AVAIL_LEN; 970 } 971 972 /* 973 * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment 974 * order we have and proactively trim the goal request length to that order to 975 * find a suitable group faster. 976 * 977 * This optimizes allocation speed at the cost of slightly reduced 978 * preallocations. However, we make sure that we don't trim the request too 979 * much and fall to CR_GOAL_LEN_SLOW in that case. 980 */ 981 static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, 982 enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) 983 { 984 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 985 struct ext4_group_info *grp = NULL; 986 int i, order, min_order; 987 unsigned long num_stripe_clusters = 0; 988 989 if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) { 990 if (sbi->s_mb_stats) 991 atomic_inc(&sbi->s_bal_best_avail_bad_suggestions); 992 } 993 994 /* 995 * mb_avg_fragment_size_order() returns order in a way that makes 996 * retrieving back the length using (1 << order) inaccurate. Hence, use 997 * fls() instead since we need to know the actual length while modifying 998 * goal length. 999 */ 1000 order = fls(ac->ac_g_ex.fe_len) - 1; 1001 min_order = order - sbi->s_mb_best_avail_max_trim_order; 1002 if (min_order < 0) 1003 min_order = 0; 1004 1005 if (sbi->s_stripe > 0) { 1006 /* 1007 * We are assuming that stripe size is always a multiple of 1008 * cluster ratio otherwise __ext4_fill_super exists early. 1009 */ 1010 num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); 1011 if (1 << min_order < num_stripe_clusters) 1012 /* 1013 * We consider 1 order less because later we round 1014 * up the goal len to num_stripe_clusters 1015 */ 1016 min_order = fls(num_stripe_clusters) - 1; 1017 } 1018 1019 if (1 << min_order < ac->ac_o_ex.fe_len) 1020 min_order = fls(ac->ac_o_ex.fe_len); 1021 1022 for (i = order; i >= min_order; i--) { 1023 int frag_order; 1024 /* 1025 * Scale down goal len to make sure we find something 1026 * in the free fragments list. Basically, reduce 1027 * preallocations. 1028 */ 1029 ac->ac_g_ex.fe_len = 1 << i; 1030 1031 if (num_stripe_clusters > 0) { 1032 /* 1033 * Try to round up the adjusted goal length to 1034 * stripe size (in cluster units) multiple for 1035 * efficiency. 1036 */ 1037 ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, 1038 num_stripe_clusters); 1039 } 1040 1041 frag_order = mb_avg_fragment_size_order(ac->ac_sb, 1042 ac->ac_g_ex.fe_len); 1043 1044 grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); 1045 if (grp) 1046 break; 1047 } 1048 1049 if (grp) { 1050 *group = grp->bb_group; 1051 ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; 1052 } else { 1053 /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ 1054 ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; 1055 *new_cr = CR_GOAL_LEN_SLOW; 1056 } 1057 } 1058 1059 static inline int should_optimize_scan(struct ext4_allocation_context *ac) 1060 { 1061 if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) 1062 return 0; 1063 if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) 1064 return 0; 1065 if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) 1066 return 0; 1067 return 1; 1068 } 1069 1070 /* 1071 * Return next linear group for allocation. If linear traversal should not be 1072 * performed, this function just returns the same group 1073 */ 1074 static ext4_group_t 1075 next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, 1076 ext4_group_t ngroups) 1077 { 1078 if (!should_optimize_scan(ac)) 1079 goto inc_and_return; 1080 1081 if (ac->ac_groups_linear_remaining) { 1082 ac->ac_groups_linear_remaining--; 1083 goto inc_and_return; 1084 } 1085 1086 return group; 1087 inc_and_return: 1088 /* 1089 * Artificially restricted ngroups for non-extent 1090 * files makes group > ngroups possible on first loop. 1091 */ 1092 return group + 1 >= ngroups ? 0 : group + 1; 1093 } 1094 1095 /* 1096 * ext4_mb_choose_next_group: choose next group for allocation. 1097 * 1098 * @ac Allocation Context 1099 * @new_cr This is an output parameter. If the there is no good group 1100 * available at current CR level, this field is updated to indicate 1101 * the new cr level that should be used. 1102 * @group This is an input / output parameter. As an input it indicates the 1103 * next group that the allocator intends to use for allocation. As 1104 * output, this field indicates the next group that should be used as 1105 * determined by the optimization functions. 1106 * @ngroups Total number of groups 1107 */ 1108 static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, 1109 enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) 1110 { 1111 *new_cr = ac->ac_criteria; 1112 1113 if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { 1114 *group = next_linear_group(ac, *group, ngroups); 1115 return; 1116 } 1117 1118 if (*new_cr == CR_POWER2_ALIGNED) { 1119 ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups); 1120 } else if (*new_cr == CR_GOAL_LEN_FAST) { 1121 ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups); 1122 } else if (*new_cr == CR_BEST_AVAIL_LEN) { 1123 ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups); 1124 } else { 1125 /* 1126 * TODO: For CR=2, we can arrange groups in an rb tree sorted by 1127 * bb_free. But until that happens, we should never come here. 1128 */ 1129 WARN_ON(1); 1130 } 1131 } 1132 1133 /* 1134 * Cache the order of the largest free extent we have available in this block 1135 * group. 1136 */ 1137 static void 1138 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) 1139 { 1140 struct ext4_sb_info *sbi = EXT4_SB(sb); 1141 int i; 1142 1143 for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) 1144 if (grp->bb_counters[i] > 0) 1145 break; 1146 /* No need to move between order lists? */ 1147 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || 1148 i == grp->bb_largest_free_order) { 1149 grp->bb_largest_free_order = i; 1150 return; 1151 } 1152 1153 if (grp->bb_largest_free_order >= 0) { 1154 write_lock(&sbi->s_mb_largest_free_orders_locks[ 1155 grp->bb_largest_free_order]); 1156 list_del_init(&grp->bb_largest_free_order_node); 1157 write_unlock(&sbi->s_mb_largest_free_orders_locks[ 1158 grp->bb_largest_free_order]); 1159 } 1160 grp->bb_largest_free_order = i; 1161 if (grp->bb_largest_free_order >= 0 && grp->bb_free) { 1162 write_lock(&sbi->s_mb_largest_free_orders_locks[ 1163 grp->bb_largest_free_order]); 1164 list_add_tail(&grp->bb_largest_free_order_node, 1165 &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); 1166 write_unlock(&sbi->s_mb_largest_free_orders_locks[ 1167 grp->bb_largest_free_order]); 1168 } 1169 } 1170 1171 static noinline_for_stack 1172 void ext4_mb_generate_buddy(struct super_block *sb, 1173 void *buddy, void *bitmap, ext4_group_t group, 1174 struct ext4_group_info *grp) 1175 { 1176 struct ext4_sb_info *sbi = EXT4_SB(sb); 1177 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 1178 ext4_grpblk_t i = 0; 1179 ext4_grpblk_t first; 1180 ext4_grpblk_t len; 1181 unsigned free = 0; 1182 unsigned fragments = 0; 1183 unsigned long long period = get_cycles(); 1184 1185 /* initialize buddy from bitmap which is aggregation 1186 * of on-disk bitmap and preallocations */ 1187 i = mb_find_next_zero_bit(bitmap, max, 0); 1188 grp->bb_first_free = i; 1189 while (i < max) { 1190 fragments++; 1191 first = i; 1192 i = mb_find_next_bit(bitmap, max, i); 1193 len = i - first; 1194 free += len; 1195 if (len > 1) 1196 ext4_mb_mark_free_simple(sb, buddy, first, len, grp); 1197 else 1198 grp->bb_counters[0]++; 1199 if (i < max) 1200 i = mb_find_next_zero_bit(bitmap, max, i); 1201 } 1202 grp->bb_fragments = fragments; 1203 1204 if (free != grp->bb_free) { 1205 ext4_grp_locked_error(sb, group, 0, 0, 1206 "block bitmap and bg descriptor " 1207 "inconsistent: %u vs %u free clusters", 1208 free, grp->bb_free); 1209 /* 1210 * If we intend to continue, we consider group descriptor 1211 * corrupt and update bb_free using bitmap value 1212 */ 1213 grp->bb_free = free; 1214 ext4_mark_group_bitmap_corrupted(sb, group, 1215 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1216 } 1217 mb_set_largest_free_order(sb, grp); 1218 mb_update_avg_fragment_size(sb, grp); 1219 1220 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 1221 1222 period = get_cycles() - period; 1223 atomic_inc(&sbi->s_mb_buddies_generated); 1224 atomic64_add(period, &sbi->s_mb_generation_time); 1225 } 1226 1227 /* The buddy information is attached the buddy cache inode 1228 * for convenience. The information regarding each group 1229 * is loaded via ext4_mb_load_buddy. The information involve 1230 * block bitmap and buddy information. The information are 1231 * stored in the inode as 1232 * 1233 * { page } 1234 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 1235 * 1236 * 1237 * one block each for bitmap and buddy information. 1238 * So for each group we take up 2 blocks. A page can 1239 * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. 1240 * So it can have information regarding groups_per_page which 1241 * is blocks_per_page/2 1242 * 1243 * Locking note: This routine takes the block group lock of all groups 1244 * for this page; do not hold this lock when calling this routine! 1245 */ 1246 1247 static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) 1248 { 1249 ext4_group_t ngroups; 1250 int blocksize; 1251 int blocks_per_page; 1252 int groups_per_page; 1253 int err = 0; 1254 int i; 1255 ext4_group_t first_group, group; 1256 int first_block; 1257 struct super_block *sb; 1258 struct buffer_head *bhs; 1259 struct buffer_head **bh = NULL; 1260 struct inode *inode; 1261 char *data; 1262 char *bitmap; 1263 struct ext4_group_info *grinfo; 1264 1265 inode = page->mapping->host; 1266 sb = inode->i_sb; 1267 ngroups = ext4_get_groups_count(sb); 1268 blocksize = i_blocksize(inode); 1269 blocks_per_page = PAGE_SIZE / blocksize; 1270 1271 mb_debug(sb, "init page %lu\n", page->index); 1272 1273 groups_per_page = blocks_per_page >> 1; 1274 if (groups_per_page == 0) 1275 groups_per_page = 1; 1276 1277 /* allocate buffer_heads to read bitmaps */ 1278 if (groups_per_page > 1) { 1279 i = sizeof(struct buffer_head *) * groups_per_page; 1280 bh = kzalloc(i, gfp); 1281 if (bh == NULL) 1282 return -ENOMEM; 1283 } else 1284 bh = &bhs; 1285 1286 first_group = page->index * blocks_per_page / 2; 1287 1288 /* read all groups the page covers into the cache */ 1289 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1290 if (group >= ngroups) 1291 break; 1292 1293 grinfo = ext4_get_group_info(sb, group); 1294 if (!grinfo) 1295 continue; 1296 /* 1297 * If page is uptodate then we came here after online resize 1298 * which added some new uninitialized group info structs, so 1299 * we must skip all initialized uptodate buddies on the page, 1300 * which may be currently in use by an allocating task. 1301 */ 1302 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { 1303 bh[i] = NULL; 1304 continue; 1305 } 1306 bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); 1307 if (IS_ERR(bh[i])) { 1308 err = PTR_ERR(bh[i]); 1309 bh[i] = NULL; 1310 goto out; 1311 } 1312 mb_debug(sb, "read bitmap for group %u\n", group); 1313 } 1314 1315 /* wait for I/O completion */ 1316 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1317 int err2; 1318 1319 if (!bh[i]) 1320 continue; 1321 err2 = ext4_wait_block_bitmap(sb, group, bh[i]); 1322 if (!err) 1323 err = err2; 1324 } 1325 1326 first_block = page->index * blocks_per_page; 1327 for (i = 0; i < blocks_per_page; i++) { 1328 group = (first_block + i) >> 1; 1329 if (group >= ngroups) 1330 break; 1331 1332 if (!bh[group - first_group]) 1333 /* skip initialized uptodate buddy */ 1334 continue; 1335 1336 if (!buffer_verified(bh[group - first_group])) 1337 /* Skip faulty bitmaps */ 1338 continue; 1339 err = 0; 1340 1341 /* 1342 * data carry information regarding this 1343 * particular group in the format specified 1344 * above 1345 * 1346 */ 1347 data = page_address(page) + (i * blocksize); 1348 bitmap = bh[group - first_group]->b_data; 1349 1350 /* 1351 * We place the buddy block and bitmap block 1352 * close together 1353 */ 1354 if ((first_block + i) & 1) { 1355 /* this is block of buddy */ 1356 BUG_ON(incore == NULL); 1357 mb_debug(sb, "put buddy for group %u in page %lu/%x\n", 1358 group, page->index, i * blocksize); 1359 trace_ext4_mb_buddy_bitmap_load(sb, group); 1360 grinfo = ext4_get_group_info(sb, group); 1361 if (!grinfo) { 1362 err = -EFSCORRUPTED; 1363 goto out; 1364 } 1365 grinfo->bb_fragments = 0; 1366 memset(grinfo->bb_counters, 0, 1367 sizeof(*grinfo->bb_counters) * 1368 (MB_NUM_ORDERS(sb))); 1369 /* 1370 * incore got set to the group block bitmap below 1371 */ 1372 ext4_lock_group(sb, group); 1373 /* init the buddy */ 1374 memset(data, 0xff, blocksize); 1375 ext4_mb_generate_buddy(sb, data, incore, group, grinfo); 1376 ext4_unlock_group(sb, group); 1377 incore = NULL; 1378 } else { 1379 /* this is block of bitmap */ 1380 BUG_ON(incore != NULL); 1381 mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", 1382 group, page->index, i * blocksize); 1383 trace_ext4_mb_bitmap_load(sb, group); 1384 1385 /* see comments in ext4_mb_put_pa() */ 1386 ext4_lock_group(sb, group); 1387 memcpy(data, bitmap, blocksize); 1388 1389 /* mark all preallocated blks used in in-core bitmap */ 1390 ext4_mb_generate_from_pa(sb, data, group); 1391 ext4_mb_generate_from_freelist(sb, data, group); 1392 ext4_unlock_group(sb, group); 1393 1394 /* set incore so that the buddy information can be 1395 * generated using this 1396 */ 1397 incore = data; 1398 } 1399 } 1400 SetPageUptodate(page); 1401 1402 out: 1403 if (bh) { 1404 for (i = 0; i < groups_per_page; i++) 1405 brelse(bh[i]); 1406 if (bh != &bhs) 1407 kfree(bh); 1408 } 1409 return err; 1410 } 1411 1412 /* 1413 * Lock the buddy and bitmap pages. This make sure other parallel init_group 1414 * on the same buddy page doesn't happen whild holding the buddy page lock. 1415 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap 1416 * are on the same page e4b->bd_buddy_page is NULL and return value is 0. 1417 */ 1418 static int ext4_mb_get_buddy_page_lock(struct super_block *sb, 1419 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) 1420 { 1421 struct inode *inode = EXT4_SB(sb)->s_buddy_cache; 1422 int block, pnum, poff; 1423 int blocks_per_page; 1424 struct page *page; 1425 1426 e4b->bd_buddy_page = NULL; 1427 e4b->bd_bitmap_page = NULL; 1428 1429 blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1430 /* 1431 * the buddy cache inode stores the block bitmap 1432 * and buddy information in consecutive blocks. 1433 * So for each group we need two blocks. 1434 */ 1435 block = group * 2; 1436 pnum = block / blocks_per_page; 1437 poff = block % blocks_per_page; 1438 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1439 if (!page) 1440 return -ENOMEM; 1441 BUG_ON(page->mapping != inode->i_mapping); 1442 e4b->bd_bitmap_page = page; 1443 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1444 1445 if (blocks_per_page >= 2) { 1446 /* buddy and bitmap are on the same page */ 1447 return 0; 1448 } 1449 1450 block++; 1451 pnum = block / blocks_per_page; 1452 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1453 if (!page) 1454 return -ENOMEM; 1455 BUG_ON(page->mapping != inode->i_mapping); 1456 e4b->bd_buddy_page = page; 1457 return 0; 1458 } 1459 1460 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) 1461 { 1462 if (e4b->bd_bitmap_page) { 1463 unlock_page(e4b->bd_bitmap_page); 1464 put_page(e4b->bd_bitmap_page); 1465 } 1466 if (e4b->bd_buddy_page) { 1467 unlock_page(e4b->bd_buddy_page); 1468 put_page(e4b->bd_buddy_page); 1469 } 1470 } 1471 1472 /* 1473 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1474 * block group lock of all groups for this page; do not hold the BG lock when 1475 * calling this routine! 1476 */ 1477 static noinline_for_stack 1478 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) 1479 { 1480 1481 struct ext4_group_info *this_grp; 1482 struct ext4_buddy e4b; 1483 struct page *page; 1484 int ret = 0; 1485 1486 might_sleep(); 1487 mb_debug(sb, "init group %u\n", group); 1488 this_grp = ext4_get_group_info(sb, group); 1489 if (!this_grp) 1490 return -EFSCORRUPTED; 1491 1492 /* 1493 * This ensures that we don't reinit the buddy cache 1494 * page which map to the group from which we are already 1495 * allocating. If we are looking at the buddy cache we would 1496 * have taken a reference using ext4_mb_load_buddy and that 1497 * would have pinned buddy page to page cache. 1498 * The call to ext4_mb_get_buddy_page_lock will mark the 1499 * page accessed. 1500 */ 1501 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); 1502 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { 1503 /* 1504 * somebody initialized the group 1505 * return without doing anything 1506 */ 1507 goto err; 1508 } 1509 1510 page = e4b.bd_bitmap_page; 1511 ret = ext4_mb_init_cache(page, NULL, gfp); 1512 if (ret) 1513 goto err; 1514 if (!PageUptodate(page)) { 1515 ret = -EIO; 1516 goto err; 1517 } 1518 1519 if (e4b.bd_buddy_page == NULL) { 1520 /* 1521 * If both the bitmap and buddy are in 1522 * the same page we don't need to force 1523 * init the buddy 1524 */ 1525 ret = 0; 1526 goto err; 1527 } 1528 /* init buddy cache */ 1529 page = e4b.bd_buddy_page; 1530 ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); 1531 if (ret) 1532 goto err; 1533 if (!PageUptodate(page)) { 1534 ret = -EIO; 1535 goto err; 1536 } 1537 err: 1538 ext4_mb_put_buddy_page_lock(&e4b); 1539 return ret; 1540 } 1541 1542 /* 1543 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1544 * block group lock of all groups for this page; do not hold the BG lock when 1545 * calling this routine! 1546 */ 1547 static noinline_for_stack int 1548 ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, 1549 struct ext4_buddy *e4b, gfp_t gfp) 1550 { 1551 int blocks_per_page; 1552 int block; 1553 int pnum; 1554 int poff; 1555 struct page *page; 1556 int ret; 1557 struct ext4_group_info *grp; 1558 struct ext4_sb_info *sbi = EXT4_SB(sb); 1559 struct inode *inode = sbi->s_buddy_cache; 1560 1561 might_sleep(); 1562 mb_debug(sb, "load group %u\n", group); 1563 1564 blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1565 grp = ext4_get_group_info(sb, group); 1566 if (!grp) 1567 return -EFSCORRUPTED; 1568 1569 e4b->bd_blkbits = sb->s_blocksize_bits; 1570 e4b->bd_info = grp; 1571 e4b->bd_sb = sb; 1572 e4b->bd_group = group; 1573 e4b->bd_buddy_page = NULL; 1574 e4b->bd_bitmap_page = NULL; 1575 1576 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1577 /* 1578 * we need full data about the group 1579 * to make a good selection 1580 */ 1581 ret = ext4_mb_init_group(sb, group, gfp); 1582 if (ret) 1583 return ret; 1584 } 1585 1586 /* 1587 * the buddy cache inode stores the block bitmap 1588 * and buddy information in consecutive blocks. 1589 * So for each group we need two blocks. 1590 */ 1591 block = group * 2; 1592 pnum = block / blocks_per_page; 1593 poff = block % blocks_per_page; 1594 1595 /* we could use find_or_create_page(), but it locks page 1596 * what we'd like to avoid in fast path ... */ 1597 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 1598 if (page == NULL || !PageUptodate(page)) { 1599 if (page) 1600 /* 1601 * drop the page reference and try 1602 * to get the page with lock. If we 1603 * are not uptodate that implies 1604 * somebody just created the page but 1605 * is yet to initialize the same. So 1606 * wait for it to initialize. 1607 */ 1608 put_page(page); 1609 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1610 if (page) { 1611 if (WARN_RATELIMIT(page->mapping != inode->i_mapping, 1612 "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { 1613 /* should never happen */ 1614 unlock_page(page); 1615 ret = -EINVAL; 1616 goto err; 1617 } 1618 if (!PageUptodate(page)) { 1619 ret = ext4_mb_init_cache(page, NULL, gfp); 1620 if (ret) { 1621 unlock_page(page); 1622 goto err; 1623 } 1624 mb_cmp_bitmaps(e4b, page_address(page) + 1625 (poff * sb->s_blocksize)); 1626 } 1627 unlock_page(page); 1628 } 1629 } 1630 if (page == NULL) { 1631 ret = -ENOMEM; 1632 goto err; 1633 } 1634 if (!PageUptodate(page)) { 1635 ret = -EIO; 1636 goto err; 1637 } 1638 1639 /* Pages marked accessed already */ 1640 e4b->bd_bitmap_page = page; 1641 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1642 1643 block++; 1644 pnum = block / blocks_per_page; 1645 poff = block % blocks_per_page; 1646 1647 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 1648 if (page == NULL || !PageUptodate(page)) { 1649 if (page) 1650 put_page(page); 1651 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1652 if (page) { 1653 if (WARN_RATELIMIT(page->mapping != inode->i_mapping, 1654 "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { 1655 /* should never happen */ 1656 unlock_page(page); 1657 ret = -EINVAL; 1658 goto err; 1659 } 1660 if (!PageUptodate(page)) { 1661 ret = ext4_mb_init_cache(page, e4b->bd_bitmap, 1662 gfp); 1663 if (ret) { 1664 unlock_page(page); 1665 goto err; 1666 } 1667 } 1668 unlock_page(page); 1669 } 1670 } 1671 if (page == NULL) { 1672 ret = -ENOMEM; 1673 goto err; 1674 } 1675 if (!PageUptodate(page)) { 1676 ret = -EIO; 1677 goto err; 1678 } 1679 1680 /* Pages marked accessed already */ 1681 e4b->bd_buddy_page = page; 1682 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1683 1684 return 0; 1685 1686 err: 1687 if (page) 1688 put_page(page); 1689 if (e4b->bd_bitmap_page) 1690 put_page(e4b->bd_bitmap_page); 1691 1692 e4b->bd_buddy = NULL; 1693 e4b->bd_bitmap = NULL; 1694 return ret; 1695 } 1696 1697 static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1698 struct ext4_buddy *e4b) 1699 { 1700 return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); 1701 } 1702 1703 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) 1704 { 1705 if (e4b->bd_bitmap_page) 1706 put_page(e4b->bd_bitmap_page); 1707 if (e4b->bd_buddy_page) 1708 put_page(e4b->bd_buddy_page); 1709 } 1710 1711 1712 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1713 { 1714 int order = 1, max; 1715 void *bb; 1716 1717 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 1718 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1719 1720 while (order <= e4b->bd_blkbits + 1) { 1721 bb = mb_find_buddy(e4b, order, &max); 1722 if (!mb_test_bit(block >> order, bb)) { 1723 /* this block is part of buddy of order 'order' */ 1724 return order; 1725 } 1726 order++; 1727 } 1728 return 0; 1729 } 1730 1731 static void mb_clear_bits(void *bm, int cur, int len) 1732 { 1733 __u32 *addr; 1734 1735 len = cur + len; 1736 while (cur < len) { 1737 if ((cur & 31) == 0 && (len - cur) >= 32) { 1738 /* fast path: clear whole word at once */ 1739 addr = bm + (cur >> 3); 1740 *addr = 0; 1741 cur += 32; 1742 continue; 1743 } 1744 mb_clear_bit(cur, bm); 1745 cur++; 1746 } 1747 } 1748 1749 /* clear bits in given range 1750 * will return first found zero bit if any, -1 otherwise 1751 */ 1752 static int mb_test_and_clear_bits(void *bm, int cur, int len) 1753 { 1754 __u32 *addr; 1755 int zero_bit = -1; 1756 1757 len = cur + len; 1758 while (cur < len) { 1759 if ((cur & 31) == 0 && (len - cur) >= 32) { 1760 /* fast path: clear whole word at once */ 1761 addr = bm + (cur >> 3); 1762 if (*addr != (__u32)(-1) && zero_bit == -1) 1763 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); 1764 *addr = 0; 1765 cur += 32; 1766 continue; 1767 } 1768 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) 1769 zero_bit = cur; 1770 cur++; 1771 } 1772 1773 return zero_bit; 1774 } 1775 1776 void mb_set_bits(void *bm, int cur, int len) 1777 { 1778 __u32 *addr; 1779 1780 len = cur + len; 1781 while (cur < len) { 1782 if ((cur & 31) == 0 && (len - cur) >= 32) { 1783 /* fast path: set whole word at once */ 1784 addr = bm + (cur >> 3); 1785 *addr = 0xffffffff; 1786 cur += 32; 1787 continue; 1788 } 1789 mb_set_bit(cur, bm); 1790 cur++; 1791 } 1792 } 1793 1794 static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) 1795 { 1796 if (mb_test_bit(*bit + side, bitmap)) { 1797 mb_clear_bit(*bit, bitmap); 1798 (*bit) -= side; 1799 return 1; 1800 } 1801 else { 1802 (*bit) += side; 1803 mb_set_bit(*bit, bitmap); 1804 return -1; 1805 } 1806 } 1807 1808 static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) 1809 { 1810 int max; 1811 int order = 1; 1812 void *buddy = mb_find_buddy(e4b, order, &max); 1813 1814 while (buddy) { 1815 void *buddy2; 1816 1817 /* Bits in range [first; last] are known to be set since 1818 * corresponding blocks were allocated. Bits in range 1819 * (first; last) will stay set because they form buddies on 1820 * upper layer. We just deal with borders if they don't 1821 * align with upper layer and then go up. 1822 * Releasing entire group is all about clearing 1823 * single bit of highest order buddy. 1824 */ 1825 1826 /* Example: 1827 * --------------------------------- 1828 * | 1 | 1 | 1 | 1 | 1829 * --------------------------------- 1830 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1831 * --------------------------------- 1832 * 0 1 2 3 4 5 6 7 1833 * \_____________________/ 1834 * 1835 * Neither [1] nor [6] is aligned to above layer. 1836 * Left neighbour [0] is free, so mark it busy, 1837 * decrease bb_counters and extend range to 1838 * [0; 6] 1839 * Right neighbour [7] is busy. It can't be coaleasced with [6], so 1840 * mark [6] free, increase bb_counters and shrink range to 1841 * [0; 5]. 1842 * Then shift range to [0; 2], go up and do the same. 1843 */ 1844 1845 1846 if (first & 1) 1847 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); 1848 if (!(last & 1)) 1849 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); 1850 if (first > last) 1851 break; 1852 order++; 1853 1854 buddy2 = mb_find_buddy(e4b, order, &max); 1855 if (!buddy2) { 1856 mb_clear_bits(buddy, first, last - first + 1); 1857 e4b->bd_info->bb_counters[order - 1] += last - first + 1; 1858 break; 1859 } 1860 first >>= 1; 1861 last >>= 1; 1862 buddy = buddy2; 1863 } 1864 } 1865 1866 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1867 int first, int count) 1868 { 1869 int left_is_free = 0; 1870 int right_is_free = 0; 1871 int block; 1872 int last = first + count - 1; 1873 struct super_block *sb = e4b->bd_sb; 1874 1875 if (WARN_ON(count == 0)) 1876 return; 1877 BUG_ON(last >= (sb->s_blocksize << 3)); 1878 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1879 /* Don't bother if the block group is corrupt. */ 1880 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) 1881 return; 1882 1883 mb_check_buddy(e4b); 1884 mb_free_blocks_double(inode, e4b, first, count); 1885 1886 this_cpu_inc(discard_pa_seq); 1887 e4b->bd_info->bb_free += count; 1888 if (first < e4b->bd_info->bb_first_free) 1889 e4b->bd_info->bb_first_free = first; 1890 1891 /* access memory sequentially: check left neighbour, 1892 * clear range and then check right neighbour 1893 */ 1894 if (first != 0) 1895 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); 1896 block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); 1897 if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) 1898 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); 1899 1900 if (unlikely(block != -1)) { 1901 struct ext4_sb_info *sbi = EXT4_SB(sb); 1902 ext4_fsblk_t blocknr; 1903 1904 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1905 blocknr += EXT4_C2B(sbi, block); 1906 if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { 1907 ext4_grp_locked_error(sb, e4b->bd_group, 1908 inode ? inode->i_ino : 0, 1909 blocknr, 1910 "freeing already freed block (bit %u); block bitmap corrupt.", 1911 block); 1912 ext4_mark_group_bitmap_corrupted( 1913 sb, e4b->bd_group, 1914 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1915 } 1916 goto done; 1917 } 1918 1919 /* let's maintain fragments counter */ 1920 if (left_is_free && right_is_free) 1921 e4b->bd_info->bb_fragments--; 1922 else if (!left_is_free && !right_is_free) 1923 e4b->bd_info->bb_fragments++; 1924 1925 /* buddy[0] == bd_bitmap is a special case, so handle 1926 * it right away and let mb_buddy_mark_free stay free of 1927 * zero order checks. 1928 * Check if neighbours are to be coaleasced, 1929 * adjust bitmap bb_counters and borders appropriately. 1930 */ 1931 if (first & 1) { 1932 first += !left_is_free; 1933 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; 1934 } 1935 if (!(last & 1)) { 1936 last -= !right_is_free; 1937 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; 1938 } 1939 1940 if (first <= last) 1941 mb_buddy_mark_free(e4b, first >> 1, last >> 1); 1942 1943 done: 1944 mb_set_largest_free_order(sb, e4b->bd_info); 1945 mb_update_avg_fragment_size(sb, e4b->bd_info); 1946 mb_check_buddy(e4b); 1947 } 1948 1949 static int mb_find_extent(struct ext4_buddy *e4b, int block, 1950 int needed, struct ext4_free_extent *ex) 1951 { 1952 int next = block; 1953 int max, order; 1954 void *buddy; 1955 1956 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1957 BUG_ON(ex == NULL); 1958 1959 buddy = mb_find_buddy(e4b, 0, &max); 1960 BUG_ON(buddy == NULL); 1961 BUG_ON(block >= max); 1962 if (mb_test_bit(block, buddy)) { 1963 ex->fe_len = 0; 1964 ex->fe_start = 0; 1965 ex->fe_group = 0; 1966 return 0; 1967 } 1968 1969 /* find actual order */ 1970 order = mb_find_order_for_block(e4b, block); 1971 block = block >> order; 1972 1973 ex->fe_len = 1 << order; 1974 ex->fe_start = block << order; 1975 ex->fe_group = e4b->bd_group; 1976 1977 /* calc difference from given start */ 1978 next = next - ex->fe_start; 1979 ex->fe_len -= next; 1980 ex->fe_start += next; 1981 1982 while (needed > ex->fe_len && 1983 mb_find_buddy(e4b, order, &max)) { 1984 1985 if (block + 1 >= max) 1986 break; 1987 1988 next = (block + 1) * (1 << order); 1989 if (mb_test_bit(next, e4b->bd_bitmap)) 1990 break; 1991 1992 order = mb_find_order_for_block(e4b, next); 1993 1994 block = next >> order; 1995 ex->fe_len += 1 << order; 1996 } 1997 1998 if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { 1999 /* Should never happen! (but apparently sometimes does?!?) */ 2000 WARN_ON(1); 2001 ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, 2002 "corruption or bug in mb_find_extent " 2003 "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", 2004 block, order, needed, ex->fe_group, ex->fe_start, 2005 ex->fe_len, ex->fe_logical); 2006 ex->fe_len = 0; 2007 ex->fe_start = 0; 2008 ex->fe_group = 0; 2009 } 2010 return ex->fe_len; 2011 } 2012 2013 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) 2014 { 2015 int ord; 2016 int mlen = 0; 2017 int max = 0; 2018 int cur; 2019 int start = ex->fe_start; 2020 int len = ex->fe_len; 2021 unsigned ret = 0; 2022 int len0 = len; 2023 void *buddy; 2024 bool split = false; 2025 2026 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 2027 BUG_ON(e4b->bd_group != ex->fe_group); 2028 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 2029 mb_check_buddy(e4b); 2030 mb_mark_used_double(e4b, start, len); 2031 2032 this_cpu_inc(discard_pa_seq); 2033 e4b->bd_info->bb_free -= len; 2034 if (e4b->bd_info->bb_first_free == start) 2035 e4b->bd_info->bb_first_free += len; 2036 2037 /* let's maintain fragments counter */ 2038 if (start != 0) 2039 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); 2040 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 2041 max = !mb_test_bit(start + len, e4b->bd_bitmap); 2042 if (mlen && max) 2043 e4b->bd_info->bb_fragments++; 2044 else if (!mlen && !max) 2045 e4b->bd_info->bb_fragments--; 2046 2047 /* let's maintain buddy itself */ 2048 while (len) { 2049 if (!split) 2050 ord = mb_find_order_for_block(e4b, start); 2051 2052 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 2053 /* the whole chunk may be allocated at once! */ 2054 mlen = 1 << ord; 2055 if (!split) 2056 buddy = mb_find_buddy(e4b, ord, &max); 2057 else 2058 split = false; 2059 BUG_ON((start >> ord) >= max); 2060 mb_set_bit(start >> ord, buddy); 2061 e4b->bd_info->bb_counters[ord]--; 2062 start += mlen; 2063 len -= mlen; 2064 BUG_ON(len < 0); 2065 continue; 2066 } 2067 2068 /* store for history */ 2069 if (ret == 0) 2070 ret = len | (ord << 16); 2071 2072 /* we have to split large buddy */ 2073 BUG_ON(ord <= 0); 2074 buddy = mb_find_buddy(e4b, ord, &max); 2075 mb_set_bit(start >> ord, buddy); 2076 e4b->bd_info->bb_counters[ord]--; 2077 2078 ord--; 2079 cur = (start >> ord) & ~1U; 2080 buddy = mb_find_buddy(e4b, ord, &max); 2081 mb_clear_bit(cur, buddy); 2082 mb_clear_bit(cur + 1, buddy); 2083 e4b->bd_info->bb_counters[ord]++; 2084 e4b->bd_info->bb_counters[ord]++; 2085 split = true; 2086 } 2087 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 2088 2089 mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); 2090 mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0); 2091 mb_check_buddy(e4b); 2092 2093 return ret; 2094 } 2095 2096 /* 2097 * Must be called under group lock! 2098 */ 2099 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, 2100 struct ext4_buddy *e4b) 2101 { 2102 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2103 int ret; 2104 2105 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); 2106 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2107 2108 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); 2109 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; 2110 ret = mb_mark_used(e4b, &ac->ac_b_ex); 2111 2112 /* preallocation can change ac_b_ex, thus we store actually 2113 * allocated blocks for history */ 2114 ac->ac_f_ex = ac->ac_b_ex; 2115 2116 ac->ac_status = AC_STATUS_FOUND; 2117 ac->ac_tail = ret & 0xffff; 2118 ac->ac_buddy = ret >> 16; 2119 2120 /* 2121 * take the page reference. We want the page to be pinned 2122 * so that we don't get a ext4_mb_init_cache_call for this 2123 * group until we update the bitmap. That would mean we 2124 * double allocate blocks. The reference is dropped 2125 * in ext4_mb_release_context 2126 */ 2127 ac->ac_bitmap_page = e4b->bd_bitmap_page; 2128 get_page(ac->ac_bitmap_page); 2129 ac->ac_buddy_page = e4b->bd_buddy_page; 2130 get_page(ac->ac_buddy_page); 2131 /* store last allocated for subsequent stream allocation */ 2132 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2133 spin_lock(&sbi->s_md_lock); 2134 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 2135 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 2136 spin_unlock(&sbi->s_md_lock); 2137 } 2138 /* 2139 * As we've just preallocated more space than 2140 * user requested originally, we store allocated 2141 * space in a special descriptor. 2142 */ 2143 if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 2144 ext4_mb_new_preallocation(ac); 2145 2146 } 2147 2148 static void ext4_mb_check_limits(struct ext4_allocation_context *ac, 2149 struct ext4_buddy *e4b, 2150 int finish_group) 2151 { 2152 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2153 struct ext4_free_extent *bex = &ac->ac_b_ex; 2154 struct ext4_free_extent *gex = &ac->ac_g_ex; 2155 2156 if (ac->ac_status == AC_STATUS_FOUND) 2157 return; 2158 /* 2159 * We don't want to scan for a whole year 2160 */ 2161 if (ac->ac_found > sbi->s_mb_max_to_scan && 2162 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2163 ac->ac_status = AC_STATUS_BREAK; 2164 return; 2165 } 2166 2167 /* 2168 * Haven't found good chunk so far, let's continue 2169 */ 2170 if (bex->fe_len < gex->fe_len) 2171 return; 2172 2173 if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan) 2174 ext4_mb_use_best_found(ac, e4b); 2175 } 2176 2177 /* 2178 * The routine checks whether found extent is good enough. If it is, 2179 * then the extent gets marked used and flag is set to the context 2180 * to stop scanning. Otherwise, the extent is compared with the 2181 * previous found extent and if new one is better, then it's stored 2182 * in the context. Later, the best found extent will be used, if 2183 * mballoc can't find good enough extent. 2184 * 2185 * The algorithm used is roughly as follows: 2186 * 2187 * * If free extent found is exactly as big as goal, then 2188 * stop the scan and use it immediately 2189 * 2190 * * If free extent found is smaller than goal, then keep retrying 2191 * upto a max of sbi->s_mb_max_to_scan times (default 200). After 2192 * that stop scanning and use whatever we have. 2193 * 2194 * * If free extent found is bigger than goal, then keep retrying 2195 * upto a max of sbi->s_mb_min_to_scan times (default 10) before 2196 * stopping the scan and using the extent. 2197 * 2198 * 2199 * FIXME: real allocation policy is to be designed yet! 2200 */ 2201 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, 2202 struct ext4_free_extent *ex, 2203 struct ext4_buddy *e4b) 2204 { 2205 struct ext4_free_extent *bex = &ac->ac_b_ex; 2206 struct ext4_free_extent *gex = &ac->ac_g_ex; 2207 2208 BUG_ON(ex->fe_len <= 0); 2209 BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 2210 BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 2211 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 2212 2213 ac->ac_found++; 2214 ac->ac_cX_found[ac->ac_criteria]++; 2215 2216 /* 2217 * The special case - take what you catch first 2218 */ 2219 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2220 *bex = *ex; 2221 ext4_mb_use_best_found(ac, e4b); 2222 return; 2223 } 2224 2225 /* 2226 * Let's check whether the chuck is good enough 2227 */ 2228 if (ex->fe_len == gex->fe_len) { 2229 *bex = *ex; 2230 ext4_mb_use_best_found(ac, e4b); 2231 return; 2232 } 2233 2234 /* 2235 * If this is first found extent, just store it in the context 2236 */ 2237 if (bex->fe_len == 0) { 2238 *bex = *ex; 2239 return; 2240 } 2241 2242 /* 2243 * If new found extent is better, store it in the context 2244 */ 2245 if (bex->fe_len < gex->fe_len) { 2246 /* if the request isn't satisfied, any found extent 2247 * larger than previous best one is better */ 2248 if (ex->fe_len > bex->fe_len) 2249 *bex = *ex; 2250 } else if (ex->fe_len > gex->fe_len) { 2251 /* if the request is satisfied, then we try to find 2252 * an extent that still satisfy the request, but is 2253 * smaller than previous one */ 2254 if (ex->fe_len < bex->fe_len) 2255 *bex = *ex; 2256 } 2257 2258 ext4_mb_check_limits(ac, e4b, 0); 2259 } 2260 2261 static noinline_for_stack 2262 void ext4_mb_try_best_found(struct ext4_allocation_context *ac, 2263 struct ext4_buddy *e4b) 2264 { 2265 struct ext4_free_extent ex = ac->ac_b_ex; 2266 ext4_group_t group = ex.fe_group; 2267 int max; 2268 int err; 2269 2270 BUG_ON(ex.fe_len <= 0); 2271 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 2272 if (err) 2273 return; 2274 2275 ext4_lock_group(ac->ac_sb, group); 2276 max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); 2277 2278 if (max > 0) { 2279 ac->ac_b_ex = ex; 2280 ext4_mb_use_best_found(ac, e4b); 2281 } 2282 2283 ext4_unlock_group(ac->ac_sb, group); 2284 ext4_mb_unload_buddy(e4b); 2285 } 2286 2287 static noinline_for_stack 2288 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 2289 struct ext4_buddy *e4b) 2290 { 2291 ext4_group_t group = ac->ac_g_ex.fe_group; 2292 int max; 2293 int err; 2294 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2295 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2296 struct ext4_free_extent ex; 2297 2298 if (!grp) 2299 return -EFSCORRUPTED; 2300 if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) 2301 return 0; 2302 if (grp->bb_free == 0) 2303 return 0; 2304 2305 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 2306 if (err) 2307 return err; 2308 2309 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { 2310 ext4_mb_unload_buddy(e4b); 2311 return 0; 2312 } 2313 2314 ext4_lock_group(ac->ac_sb, group); 2315 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, 2316 ac->ac_g_ex.fe_len, &ex); 2317 ex.fe_logical = 0xDEADFA11; /* debug value */ 2318 2319 if (max >= ac->ac_g_ex.fe_len && 2320 ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) { 2321 ext4_fsblk_t start; 2322 2323 start = ext4_grp_offs_to_block(ac->ac_sb, &ex); 2324 /* use do_div to get remainder (would be 64-bit modulo) */ 2325 if (do_div(start, sbi->s_stripe) == 0) { 2326 ac->ac_found++; 2327 ac->ac_b_ex = ex; 2328 ext4_mb_use_best_found(ac, e4b); 2329 } 2330 } else if (max >= ac->ac_g_ex.fe_len) { 2331 BUG_ON(ex.fe_len <= 0); 2332 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 2333 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 2334 ac->ac_found++; 2335 ac->ac_b_ex = ex; 2336 ext4_mb_use_best_found(ac, e4b); 2337 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { 2338 /* Sometimes, caller may want to merge even small 2339 * number of blocks to an existing extent */ 2340 BUG_ON(ex.fe_len <= 0); 2341 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 2342 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 2343 ac->ac_found++; 2344 ac->ac_b_ex = ex; 2345 ext4_mb_use_best_found(ac, e4b); 2346 } 2347 ext4_unlock_group(ac->ac_sb, group); 2348 ext4_mb_unload_buddy(e4b); 2349 2350 return 0; 2351 } 2352 2353 /* 2354 * The routine scans buddy structures (not bitmap!) from given order 2355 * to max order and tries to find big enough chunk to satisfy the req 2356 */ 2357 static noinline_for_stack 2358 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 2359 struct ext4_buddy *e4b) 2360 { 2361 struct super_block *sb = ac->ac_sb; 2362 struct ext4_group_info *grp = e4b->bd_info; 2363 void *buddy; 2364 int i; 2365 int k; 2366 int max; 2367 2368 BUG_ON(ac->ac_2order <= 0); 2369 for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { 2370 if (grp->bb_counters[i] == 0) 2371 continue; 2372 2373 buddy = mb_find_buddy(e4b, i, &max); 2374 if (WARN_RATELIMIT(buddy == NULL, 2375 "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) 2376 continue; 2377 2378 k = mb_find_next_zero_bit(buddy, max, 0); 2379 if (k >= max) { 2380 ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, 2381 "%d free clusters of order %d. But found 0", 2382 grp->bb_counters[i], i); 2383 ext4_mark_group_bitmap_corrupted(ac->ac_sb, 2384 e4b->bd_group, 2385 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2386 break; 2387 } 2388 ac->ac_found++; 2389 ac->ac_cX_found[ac->ac_criteria]++; 2390 2391 ac->ac_b_ex.fe_len = 1 << i; 2392 ac->ac_b_ex.fe_start = k << i; 2393 ac->ac_b_ex.fe_group = e4b->bd_group; 2394 2395 ext4_mb_use_best_found(ac, e4b); 2396 2397 BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); 2398 2399 if (EXT4_SB(sb)->s_mb_stats) 2400 atomic_inc(&EXT4_SB(sb)->s_bal_2orders); 2401 2402 break; 2403 } 2404 } 2405 2406 /* 2407 * The routine scans the group and measures all found extents. 2408 * In order to optimize scanning, caller must pass number of 2409 * free blocks in the group, so the routine can know upper limit. 2410 */ 2411 static noinline_for_stack 2412 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 2413 struct ext4_buddy *e4b) 2414 { 2415 struct super_block *sb = ac->ac_sb; 2416 void *bitmap = e4b->bd_bitmap; 2417 struct ext4_free_extent ex; 2418 int i, j, freelen; 2419 int free; 2420 2421 free = e4b->bd_info->bb_free; 2422 if (WARN_ON(free <= 0)) 2423 return; 2424 2425 i = e4b->bd_info->bb_first_free; 2426 2427 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 2428 i = mb_find_next_zero_bit(bitmap, 2429 EXT4_CLUSTERS_PER_GROUP(sb), i); 2430 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { 2431 /* 2432 * IF we have corrupt bitmap, we won't find any 2433 * free blocks even though group info says we 2434 * have free blocks 2435 */ 2436 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 2437 "%d free clusters as per " 2438 "group info. But bitmap says 0", 2439 free); 2440 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 2441 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2442 break; 2443 } 2444 2445 if (!ext4_mb_cr_expensive(ac->ac_criteria)) { 2446 /* 2447 * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are 2448 * sure that this group will have a large enough 2449 * continuous free extent, so skip over the smaller free 2450 * extents 2451 */ 2452 j = mb_find_next_bit(bitmap, 2453 EXT4_CLUSTERS_PER_GROUP(sb), i); 2454 freelen = j - i; 2455 2456 if (freelen < ac->ac_g_ex.fe_len) { 2457 i = j; 2458 free -= freelen; 2459 continue; 2460 } 2461 } 2462 2463 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); 2464 if (WARN_ON(ex.fe_len <= 0)) 2465 break; 2466 if (free < ex.fe_len) { 2467 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 2468 "%d free clusters as per " 2469 "group info. But got %d blocks", 2470 free, ex.fe_len); 2471 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 2472 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2473 /* 2474 * The number of free blocks differs. This mostly 2475 * indicate that the bitmap is corrupt. So exit 2476 * without claiming the space. 2477 */ 2478 break; 2479 } 2480 ex.fe_logical = 0xDEADC0DE; /* debug value */ 2481 ext4_mb_measure_extent(ac, &ex, e4b); 2482 2483 i += ex.fe_len; 2484 free -= ex.fe_len; 2485 } 2486 2487 ext4_mb_check_limits(ac, e4b, 1); 2488 } 2489 2490 /* 2491 * This is a special case for storages like raid5 2492 * we try to find stripe-aligned chunks for stripe-size-multiple requests 2493 */ 2494 static noinline_for_stack 2495 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 2496 struct ext4_buddy *e4b) 2497 { 2498 struct super_block *sb = ac->ac_sb; 2499 struct ext4_sb_info *sbi = EXT4_SB(sb); 2500 void *bitmap = e4b->bd_bitmap; 2501 struct ext4_free_extent ex; 2502 ext4_fsblk_t first_group_block; 2503 ext4_fsblk_t a; 2504 ext4_grpblk_t i, stripe; 2505 int max; 2506 2507 BUG_ON(sbi->s_stripe == 0); 2508 2509 /* find first stripe-aligned block in group */ 2510 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); 2511 2512 a = first_group_block + sbi->s_stripe - 1; 2513 do_div(a, sbi->s_stripe); 2514 i = (a * sbi->s_stripe) - first_group_block; 2515 2516 stripe = EXT4_B2C(sbi, sbi->s_stripe); 2517 i = EXT4_B2C(sbi, i); 2518 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { 2519 if (!mb_test_bit(i, bitmap)) { 2520 max = mb_find_extent(e4b, i, stripe, &ex); 2521 if (max >= stripe) { 2522 ac->ac_found++; 2523 ac->ac_cX_found[ac->ac_criteria]++; 2524 ex.fe_logical = 0xDEADF00D; /* debug value */ 2525 ac->ac_b_ex = ex; 2526 ext4_mb_use_best_found(ac, e4b); 2527 break; 2528 } 2529 } 2530 i += stripe; 2531 } 2532 } 2533 2534 /* 2535 * This is also called BEFORE we load the buddy bitmap. 2536 * Returns either 1 or 0 indicating that the group is either suitable 2537 * for the allocation or not. 2538 */ 2539 static bool ext4_mb_good_group(struct ext4_allocation_context *ac, 2540 ext4_group_t group, enum criteria cr) 2541 { 2542 ext4_grpblk_t free, fragments; 2543 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 2544 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2545 2546 BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); 2547 2548 if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) 2549 return false; 2550 2551 free = grp->bb_free; 2552 if (free == 0) 2553 return false; 2554 2555 fragments = grp->bb_fragments; 2556 if (fragments == 0) 2557 return false; 2558 2559 switch (cr) { 2560 case CR_POWER2_ALIGNED: 2561 BUG_ON(ac->ac_2order == 0); 2562 2563 /* Avoid using the first bg of a flexgroup for data files */ 2564 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 2565 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 2566 ((group % flex_size) == 0)) 2567 return false; 2568 2569 if (free < ac->ac_g_ex.fe_len) 2570 return false; 2571 2572 if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) 2573 return true; 2574 2575 if (grp->bb_largest_free_order < ac->ac_2order) 2576 return false; 2577 2578 return true; 2579 case CR_GOAL_LEN_FAST: 2580 case CR_BEST_AVAIL_LEN: 2581 if ((free / fragments) >= ac->ac_g_ex.fe_len) 2582 return true; 2583 break; 2584 case CR_GOAL_LEN_SLOW: 2585 if (free >= ac->ac_g_ex.fe_len) 2586 return true; 2587 break; 2588 case CR_ANY_FREE: 2589 return true; 2590 default: 2591 BUG(); 2592 } 2593 2594 return false; 2595 } 2596 2597 /* 2598 * This could return negative error code if something goes wrong 2599 * during ext4_mb_init_group(). This should not be called with 2600 * ext4_lock_group() held. 2601 * 2602 * Note: because we are conditionally operating with the group lock in 2603 * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this 2604 * function using __acquire and __release. This means we need to be 2605 * super careful before messing with the error path handling via "goto 2606 * out"! 2607 */ 2608 static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, 2609 ext4_group_t group, enum criteria cr) 2610 { 2611 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2612 struct super_block *sb = ac->ac_sb; 2613 struct ext4_sb_info *sbi = EXT4_SB(sb); 2614 bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; 2615 ext4_grpblk_t free; 2616 int ret = 0; 2617 2618 if (!grp) 2619 return -EFSCORRUPTED; 2620 if (sbi->s_mb_stats) 2621 atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); 2622 if (should_lock) { 2623 ext4_lock_group(sb, group); 2624 __release(ext4_group_lock_ptr(sb, group)); 2625 } 2626 free = grp->bb_free; 2627 if (free == 0) 2628 goto out; 2629 /* 2630 * In all criterias except CR_ANY_FREE we try to avoid groups that 2631 * can't possibly satisfy the full goal request due to insufficient 2632 * free blocks. 2633 */ 2634 if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) 2635 goto out; 2636 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) 2637 goto out; 2638 if (should_lock) { 2639 __acquire(ext4_group_lock_ptr(sb, group)); 2640 ext4_unlock_group(sb, group); 2641 } 2642 2643 /* We only do this if the grp has never been initialized */ 2644 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 2645 struct ext4_group_desc *gdp = 2646 ext4_get_group_desc(sb, group, NULL); 2647 int ret; 2648 2649 /* 2650 * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic 2651 * search to find large good chunks almost for free. If buddy 2652 * data is not ready, then this optimization makes no sense. But 2653 * we never skip the first block group in a flex_bg, since this 2654 * gets used for metadata block allocation, and we want to make 2655 * sure we locate metadata blocks in the first block group in 2656 * the flex_bg if possible. 2657 */ 2658 if (!ext4_mb_cr_expensive(cr) && 2659 (!sbi->s_log_groups_per_flex || 2660 ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && 2661 !(ext4_has_group_desc_csum(sb) && 2662 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) 2663 return 0; 2664 ret = ext4_mb_init_group(sb, group, GFP_NOFS); 2665 if (ret) 2666 return ret; 2667 } 2668 2669 if (should_lock) { 2670 ext4_lock_group(sb, group); 2671 __release(ext4_group_lock_ptr(sb, group)); 2672 } 2673 ret = ext4_mb_good_group(ac, group, cr); 2674 out: 2675 if (should_lock) { 2676 __acquire(ext4_group_lock_ptr(sb, group)); 2677 ext4_unlock_group(sb, group); 2678 } 2679 return ret; 2680 } 2681 2682 /* 2683 * Start prefetching @nr block bitmaps starting at @group. 2684 * Return the next group which needs to be prefetched. 2685 */ 2686 ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, 2687 unsigned int nr, int *cnt) 2688 { 2689 ext4_group_t ngroups = ext4_get_groups_count(sb); 2690 struct buffer_head *bh; 2691 struct blk_plug plug; 2692 2693 blk_start_plug(&plug); 2694 while (nr-- > 0) { 2695 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, 2696 NULL); 2697 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 2698 2699 /* 2700 * Prefetch block groups with free blocks; but don't 2701 * bother if it is marked uninitialized on disk, since 2702 * it won't require I/O to read. Also only try to 2703 * prefetch once, so we avoid getblk() call, which can 2704 * be expensive. 2705 */ 2706 if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && 2707 EXT4_MB_GRP_NEED_INIT(grp) && 2708 ext4_free_group_clusters(sb, gdp) > 0 ) { 2709 bh = ext4_read_block_bitmap_nowait(sb, group, true); 2710 if (bh && !IS_ERR(bh)) { 2711 if (!buffer_uptodate(bh) && cnt) 2712 (*cnt)++; 2713 brelse(bh); 2714 } 2715 } 2716 if (++group >= ngroups) 2717 group = 0; 2718 } 2719 blk_finish_plug(&plug); 2720 return group; 2721 } 2722 2723 /* 2724 * Prefetching reads the block bitmap into the buffer cache; but we 2725 * need to make sure that the buddy bitmap in the page cache has been 2726 * initialized. Note that ext4_mb_init_group() will block if the I/O 2727 * is not yet completed, or indeed if it was not initiated by 2728 * ext4_mb_prefetch did not start the I/O. 2729 * 2730 * TODO: We should actually kick off the buddy bitmap setup in a work 2731 * queue when the buffer I/O is completed, so that we don't block 2732 * waiting for the block allocation bitmap read to finish when 2733 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). 2734 */ 2735 void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, 2736 unsigned int nr) 2737 { 2738 struct ext4_group_desc *gdp; 2739 struct ext4_group_info *grp; 2740 2741 while (nr-- > 0) { 2742 if (!group) 2743 group = ext4_get_groups_count(sb); 2744 group--; 2745 gdp = ext4_get_group_desc(sb, group, NULL); 2746 grp = ext4_get_group_info(sb, group); 2747 2748 if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && 2749 ext4_free_group_clusters(sb, gdp) > 0) { 2750 if (ext4_mb_init_group(sb, group, GFP_NOFS)) 2751 break; 2752 } 2753 } 2754 } 2755 2756 static noinline_for_stack int 2757 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 2758 { 2759 ext4_group_t prefetch_grp = 0, ngroups, group, i; 2760 enum criteria new_cr, cr = CR_GOAL_LEN_FAST; 2761 int err = 0, first_err = 0; 2762 unsigned int nr = 0, prefetch_ios = 0; 2763 struct ext4_sb_info *sbi; 2764 struct super_block *sb; 2765 struct ext4_buddy e4b; 2766 int lost; 2767 2768 sb = ac->ac_sb; 2769 sbi = EXT4_SB(sb); 2770 ngroups = ext4_get_groups_count(sb); 2771 /* non-extent files are limited to low blocks/groups */ 2772 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) 2773 ngroups = sbi->s_blockfile_groups; 2774 2775 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2776 2777 /* first, try the goal */ 2778 err = ext4_mb_find_by_goal(ac, &e4b); 2779 if (err || ac->ac_status == AC_STATUS_FOUND) 2780 goto out; 2781 2782 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 2783 goto out; 2784 2785 /* 2786 * ac->ac_2order is set only if the fe_len is a power of 2 2787 * if ac->ac_2order is set we also set criteria to 0 so that we 2788 * try exact allocation using buddy. 2789 */ 2790 i = fls(ac->ac_g_ex.fe_len); 2791 ac->ac_2order = 0; 2792 /* 2793 * We search using buddy data only if the order of the request 2794 * is greater than equal to the sbi_s_mb_order2_reqs 2795 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req 2796 * We also support searching for power-of-two requests only for 2797 * requests upto maximum buddy size we have constructed. 2798 */ 2799 if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { 2800 if (is_power_of_2(ac->ac_g_ex.fe_len)) 2801 ac->ac_2order = array_index_nospec(i - 1, 2802 MB_NUM_ORDERS(sb)); 2803 } 2804 2805 /* if stream allocation is enabled, use global goal */ 2806 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2807 /* TBD: may be hot point */ 2808 spin_lock(&sbi->s_md_lock); 2809 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2810 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2811 spin_unlock(&sbi->s_md_lock); 2812 } 2813 2814 /* 2815 * Let's just scan groups to find more-less suitable blocks We 2816 * start with CR_GOAL_LEN_FAST, unless it is power of 2 2817 * aligned, in which case let's do that faster approach first. 2818 */ 2819 if (ac->ac_2order) 2820 cr = CR_POWER2_ALIGNED; 2821 repeat: 2822 for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 2823 ac->ac_criteria = cr; 2824 /* 2825 * searching for the right group start 2826 * from the goal value specified 2827 */ 2828 group = ac->ac_g_ex.fe_group; 2829 ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; 2830 prefetch_grp = group; 2831 2832 for (i = 0, new_cr = cr; i < ngroups; i++, 2833 ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { 2834 int ret = 0; 2835 2836 cond_resched(); 2837 if (new_cr != cr) { 2838 cr = new_cr; 2839 goto repeat; 2840 } 2841 2842 /* 2843 * Batch reads of the block allocation bitmaps 2844 * to get multiple READs in flight; limit 2845 * prefetching at cr=0/1, otherwise mballoc can 2846 * spend a lot of time loading imperfect groups 2847 */ 2848 if ((prefetch_grp == group) && 2849 (ext4_mb_cr_expensive(cr) || 2850 prefetch_ios < sbi->s_mb_prefetch_limit)) { 2851 nr = sbi->s_mb_prefetch; 2852 if (ext4_has_feature_flex_bg(sb)) { 2853 nr = 1 << sbi->s_log_groups_per_flex; 2854 nr -= group & (nr - 1); 2855 nr = min(nr, sbi->s_mb_prefetch); 2856 } 2857 prefetch_grp = ext4_mb_prefetch(sb, group, 2858 nr, &prefetch_ios); 2859 } 2860 2861 /* This now checks without needing the buddy page */ 2862 ret = ext4_mb_good_group_nolock(ac, group, cr); 2863 if (ret <= 0) { 2864 if (!first_err) 2865 first_err = ret; 2866 continue; 2867 } 2868 2869 err = ext4_mb_load_buddy(sb, group, &e4b); 2870 if (err) 2871 goto out; 2872 2873 ext4_lock_group(sb, group); 2874 2875 /* 2876 * We need to check again after locking the 2877 * block group 2878 */ 2879 ret = ext4_mb_good_group(ac, group, cr); 2880 if (ret == 0) { 2881 ext4_unlock_group(sb, group); 2882 ext4_mb_unload_buddy(&e4b); 2883 continue; 2884 } 2885 2886 ac->ac_groups_scanned++; 2887 if (cr == CR_POWER2_ALIGNED) 2888 ext4_mb_simple_scan_group(ac, &e4b); 2889 else if ((cr == CR_GOAL_LEN_FAST || 2890 cr == CR_BEST_AVAIL_LEN) && 2891 sbi->s_stripe && 2892 !(ac->ac_g_ex.fe_len % 2893 EXT4_B2C(sbi, sbi->s_stripe))) 2894 ext4_mb_scan_aligned(ac, &e4b); 2895 else 2896 ext4_mb_complex_scan_group(ac, &e4b); 2897 2898 ext4_unlock_group(sb, group); 2899 ext4_mb_unload_buddy(&e4b); 2900 2901 if (ac->ac_status != AC_STATUS_CONTINUE) 2902 break; 2903 } 2904 /* Processed all groups and haven't found blocks */ 2905 if (sbi->s_mb_stats && i == ngroups) 2906 atomic64_inc(&sbi->s_bal_cX_failed[cr]); 2907 2908 if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN) 2909 /* Reset goal length to original goal length before 2910 * falling into CR_GOAL_LEN_SLOW */ 2911 ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; 2912 } 2913 2914 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && 2915 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2916 /* 2917 * We've been searching too long. Let's try to allocate 2918 * the best chunk we've found so far 2919 */ 2920 ext4_mb_try_best_found(ac, &e4b); 2921 if (ac->ac_status != AC_STATUS_FOUND) { 2922 /* 2923 * Someone more lucky has already allocated it. 2924 * The only thing we can do is just take first 2925 * found block(s) 2926 */ 2927 lost = atomic_inc_return(&sbi->s_mb_lost_chunks); 2928 mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", 2929 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, 2930 ac->ac_b_ex.fe_len, lost); 2931 2932 ac->ac_b_ex.fe_group = 0; 2933 ac->ac_b_ex.fe_start = 0; 2934 ac->ac_b_ex.fe_len = 0; 2935 ac->ac_status = AC_STATUS_CONTINUE; 2936 ac->ac_flags |= EXT4_MB_HINT_FIRST; 2937 cr = CR_ANY_FREE; 2938 goto repeat; 2939 } 2940 } 2941 2942 if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) 2943 atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); 2944 out: 2945 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) 2946 err = first_err; 2947 2948 mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", 2949 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, 2950 ac->ac_flags, cr, err); 2951 2952 if (nr) 2953 ext4_mb_prefetch_fini(sb, prefetch_grp, nr); 2954 2955 return err; 2956 } 2957 2958 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2959 { 2960 struct super_block *sb = pde_data(file_inode(seq->file)); 2961 ext4_group_t group; 2962 2963 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2964 return NULL; 2965 group = *pos + 1; 2966 return (void *) ((unsigned long) group); 2967 } 2968 2969 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2970 { 2971 struct super_block *sb = pde_data(file_inode(seq->file)); 2972 ext4_group_t group; 2973 2974 ++*pos; 2975 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2976 return NULL; 2977 group = *pos + 1; 2978 return (void *) ((unsigned long) group); 2979 } 2980 2981 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2982 { 2983 struct super_block *sb = pde_data(file_inode(seq->file)); 2984 ext4_group_t group = (ext4_group_t) ((unsigned long) v); 2985 int i; 2986 int err, buddy_loaded = 0; 2987 struct ext4_buddy e4b; 2988 struct ext4_group_info *grinfo; 2989 unsigned char blocksize_bits = min_t(unsigned char, 2990 sb->s_blocksize_bits, 2991 EXT4_MAX_BLOCK_LOG_SIZE); 2992 struct sg { 2993 struct ext4_group_info info; 2994 ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; 2995 } sg; 2996 2997 group--; 2998 if (group == 0) 2999 seq_puts(seq, "#group: free frags first [" 3000 " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " 3001 " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); 3002 3003 i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + 3004 sizeof(struct ext4_group_info); 3005 3006 grinfo = ext4_get_group_info(sb, group); 3007 if (!grinfo) 3008 return 0; 3009 /* Load the group info in memory only if not already loaded. */ 3010 if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { 3011 err = ext4_mb_load_buddy(sb, group, &e4b); 3012 if (err) { 3013 seq_printf(seq, "#%-5u: I/O error\n", group); 3014 return 0; 3015 } 3016 buddy_loaded = 1; 3017 } 3018 3019 memcpy(&sg, grinfo, i); 3020 3021 if (buddy_loaded) 3022 ext4_mb_unload_buddy(&e4b); 3023 3024 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 3025 sg.info.bb_fragments, sg.info.bb_first_free); 3026 for (i = 0; i <= 13; i++) 3027 seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? 3028 sg.info.bb_counters[i] : 0); 3029 seq_puts(seq, " ]\n"); 3030 3031 return 0; 3032 } 3033 3034 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) 3035 { 3036 } 3037 3038 const struct seq_operations ext4_mb_seq_groups_ops = { 3039 .start = ext4_mb_seq_groups_start, 3040 .next = ext4_mb_seq_groups_next, 3041 .stop = ext4_mb_seq_groups_stop, 3042 .show = ext4_mb_seq_groups_show, 3043 }; 3044 3045 int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) 3046 { 3047 struct super_block *sb = seq->private; 3048 struct ext4_sb_info *sbi = EXT4_SB(sb); 3049 3050 seq_puts(seq, "mballoc:\n"); 3051 if (!sbi->s_mb_stats) { 3052 seq_puts(seq, "\tmb stats collection turned off.\n"); 3053 seq_puts( 3054 seq, 3055 "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); 3056 return 0; 3057 } 3058 seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); 3059 seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); 3060 3061 seq_printf(seq, "\tgroups_scanned: %u\n", 3062 atomic_read(&sbi->s_bal_groups_scanned)); 3063 3064 /* CR_POWER2_ALIGNED stats */ 3065 seq_puts(seq, "\tcr_p2_aligned_stats:\n"); 3066 seq_printf(seq, "\t\thits: %llu\n", 3067 atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED])); 3068 seq_printf( 3069 seq, "\t\tgroups_considered: %llu\n", 3070 atomic64_read( 3071 &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED])); 3072 seq_printf(seq, "\t\textents_scanned: %u\n", 3073 atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); 3074 seq_printf(seq, "\t\tuseless_loops: %llu\n", 3075 atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); 3076 seq_printf(seq, "\t\tbad_suggestions: %u\n", 3077 atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions)); 3078 3079 /* CR_GOAL_LEN_FAST stats */ 3080 seq_puts(seq, "\tcr_goal_fast_stats:\n"); 3081 seq_printf(seq, "\t\thits: %llu\n", 3082 atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST])); 3083 seq_printf(seq, "\t\tgroups_considered: %llu\n", 3084 atomic64_read( 3085 &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST])); 3086 seq_printf(seq, "\t\textents_scanned: %u\n", 3087 atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); 3088 seq_printf(seq, "\t\tuseless_loops: %llu\n", 3089 atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); 3090 seq_printf(seq, "\t\tbad_suggestions: %u\n", 3091 atomic_read(&sbi->s_bal_goal_fast_bad_suggestions)); 3092 3093 /* CR_BEST_AVAIL_LEN stats */ 3094 seq_puts(seq, "\tcr_best_avail_stats:\n"); 3095 seq_printf(seq, "\t\thits: %llu\n", 3096 atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN])); 3097 seq_printf( 3098 seq, "\t\tgroups_considered: %llu\n", 3099 atomic64_read( 3100 &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN])); 3101 seq_printf(seq, "\t\textents_scanned: %u\n", 3102 atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); 3103 seq_printf(seq, "\t\tuseless_loops: %llu\n", 3104 atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); 3105 seq_printf(seq, "\t\tbad_suggestions: %u\n", 3106 atomic_read(&sbi->s_bal_best_avail_bad_suggestions)); 3107 3108 /* CR_GOAL_LEN_SLOW stats */ 3109 seq_puts(seq, "\tcr_goal_slow_stats:\n"); 3110 seq_printf(seq, "\t\thits: %llu\n", 3111 atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW])); 3112 seq_printf(seq, "\t\tgroups_considered: %llu\n", 3113 atomic64_read( 3114 &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW])); 3115 seq_printf(seq, "\t\textents_scanned: %u\n", 3116 atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW])); 3117 seq_printf(seq, "\t\tuseless_loops: %llu\n", 3118 atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW])); 3119 3120 /* CR_ANY_FREE stats */ 3121 seq_puts(seq, "\tcr_any_free_stats:\n"); 3122 seq_printf(seq, "\t\thits: %llu\n", 3123 atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE])); 3124 seq_printf( 3125 seq, "\t\tgroups_considered: %llu\n", 3126 atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE])); 3127 seq_printf(seq, "\t\textents_scanned: %u\n", 3128 atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE])); 3129 seq_printf(seq, "\t\tuseless_loops: %llu\n", 3130 atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE])); 3131 3132 /* Aggregates */ 3133 seq_printf(seq, "\textents_scanned: %u\n", 3134 atomic_read(&sbi->s_bal_ex_scanned)); 3135 seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); 3136 seq_printf(seq, "\t\tlen_goal_hits: %u\n", 3137 atomic_read(&sbi->s_bal_len_goals)); 3138 seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); 3139 seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); 3140 seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); 3141 seq_printf(seq, "\tbuddies_generated: %u/%u\n", 3142 atomic_read(&sbi->s_mb_buddies_generated), 3143 ext4_get_groups_count(sb)); 3144 seq_printf(seq, "\tbuddies_time_used: %llu\n", 3145 atomic64_read(&sbi->s_mb_generation_time)); 3146 seq_printf(seq, "\tpreallocated: %u\n", 3147 atomic_read(&sbi->s_mb_preallocated)); 3148 seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded)); 3149 return 0; 3150 } 3151 3152 static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) 3153 __acquires(&EXT4_SB(sb)->s_mb_rb_lock) 3154 { 3155 struct super_block *sb = pde_data(file_inode(seq->file)); 3156 unsigned long position; 3157 3158 if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) 3159 return NULL; 3160 position = *pos + 1; 3161 return (void *) ((unsigned long) position); 3162 } 3163 3164 static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) 3165 { 3166 struct super_block *sb = pde_data(file_inode(seq->file)); 3167 unsigned long position; 3168 3169 ++*pos; 3170 if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) 3171 return NULL; 3172 position = *pos + 1; 3173 return (void *) ((unsigned long) position); 3174 } 3175 3176 static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) 3177 { 3178 struct super_block *sb = pde_data(file_inode(seq->file)); 3179 struct ext4_sb_info *sbi = EXT4_SB(sb); 3180 unsigned long position = ((unsigned long) v); 3181 struct ext4_group_info *grp; 3182 unsigned int count; 3183 3184 position--; 3185 if (position >= MB_NUM_ORDERS(sb)) { 3186 position -= MB_NUM_ORDERS(sb); 3187 if (position == 0) 3188 seq_puts(seq, "avg_fragment_size_lists:\n"); 3189 3190 count = 0; 3191 read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); 3192 list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], 3193 bb_avg_fragment_size_node) 3194 count++; 3195 read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); 3196 seq_printf(seq, "\tlist_order_%u_groups: %u\n", 3197 (unsigned int)position, count); 3198 return 0; 3199 } 3200 3201 if (position == 0) { 3202 seq_printf(seq, "optimize_scan: %d\n", 3203 test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); 3204 seq_puts(seq, "max_free_order_lists:\n"); 3205 } 3206 count = 0; 3207 read_lock(&sbi->s_mb_largest_free_orders_locks[position]); 3208 list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], 3209 bb_largest_free_order_node) 3210 count++; 3211 read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); 3212 seq_printf(seq, "\tlist_order_%u_groups: %u\n", 3213 (unsigned int)position, count); 3214 3215 return 0; 3216 } 3217 3218 static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) 3219 { 3220 } 3221 3222 const struct seq_operations ext4_mb_seq_structs_summary_ops = { 3223 .start = ext4_mb_seq_structs_summary_start, 3224 .next = ext4_mb_seq_structs_summary_next, 3225 .stop = ext4_mb_seq_structs_summary_stop, 3226 .show = ext4_mb_seq_structs_summary_show, 3227 }; 3228 3229 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) 3230 { 3231 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 3232 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; 3233 3234 BUG_ON(!cachep); 3235 return cachep; 3236 } 3237 3238 /* 3239 * Allocate the top-level s_group_info array for the specified number 3240 * of groups 3241 */ 3242 int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) 3243 { 3244 struct ext4_sb_info *sbi = EXT4_SB(sb); 3245 unsigned size; 3246 struct ext4_group_info ***old_groupinfo, ***new_groupinfo; 3247 3248 size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> 3249 EXT4_DESC_PER_BLOCK_BITS(sb); 3250 if (size <= sbi->s_group_info_size) 3251 return 0; 3252 3253 size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); 3254 new_groupinfo = kvzalloc(size, GFP_KERNEL); 3255 if (!new_groupinfo) { 3256 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); 3257 return -ENOMEM; 3258 } 3259 rcu_read_lock(); 3260 old_groupinfo = rcu_dereference(sbi->s_group_info); 3261 if (old_groupinfo) 3262 memcpy(new_groupinfo, old_groupinfo, 3263 sbi->s_group_info_size * sizeof(*sbi->s_group_info)); 3264 rcu_read_unlock(); 3265 rcu_assign_pointer(sbi->s_group_info, new_groupinfo); 3266 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); 3267 if (old_groupinfo) 3268 ext4_kvfree_array_rcu(old_groupinfo); 3269 ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 3270 sbi->s_group_info_size); 3271 return 0; 3272 } 3273 3274 /* Create and initialize ext4_group_info data for the given group. */ 3275 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 3276 struct ext4_group_desc *desc) 3277 { 3278 int i; 3279 int metalen = 0; 3280 int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); 3281 struct ext4_sb_info *sbi = EXT4_SB(sb); 3282 struct ext4_group_info **meta_group_info; 3283 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3284 3285 /* 3286 * First check if this group is the first of a reserved block. 3287 * If it's true, we have to allocate a new table of pointers 3288 * to ext4_group_info structures 3289 */ 3290 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 3291 metalen = sizeof(*meta_group_info) << 3292 EXT4_DESC_PER_BLOCK_BITS(sb); 3293 meta_group_info = kmalloc(metalen, GFP_NOFS); 3294 if (meta_group_info == NULL) { 3295 ext4_msg(sb, KERN_ERR, "can't allocate mem " 3296 "for a buddy group"); 3297 return -ENOMEM; 3298 } 3299 rcu_read_lock(); 3300 rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; 3301 rcu_read_unlock(); 3302 } 3303 3304 meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); 3305 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 3306 3307 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); 3308 if (meta_group_info[i] == NULL) { 3309 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); 3310 goto exit_group_info; 3311 } 3312 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 3313 &(meta_group_info[i]->bb_state)); 3314 3315 /* 3316 * initialize bb_free to be able to skip 3317 * empty groups without initialization 3318 */ 3319 if (ext4_has_group_desc_csum(sb) && 3320 (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 3321 meta_group_info[i]->bb_free = 3322 ext4_free_clusters_after_init(sb, group, desc); 3323 } else { 3324 meta_group_info[i]->bb_free = 3325 ext4_free_group_clusters(sb, desc); 3326 } 3327 3328 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 3329 init_rwsem(&meta_group_info[i]->alloc_sem); 3330 meta_group_info[i]->bb_free_root = RB_ROOT; 3331 INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); 3332 INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); 3333 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ 3334 meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ 3335 meta_group_info[i]->bb_group = group; 3336 3337 mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); 3338 return 0; 3339 3340 exit_group_info: 3341 /* If a meta_group_info table has been allocated, release it now */ 3342 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 3343 struct ext4_group_info ***group_info; 3344 3345 rcu_read_lock(); 3346 group_info = rcu_dereference(sbi->s_group_info); 3347 kfree(group_info[idx]); 3348 group_info[idx] = NULL; 3349 rcu_read_unlock(); 3350 } 3351 return -ENOMEM; 3352 } /* ext4_mb_add_groupinfo */ 3353 3354 static int ext4_mb_init_backend(struct super_block *sb) 3355 { 3356 ext4_group_t ngroups = ext4_get_groups_count(sb); 3357 ext4_group_t i; 3358 struct ext4_sb_info *sbi = EXT4_SB(sb); 3359 int err; 3360 struct ext4_group_desc *desc; 3361 struct ext4_group_info ***group_info; 3362 struct kmem_cache *cachep; 3363 3364 err = ext4_mb_alloc_groupinfo(sb, ngroups); 3365 if (err) 3366 return err; 3367 3368 sbi->s_buddy_cache = new_inode(sb); 3369 if (sbi->s_buddy_cache == NULL) { 3370 ext4_msg(sb, KERN_ERR, "can't get new inode"); 3371 goto err_freesgi; 3372 } 3373 /* To avoid potentially colliding with an valid on-disk inode number, 3374 * use EXT4_BAD_INO for the buddy cache inode number. This inode is 3375 * not in the inode hash, so it should never be found by iget(), but 3376 * this will avoid confusion if it ever shows up during debugging. */ 3377 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; 3378 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 3379 for (i = 0; i < ngroups; i++) { 3380 cond_resched(); 3381 desc = ext4_get_group_desc(sb, i, NULL); 3382 if (desc == NULL) { 3383 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); 3384 goto err_freebuddy; 3385 } 3386 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 3387 goto err_freebuddy; 3388 } 3389 3390 if (ext4_has_feature_flex_bg(sb)) { 3391 /* a single flex group is supposed to be read by a single IO. 3392 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is 3393 * unsigned integer, so the maximum shift is 32. 3394 */ 3395 if (sbi->s_es->s_log_groups_per_flex >= 32) { 3396 ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); 3397 goto err_freebuddy; 3398 } 3399 sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, 3400 BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); 3401 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ 3402 } else { 3403 sbi->s_mb_prefetch = 32; 3404 } 3405 if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) 3406 sbi->s_mb_prefetch = ext4_get_groups_count(sb); 3407 /* now many real IOs to prefetch within a single allocation at cr=0 3408 * given cr=0 is an CPU-related optimization we shouldn't try to 3409 * load too many groups, at some point we should start to use what 3410 * we've got in memory. 3411 * with an average random access time 5ms, it'd take a second to get 3412 * 200 groups (* N with flex_bg), so let's make this limit 4 3413 */ 3414 sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; 3415 if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) 3416 sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); 3417 3418 return 0; 3419 3420 err_freebuddy: 3421 cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3422 while (i-- > 0) { 3423 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 3424 3425 if (grp) 3426 kmem_cache_free(cachep, grp); 3427 } 3428 i = sbi->s_group_info_size; 3429 rcu_read_lock(); 3430 group_info = rcu_dereference(sbi->s_group_info); 3431 while (i-- > 0) 3432 kfree(group_info[i]); 3433 rcu_read_unlock(); 3434 iput(sbi->s_buddy_cache); 3435 err_freesgi: 3436 rcu_read_lock(); 3437 kvfree(rcu_dereference(sbi->s_group_info)); 3438 rcu_read_unlock(); 3439 return -ENOMEM; 3440 } 3441 3442 static void ext4_groupinfo_destroy_slabs(void) 3443 { 3444 int i; 3445 3446 for (i = 0; i < NR_GRPINFO_CACHES; i++) { 3447 kmem_cache_destroy(ext4_groupinfo_caches[i]); 3448 ext4_groupinfo_caches[i] = NULL; 3449 } 3450 } 3451 3452 static int ext4_groupinfo_create_slab(size_t size) 3453 { 3454 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); 3455 int slab_size; 3456 int blocksize_bits = order_base_2(size); 3457 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 3458 struct kmem_cache *cachep; 3459 3460 if (cache_index >= NR_GRPINFO_CACHES) 3461 return -EINVAL; 3462 3463 if (unlikely(cache_index < 0)) 3464 cache_index = 0; 3465 3466 mutex_lock(&ext4_grpinfo_slab_create_mutex); 3467 if (ext4_groupinfo_caches[cache_index]) { 3468 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 3469 return 0; /* Already created */ 3470 } 3471 3472 slab_size = offsetof(struct ext4_group_info, 3473 bb_counters[blocksize_bits + 2]); 3474 3475 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], 3476 slab_size, 0, SLAB_RECLAIM_ACCOUNT, 3477 NULL); 3478 3479 ext4_groupinfo_caches[cache_index] = cachep; 3480 3481 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 3482 if (!cachep) { 3483 printk(KERN_EMERG 3484 "EXT4-fs: no memory for groupinfo slab cache\n"); 3485 return -ENOMEM; 3486 } 3487 3488 return 0; 3489 } 3490 3491 static void ext4_discard_work(struct work_struct *work) 3492 { 3493 struct ext4_sb_info *sbi = container_of(work, 3494 struct ext4_sb_info, s_discard_work); 3495 struct super_block *sb = sbi->s_sb; 3496 struct ext4_free_data *fd, *nfd; 3497 struct ext4_buddy e4b; 3498 struct list_head discard_list; 3499 ext4_group_t grp, load_grp; 3500 int err = 0; 3501 3502 INIT_LIST_HEAD(&discard_list); 3503 spin_lock(&sbi->s_md_lock); 3504 list_splice_init(&sbi->s_discard_list, &discard_list); 3505 spin_unlock(&sbi->s_md_lock); 3506 3507 load_grp = UINT_MAX; 3508 list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { 3509 /* 3510 * If filesystem is umounting or no memory or suffering 3511 * from no space, give up the discard 3512 */ 3513 if ((sb->s_flags & SB_ACTIVE) && !err && 3514 !atomic_read(&sbi->s_retry_alloc_pending)) { 3515 grp = fd->efd_group; 3516 if (grp != load_grp) { 3517 if (load_grp != UINT_MAX) 3518 ext4_mb_unload_buddy(&e4b); 3519 3520 err = ext4_mb_load_buddy(sb, grp, &e4b); 3521 if (err) { 3522 kmem_cache_free(ext4_free_data_cachep, fd); 3523 load_grp = UINT_MAX; 3524 continue; 3525 } else { 3526 load_grp = grp; 3527 } 3528 } 3529 3530 ext4_lock_group(sb, grp); 3531 ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, 3532 fd->efd_start_cluster + fd->efd_count - 1, 1); 3533 ext4_unlock_group(sb, grp); 3534 } 3535 kmem_cache_free(ext4_free_data_cachep, fd); 3536 } 3537 3538 if (load_grp != UINT_MAX) 3539 ext4_mb_unload_buddy(&e4b); 3540 } 3541 3542 int ext4_mb_init(struct super_block *sb) 3543 { 3544 struct ext4_sb_info *sbi = EXT4_SB(sb); 3545 unsigned i, j; 3546 unsigned offset, offset_incr; 3547 unsigned max; 3548 int ret; 3549 3550 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); 3551 3552 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 3553 if (sbi->s_mb_offsets == NULL) { 3554 ret = -ENOMEM; 3555 goto out; 3556 } 3557 3558 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); 3559 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 3560 if (sbi->s_mb_maxs == NULL) { 3561 ret = -ENOMEM; 3562 goto out; 3563 } 3564 3565 ret = ext4_groupinfo_create_slab(sb->s_blocksize); 3566 if (ret < 0) 3567 goto out; 3568 3569 /* order 0 is regular bitmap */ 3570 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 3571 sbi->s_mb_offsets[0] = 0; 3572 3573 i = 1; 3574 offset = 0; 3575 offset_incr = 1 << (sb->s_blocksize_bits - 1); 3576 max = sb->s_blocksize << 2; 3577 do { 3578 sbi->s_mb_offsets[i] = offset; 3579 sbi->s_mb_maxs[i] = max; 3580 offset += offset_incr; 3581 offset_incr = offset_incr >> 1; 3582 max = max >> 1; 3583 i++; 3584 } while (i < MB_NUM_ORDERS(sb)); 3585 3586 sbi->s_mb_avg_fragment_size = 3587 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3588 GFP_KERNEL); 3589 if (!sbi->s_mb_avg_fragment_size) { 3590 ret = -ENOMEM; 3591 goto out; 3592 } 3593 sbi->s_mb_avg_fragment_size_locks = 3594 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 3595 GFP_KERNEL); 3596 if (!sbi->s_mb_avg_fragment_size_locks) { 3597 ret = -ENOMEM; 3598 goto out; 3599 } 3600 for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 3601 INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); 3602 rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); 3603 } 3604 sbi->s_mb_largest_free_orders = 3605 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3606 GFP_KERNEL); 3607 if (!sbi->s_mb_largest_free_orders) { 3608 ret = -ENOMEM; 3609 goto out; 3610 } 3611 sbi->s_mb_largest_free_orders_locks = 3612 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 3613 GFP_KERNEL); 3614 if (!sbi->s_mb_largest_free_orders_locks) { 3615 ret = -ENOMEM; 3616 goto out; 3617 } 3618 for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 3619 INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); 3620 rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); 3621 } 3622 3623 spin_lock_init(&sbi->s_md_lock); 3624 sbi->s_mb_free_pending = 0; 3625 INIT_LIST_HEAD(&sbi->s_freed_data_list); 3626 INIT_LIST_HEAD(&sbi->s_discard_list); 3627 INIT_WORK(&sbi->s_discard_work, ext4_discard_work); 3628 atomic_set(&sbi->s_retry_alloc_pending, 0); 3629 3630 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 3631 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; 3632 sbi->s_mb_stats = MB_DEFAULT_STATS; 3633 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 3634 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 3635 sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER; 3636 3637 /* 3638 * The default group preallocation is 512, which for 4k block 3639 * sizes translates to 2 megabytes. However for bigalloc file 3640 * systems, this is probably too big (i.e, if the cluster size 3641 * is 1 megabyte, then group preallocation size becomes half a 3642 * gigabyte!). As a default, we will keep a two megabyte 3643 * group pralloc size for cluster sizes up to 64k, and after 3644 * that, we will force a minimum group preallocation size of 3645 * 32 clusters. This translates to 8 megs when the cluster 3646 * size is 256k, and 32 megs when the cluster size is 1 meg, 3647 * which seems reasonable as a default. 3648 */ 3649 sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> 3650 sbi->s_cluster_bits, 32); 3651 /* 3652 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc 3653 * to the lowest multiple of s_stripe which is bigger than 3654 * the s_mb_group_prealloc as determined above. We want 3655 * the preallocation size to be an exact multiple of the 3656 * RAID stripe size so that preallocations don't fragment 3657 * the stripes. 3658 */ 3659 if (sbi->s_stripe > 1) { 3660 sbi->s_mb_group_prealloc = roundup( 3661 sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); 3662 } 3663 3664 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 3665 if (sbi->s_locality_groups == NULL) { 3666 ret = -ENOMEM; 3667 goto out; 3668 } 3669 for_each_possible_cpu(i) { 3670 struct ext4_locality_group *lg; 3671 lg = per_cpu_ptr(sbi->s_locality_groups, i); 3672 mutex_init(&lg->lg_mutex); 3673 for (j = 0; j < PREALLOC_TB_SIZE; j++) 3674 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 3675 spin_lock_init(&lg->lg_prealloc_lock); 3676 } 3677 3678 if (bdev_nonrot(sb->s_bdev)) 3679 sbi->s_mb_max_linear_groups = 0; 3680 else 3681 sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; 3682 /* init file for buddy data */ 3683 ret = ext4_mb_init_backend(sb); 3684 if (ret != 0) 3685 goto out_free_locality_groups; 3686 3687 return 0; 3688 3689 out_free_locality_groups: 3690 free_percpu(sbi->s_locality_groups); 3691 sbi->s_locality_groups = NULL; 3692 out: 3693 kfree(sbi->s_mb_avg_fragment_size); 3694 kfree(sbi->s_mb_avg_fragment_size_locks); 3695 kfree(sbi->s_mb_largest_free_orders); 3696 kfree(sbi->s_mb_largest_free_orders_locks); 3697 kfree(sbi->s_mb_offsets); 3698 sbi->s_mb_offsets = NULL; 3699 kfree(sbi->s_mb_maxs); 3700 sbi->s_mb_maxs = NULL; 3701 return ret; 3702 } 3703 3704 /* need to called with the ext4 group lock held */ 3705 static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) 3706 { 3707 struct ext4_prealloc_space *pa; 3708 struct list_head *cur, *tmp; 3709 int count = 0; 3710 3711 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { 3712 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 3713 list_del(&pa->pa_group_list); 3714 count++; 3715 kmem_cache_free(ext4_pspace_cachep, pa); 3716 } 3717 return count; 3718 } 3719 3720 int ext4_mb_release(struct super_block *sb) 3721 { 3722 ext4_group_t ngroups = ext4_get_groups_count(sb); 3723 ext4_group_t i; 3724 int num_meta_group_infos; 3725 struct ext4_group_info *grinfo, ***group_info; 3726 struct ext4_sb_info *sbi = EXT4_SB(sb); 3727 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3728 int count; 3729 3730 if (test_opt(sb, DISCARD)) { 3731 /* 3732 * wait the discard work to drain all of ext4_free_data 3733 */ 3734 flush_work(&sbi->s_discard_work); 3735 WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); 3736 } 3737 3738 if (sbi->s_group_info) { 3739 for (i = 0; i < ngroups; i++) { 3740 cond_resched(); 3741 grinfo = ext4_get_group_info(sb, i); 3742 if (!grinfo) 3743 continue; 3744 mb_group_bb_bitmap_free(grinfo); 3745 ext4_lock_group(sb, i); 3746 count = ext4_mb_cleanup_pa(grinfo); 3747 if (count) 3748 mb_debug(sb, "mballoc: %d PAs left\n", 3749 count); 3750 ext4_unlock_group(sb, i); 3751 kmem_cache_free(cachep, grinfo); 3752 } 3753 num_meta_group_infos = (ngroups + 3754 EXT4_DESC_PER_BLOCK(sb) - 1) >> 3755 EXT4_DESC_PER_BLOCK_BITS(sb); 3756 rcu_read_lock(); 3757 group_info = rcu_dereference(sbi->s_group_info); 3758 for (i = 0; i < num_meta_group_infos; i++) 3759 kfree(group_info[i]); 3760 kvfree(group_info); 3761 rcu_read_unlock(); 3762 } 3763 kfree(sbi->s_mb_avg_fragment_size); 3764 kfree(sbi->s_mb_avg_fragment_size_locks); 3765 kfree(sbi->s_mb_largest_free_orders); 3766 kfree(sbi->s_mb_largest_free_orders_locks); 3767 kfree(sbi->s_mb_offsets); 3768 kfree(sbi->s_mb_maxs); 3769 iput(sbi->s_buddy_cache); 3770 if (sbi->s_mb_stats) { 3771 ext4_msg(sb, KERN_INFO, 3772 "mballoc: %u blocks %u reqs (%u success)", 3773 atomic_read(&sbi->s_bal_allocated), 3774 atomic_read(&sbi->s_bal_reqs), 3775 atomic_read(&sbi->s_bal_success)); 3776 ext4_msg(sb, KERN_INFO, 3777 "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " 3778 "%u 2^N hits, %u breaks, %u lost", 3779 atomic_read(&sbi->s_bal_ex_scanned), 3780 atomic_read(&sbi->s_bal_groups_scanned), 3781 atomic_read(&sbi->s_bal_goals), 3782 atomic_read(&sbi->s_bal_2orders), 3783 atomic_read(&sbi->s_bal_breaks), 3784 atomic_read(&sbi->s_mb_lost_chunks)); 3785 ext4_msg(sb, KERN_INFO, 3786 "mballoc: %u generated and it took %llu", 3787 atomic_read(&sbi->s_mb_buddies_generated), 3788 atomic64_read(&sbi->s_mb_generation_time)); 3789 ext4_msg(sb, KERN_INFO, 3790 "mballoc: %u preallocated, %u discarded", 3791 atomic_read(&sbi->s_mb_preallocated), 3792 atomic_read(&sbi->s_mb_discarded)); 3793 } 3794 3795 free_percpu(sbi->s_locality_groups); 3796 3797 return 0; 3798 } 3799 3800 static inline int ext4_issue_discard(struct super_block *sb, 3801 ext4_group_t block_group, ext4_grpblk_t cluster, int count, 3802 struct bio **biop) 3803 { 3804 ext4_fsblk_t discard_block; 3805 3806 discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + 3807 ext4_group_first_block_no(sb, block_group)); 3808 count = EXT4_C2B(EXT4_SB(sb), count); 3809 trace_ext4_discard_blocks(sb, 3810 (unsigned long long) discard_block, count); 3811 if (biop) { 3812 return __blkdev_issue_discard(sb->s_bdev, 3813 (sector_t)discard_block << (sb->s_blocksize_bits - 9), 3814 (sector_t)count << (sb->s_blocksize_bits - 9), 3815 GFP_NOFS, biop); 3816 } else 3817 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); 3818 } 3819 3820 static void ext4_free_data_in_buddy(struct super_block *sb, 3821 struct ext4_free_data *entry) 3822 { 3823 struct ext4_buddy e4b; 3824 struct ext4_group_info *db; 3825 int err, count = 0; 3826 3827 mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", 3828 entry->efd_count, entry->efd_group, entry); 3829 3830 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); 3831 /* we expect to find existing buddy because it's pinned */ 3832 BUG_ON(err != 0); 3833 3834 spin_lock(&EXT4_SB(sb)->s_md_lock); 3835 EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; 3836 spin_unlock(&EXT4_SB(sb)->s_md_lock); 3837 3838 db = e4b.bd_info; 3839 /* there are blocks to put in buddy to make them really free */ 3840 count += entry->efd_count; 3841 ext4_lock_group(sb, entry->efd_group); 3842 /* Take it out of per group rb tree */ 3843 rb_erase(&entry->efd_node, &(db->bb_free_root)); 3844 mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); 3845 3846 /* 3847 * Clear the trimmed flag for the group so that the next 3848 * ext4_trim_fs can trim it. 3849 * If the volume is mounted with -o discard, online discard 3850 * is supported and the free blocks will be trimmed online. 3851 */ 3852 if (!test_opt(sb, DISCARD)) 3853 EXT4_MB_GRP_CLEAR_TRIMMED(db); 3854 3855 if (!db->bb_free_root.rb_node) { 3856 /* No more items in the per group rb tree 3857 * balance refcounts from ext4_mb_free_metadata() 3858 */ 3859 put_page(e4b.bd_buddy_page); 3860 put_page(e4b.bd_bitmap_page); 3861 } 3862 ext4_unlock_group(sb, entry->efd_group); 3863 ext4_mb_unload_buddy(&e4b); 3864 3865 mb_debug(sb, "freed %d blocks in 1 structures\n", count); 3866 } 3867 3868 /* 3869 * This function is called by the jbd2 layer once the commit has finished, 3870 * so we know we can free the blocks that were released with that commit. 3871 */ 3872 void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) 3873 { 3874 struct ext4_sb_info *sbi = EXT4_SB(sb); 3875 struct ext4_free_data *entry, *tmp; 3876 struct list_head freed_data_list; 3877 struct list_head *cut_pos = NULL; 3878 bool wake; 3879 3880 INIT_LIST_HEAD(&freed_data_list); 3881 3882 spin_lock(&sbi->s_md_lock); 3883 list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { 3884 if (entry->efd_tid != commit_tid) 3885 break; 3886 cut_pos = &entry->efd_list; 3887 } 3888 if (cut_pos) 3889 list_cut_position(&freed_data_list, &sbi->s_freed_data_list, 3890 cut_pos); 3891 spin_unlock(&sbi->s_md_lock); 3892 3893 list_for_each_entry(entry, &freed_data_list, efd_list) 3894 ext4_free_data_in_buddy(sb, entry); 3895 3896 if (test_opt(sb, DISCARD)) { 3897 spin_lock(&sbi->s_md_lock); 3898 wake = list_empty(&sbi->s_discard_list); 3899 list_splice_tail(&freed_data_list, &sbi->s_discard_list); 3900 spin_unlock(&sbi->s_md_lock); 3901 if (wake) 3902 queue_work(system_unbound_wq, &sbi->s_discard_work); 3903 } else { 3904 list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) 3905 kmem_cache_free(ext4_free_data_cachep, entry); 3906 } 3907 } 3908 3909 int __init ext4_init_mballoc(void) 3910 { 3911 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, 3912 SLAB_RECLAIM_ACCOUNT); 3913 if (ext4_pspace_cachep == NULL) 3914 goto out; 3915 3916 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, 3917 SLAB_RECLAIM_ACCOUNT); 3918 if (ext4_ac_cachep == NULL) 3919 goto out_pa_free; 3920 3921 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, 3922 SLAB_RECLAIM_ACCOUNT); 3923 if (ext4_free_data_cachep == NULL) 3924 goto out_ac_free; 3925 3926 return 0; 3927 3928 out_ac_free: 3929 kmem_cache_destroy(ext4_ac_cachep); 3930 out_pa_free: 3931 kmem_cache_destroy(ext4_pspace_cachep); 3932 out: 3933 return -ENOMEM; 3934 } 3935 3936 void ext4_exit_mballoc(void) 3937 { 3938 /* 3939 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 3940 * before destroying the slab cache. 3941 */ 3942 rcu_barrier(); 3943 kmem_cache_destroy(ext4_pspace_cachep); 3944 kmem_cache_destroy(ext4_ac_cachep); 3945 kmem_cache_destroy(ext4_free_data_cachep); 3946 ext4_groupinfo_destroy_slabs(); 3947 } 3948 3949 3950 /* 3951 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps 3952 * Returns 0 if success or error code 3953 */ 3954 static noinline_for_stack int 3955 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 3956 handle_t *handle, unsigned int reserv_clstrs) 3957 { 3958 struct buffer_head *bitmap_bh = NULL; 3959 struct ext4_group_desc *gdp; 3960 struct buffer_head *gdp_bh; 3961 struct ext4_sb_info *sbi; 3962 struct super_block *sb; 3963 ext4_fsblk_t block; 3964 int err, len; 3965 3966 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3967 BUG_ON(ac->ac_b_ex.fe_len <= 0); 3968 3969 sb = ac->ac_sb; 3970 sbi = EXT4_SB(sb); 3971 3972 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 3973 if (IS_ERR(bitmap_bh)) { 3974 return PTR_ERR(bitmap_bh); 3975 } 3976 3977 BUFFER_TRACE(bitmap_bh, "getting write access"); 3978 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 3979 EXT4_JTR_NONE); 3980 if (err) 3981 goto out_err; 3982 3983 err = -EIO; 3984 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); 3985 if (!gdp) 3986 goto out_err; 3987 3988 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 3989 ext4_free_group_clusters(sb, gdp)); 3990 3991 BUFFER_TRACE(gdp_bh, "get_write_access"); 3992 err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); 3993 if (err) 3994 goto out_err; 3995 3996 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3997 3998 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 3999 if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { 4000 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 4001 "fs metadata", block, block+len); 4002 /* File system mounted not to panic on error 4003 * Fix the bitmap and return EFSCORRUPTED 4004 * We leak some of the blocks here. 4005 */ 4006 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 4007 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 4008 ac->ac_b_ex.fe_len); 4009 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 4010 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4011 if (!err) 4012 err = -EFSCORRUPTED; 4013 goto out_err; 4014 } 4015 4016 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 4017 #ifdef AGGRESSIVE_CHECK 4018 { 4019 int i; 4020 for (i = 0; i < ac->ac_b_ex.fe_len; i++) { 4021 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, 4022 bitmap_bh->b_data)); 4023 } 4024 } 4025 #endif 4026 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 4027 ac->ac_b_ex.fe_len); 4028 if (ext4_has_group_desc_csum(sb) && 4029 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 4030 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 4031 ext4_free_group_clusters_set(sb, gdp, 4032 ext4_free_clusters_after_init(sb, 4033 ac->ac_b_ex.fe_group, gdp)); 4034 } 4035 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; 4036 ext4_free_group_clusters_set(sb, gdp, len); 4037 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 4038 ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); 4039 4040 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 4041 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); 4042 /* 4043 * Now reduce the dirty block count also. Should not go negative 4044 */ 4045 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 4046 /* release all the reserved blocks if non delalloc */ 4047 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 4048 reserv_clstrs); 4049 4050 if (sbi->s_log_groups_per_flex) { 4051 ext4_group_t flex_group = ext4_flex_group(sbi, 4052 ac->ac_b_ex.fe_group); 4053 atomic64_sub(ac->ac_b_ex.fe_len, 4054 &sbi_array_rcu_deref(sbi, s_flex_groups, 4055 flex_group)->free_clusters); 4056 } 4057 4058 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4059 if (err) 4060 goto out_err; 4061 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 4062 4063 out_err: 4064 brelse(bitmap_bh); 4065 return err; 4066 } 4067 4068 /* 4069 * Idempotent helper for Ext4 fast commit replay path to set the state of 4070 * blocks in bitmaps and update counters. 4071 */ 4072 void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, 4073 int len, int state) 4074 { 4075 struct buffer_head *bitmap_bh = NULL; 4076 struct ext4_group_desc *gdp; 4077 struct buffer_head *gdp_bh; 4078 struct ext4_sb_info *sbi = EXT4_SB(sb); 4079 ext4_group_t group; 4080 ext4_grpblk_t blkoff; 4081 int i, err; 4082 int already; 4083 unsigned int clen, clen_changed, thisgrp_len; 4084 4085 while (len > 0) { 4086 ext4_get_group_no_and_offset(sb, block, &group, &blkoff); 4087 4088 /* 4089 * Check to see if we are freeing blocks across a group 4090 * boundary. 4091 * In case of flex_bg, this can happen that (block, len) may 4092 * span across more than one group. In that case we need to 4093 * get the corresponding group metadata to work with. 4094 * For this we have goto again loop. 4095 */ 4096 thisgrp_len = min_t(unsigned int, (unsigned int)len, 4097 EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); 4098 clen = EXT4_NUM_B2C(sbi, thisgrp_len); 4099 4100 if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) { 4101 ext4_error(sb, "Marking blocks in system zone - " 4102 "Block = %llu, len = %u", 4103 block, thisgrp_len); 4104 bitmap_bh = NULL; 4105 break; 4106 } 4107 4108 bitmap_bh = ext4_read_block_bitmap(sb, group); 4109 if (IS_ERR(bitmap_bh)) { 4110 err = PTR_ERR(bitmap_bh); 4111 bitmap_bh = NULL; 4112 break; 4113 } 4114 4115 err = -EIO; 4116 gdp = ext4_get_group_desc(sb, group, &gdp_bh); 4117 if (!gdp) 4118 break; 4119 4120 ext4_lock_group(sb, group); 4121 already = 0; 4122 for (i = 0; i < clen; i++) 4123 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == 4124 !state) 4125 already++; 4126 4127 clen_changed = clen - already; 4128 if (state) 4129 mb_set_bits(bitmap_bh->b_data, blkoff, clen); 4130 else 4131 mb_clear_bits(bitmap_bh->b_data, blkoff, clen); 4132 if (ext4_has_group_desc_csum(sb) && 4133 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 4134 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 4135 ext4_free_group_clusters_set(sb, gdp, 4136 ext4_free_clusters_after_init(sb, group, gdp)); 4137 } 4138 if (state) 4139 clen = ext4_free_group_clusters(sb, gdp) - clen_changed; 4140 else 4141 clen = ext4_free_group_clusters(sb, gdp) + clen_changed; 4142 4143 ext4_free_group_clusters_set(sb, gdp, clen); 4144 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 4145 ext4_group_desc_csum_set(sb, group, gdp); 4146 4147 ext4_unlock_group(sb, group); 4148 4149 if (sbi->s_log_groups_per_flex) { 4150 ext4_group_t flex_group = ext4_flex_group(sbi, group); 4151 struct flex_groups *fg = sbi_array_rcu_deref(sbi, 4152 s_flex_groups, flex_group); 4153 4154 if (state) 4155 atomic64_sub(clen_changed, &fg->free_clusters); 4156 else 4157 atomic64_add(clen_changed, &fg->free_clusters); 4158 4159 } 4160 4161 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); 4162 if (err) 4163 break; 4164 sync_dirty_buffer(bitmap_bh); 4165 err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); 4166 sync_dirty_buffer(gdp_bh); 4167 if (err) 4168 break; 4169 4170 block += thisgrp_len; 4171 len -= thisgrp_len; 4172 brelse(bitmap_bh); 4173 BUG_ON(len < 0); 4174 } 4175 4176 if (err) 4177 brelse(bitmap_bh); 4178 } 4179 4180 /* 4181 * here we normalize request for locality group 4182 * Group request are normalized to s_mb_group_prealloc, which goes to 4183 * s_strip if we set the same via mount option. 4184 * s_mb_group_prealloc can be configured via 4185 * /sys/fs/ext4/<partition>/mb_group_prealloc 4186 * 4187 * XXX: should we try to preallocate more than the group has now? 4188 */ 4189 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) 4190 { 4191 struct super_block *sb = ac->ac_sb; 4192 struct ext4_locality_group *lg = ac->ac_lg; 4193 4194 BUG_ON(lg == NULL); 4195 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 4196 mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); 4197 } 4198 4199 /* 4200 * This function returns the next element to look at during inode 4201 * PA rbtree walk. We assume that we have held the inode PA rbtree lock 4202 * (ei->i_prealloc_lock) 4203 * 4204 * new_start The start of the range we want to compare 4205 * cur_start The existing start that we are comparing against 4206 * node The node of the rb_tree 4207 */ 4208 static inline struct rb_node* 4209 ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) 4210 { 4211 if (new_start < cur_start) 4212 return node->rb_left; 4213 else 4214 return node->rb_right; 4215 } 4216 4217 static inline void 4218 ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, 4219 ext4_lblk_t start, loff_t end) 4220 { 4221 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4222 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4223 struct ext4_prealloc_space *tmp_pa; 4224 ext4_lblk_t tmp_pa_start; 4225 loff_t tmp_pa_end; 4226 struct rb_node *iter; 4227 4228 read_lock(&ei->i_prealloc_lock); 4229 for (iter = ei->i_prealloc_node.rb_node; iter; 4230 iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { 4231 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4232 pa_node.inode_node); 4233 tmp_pa_start = tmp_pa->pa_lstart; 4234 tmp_pa_end = pa_logical_end(sbi, tmp_pa); 4235 4236 spin_lock(&tmp_pa->pa_lock); 4237 if (tmp_pa->pa_deleted == 0) 4238 BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); 4239 spin_unlock(&tmp_pa->pa_lock); 4240 } 4241 read_unlock(&ei->i_prealloc_lock); 4242 } 4243 4244 /* 4245 * Given an allocation context "ac" and a range "start", "end", check 4246 * and adjust boundaries if the range overlaps with any of the existing 4247 * preallocatoins stored in the corresponding inode of the allocation context. 4248 * 4249 * Parameters: 4250 * ac allocation context 4251 * start start of the new range 4252 * end end of the new range 4253 */ 4254 static inline void 4255 ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, 4256 ext4_lblk_t *start, loff_t *end) 4257 { 4258 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4259 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4260 struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; 4261 struct rb_node *iter; 4262 ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; 4263 loff_t new_end, tmp_pa_end, left_pa_end = -1; 4264 4265 new_start = *start; 4266 new_end = *end; 4267 4268 /* 4269 * Adjust the normalized range so that it doesn't overlap with any 4270 * existing preallocated blocks(PAs). Make sure to hold the rbtree lock 4271 * so it doesn't change underneath us. 4272 */ 4273 read_lock(&ei->i_prealloc_lock); 4274 4275 /* Step 1: find any one immediate neighboring PA of the normalized range */ 4276 for (iter = ei->i_prealloc_node.rb_node; iter; 4277 iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, 4278 tmp_pa_start, iter)) { 4279 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4280 pa_node.inode_node); 4281 tmp_pa_start = tmp_pa->pa_lstart; 4282 tmp_pa_end = pa_logical_end(sbi, tmp_pa); 4283 4284 /* PA must not overlap original request */ 4285 spin_lock(&tmp_pa->pa_lock); 4286 if (tmp_pa->pa_deleted == 0) 4287 BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || 4288 ac->ac_o_ex.fe_logical < tmp_pa_start)); 4289 spin_unlock(&tmp_pa->pa_lock); 4290 } 4291 4292 /* 4293 * Step 2: check if the found PA is left or right neighbor and 4294 * get the other neighbor 4295 */ 4296 if (tmp_pa) { 4297 if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { 4298 struct rb_node *tmp; 4299 4300 left_pa = tmp_pa; 4301 tmp = rb_next(&left_pa->pa_node.inode_node); 4302 if (tmp) { 4303 right_pa = rb_entry(tmp, 4304 struct ext4_prealloc_space, 4305 pa_node.inode_node); 4306 } 4307 } else { 4308 struct rb_node *tmp; 4309 4310 right_pa = tmp_pa; 4311 tmp = rb_prev(&right_pa->pa_node.inode_node); 4312 if (tmp) { 4313 left_pa = rb_entry(tmp, 4314 struct ext4_prealloc_space, 4315 pa_node.inode_node); 4316 } 4317 } 4318 } 4319 4320 /* Step 3: get the non deleted neighbors */ 4321 if (left_pa) { 4322 for (iter = &left_pa->pa_node.inode_node;; 4323 iter = rb_prev(iter)) { 4324 if (!iter) { 4325 left_pa = NULL; 4326 break; 4327 } 4328 4329 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4330 pa_node.inode_node); 4331 left_pa = tmp_pa; 4332 spin_lock(&tmp_pa->pa_lock); 4333 if (tmp_pa->pa_deleted == 0) { 4334 spin_unlock(&tmp_pa->pa_lock); 4335 break; 4336 } 4337 spin_unlock(&tmp_pa->pa_lock); 4338 } 4339 } 4340 4341 if (right_pa) { 4342 for (iter = &right_pa->pa_node.inode_node;; 4343 iter = rb_next(iter)) { 4344 if (!iter) { 4345 right_pa = NULL; 4346 break; 4347 } 4348 4349 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4350 pa_node.inode_node); 4351 right_pa = tmp_pa; 4352 spin_lock(&tmp_pa->pa_lock); 4353 if (tmp_pa->pa_deleted == 0) { 4354 spin_unlock(&tmp_pa->pa_lock); 4355 break; 4356 } 4357 spin_unlock(&tmp_pa->pa_lock); 4358 } 4359 } 4360 4361 if (left_pa) { 4362 left_pa_end = pa_logical_end(sbi, left_pa); 4363 BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); 4364 } 4365 4366 if (right_pa) { 4367 right_pa_start = right_pa->pa_lstart; 4368 BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); 4369 } 4370 4371 /* Step 4: trim our normalized range to not overlap with the neighbors */ 4372 if (left_pa) { 4373 if (left_pa_end > new_start) 4374 new_start = left_pa_end; 4375 } 4376 4377 if (right_pa) { 4378 if (right_pa_start < new_end) 4379 new_end = right_pa_start; 4380 } 4381 read_unlock(&ei->i_prealloc_lock); 4382 4383 /* XXX: extra loop to check we really don't overlap preallocations */ 4384 ext4_mb_pa_assert_overlap(ac, new_start, new_end); 4385 4386 *start = new_start; 4387 *end = new_end; 4388 } 4389 4390 /* 4391 * Normalization means making request better in terms of 4392 * size and alignment 4393 */ 4394 static noinline_for_stack void 4395 ext4_mb_normalize_request(struct ext4_allocation_context *ac, 4396 struct ext4_allocation_request *ar) 4397 { 4398 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4399 struct ext4_super_block *es = sbi->s_es; 4400 int bsbits, max; 4401 loff_t size, start_off, end; 4402 loff_t orig_size __maybe_unused; 4403 ext4_lblk_t start; 4404 4405 /* do normalize only data requests, metadata requests 4406 do not need preallocation */ 4407 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4408 return; 4409 4410 /* sometime caller may want exact blocks */ 4411 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4412 return; 4413 4414 /* caller may indicate that preallocation isn't 4415 * required (it's a tail, for example) */ 4416 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) 4417 return; 4418 4419 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { 4420 ext4_mb_normalize_group_request(ac); 4421 return ; 4422 } 4423 4424 bsbits = ac->ac_sb->s_blocksize_bits; 4425 4426 /* first, let's learn actual file size 4427 * given current request is allocated */ 4428 size = extent_logical_end(sbi, &ac->ac_o_ex); 4429 size = size << bsbits; 4430 if (size < i_size_read(ac->ac_inode)) 4431 size = i_size_read(ac->ac_inode); 4432 orig_size = size; 4433 4434 /* max size of free chunks */ 4435 max = 2 << bsbits; 4436 4437 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ 4438 (req <= (size) || max <= (chunk_size)) 4439 4440 /* first, try to predict filesize */ 4441 /* XXX: should this table be tunable? */ 4442 start_off = 0; 4443 if (size <= 16 * 1024) { 4444 size = 16 * 1024; 4445 } else if (size <= 32 * 1024) { 4446 size = 32 * 1024; 4447 } else if (size <= 64 * 1024) { 4448 size = 64 * 1024; 4449 } else if (size <= 128 * 1024) { 4450 size = 128 * 1024; 4451 } else if (size <= 256 * 1024) { 4452 size = 256 * 1024; 4453 } else if (size <= 512 * 1024) { 4454 size = 512 * 1024; 4455 } else if (size <= 1024 * 1024) { 4456 size = 1024 * 1024; 4457 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { 4458 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4459 (21 - bsbits)) << 21; 4460 size = 2 * 1024 * 1024; 4461 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { 4462 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4463 (22 - bsbits)) << 22; 4464 size = 4 * 1024 * 1024; 4465 } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), 4466 (8<<20)>>bsbits, max, 8 * 1024)) { 4467 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4468 (23 - bsbits)) << 23; 4469 size = 8 * 1024 * 1024; 4470 } else { 4471 start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; 4472 size = (loff_t) EXT4_C2B(sbi, 4473 ac->ac_o_ex.fe_len) << bsbits; 4474 } 4475 size = size >> bsbits; 4476 start = start_off >> bsbits; 4477 4478 /* 4479 * For tiny groups (smaller than 8MB) the chosen allocation 4480 * alignment may be larger than group size. Make sure the 4481 * alignment does not move allocation to a different group which 4482 * makes mballoc fail assertions later. 4483 */ 4484 start = max(start, rounddown(ac->ac_o_ex.fe_logical, 4485 (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); 4486 4487 /* don't cover already allocated blocks in selected range */ 4488 if (ar->pleft && start <= ar->lleft) { 4489 size -= ar->lleft + 1 - start; 4490 start = ar->lleft + 1; 4491 } 4492 if (ar->pright && start + size - 1 >= ar->lright) 4493 size -= start + size - ar->lright; 4494 4495 /* 4496 * Trim allocation request for filesystems with artificially small 4497 * groups. 4498 */ 4499 if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) 4500 size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); 4501 4502 end = start + size; 4503 4504 ext4_mb_pa_adjust_overlap(ac, &start, &end); 4505 4506 size = end - start; 4507 4508 /* 4509 * In this function "start" and "size" are normalized for better 4510 * alignment and length such that we could preallocate more blocks. 4511 * This normalization is done such that original request of 4512 * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and 4513 * "size" boundaries. 4514 * (Note fe_len can be relaxed since FS block allocation API does not 4515 * provide gurantee on number of contiguous blocks allocation since that 4516 * depends upon free space left, etc). 4517 * In case of inode pa, later we use the allocated blocks 4518 * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated 4519 * range of goal/best blocks [start, size] to put it at the 4520 * ac_o_ex.fe_logical extent of this inode. 4521 * (See ext4_mb_use_inode_pa() for more details) 4522 */ 4523 if (start + size <= ac->ac_o_ex.fe_logical || 4524 start > ac->ac_o_ex.fe_logical) { 4525 ext4_msg(ac->ac_sb, KERN_ERR, 4526 "start %lu, size %lu, fe_logical %lu", 4527 (unsigned long) start, (unsigned long) size, 4528 (unsigned long) ac->ac_o_ex.fe_logical); 4529 BUG(); 4530 } 4531 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 4532 4533 /* now prepare goal request */ 4534 4535 /* XXX: is it better to align blocks WRT to logical 4536 * placement or satisfy big request as is */ 4537 ac->ac_g_ex.fe_logical = start; 4538 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); 4539 ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; 4540 4541 /* define goal start in order to merge */ 4542 if (ar->pright && (ar->lright == (start + size)) && 4543 ar->pright >= size && 4544 ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { 4545 /* merge to the right */ 4546 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, 4547 &ac->ac_g_ex.fe_group, 4548 &ac->ac_g_ex.fe_start); 4549 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 4550 } 4551 if (ar->pleft && (ar->lleft + 1 == start) && 4552 ar->pleft + 1 < ext4_blocks_count(es)) { 4553 /* merge to the left */ 4554 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, 4555 &ac->ac_g_ex.fe_group, 4556 &ac->ac_g_ex.fe_start); 4557 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 4558 } 4559 4560 mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, 4561 orig_size, start); 4562 } 4563 4564 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) 4565 { 4566 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4567 4568 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { 4569 atomic_inc(&sbi->s_bal_reqs); 4570 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 4571 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) 4572 atomic_inc(&sbi->s_bal_success); 4573 4574 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 4575 for (int i=0; i<EXT4_MB_NUM_CRS; i++) { 4576 atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]); 4577 } 4578 4579 atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); 4580 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 4581 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 4582 atomic_inc(&sbi->s_bal_goals); 4583 /* did we allocate as much as normalizer originally wanted? */ 4584 if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len) 4585 atomic_inc(&sbi->s_bal_len_goals); 4586 4587 if (ac->ac_found > sbi->s_mb_max_to_scan) 4588 atomic_inc(&sbi->s_bal_breaks); 4589 } 4590 4591 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) 4592 trace_ext4_mballoc_alloc(ac); 4593 else 4594 trace_ext4_mballoc_prealloc(ac); 4595 } 4596 4597 /* 4598 * Called on failure; free up any blocks from the inode PA for this 4599 * context. We don't need this for MB_GROUP_PA because we only change 4600 * pa_free in ext4_mb_release_context(), but on failure, we've already 4601 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. 4602 */ 4603 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) 4604 { 4605 struct ext4_prealloc_space *pa = ac->ac_pa; 4606 struct ext4_buddy e4b; 4607 int err; 4608 4609 if (pa == NULL) { 4610 if (ac->ac_f_ex.fe_len == 0) 4611 return; 4612 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); 4613 if (WARN_RATELIMIT(err, 4614 "ext4: mb_load_buddy failed (%d)", err)) 4615 /* 4616 * This should never happen since we pin the 4617 * pages in the ext4_allocation_context so 4618 * ext4_mb_load_buddy() should never fail. 4619 */ 4620 return; 4621 ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); 4622 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, 4623 ac->ac_f_ex.fe_len); 4624 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); 4625 ext4_mb_unload_buddy(&e4b); 4626 return; 4627 } 4628 if (pa->pa_type == MB_INODE_PA) { 4629 spin_lock(&pa->pa_lock); 4630 pa->pa_free += ac->ac_b_ex.fe_len; 4631 spin_unlock(&pa->pa_lock); 4632 } 4633 } 4634 4635 /* 4636 * use blocks preallocated to inode 4637 */ 4638 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 4639 struct ext4_prealloc_space *pa) 4640 { 4641 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4642 ext4_fsblk_t start; 4643 ext4_fsblk_t end; 4644 int len; 4645 4646 /* found preallocated blocks, use them */ 4647 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 4648 end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), 4649 start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); 4650 len = EXT4_NUM_B2C(sbi, end - start); 4651 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 4652 &ac->ac_b_ex.fe_start); 4653 ac->ac_b_ex.fe_len = len; 4654 ac->ac_status = AC_STATUS_FOUND; 4655 ac->ac_pa = pa; 4656 4657 BUG_ON(start < pa->pa_pstart); 4658 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); 4659 BUG_ON(pa->pa_free < len); 4660 BUG_ON(ac->ac_b_ex.fe_len <= 0); 4661 pa->pa_free -= len; 4662 4663 mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); 4664 } 4665 4666 /* 4667 * use blocks preallocated to locality group 4668 */ 4669 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, 4670 struct ext4_prealloc_space *pa) 4671 { 4672 unsigned int len = ac->ac_o_ex.fe_len; 4673 4674 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 4675 &ac->ac_b_ex.fe_group, 4676 &ac->ac_b_ex.fe_start); 4677 ac->ac_b_ex.fe_len = len; 4678 ac->ac_status = AC_STATUS_FOUND; 4679 ac->ac_pa = pa; 4680 4681 /* we don't correct pa_pstart or pa_len here to avoid 4682 * possible race when the group is being loaded concurrently 4683 * instead we correct pa later, after blocks are marked 4684 * in on-disk bitmap -- see ext4_mb_release_context() 4685 * Other CPUs are prevented from allocating from this pa by lg_mutex 4686 */ 4687 mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", 4688 pa->pa_lstart, len, pa); 4689 } 4690 4691 /* 4692 * Return the prealloc space that have minimal distance 4693 * from the goal block. @cpa is the prealloc 4694 * space that is having currently known minimal distance 4695 * from the goal block. 4696 */ 4697 static struct ext4_prealloc_space * 4698 ext4_mb_check_group_pa(ext4_fsblk_t goal_block, 4699 struct ext4_prealloc_space *pa, 4700 struct ext4_prealloc_space *cpa) 4701 { 4702 ext4_fsblk_t cur_distance, new_distance; 4703 4704 if (cpa == NULL) { 4705 atomic_inc(&pa->pa_count); 4706 return pa; 4707 } 4708 cur_distance = abs(goal_block - cpa->pa_pstart); 4709 new_distance = abs(goal_block - pa->pa_pstart); 4710 4711 if (cur_distance <= new_distance) 4712 return cpa; 4713 4714 /* drop the previous reference */ 4715 atomic_dec(&cpa->pa_count); 4716 atomic_inc(&pa->pa_count); 4717 return pa; 4718 } 4719 4720 /* 4721 * check if found pa meets EXT4_MB_HINT_GOAL_ONLY 4722 */ 4723 static bool 4724 ext4_mb_pa_goal_check(struct ext4_allocation_context *ac, 4725 struct ext4_prealloc_space *pa) 4726 { 4727 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4728 ext4_fsblk_t start; 4729 4730 if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))) 4731 return true; 4732 4733 /* 4734 * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted 4735 * in ext4_mb_normalize_request and will keep same with ac_o_ex 4736 * from ext4_mb_initialize_context. Choose ac_g_ex here to keep 4737 * consistent with ext4_mb_find_by_goal. 4738 */ 4739 start = pa->pa_pstart + 4740 (ac->ac_g_ex.fe_logical - pa->pa_lstart); 4741 if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start) 4742 return false; 4743 4744 if (ac->ac_g_ex.fe_len > pa->pa_len - 4745 EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart)) 4746 return false; 4747 4748 return true; 4749 } 4750 4751 /* 4752 * search goal blocks in preallocated space 4753 */ 4754 static noinline_for_stack bool 4755 ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 4756 { 4757 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4758 int order, i; 4759 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4760 struct ext4_locality_group *lg; 4761 struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; 4762 struct rb_node *iter; 4763 ext4_fsblk_t goal_block; 4764 4765 /* only data can be preallocated */ 4766 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4767 return false; 4768 4769 /* 4770 * first, try per-file preallocation by searching the inode pa rbtree. 4771 * 4772 * Here, we can't do a direct traversal of the tree because 4773 * ext4_mb_discard_group_preallocation() can paralelly mark the pa 4774 * deleted and that can cause direct traversal to skip some entries. 4775 */ 4776 read_lock(&ei->i_prealloc_lock); 4777 4778 if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) { 4779 goto try_group_pa; 4780 } 4781 4782 /* 4783 * Step 1: Find a pa with logical start immediately adjacent to the 4784 * original logical start. This could be on the left or right. 4785 * 4786 * (tmp_pa->pa_lstart never changes so we can skip locking for it). 4787 */ 4788 for (iter = ei->i_prealloc_node.rb_node; iter; 4789 iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, 4790 tmp_pa->pa_lstart, iter)) { 4791 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4792 pa_node.inode_node); 4793 } 4794 4795 /* 4796 * Step 2: The adjacent pa might be to the right of logical start, find 4797 * the left adjacent pa. After this step we'd have a valid tmp_pa whose 4798 * logical start is towards the left of original request's logical start 4799 */ 4800 if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) { 4801 struct rb_node *tmp; 4802 tmp = rb_prev(&tmp_pa->pa_node.inode_node); 4803 4804 if (tmp) { 4805 tmp_pa = rb_entry(tmp, struct ext4_prealloc_space, 4806 pa_node.inode_node); 4807 } else { 4808 /* 4809 * If there is no adjacent pa to the left then finding 4810 * an overlapping pa is not possible hence stop searching 4811 * inode pa tree 4812 */ 4813 goto try_group_pa; 4814 } 4815 } 4816 4817 BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); 4818 4819 /* 4820 * Step 3: If the left adjacent pa is deleted, keep moving left to find 4821 * the first non deleted adjacent pa. After this step we should have a 4822 * valid tmp_pa which is guaranteed to be non deleted. 4823 */ 4824 for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) { 4825 if (!iter) { 4826 /* 4827 * no non deleted left adjacent pa, so stop searching 4828 * inode pa tree 4829 */ 4830 goto try_group_pa; 4831 } 4832 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4833 pa_node.inode_node); 4834 spin_lock(&tmp_pa->pa_lock); 4835 if (tmp_pa->pa_deleted == 0) { 4836 /* 4837 * We will keep holding the pa_lock from 4838 * this point on because we don't want group discard 4839 * to delete this pa underneath us. Since group 4840 * discard is anyways an ENOSPC operation it 4841 * should be okay for it to wait a few more cycles. 4842 */ 4843 break; 4844 } else { 4845 spin_unlock(&tmp_pa->pa_lock); 4846 } 4847 } 4848 4849 BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); 4850 BUG_ON(tmp_pa->pa_deleted == 1); 4851 4852 /* 4853 * Step 4: We now have the non deleted left adjacent pa. Only this 4854 * pa can possibly satisfy the request hence check if it overlaps 4855 * original logical start and stop searching if it doesn't. 4856 */ 4857 if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { 4858 spin_unlock(&tmp_pa->pa_lock); 4859 goto try_group_pa; 4860 } 4861 4862 /* non-extent files can't have physical blocks past 2^32 */ 4863 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 4864 (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > 4865 EXT4_MAX_BLOCK_FILE_PHYS)) { 4866 /* 4867 * Since PAs don't overlap, we won't find any other PA to 4868 * satisfy this. 4869 */ 4870 spin_unlock(&tmp_pa->pa_lock); 4871 goto try_group_pa; 4872 } 4873 4874 if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { 4875 atomic_inc(&tmp_pa->pa_count); 4876 ext4_mb_use_inode_pa(ac, tmp_pa); 4877 spin_unlock(&tmp_pa->pa_lock); 4878 read_unlock(&ei->i_prealloc_lock); 4879 return true; 4880 } else { 4881 /* 4882 * We found a valid overlapping pa but couldn't use it because 4883 * it had no free blocks. This should ideally never happen 4884 * because: 4885 * 4886 * 1. When a new inode pa is added to rbtree it must have 4887 * pa_free > 0 since otherwise we won't actually need 4888 * preallocation. 4889 * 4890 * 2. An inode pa that is in the rbtree can only have it's 4891 * pa_free become zero when another thread calls: 4892 * ext4_mb_new_blocks 4893 * ext4_mb_use_preallocated 4894 * ext4_mb_use_inode_pa 4895 * 4896 * 3. Further, after the above calls make pa_free == 0, we will 4897 * immediately remove it from the rbtree in: 4898 * ext4_mb_new_blocks 4899 * ext4_mb_release_context 4900 * ext4_mb_put_pa 4901 * 4902 * 4. Since the pa_free becoming 0 and pa_free getting removed 4903 * from tree both happen in ext4_mb_new_blocks, which is always 4904 * called with i_data_sem held for data allocations, we can be 4905 * sure that another process will never see a pa in rbtree with 4906 * pa_free == 0. 4907 */ 4908 WARN_ON_ONCE(tmp_pa->pa_free == 0); 4909 } 4910 spin_unlock(&tmp_pa->pa_lock); 4911 try_group_pa: 4912 read_unlock(&ei->i_prealloc_lock); 4913 4914 /* can we use group allocation? */ 4915 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) 4916 return false; 4917 4918 /* inode may have no locality group for some reason */ 4919 lg = ac->ac_lg; 4920 if (lg == NULL) 4921 return false; 4922 order = fls(ac->ac_o_ex.fe_len) - 1; 4923 if (order > PREALLOC_TB_SIZE - 1) 4924 /* The max size of hash table is PREALLOC_TB_SIZE */ 4925 order = PREALLOC_TB_SIZE - 1; 4926 4927 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); 4928 /* 4929 * search for the prealloc space that is having 4930 * minimal distance from the goal block. 4931 */ 4932 for (i = order; i < PREALLOC_TB_SIZE; i++) { 4933 rcu_read_lock(); 4934 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], 4935 pa_node.lg_list) { 4936 spin_lock(&tmp_pa->pa_lock); 4937 if (tmp_pa->pa_deleted == 0 && 4938 tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { 4939 4940 cpa = ext4_mb_check_group_pa(goal_block, 4941 tmp_pa, cpa); 4942 } 4943 spin_unlock(&tmp_pa->pa_lock); 4944 } 4945 rcu_read_unlock(); 4946 } 4947 if (cpa) { 4948 ext4_mb_use_group_pa(ac, cpa); 4949 return true; 4950 } 4951 return false; 4952 } 4953 4954 /* 4955 * the function goes through all block freed in the group 4956 * but not yet committed and marks them used in in-core bitmap. 4957 * buddy must be generated from this bitmap 4958 * Need to be called with the ext4 group lock held 4959 */ 4960 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 4961 ext4_group_t group) 4962 { 4963 struct rb_node *n; 4964 struct ext4_group_info *grp; 4965 struct ext4_free_data *entry; 4966 4967 grp = ext4_get_group_info(sb, group); 4968 if (!grp) 4969 return; 4970 n = rb_first(&(grp->bb_free_root)); 4971 4972 while (n) { 4973 entry = rb_entry(n, struct ext4_free_data, efd_node); 4974 mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); 4975 n = rb_next(n); 4976 } 4977 } 4978 4979 /* 4980 * the function goes through all preallocation in this group and marks them 4981 * used in in-core bitmap. buddy must be generated from this bitmap 4982 * Need to be called with ext4 group lock held 4983 */ 4984 static noinline_for_stack 4985 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 4986 ext4_group_t group) 4987 { 4988 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 4989 struct ext4_prealloc_space *pa; 4990 struct list_head *cur; 4991 ext4_group_t groupnr; 4992 ext4_grpblk_t start; 4993 int preallocated = 0; 4994 int len; 4995 4996 if (!grp) 4997 return; 4998 4999 /* all form of preallocation discards first load group, 5000 * so the only competing code is preallocation use. 5001 * we don't need any locking here 5002 * notice we do NOT ignore preallocations with pa_deleted 5003 * otherwise we could leave used blocks available for 5004 * allocation in buddy when concurrent ext4_mb_put_pa() 5005 * is dropping preallocation 5006 */ 5007 list_for_each(cur, &grp->bb_prealloc_list) { 5008 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 5009 spin_lock(&pa->pa_lock); 5010 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 5011 &groupnr, &start); 5012 len = pa->pa_len; 5013 spin_unlock(&pa->pa_lock); 5014 if (unlikely(len == 0)) 5015 continue; 5016 BUG_ON(groupnr != group); 5017 mb_set_bits(bitmap, start, len); 5018 preallocated += len; 5019 } 5020 mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); 5021 } 5022 5023 static void ext4_mb_mark_pa_deleted(struct super_block *sb, 5024 struct ext4_prealloc_space *pa) 5025 { 5026 struct ext4_inode_info *ei; 5027 5028 if (pa->pa_deleted) { 5029 ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", 5030 pa->pa_type, pa->pa_pstart, pa->pa_lstart, 5031 pa->pa_len); 5032 return; 5033 } 5034 5035 pa->pa_deleted = 1; 5036 5037 if (pa->pa_type == MB_INODE_PA) { 5038 ei = EXT4_I(pa->pa_inode); 5039 atomic_dec(&ei->i_prealloc_active); 5040 } 5041 } 5042 5043 static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) 5044 { 5045 BUG_ON(!pa); 5046 BUG_ON(atomic_read(&pa->pa_count)); 5047 BUG_ON(pa->pa_deleted == 0); 5048 kmem_cache_free(ext4_pspace_cachep, pa); 5049 } 5050 5051 static void ext4_mb_pa_callback(struct rcu_head *head) 5052 { 5053 struct ext4_prealloc_space *pa; 5054 5055 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 5056 ext4_mb_pa_free(pa); 5057 } 5058 5059 /* 5060 * drops a reference to preallocated space descriptor 5061 * if this was the last reference and the space is consumed 5062 */ 5063 static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 5064 struct super_block *sb, struct ext4_prealloc_space *pa) 5065 { 5066 ext4_group_t grp; 5067 ext4_fsblk_t grp_blk; 5068 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 5069 5070 /* in this short window concurrent discard can set pa_deleted */ 5071 spin_lock(&pa->pa_lock); 5072 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { 5073 spin_unlock(&pa->pa_lock); 5074 return; 5075 } 5076 5077 if (pa->pa_deleted == 1) { 5078 spin_unlock(&pa->pa_lock); 5079 return; 5080 } 5081 5082 ext4_mb_mark_pa_deleted(sb, pa); 5083 spin_unlock(&pa->pa_lock); 5084 5085 grp_blk = pa->pa_pstart; 5086 /* 5087 * If doing group-based preallocation, pa_pstart may be in the 5088 * next group when pa is used up 5089 */ 5090 if (pa->pa_type == MB_GROUP_PA) 5091 grp_blk--; 5092 5093 grp = ext4_get_group_number(sb, grp_blk); 5094 5095 /* 5096 * possible race: 5097 * 5098 * P1 (buddy init) P2 (regular allocation) 5099 * find block B in PA 5100 * copy on-disk bitmap to buddy 5101 * mark B in on-disk bitmap 5102 * drop PA from group 5103 * mark all PAs in buddy 5104 * 5105 * thus, P1 initializes buddy with B available. to prevent this 5106 * we make "copy" and "mark all PAs" atomic and serialize "drop PA" 5107 * against that pair 5108 */ 5109 ext4_lock_group(sb, grp); 5110 list_del(&pa->pa_group_list); 5111 ext4_unlock_group(sb, grp); 5112 5113 if (pa->pa_type == MB_INODE_PA) { 5114 write_lock(pa->pa_node_lock.inode_lock); 5115 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 5116 write_unlock(pa->pa_node_lock.inode_lock); 5117 ext4_mb_pa_free(pa); 5118 } else { 5119 spin_lock(pa->pa_node_lock.lg_lock); 5120 list_del_rcu(&pa->pa_node.lg_list); 5121 spin_unlock(pa->pa_node_lock.lg_lock); 5122 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 5123 } 5124 } 5125 5126 static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) 5127 { 5128 struct rb_node **iter = &root->rb_node, *parent = NULL; 5129 struct ext4_prealloc_space *iter_pa, *new_pa; 5130 ext4_lblk_t iter_start, new_start; 5131 5132 while (*iter) { 5133 iter_pa = rb_entry(*iter, struct ext4_prealloc_space, 5134 pa_node.inode_node); 5135 new_pa = rb_entry(new, struct ext4_prealloc_space, 5136 pa_node.inode_node); 5137 iter_start = iter_pa->pa_lstart; 5138 new_start = new_pa->pa_lstart; 5139 5140 parent = *iter; 5141 if (new_start < iter_start) 5142 iter = &((*iter)->rb_left); 5143 else 5144 iter = &((*iter)->rb_right); 5145 } 5146 5147 rb_link_node(new, parent, iter); 5148 rb_insert_color(new, root); 5149 } 5150 5151 /* 5152 * creates new preallocated space for given inode 5153 */ 5154 static noinline_for_stack void 5155 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 5156 { 5157 struct super_block *sb = ac->ac_sb; 5158 struct ext4_sb_info *sbi = EXT4_SB(sb); 5159 struct ext4_prealloc_space *pa; 5160 struct ext4_group_info *grp; 5161 struct ext4_inode_info *ei; 5162 5163 /* preallocate only when found space is larger then requested */ 5164 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 5165 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 5166 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 5167 BUG_ON(ac->ac_pa == NULL); 5168 5169 pa = ac->ac_pa; 5170 5171 if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { 5172 struct ext4_free_extent ex = { 5173 .fe_logical = ac->ac_g_ex.fe_logical, 5174 .fe_len = ac->ac_orig_goal_len, 5175 }; 5176 loff_t orig_goal_end = extent_logical_end(sbi, &ex); 5177 5178 /* we can't allocate as much as normalizer wants. 5179 * so, found space must get proper lstart 5180 * to cover original request */ 5181 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 5182 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); 5183 5184 /* 5185 * Use the below logic for adjusting best extent as it keeps 5186 * fragmentation in check while ensuring logical range of best 5187 * extent doesn't overflow out of goal extent: 5188 * 5189 * 1. Check if best ex can be kept at end of goal (before 5190 * cr_best_avail trimmed it) and still cover original start 5191 * 2. Else, check if best ex can be kept at start of goal and 5192 * still cover original start 5193 * 3. Else, keep the best ex at start of original request. 5194 */ 5195 ex.fe_len = ac->ac_b_ex.fe_len; 5196 5197 ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); 5198 if (ac->ac_o_ex.fe_logical >= ex.fe_logical) 5199 goto adjust_bex; 5200 5201 ex.fe_logical = ac->ac_g_ex.fe_logical; 5202 if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex)) 5203 goto adjust_bex; 5204 5205 ex.fe_logical = ac->ac_o_ex.fe_logical; 5206 adjust_bex: 5207 ac->ac_b_ex.fe_logical = ex.fe_logical; 5208 5209 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 5210 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 5211 BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); 5212 } 5213 5214 pa->pa_lstart = ac->ac_b_ex.fe_logical; 5215 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 5216 pa->pa_len = ac->ac_b_ex.fe_len; 5217 pa->pa_free = pa->pa_len; 5218 spin_lock_init(&pa->pa_lock); 5219 INIT_LIST_HEAD(&pa->pa_group_list); 5220 pa->pa_deleted = 0; 5221 pa->pa_type = MB_INODE_PA; 5222 5223 mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, 5224 pa->pa_len, pa->pa_lstart); 5225 trace_ext4_mb_new_inode_pa(ac, pa); 5226 5227 atomic_add(pa->pa_free, &sbi->s_mb_preallocated); 5228 ext4_mb_use_inode_pa(ac, pa); 5229 5230 ei = EXT4_I(ac->ac_inode); 5231 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 5232 if (!grp) 5233 return; 5234 5235 pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; 5236 pa->pa_inode = ac->ac_inode; 5237 5238 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 5239 5240 write_lock(pa->pa_node_lock.inode_lock); 5241 ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); 5242 write_unlock(pa->pa_node_lock.inode_lock); 5243 atomic_inc(&ei->i_prealloc_active); 5244 } 5245 5246 /* 5247 * creates new preallocated space for locality group inodes belongs to 5248 */ 5249 static noinline_for_stack void 5250 ext4_mb_new_group_pa(struct ext4_allocation_context *ac) 5251 { 5252 struct super_block *sb = ac->ac_sb; 5253 struct ext4_locality_group *lg; 5254 struct ext4_prealloc_space *pa; 5255 struct ext4_group_info *grp; 5256 5257 /* preallocate only when found space is larger then requested */ 5258 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 5259 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 5260 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 5261 BUG_ON(ac->ac_pa == NULL); 5262 5263 pa = ac->ac_pa; 5264 5265 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 5266 pa->pa_lstart = pa->pa_pstart; 5267 pa->pa_len = ac->ac_b_ex.fe_len; 5268 pa->pa_free = pa->pa_len; 5269 spin_lock_init(&pa->pa_lock); 5270 INIT_LIST_HEAD(&pa->pa_node.lg_list); 5271 INIT_LIST_HEAD(&pa->pa_group_list); 5272 pa->pa_deleted = 0; 5273 pa->pa_type = MB_GROUP_PA; 5274 5275 mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, 5276 pa->pa_len, pa->pa_lstart); 5277 trace_ext4_mb_new_group_pa(ac, pa); 5278 5279 ext4_mb_use_group_pa(ac, pa); 5280 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 5281 5282 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 5283 if (!grp) 5284 return; 5285 lg = ac->ac_lg; 5286 BUG_ON(lg == NULL); 5287 5288 pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; 5289 pa->pa_inode = NULL; 5290 5291 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 5292 5293 /* 5294 * We will later add the new pa to the right bucket 5295 * after updating the pa_free in ext4_mb_release_context 5296 */ 5297 } 5298 5299 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) 5300 { 5301 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 5302 ext4_mb_new_group_pa(ac); 5303 else 5304 ext4_mb_new_inode_pa(ac); 5305 } 5306 5307 /* 5308 * finds all unused blocks in on-disk bitmap, frees them in 5309 * in-core bitmap and buddy. 5310 * @pa must be unlinked from inode and group lists, so that 5311 * nobody else can find/use it. 5312 * the caller MUST hold group/inode locks. 5313 * TODO: optimize the case when there are no in-core structures yet 5314 */ 5315 static noinline_for_stack int 5316 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 5317 struct ext4_prealloc_space *pa) 5318 { 5319 struct super_block *sb = e4b->bd_sb; 5320 struct ext4_sb_info *sbi = EXT4_SB(sb); 5321 unsigned int end; 5322 unsigned int next; 5323 ext4_group_t group; 5324 ext4_grpblk_t bit; 5325 unsigned long long grp_blk_start; 5326 int free = 0; 5327 5328 BUG_ON(pa->pa_deleted == 0); 5329 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 5330 grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); 5331 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 5332 end = bit + pa->pa_len; 5333 5334 while (bit < end) { 5335 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 5336 if (bit >= end) 5337 break; 5338 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 5339 mb_debug(sb, "free preallocated %u/%u in group %u\n", 5340 (unsigned) ext4_group_first_block_no(sb, group) + bit, 5341 (unsigned) next - bit, (unsigned) group); 5342 free += next - bit; 5343 5344 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); 5345 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + 5346 EXT4_C2B(sbi, bit)), 5347 next - bit); 5348 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 5349 bit = next + 1; 5350 } 5351 if (free != pa->pa_free) { 5352 ext4_msg(e4b->bd_sb, KERN_CRIT, 5353 "pa %p: logic %lu, phys. %lu, len %d", 5354 pa, (unsigned long) pa->pa_lstart, 5355 (unsigned long) pa->pa_pstart, 5356 pa->pa_len); 5357 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", 5358 free, pa->pa_free); 5359 /* 5360 * pa is already deleted so we use the value obtained 5361 * from the bitmap and continue. 5362 */ 5363 } 5364 atomic_add(free, &sbi->s_mb_discarded); 5365 5366 return 0; 5367 } 5368 5369 static noinline_for_stack int 5370 ext4_mb_release_group_pa(struct ext4_buddy *e4b, 5371 struct ext4_prealloc_space *pa) 5372 { 5373 struct super_block *sb = e4b->bd_sb; 5374 ext4_group_t group; 5375 ext4_grpblk_t bit; 5376 5377 trace_ext4_mb_release_group_pa(sb, pa); 5378 BUG_ON(pa->pa_deleted == 0); 5379 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 5380 if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { 5381 ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", 5382 e4b->bd_group, group, pa->pa_pstart); 5383 return 0; 5384 } 5385 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 5386 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 5387 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); 5388 5389 return 0; 5390 } 5391 5392 /* 5393 * releases all preallocations in given group 5394 * 5395 * first, we need to decide discard policy: 5396 * - when do we discard 5397 * 1) ENOSPC 5398 * - how many do we discard 5399 * 1) how many requested 5400 */ 5401 static noinline_for_stack int 5402 ext4_mb_discard_group_preallocations(struct super_block *sb, 5403 ext4_group_t group, int *busy) 5404 { 5405 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 5406 struct buffer_head *bitmap_bh = NULL; 5407 struct ext4_prealloc_space *pa, *tmp; 5408 struct list_head list; 5409 struct ext4_buddy e4b; 5410 struct ext4_inode_info *ei; 5411 int err; 5412 int free = 0; 5413 5414 if (!grp) 5415 return 0; 5416 mb_debug(sb, "discard preallocation for group %u\n", group); 5417 if (list_empty(&grp->bb_prealloc_list)) 5418 goto out_dbg; 5419 5420 bitmap_bh = ext4_read_block_bitmap(sb, group); 5421 if (IS_ERR(bitmap_bh)) { 5422 err = PTR_ERR(bitmap_bh); 5423 ext4_error_err(sb, -err, 5424 "Error %d reading block bitmap for %u", 5425 err, group); 5426 goto out_dbg; 5427 } 5428 5429 err = ext4_mb_load_buddy(sb, group, &e4b); 5430 if (err) { 5431 ext4_warning(sb, "Error %d loading buddy information for %u", 5432 err, group); 5433 put_bh(bitmap_bh); 5434 goto out_dbg; 5435 } 5436 5437 INIT_LIST_HEAD(&list); 5438 ext4_lock_group(sb, group); 5439 list_for_each_entry_safe(pa, tmp, 5440 &grp->bb_prealloc_list, pa_group_list) { 5441 spin_lock(&pa->pa_lock); 5442 if (atomic_read(&pa->pa_count)) { 5443 spin_unlock(&pa->pa_lock); 5444 *busy = 1; 5445 continue; 5446 } 5447 if (pa->pa_deleted) { 5448 spin_unlock(&pa->pa_lock); 5449 continue; 5450 } 5451 5452 /* seems this one can be freed ... */ 5453 ext4_mb_mark_pa_deleted(sb, pa); 5454 5455 if (!free) 5456 this_cpu_inc(discard_pa_seq); 5457 5458 /* we can trust pa_free ... */ 5459 free += pa->pa_free; 5460 5461 spin_unlock(&pa->pa_lock); 5462 5463 list_del(&pa->pa_group_list); 5464 list_add(&pa->u.pa_tmp_list, &list); 5465 } 5466 5467 /* now free all selected PAs */ 5468 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 5469 5470 /* remove from object (inode or locality group) */ 5471 if (pa->pa_type == MB_GROUP_PA) { 5472 spin_lock(pa->pa_node_lock.lg_lock); 5473 list_del_rcu(&pa->pa_node.lg_list); 5474 spin_unlock(pa->pa_node_lock.lg_lock); 5475 } else { 5476 write_lock(pa->pa_node_lock.inode_lock); 5477 ei = EXT4_I(pa->pa_inode); 5478 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 5479 write_unlock(pa->pa_node_lock.inode_lock); 5480 } 5481 5482 list_del(&pa->u.pa_tmp_list); 5483 5484 if (pa->pa_type == MB_GROUP_PA) { 5485 ext4_mb_release_group_pa(&e4b, pa); 5486 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 5487 } else { 5488 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 5489 ext4_mb_pa_free(pa); 5490 } 5491 } 5492 5493 ext4_unlock_group(sb, group); 5494 ext4_mb_unload_buddy(&e4b); 5495 put_bh(bitmap_bh); 5496 out_dbg: 5497 mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", 5498 free, group, grp->bb_free); 5499 return free; 5500 } 5501 5502 /* 5503 * releases all non-used preallocated blocks for given inode 5504 * 5505 * It's important to discard preallocations under i_data_sem 5506 * We don't want another block to be served from the prealloc 5507 * space when we are discarding the inode prealloc space. 5508 * 5509 * FIXME!! Make sure it is valid at all the call sites 5510 */ 5511 void ext4_discard_preallocations(struct inode *inode, unsigned int needed) 5512 { 5513 struct ext4_inode_info *ei = EXT4_I(inode); 5514 struct super_block *sb = inode->i_sb; 5515 struct buffer_head *bitmap_bh = NULL; 5516 struct ext4_prealloc_space *pa, *tmp; 5517 ext4_group_t group = 0; 5518 struct list_head list; 5519 struct ext4_buddy e4b; 5520 struct rb_node *iter; 5521 int err; 5522 5523 if (!S_ISREG(inode->i_mode)) { 5524 return; 5525 } 5526 5527 if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) 5528 return; 5529 5530 mb_debug(sb, "discard preallocation for inode %lu\n", 5531 inode->i_ino); 5532 trace_ext4_discard_preallocations(inode, 5533 atomic_read(&ei->i_prealloc_active), needed); 5534 5535 INIT_LIST_HEAD(&list); 5536 5537 if (needed == 0) 5538 needed = UINT_MAX; 5539 5540 repeat: 5541 /* first, collect all pa's in the inode */ 5542 write_lock(&ei->i_prealloc_lock); 5543 for (iter = rb_first(&ei->i_prealloc_node); iter && needed; 5544 iter = rb_next(iter)) { 5545 pa = rb_entry(iter, struct ext4_prealloc_space, 5546 pa_node.inode_node); 5547 BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); 5548 5549 spin_lock(&pa->pa_lock); 5550 if (atomic_read(&pa->pa_count)) { 5551 /* this shouldn't happen often - nobody should 5552 * use preallocation while we're discarding it */ 5553 spin_unlock(&pa->pa_lock); 5554 write_unlock(&ei->i_prealloc_lock); 5555 ext4_msg(sb, KERN_ERR, 5556 "uh-oh! used pa while discarding"); 5557 WARN_ON(1); 5558 schedule_timeout_uninterruptible(HZ); 5559 goto repeat; 5560 5561 } 5562 if (pa->pa_deleted == 0) { 5563 ext4_mb_mark_pa_deleted(sb, pa); 5564 spin_unlock(&pa->pa_lock); 5565 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 5566 list_add(&pa->u.pa_tmp_list, &list); 5567 needed--; 5568 continue; 5569 } 5570 5571 /* someone is deleting pa right now */ 5572 spin_unlock(&pa->pa_lock); 5573 write_unlock(&ei->i_prealloc_lock); 5574 5575 /* we have to wait here because pa_deleted 5576 * doesn't mean pa is already unlinked from 5577 * the list. as we might be called from 5578 * ->clear_inode() the inode will get freed 5579 * and concurrent thread which is unlinking 5580 * pa from inode's list may access already 5581 * freed memory, bad-bad-bad */ 5582 5583 /* XXX: if this happens too often, we can 5584 * add a flag to force wait only in case 5585 * of ->clear_inode(), but not in case of 5586 * regular truncate */ 5587 schedule_timeout_uninterruptible(HZ); 5588 goto repeat; 5589 } 5590 write_unlock(&ei->i_prealloc_lock); 5591 5592 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 5593 BUG_ON(pa->pa_type != MB_INODE_PA); 5594 group = ext4_get_group_number(sb, pa->pa_pstart); 5595 5596 err = ext4_mb_load_buddy_gfp(sb, group, &e4b, 5597 GFP_NOFS|__GFP_NOFAIL); 5598 if (err) { 5599 ext4_error_err(sb, -err, "Error %d loading buddy information for %u", 5600 err, group); 5601 continue; 5602 } 5603 5604 bitmap_bh = ext4_read_block_bitmap(sb, group); 5605 if (IS_ERR(bitmap_bh)) { 5606 err = PTR_ERR(bitmap_bh); 5607 ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", 5608 err, group); 5609 ext4_mb_unload_buddy(&e4b); 5610 continue; 5611 } 5612 5613 ext4_lock_group(sb, group); 5614 list_del(&pa->pa_group_list); 5615 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 5616 ext4_unlock_group(sb, group); 5617 5618 ext4_mb_unload_buddy(&e4b); 5619 put_bh(bitmap_bh); 5620 5621 list_del(&pa->u.pa_tmp_list); 5622 ext4_mb_pa_free(pa); 5623 } 5624 } 5625 5626 static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) 5627 { 5628 struct ext4_prealloc_space *pa; 5629 5630 BUG_ON(ext4_pspace_cachep == NULL); 5631 pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); 5632 if (!pa) 5633 return -ENOMEM; 5634 atomic_set(&pa->pa_count, 1); 5635 ac->ac_pa = pa; 5636 return 0; 5637 } 5638 5639 static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) 5640 { 5641 struct ext4_prealloc_space *pa = ac->ac_pa; 5642 5643 BUG_ON(!pa); 5644 ac->ac_pa = NULL; 5645 WARN_ON(!atomic_dec_and_test(&pa->pa_count)); 5646 /* 5647 * current function is only called due to an error or due to 5648 * len of found blocks < len of requested blocks hence the PA has not 5649 * been added to grp->bb_prealloc_list. So we don't need to lock it 5650 */ 5651 pa->pa_deleted = 1; 5652 ext4_mb_pa_free(pa); 5653 } 5654 5655 #ifdef CONFIG_EXT4_DEBUG 5656 static inline void ext4_mb_show_pa(struct super_block *sb) 5657 { 5658 ext4_group_t i, ngroups; 5659 5660 if (ext4_forced_shutdown(sb)) 5661 return; 5662 5663 ngroups = ext4_get_groups_count(sb); 5664 mb_debug(sb, "groups: "); 5665 for (i = 0; i < ngroups; i++) { 5666 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 5667 struct ext4_prealloc_space *pa; 5668 ext4_grpblk_t start; 5669 struct list_head *cur; 5670 5671 if (!grp) 5672 continue; 5673 ext4_lock_group(sb, i); 5674 list_for_each(cur, &grp->bb_prealloc_list) { 5675 pa = list_entry(cur, struct ext4_prealloc_space, 5676 pa_group_list); 5677 spin_lock(&pa->pa_lock); 5678 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 5679 NULL, &start); 5680 spin_unlock(&pa->pa_lock); 5681 mb_debug(sb, "PA:%u:%d:%d\n", i, start, 5682 pa->pa_len); 5683 } 5684 ext4_unlock_group(sb, i); 5685 mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, 5686 grp->bb_fragments); 5687 } 5688 } 5689 5690 static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 5691 { 5692 struct super_block *sb = ac->ac_sb; 5693 5694 if (ext4_forced_shutdown(sb)) 5695 return; 5696 5697 mb_debug(sb, "Can't allocate:" 5698 " Allocation context details:"); 5699 mb_debug(sb, "status %u flags 0x%x", 5700 ac->ac_status, ac->ac_flags); 5701 mb_debug(sb, "orig %lu/%lu/%lu@%lu, " 5702 "goal %lu/%lu/%lu@%lu, " 5703 "best %lu/%lu/%lu@%lu cr %d", 5704 (unsigned long)ac->ac_o_ex.fe_group, 5705 (unsigned long)ac->ac_o_ex.fe_start, 5706 (unsigned long)ac->ac_o_ex.fe_len, 5707 (unsigned long)ac->ac_o_ex.fe_logical, 5708 (unsigned long)ac->ac_g_ex.fe_group, 5709 (unsigned long)ac->ac_g_ex.fe_start, 5710 (unsigned long)ac->ac_g_ex.fe_len, 5711 (unsigned long)ac->ac_g_ex.fe_logical, 5712 (unsigned long)ac->ac_b_ex.fe_group, 5713 (unsigned long)ac->ac_b_ex.fe_start, 5714 (unsigned long)ac->ac_b_ex.fe_len, 5715 (unsigned long)ac->ac_b_ex.fe_logical, 5716 (int)ac->ac_criteria); 5717 mb_debug(sb, "%u found", ac->ac_found); 5718 mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no"); 5719 if (ac->ac_pa) 5720 mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? 5721 "group pa" : "inode pa"); 5722 ext4_mb_show_pa(sb); 5723 } 5724 #else 5725 static inline void ext4_mb_show_pa(struct super_block *sb) 5726 { 5727 } 5728 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) 5729 { 5730 ext4_mb_show_pa(ac->ac_sb); 5731 } 5732 #endif 5733 5734 /* 5735 * We use locality group preallocation for small size file. The size of the 5736 * file is determined by the current size or the resulting size after 5737 * allocation which ever is larger 5738 * 5739 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req 5740 */ 5741 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 5742 { 5743 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 5744 int bsbits = ac->ac_sb->s_blocksize_bits; 5745 loff_t size, isize; 5746 bool inode_pa_eligible, group_pa_eligible; 5747 5748 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 5749 return; 5750 5751 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 5752 return; 5753 5754 group_pa_eligible = sbi->s_mb_group_prealloc > 0; 5755 inode_pa_eligible = true; 5756 size = extent_logical_end(sbi, &ac->ac_o_ex); 5757 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 5758 >> bsbits; 5759 5760 /* No point in using inode preallocation for closed files */ 5761 if ((size == isize) && !ext4_fs_is_busy(sbi) && 5762 !inode_is_open_for_write(ac->ac_inode)) 5763 inode_pa_eligible = false; 5764 5765 size = max(size, isize); 5766 /* Don't use group allocation for large files */ 5767 if (size > sbi->s_mb_stream_request) 5768 group_pa_eligible = false; 5769 5770 if (!group_pa_eligible) { 5771 if (inode_pa_eligible) 5772 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 5773 else 5774 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; 5775 return; 5776 } 5777 5778 BUG_ON(ac->ac_lg != NULL); 5779 /* 5780 * locality group prealloc space are per cpu. The reason for having 5781 * per cpu locality group is to reduce the contention between block 5782 * request from multiple CPUs. 5783 */ 5784 ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); 5785 5786 /* we're going to use group allocation */ 5787 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 5788 5789 /* serialize all allocations in the group */ 5790 mutex_lock(&ac->ac_lg->lg_mutex); 5791 } 5792 5793 static noinline_for_stack void 5794 ext4_mb_initialize_context(struct ext4_allocation_context *ac, 5795 struct ext4_allocation_request *ar) 5796 { 5797 struct super_block *sb = ar->inode->i_sb; 5798 struct ext4_sb_info *sbi = EXT4_SB(sb); 5799 struct ext4_super_block *es = sbi->s_es; 5800 ext4_group_t group; 5801 unsigned int len; 5802 ext4_fsblk_t goal; 5803 ext4_grpblk_t block; 5804 5805 /* we can't allocate > group size */ 5806 len = ar->len; 5807 5808 /* just a dirty hack to filter too big requests */ 5809 if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) 5810 len = EXT4_CLUSTERS_PER_GROUP(sb); 5811 5812 /* start searching from the goal */ 5813 goal = ar->goal; 5814 if (goal < le32_to_cpu(es->s_first_data_block) || 5815 goal >= ext4_blocks_count(es)) 5816 goal = le32_to_cpu(es->s_first_data_block); 5817 ext4_get_group_no_and_offset(sb, goal, &group, &block); 5818 5819 /* set up allocation goals */ 5820 ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); 5821 ac->ac_status = AC_STATUS_CONTINUE; 5822 ac->ac_sb = sb; 5823 ac->ac_inode = ar->inode; 5824 ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; 5825 ac->ac_o_ex.fe_group = group; 5826 ac->ac_o_ex.fe_start = block; 5827 ac->ac_o_ex.fe_len = len; 5828 ac->ac_g_ex = ac->ac_o_ex; 5829 ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; 5830 ac->ac_flags = ar->flags; 5831 5832 /* we have to define context: we'll work with a file or 5833 * locality group. this is a policy, actually */ 5834 ext4_mb_group_or_file(ac); 5835 5836 mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " 5837 "left: %u/%u, right %u/%u to %swritable\n", 5838 (unsigned) ar->len, (unsigned) ar->logical, 5839 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 5840 (unsigned) ar->lleft, (unsigned) ar->pleft, 5841 (unsigned) ar->lright, (unsigned) ar->pright, 5842 inode_is_open_for_write(ar->inode) ? "" : "non-"); 5843 } 5844 5845 static noinline_for_stack void 5846 ext4_mb_discard_lg_preallocations(struct super_block *sb, 5847 struct ext4_locality_group *lg, 5848 int order, int total_entries) 5849 { 5850 ext4_group_t group = 0; 5851 struct ext4_buddy e4b; 5852 struct list_head discard_list; 5853 struct ext4_prealloc_space *pa, *tmp; 5854 5855 mb_debug(sb, "discard locality group preallocation\n"); 5856 5857 INIT_LIST_HEAD(&discard_list); 5858 5859 spin_lock(&lg->lg_prealloc_lock); 5860 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 5861 pa_node.lg_list, 5862 lockdep_is_held(&lg->lg_prealloc_lock)) { 5863 spin_lock(&pa->pa_lock); 5864 if (atomic_read(&pa->pa_count)) { 5865 /* 5866 * This is the pa that we just used 5867 * for block allocation. So don't 5868 * free that 5869 */ 5870 spin_unlock(&pa->pa_lock); 5871 continue; 5872 } 5873 if (pa->pa_deleted) { 5874 spin_unlock(&pa->pa_lock); 5875 continue; 5876 } 5877 /* only lg prealloc space */ 5878 BUG_ON(pa->pa_type != MB_GROUP_PA); 5879 5880 /* seems this one can be freed ... */ 5881 ext4_mb_mark_pa_deleted(sb, pa); 5882 spin_unlock(&pa->pa_lock); 5883 5884 list_del_rcu(&pa->pa_node.lg_list); 5885 list_add(&pa->u.pa_tmp_list, &discard_list); 5886 5887 total_entries--; 5888 if (total_entries <= 5) { 5889 /* 5890 * we want to keep only 5 entries 5891 * allowing it to grow to 8. This 5892 * mak sure we don't call discard 5893 * soon for this list. 5894 */ 5895 break; 5896 } 5897 } 5898 spin_unlock(&lg->lg_prealloc_lock); 5899 5900 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 5901 int err; 5902 5903 group = ext4_get_group_number(sb, pa->pa_pstart); 5904 err = ext4_mb_load_buddy_gfp(sb, group, &e4b, 5905 GFP_NOFS|__GFP_NOFAIL); 5906 if (err) { 5907 ext4_error_err(sb, -err, "Error %d loading buddy information for %u", 5908 err, group); 5909 continue; 5910 } 5911 ext4_lock_group(sb, group); 5912 list_del(&pa->pa_group_list); 5913 ext4_mb_release_group_pa(&e4b, pa); 5914 ext4_unlock_group(sb, group); 5915 5916 ext4_mb_unload_buddy(&e4b); 5917 list_del(&pa->u.pa_tmp_list); 5918 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 5919 } 5920 } 5921 5922 /* 5923 * We have incremented pa_count. So it cannot be freed at this 5924 * point. Also we hold lg_mutex. So no parallel allocation is 5925 * possible from this lg. That means pa_free cannot be updated. 5926 * 5927 * A parallel ext4_mb_discard_group_preallocations is possible. 5928 * which can cause the lg_prealloc_list to be updated. 5929 */ 5930 5931 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) 5932 { 5933 int order, added = 0, lg_prealloc_count = 1; 5934 struct super_block *sb = ac->ac_sb; 5935 struct ext4_locality_group *lg = ac->ac_lg; 5936 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; 5937 5938 order = fls(pa->pa_free) - 1; 5939 if (order > PREALLOC_TB_SIZE - 1) 5940 /* The max size of hash table is PREALLOC_TB_SIZE */ 5941 order = PREALLOC_TB_SIZE - 1; 5942 /* Add the prealloc space to lg */ 5943 spin_lock(&lg->lg_prealloc_lock); 5944 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], 5945 pa_node.lg_list, 5946 lockdep_is_held(&lg->lg_prealloc_lock)) { 5947 spin_lock(&tmp_pa->pa_lock); 5948 if (tmp_pa->pa_deleted) { 5949 spin_unlock(&tmp_pa->pa_lock); 5950 continue; 5951 } 5952 if (!added && pa->pa_free < tmp_pa->pa_free) { 5953 /* Add to the tail of the previous entry */ 5954 list_add_tail_rcu(&pa->pa_node.lg_list, 5955 &tmp_pa->pa_node.lg_list); 5956 added = 1; 5957 /* 5958 * we want to count the total 5959 * number of entries in the list 5960 */ 5961 } 5962 spin_unlock(&tmp_pa->pa_lock); 5963 lg_prealloc_count++; 5964 } 5965 if (!added) 5966 list_add_tail_rcu(&pa->pa_node.lg_list, 5967 &lg->lg_prealloc_list[order]); 5968 spin_unlock(&lg->lg_prealloc_lock); 5969 5970 /* Now trim the list to be not more than 8 elements */ 5971 if (lg_prealloc_count > 8) 5972 ext4_mb_discard_lg_preallocations(sb, lg, 5973 order, lg_prealloc_count); 5974 } 5975 5976 /* 5977 * release all resource we used in allocation 5978 */ 5979 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 5980 { 5981 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 5982 struct ext4_prealloc_space *pa = ac->ac_pa; 5983 if (pa) { 5984 if (pa->pa_type == MB_GROUP_PA) { 5985 /* see comment in ext4_mb_use_group_pa() */ 5986 spin_lock(&pa->pa_lock); 5987 pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 5988 pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 5989 pa->pa_free -= ac->ac_b_ex.fe_len; 5990 pa->pa_len -= ac->ac_b_ex.fe_len; 5991 spin_unlock(&pa->pa_lock); 5992 5993 /* 5994 * We want to add the pa to the right bucket. 5995 * Remove it from the list and while adding 5996 * make sure the list to which we are adding 5997 * doesn't grow big. 5998 */ 5999 if (likely(pa->pa_free)) { 6000 spin_lock(pa->pa_node_lock.lg_lock); 6001 list_del_rcu(&pa->pa_node.lg_list); 6002 spin_unlock(pa->pa_node_lock.lg_lock); 6003 ext4_mb_add_n_trim(ac); 6004 } 6005 } 6006 6007 ext4_mb_put_pa(ac, ac->ac_sb, pa); 6008 } 6009 if (ac->ac_bitmap_page) 6010 put_page(ac->ac_bitmap_page); 6011 if (ac->ac_buddy_page) 6012 put_page(ac->ac_buddy_page); 6013 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 6014 mutex_unlock(&ac->ac_lg->lg_mutex); 6015 ext4_mb_collect_stats(ac); 6016 return 0; 6017 } 6018 6019 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 6020 { 6021 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 6022 int ret; 6023 int freed = 0, busy = 0; 6024 int retry = 0; 6025 6026 trace_ext4_mb_discard_preallocations(sb, needed); 6027 6028 if (needed == 0) 6029 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; 6030 repeat: 6031 for (i = 0; i < ngroups && needed > 0; i++) { 6032 ret = ext4_mb_discard_group_preallocations(sb, i, &busy); 6033 freed += ret; 6034 needed -= ret; 6035 cond_resched(); 6036 } 6037 6038 if (needed > 0 && busy && ++retry < 3) { 6039 busy = 0; 6040 goto repeat; 6041 } 6042 6043 return freed; 6044 } 6045 6046 static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, 6047 struct ext4_allocation_context *ac, u64 *seq) 6048 { 6049 int freed; 6050 u64 seq_retry = 0; 6051 bool ret = false; 6052 6053 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); 6054 if (freed) { 6055 ret = true; 6056 goto out_dbg; 6057 } 6058 seq_retry = ext4_get_discard_pa_seq_sum(); 6059 if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { 6060 ac->ac_flags |= EXT4_MB_STRICT_CHECK; 6061 *seq = seq_retry; 6062 ret = true; 6063 } 6064 6065 out_dbg: 6066 mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); 6067 return ret; 6068 } 6069 6070 /* 6071 * Simple allocator for Ext4 fast commit replay path. It searches for blocks 6072 * linearly starting at the goal block and also excludes the blocks which 6073 * are going to be in use after fast commit replay. 6074 */ 6075 static ext4_fsblk_t 6076 ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) 6077 { 6078 struct buffer_head *bitmap_bh; 6079 struct super_block *sb = ar->inode->i_sb; 6080 struct ext4_sb_info *sbi = EXT4_SB(sb); 6081 ext4_group_t group, nr; 6082 ext4_grpblk_t blkoff; 6083 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 6084 ext4_grpblk_t i = 0; 6085 ext4_fsblk_t goal, block; 6086 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 6087 6088 goal = ar->goal; 6089 if (goal < le32_to_cpu(es->s_first_data_block) || 6090 goal >= ext4_blocks_count(es)) 6091 goal = le32_to_cpu(es->s_first_data_block); 6092 6093 ar->len = 0; 6094 ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); 6095 for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { 6096 bitmap_bh = ext4_read_block_bitmap(sb, group); 6097 if (IS_ERR(bitmap_bh)) { 6098 *errp = PTR_ERR(bitmap_bh); 6099 pr_warn("Failed to read block bitmap\n"); 6100 return 0; 6101 } 6102 6103 while (1) { 6104 i = mb_find_next_zero_bit(bitmap_bh->b_data, max, 6105 blkoff); 6106 if (i >= max) 6107 break; 6108 if (ext4_fc_replay_check_excluded(sb, 6109 ext4_group_first_block_no(sb, group) + 6110 EXT4_C2B(sbi, i))) { 6111 blkoff = i + 1; 6112 } else 6113 break; 6114 } 6115 brelse(bitmap_bh); 6116 if (i < max) 6117 break; 6118 6119 if (++group >= ext4_get_groups_count(sb)) 6120 group = 0; 6121 6122 blkoff = 0; 6123 } 6124 6125 if (i >= max) { 6126 *errp = -ENOSPC; 6127 return 0; 6128 } 6129 6130 block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); 6131 ext4_mb_mark_bb(sb, block, 1, 1); 6132 ar->len = 1; 6133 6134 return block; 6135 } 6136 6137 /* 6138 * Main entry point into mballoc to allocate blocks 6139 * it tries to use preallocation first, then falls back 6140 * to usual allocation 6141 */ 6142 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 6143 struct ext4_allocation_request *ar, int *errp) 6144 { 6145 struct ext4_allocation_context *ac = NULL; 6146 struct ext4_sb_info *sbi; 6147 struct super_block *sb; 6148 ext4_fsblk_t block = 0; 6149 unsigned int inquota = 0; 6150 unsigned int reserv_clstrs = 0; 6151 int retries = 0; 6152 u64 seq; 6153 6154 might_sleep(); 6155 sb = ar->inode->i_sb; 6156 sbi = EXT4_SB(sb); 6157 6158 trace_ext4_request_blocks(ar); 6159 if (sbi->s_mount_state & EXT4_FC_REPLAY) 6160 return ext4_mb_new_blocks_simple(ar, errp); 6161 6162 /* Allow to use superuser reservation for quota file */ 6163 if (ext4_is_quota_file(ar->inode)) 6164 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; 6165 6166 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { 6167 /* Without delayed allocation we need to verify 6168 * there is enough free blocks to do block allocation 6169 * and verify allocation doesn't exceed the quota limits. 6170 */ 6171 while (ar->len && 6172 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { 6173 6174 /* let others to free the space */ 6175 cond_resched(); 6176 ar->len = ar->len >> 1; 6177 } 6178 if (!ar->len) { 6179 ext4_mb_show_pa(sb); 6180 *errp = -ENOSPC; 6181 return 0; 6182 } 6183 reserv_clstrs = ar->len; 6184 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { 6185 dquot_alloc_block_nofail(ar->inode, 6186 EXT4_C2B(sbi, ar->len)); 6187 } else { 6188 while (ar->len && 6189 dquot_alloc_block(ar->inode, 6190 EXT4_C2B(sbi, ar->len))) { 6191 6192 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 6193 ar->len--; 6194 } 6195 } 6196 inquota = ar->len; 6197 if (ar->len == 0) { 6198 *errp = -EDQUOT; 6199 goto out; 6200 } 6201 } 6202 6203 ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); 6204 if (!ac) { 6205 ar->len = 0; 6206 *errp = -ENOMEM; 6207 goto out; 6208 } 6209 6210 ext4_mb_initialize_context(ac, ar); 6211 6212 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 6213 seq = this_cpu_read(discard_pa_seq); 6214 if (!ext4_mb_use_preallocated(ac)) { 6215 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 6216 ext4_mb_normalize_request(ac, ar); 6217 6218 *errp = ext4_mb_pa_alloc(ac); 6219 if (*errp) 6220 goto errout; 6221 repeat: 6222 /* allocate space in core */ 6223 *errp = ext4_mb_regular_allocator(ac); 6224 /* 6225 * pa allocated above is added to grp->bb_prealloc_list only 6226 * when we were able to allocate some block i.e. when 6227 * ac->ac_status == AC_STATUS_FOUND. 6228 * And error from above mean ac->ac_status != AC_STATUS_FOUND 6229 * So we have to free this pa here itself. 6230 */ 6231 if (*errp) { 6232 ext4_mb_pa_put_free(ac); 6233 ext4_discard_allocated_blocks(ac); 6234 goto errout; 6235 } 6236 if (ac->ac_status == AC_STATUS_FOUND && 6237 ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) 6238 ext4_mb_pa_put_free(ac); 6239 } 6240 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 6241 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); 6242 if (*errp) { 6243 ext4_discard_allocated_blocks(ac); 6244 goto errout; 6245 } else { 6246 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 6247 ar->len = ac->ac_b_ex.fe_len; 6248 } 6249 } else { 6250 if (++retries < 3 && 6251 ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) 6252 goto repeat; 6253 /* 6254 * If block allocation fails then the pa allocated above 6255 * needs to be freed here itself. 6256 */ 6257 ext4_mb_pa_put_free(ac); 6258 *errp = -ENOSPC; 6259 } 6260 6261 if (*errp) { 6262 errout: 6263 ac->ac_b_ex.fe_len = 0; 6264 ar->len = 0; 6265 ext4_mb_show_ac(ac); 6266 } 6267 ext4_mb_release_context(ac); 6268 kmem_cache_free(ext4_ac_cachep, ac); 6269 out: 6270 if (inquota && ar->len < inquota) 6271 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); 6272 if (!ar->len) { 6273 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) 6274 /* release all the reserved blocks if non delalloc */ 6275 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 6276 reserv_clstrs); 6277 } 6278 6279 trace_ext4_allocate_blocks(ar, (unsigned long long)block); 6280 6281 return block; 6282 } 6283 6284 /* 6285 * We can merge two free data extents only if the physical blocks 6286 * are contiguous, AND the extents were freed by the same transaction, 6287 * AND the blocks are associated with the same group. 6288 */ 6289 static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, 6290 struct ext4_free_data *entry, 6291 struct ext4_free_data *new_entry, 6292 struct rb_root *entry_rb_root) 6293 { 6294 if ((entry->efd_tid != new_entry->efd_tid) || 6295 (entry->efd_group != new_entry->efd_group)) 6296 return; 6297 if (entry->efd_start_cluster + entry->efd_count == 6298 new_entry->efd_start_cluster) { 6299 new_entry->efd_start_cluster = entry->efd_start_cluster; 6300 new_entry->efd_count += entry->efd_count; 6301 } else if (new_entry->efd_start_cluster + new_entry->efd_count == 6302 entry->efd_start_cluster) { 6303 new_entry->efd_count += entry->efd_count; 6304 } else 6305 return; 6306 spin_lock(&sbi->s_md_lock); 6307 list_del(&entry->efd_list); 6308 spin_unlock(&sbi->s_md_lock); 6309 rb_erase(&entry->efd_node, entry_rb_root); 6310 kmem_cache_free(ext4_free_data_cachep, entry); 6311 } 6312 6313 static noinline_for_stack void 6314 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 6315 struct ext4_free_data *new_entry) 6316 { 6317 ext4_group_t group = e4b->bd_group; 6318 ext4_grpblk_t cluster; 6319 ext4_grpblk_t clusters = new_entry->efd_count; 6320 struct ext4_free_data *entry; 6321 struct ext4_group_info *db = e4b->bd_info; 6322 struct super_block *sb = e4b->bd_sb; 6323 struct ext4_sb_info *sbi = EXT4_SB(sb); 6324 struct rb_node **n = &db->bb_free_root.rb_node, *node; 6325 struct rb_node *parent = NULL, *new_node; 6326 6327 BUG_ON(!ext4_handle_valid(handle)); 6328 BUG_ON(e4b->bd_bitmap_page == NULL); 6329 BUG_ON(e4b->bd_buddy_page == NULL); 6330 6331 new_node = &new_entry->efd_node; 6332 cluster = new_entry->efd_start_cluster; 6333 6334 if (!*n) { 6335 /* first free block exent. We need to 6336 protect buddy cache from being freed, 6337 * otherwise we'll refresh it from 6338 * on-disk bitmap and lose not-yet-available 6339 * blocks */ 6340 get_page(e4b->bd_buddy_page); 6341 get_page(e4b->bd_bitmap_page); 6342 } 6343 while (*n) { 6344 parent = *n; 6345 entry = rb_entry(parent, struct ext4_free_data, efd_node); 6346 if (cluster < entry->efd_start_cluster) 6347 n = &(*n)->rb_left; 6348 else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) 6349 n = &(*n)->rb_right; 6350 else { 6351 ext4_grp_locked_error(sb, group, 0, 6352 ext4_group_first_block_no(sb, group) + 6353 EXT4_C2B(sbi, cluster), 6354 "Block already on to-be-freed list"); 6355 kmem_cache_free(ext4_free_data_cachep, new_entry); 6356 return; 6357 } 6358 } 6359 6360 rb_link_node(new_node, parent, n); 6361 rb_insert_color(new_node, &db->bb_free_root); 6362 6363 /* Now try to see the extent can be merged to left and right */ 6364 node = rb_prev(new_node); 6365 if (node) { 6366 entry = rb_entry(node, struct ext4_free_data, efd_node); 6367 ext4_try_merge_freed_extent(sbi, entry, new_entry, 6368 &(db->bb_free_root)); 6369 } 6370 6371 node = rb_next(new_node); 6372 if (node) { 6373 entry = rb_entry(node, struct ext4_free_data, efd_node); 6374 ext4_try_merge_freed_extent(sbi, entry, new_entry, 6375 &(db->bb_free_root)); 6376 } 6377 6378 spin_lock(&sbi->s_md_lock); 6379 list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); 6380 sbi->s_mb_free_pending += clusters; 6381 spin_unlock(&sbi->s_md_lock); 6382 } 6383 6384 static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, 6385 unsigned long count) 6386 { 6387 struct buffer_head *bitmap_bh; 6388 struct super_block *sb = inode->i_sb; 6389 struct ext4_group_desc *gdp; 6390 struct buffer_head *gdp_bh; 6391 ext4_group_t group; 6392 ext4_grpblk_t blkoff; 6393 int already_freed = 0, err, i; 6394 6395 ext4_get_group_no_and_offset(sb, block, &group, &blkoff); 6396 bitmap_bh = ext4_read_block_bitmap(sb, group); 6397 if (IS_ERR(bitmap_bh)) { 6398 pr_warn("Failed to read block bitmap\n"); 6399 return; 6400 } 6401 gdp = ext4_get_group_desc(sb, group, &gdp_bh); 6402 if (!gdp) 6403 goto err_out; 6404 6405 for (i = 0; i < count; i++) { 6406 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) 6407 already_freed++; 6408 } 6409 mb_clear_bits(bitmap_bh->b_data, blkoff, count); 6410 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); 6411 if (err) 6412 goto err_out; 6413 ext4_free_group_clusters_set( 6414 sb, gdp, ext4_free_group_clusters(sb, gdp) + 6415 count - already_freed); 6416 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 6417 ext4_group_desc_csum_set(sb, group, gdp); 6418 ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); 6419 sync_dirty_buffer(bitmap_bh); 6420 sync_dirty_buffer(gdp_bh); 6421 6422 err_out: 6423 brelse(bitmap_bh); 6424 } 6425 6426 /** 6427 * ext4_mb_clear_bb() -- helper function for freeing blocks. 6428 * Used by ext4_free_blocks() 6429 * @handle: handle for this transaction 6430 * @inode: inode 6431 * @block: starting physical block to be freed 6432 * @count: number of blocks to be freed 6433 * @flags: flags used by ext4_free_blocks 6434 */ 6435 static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, 6436 ext4_fsblk_t block, unsigned long count, 6437 int flags) 6438 { 6439 struct buffer_head *bitmap_bh = NULL; 6440 struct super_block *sb = inode->i_sb; 6441 struct ext4_group_desc *gdp; 6442 struct ext4_group_info *grp; 6443 unsigned int overflow; 6444 ext4_grpblk_t bit; 6445 struct buffer_head *gd_bh; 6446 ext4_group_t block_group; 6447 struct ext4_sb_info *sbi; 6448 struct ext4_buddy e4b; 6449 unsigned int count_clusters; 6450 int err = 0; 6451 int ret; 6452 6453 sbi = EXT4_SB(sb); 6454 6455 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6456 !ext4_inode_block_valid(inode, block, count)) { 6457 ext4_error(sb, "Freeing blocks in system zone - " 6458 "Block = %llu, count = %lu", block, count); 6459 /* err = 0. ext4_std_error should be a no op */ 6460 goto error_return; 6461 } 6462 flags |= EXT4_FREE_BLOCKS_VALIDATED; 6463 6464 do_more: 6465 overflow = 0; 6466 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 6467 6468 grp = ext4_get_group_info(sb, block_group); 6469 if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) 6470 return; 6471 6472 /* 6473 * Check to see if we are freeing blocks across a group 6474 * boundary. 6475 */ 6476 if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { 6477 overflow = EXT4_C2B(sbi, bit) + count - 6478 EXT4_BLOCKS_PER_GROUP(sb); 6479 count -= overflow; 6480 /* The range changed so it's no longer validated */ 6481 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6482 } 6483 count_clusters = EXT4_NUM_B2C(sbi, count); 6484 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 6485 if (IS_ERR(bitmap_bh)) { 6486 err = PTR_ERR(bitmap_bh); 6487 bitmap_bh = NULL; 6488 goto error_return; 6489 } 6490 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 6491 if (!gdp) { 6492 err = -EIO; 6493 goto error_return; 6494 } 6495 6496 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6497 !ext4_inode_block_valid(inode, block, count)) { 6498 ext4_error(sb, "Freeing blocks in system zone - " 6499 "Block = %llu, count = %lu", block, count); 6500 /* err = 0. ext4_std_error should be a no op */ 6501 goto error_return; 6502 } 6503 6504 BUFFER_TRACE(bitmap_bh, "getting write access"); 6505 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 6506 EXT4_JTR_NONE); 6507 if (err) 6508 goto error_return; 6509 6510 /* 6511 * We are about to modify some metadata. Call the journal APIs 6512 * to unshare ->b_data if a currently-committing transaction is 6513 * using it 6514 */ 6515 BUFFER_TRACE(gd_bh, "get_write_access"); 6516 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); 6517 if (err) 6518 goto error_return; 6519 #ifdef AGGRESSIVE_CHECK 6520 { 6521 int i; 6522 for (i = 0; i < count_clusters; i++) 6523 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 6524 } 6525 #endif 6526 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); 6527 6528 /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ 6529 err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, 6530 GFP_NOFS|__GFP_NOFAIL); 6531 if (err) 6532 goto error_return; 6533 6534 /* 6535 * We need to make sure we don't reuse the freed block until after the 6536 * transaction is committed. We make an exception if the inode is to be 6537 * written in writeback mode since writeback mode has weak data 6538 * consistency guarantees. 6539 */ 6540 if (ext4_handle_valid(handle) && 6541 ((flags & EXT4_FREE_BLOCKS_METADATA) || 6542 !ext4_should_writeback_data(inode))) { 6543 struct ext4_free_data *new_entry; 6544 /* 6545 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed 6546 * to fail. 6547 */ 6548 new_entry = kmem_cache_alloc(ext4_free_data_cachep, 6549 GFP_NOFS|__GFP_NOFAIL); 6550 new_entry->efd_start_cluster = bit; 6551 new_entry->efd_group = block_group; 6552 new_entry->efd_count = count_clusters; 6553 new_entry->efd_tid = handle->h_transaction->t_tid; 6554 6555 ext4_lock_group(sb, block_group); 6556 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 6557 ext4_mb_free_metadata(handle, &e4b, new_entry); 6558 } else { 6559 /* need to update group_info->bb_free and bitmap 6560 * with group lock held. generate_buddy look at 6561 * them with group lock_held 6562 */ 6563 if (test_opt(sb, DISCARD)) { 6564 err = ext4_issue_discard(sb, block_group, bit, 6565 count_clusters, NULL); 6566 if (err && err != -EOPNOTSUPP) 6567 ext4_msg(sb, KERN_WARNING, "discard request in" 6568 " group:%u block:%d count:%lu failed" 6569 " with %d", block_group, bit, count, 6570 err); 6571 } else 6572 EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); 6573 6574 ext4_lock_group(sb, block_group); 6575 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 6576 mb_free_blocks(inode, &e4b, bit, count_clusters); 6577 } 6578 6579 ret = ext4_free_group_clusters(sb, gdp) + count_clusters; 6580 ext4_free_group_clusters_set(sb, gdp, ret); 6581 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 6582 ext4_group_desc_csum_set(sb, block_group, gdp); 6583 ext4_unlock_group(sb, block_group); 6584 6585 if (sbi->s_log_groups_per_flex) { 6586 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 6587 atomic64_add(count_clusters, 6588 &sbi_array_rcu_deref(sbi, s_flex_groups, 6589 flex_group)->free_clusters); 6590 } 6591 6592 /* 6593 * on a bigalloc file system, defer the s_freeclusters_counter 6594 * update to the caller (ext4_remove_space and friends) so they 6595 * can determine if a cluster freed here should be rereserved 6596 */ 6597 if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { 6598 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) 6599 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); 6600 percpu_counter_add(&sbi->s_freeclusters_counter, 6601 count_clusters); 6602 } 6603 6604 ext4_mb_unload_buddy(&e4b); 6605 6606 /* We dirtied the bitmap block */ 6607 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 6608 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 6609 6610 /* And the group descriptor block */ 6611 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 6612 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 6613 if (!err) 6614 err = ret; 6615 6616 if (overflow && !err) { 6617 block += count; 6618 count = overflow; 6619 put_bh(bitmap_bh); 6620 /* The range changed so it's no longer validated */ 6621 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6622 goto do_more; 6623 } 6624 error_return: 6625 brelse(bitmap_bh); 6626 ext4_std_error(sb, err); 6627 } 6628 6629 /** 6630 * ext4_free_blocks() -- Free given blocks and update quota 6631 * @handle: handle for this transaction 6632 * @inode: inode 6633 * @bh: optional buffer of the block to be freed 6634 * @block: starting physical block to be freed 6635 * @count: number of blocks to be freed 6636 * @flags: flags used by ext4_free_blocks 6637 */ 6638 void ext4_free_blocks(handle_t *handle, struct inode *inode, 6639 struct buffer_head *bh, ext4_fsblk_t block, 6640 unsigned long count, int flags) 6641 { 6642 struct super_block *sb = inode->i_sb; 6643 unsigned int overflow; 6644 struct ext4_sb_info *sbi; 6645 6646 sbi = EXT4_SB(sb); 6647 6648 if (bh) { 6649 if (block) 6650 BUG_ON(block != bh->b_blocknr); 6651 else 6652 block = bh->b_blocknr; 6653 } 6654 6655 if (sbi->s_mount_state & EXT4_FC_REPLAY) { 6656 ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count)); 6657 return; 6658 } 6659 6660 might_sleep(); 6661 6662 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6663 !ext4_inode_block_valid(inode, block, count)) { 6664 ext4_error(sb, "Freeing blocks not in datazone - " 6665 "block = %llu, count = %lu", block, count); 6666 return; 6667 } 6668 flags |= EXT4_FREE_BLOCKS_VALIDATED; 6669 6670 ext4_debug("freeing block %llu\n", block); 6671 trace_ext4_free_blocks(inode, block, count, flags); 6672 6673 if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { 6674 BUG_ON(count > 1); 6675 6676 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 6677 inode, bh, block); 6678 } 6679 6680 /* 6681 * If the extent to be freed does not begin on a cluster 6682 * boundary, we need to deal with partial clusters at the 6683 * beginning and end of the extent. Normally we will free 6684 * blocks at the beginning or the end unless we are explicitly 6685 * requested to avoid doing so. 6686 */ 6687 overflow = EXT4_PBLK_COFF(sbi, block); 6688 if (overflow) { 6689 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { 6690 overflow = sbi->s_cluster_ratio - overflow; 6691 block += overflow; 6692 if (count > overflow) 6693 count -= overflow; 6694 else 6695 return; 6696 } else { 6697 block -= overflow; 6698 count += overflow; 6699 } 6700 /* The range changed so it's no longer validated */ 6701 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6702 } 6703 overflow = EXT4_LBLK_COFF(sbi, count); 6704 if (overflow) { 6705 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { 6706 if (count > overflow) 6707 count -= overflow; 6708 else 6709 return; 6710 } else 6711 count += sbi->s_cluster_ratio - overflow; 6712 /* The range changed so it's no longer validated */ 6713 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6714 } 6715 6716 if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { 6717 int i; 6718 int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; 6719 6720 for (i = 0; i < count; i++) { 6721 cond_resched(); 6722 if (is_metadata) 6723 bh = sb_find_get_block(inode->i_sb, block + i); 6724 ext4_forget(handle, is_metadata, inode, bh, block + i); 6725 } 6726 } 6727 6728 ext4_mb_clear_bb(handle, inode, block, count, flags); 6729 } 6730 6731 /** 6732 * ext4_group_add_blocks() -- Add given blocks to an existing group 6733 * @handle: handle to this transaction 6734 * @sb: super block 6735 * @block: start physical block to add to the block group 6736 * @count: number of blocks to free 6737 * 6738 * This marks the blocks as free in the bitmap and buddy. 6739 */ 6740 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, 6741 ext4_fsblk_t block, unsigned long count) 6742 { 6743 struct buffer_head *bitmap_bh = NULL; 6744 struct buffer_head *gd_bh; 6745 ext4_group_t block_group; 6746 ext4_grpblk_t bit; 6747 unsigned int i; 6748 struct ext4_group_desc *desc; 6749 struct ext4_sb_info *sbi = EXT4_SB(sb); 6750 struct ext4_buddy e4b; 6751 int err = 0, ret, free_clusters_count; 6752 ext4_grpblk_t clusters_freed; 6753 ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); 6754 ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); 6755 unsigned long cluster_count = last_cluster - first_cluster + 1; 6756 6757 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 6758 6759 if (count == 0) 6760 return 0; 6761 6762 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 6763 /* 6764 * Check to see if we are freeing blocks across a group 6765 * boundary. 6766 */ 6767 if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { 6768 ext4_warning(sb, "too many blocks added to group %u", 6769 block_group); 6770 err = -EINVAL; 6771 goto error_return; 6772 } 6773 6774 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 6775 if (IS_ERR(bitmap_bh)) { 6776 err = PTR_ERR(bitmap_bh); 6777 bitmap_bh = NULL; 6778 goto error_return; 6779 } 6780 6781 desc = ext4_get_group_desc(sb, block_group, &gd_bh); 6782 if (!desc) { 6783 err = -EIO; 6784 goto error_return; 6785 } 6786 6787 if (!ext4_sb_block_valid(sb, NULL, block, count)) { 6788 ext4_error(sb, "Adding blocks in system zones - " 6789 "Block = %llu, count = %lu", 6790 block, count); 6791 err = -EINVAL; 6792 goto error_return; 6793 } 6794 6795 BUFFER_TRACE(bitmap_bh, "getting write access"); 6796 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 6797 EXT4_JTR_NONE); 6798 if (err) 6799 goto error_return; 6800 6801 /* 6802 * We are about to modify some metadata. Call the journal APIs 6803 * to unshare ->b_data if a currently-committing transaction is 6804 * using it 6805 */ 6806 BUFFER_TRACE(gd_bh, "get_write_access"); 6807 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); 6808 if (err) 6809 goto error_return; 6810 6811 for (i = 0, clusters_freed = 0; i < cluster_count; i++) { 6812 BUFFER_TRACE(bitmap_bh, "clear bit"); 6813 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { 6814 ext4_error(sb, "bit already cleared for block %llu", 6815 (ext4_fsblk_t)(block + i)); 6816 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 6817 } else { 6818 clusters_freed++; 6819 } 6820 } 6821 6822 err = ext4_mb_load_buddy(sb, block_group, &e4b); 6823 if (err) 6824 goto error_return; 6825 6826 /* 6827 * need to update group_info->bb_free and bitmap 6828 * with group lock held. generate_buddy look at 6829 * them with group lock_held 6830 */ 6831 ext4_lock_group(sb, block_group); 6832 mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); 6833 mb_free_blocks(NULL, &e4b, bit, cluster_count); 6834 free_clusters_count = clusters_freed + 6835 ext4_free_group_clusters(sb, desc); 6836 ext4_free_group_clusters_set(sb, desc, free_clusters_count); 6837 ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); 6838 ext4_group_desc_csum_set(sb, block_group, desc); 6839 ext4_unlock_group(sb, block_group); 6840 percpu_counter_add(&sbi->s_freeclusters_counter, 6841 clusters_freed); 6842 6843 if (sbi->s_log_groups_per_flex) { 6844 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 6845 atomic64_add(clusters_freed, 6846 &sbi_array_rcu_deref(sbi, s_flex_groups, 6847 flex_group)->free_clusters); 6848 } 6849 6850 ext4_mb_unload_buddy(&e4b); 6851 6852 /* We dirtied the bitmap block */ 6853 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 6854 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 6855 6856 /* And the group descriptor block */ 6857 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 6858 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 6859 if (!err) 6860 err = ret; 6861 6862 error_return: 6863 brelse(bitmap_bh); 6864 ext4_std_error(sb, err); 6865 return err; 6866 } 6867 6868 /** 6869 * ext4_trim_extent -- function to TRIM one single free extent in the group 6870 * @sb: super block for the file system 6871 * @start: starting block of the free extent in the alloc. group 6872 * @count: number of blocks to TRIM 6873 * @e4b: ext4 buddy for the group 6874 * 6875 * Trim "count" blocks starting at "start" in the "group". To assure that no 6876 * one will allocate those blocks, mark it as used in buddy bitmap. This must 6877 * be called with under the group lock. 6878 */ 6879 static int ext4_trim_extent(struct super_block *sb, 6880 int start, int count, struct ext4_buddy *e4b) 6881 __releases(bitlock) 6882 __acquires(bitlock) 6883 { 6884 struct ext4_free_extent ex; 6885 ext4_group_t group = e4b->bd_group; 6886 int ret = 0; 6887 6888 trace_ext4_trim_extent(sb, group, start, count); 6889 6890 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 6891 6892 ex.fe_start = start; 6893 ex.fe_group = group; 6894 ex.fe_len = count; 6895 6896 /* 6897 * Mark blocks used, so no one can reuse them while 6898 * being trimmed. 6899 */ 6900 mb_mark_used(e4b, &ex); 6901 ext4_unlock_group(sb, group); 6902 ret = ext4_issue_discard(sb, group, start, count, NULL); 6903 ext4_lock_group(sb, group); 6904 mb_free_blocks(NULL, e4b, start, ex.fe_len); 6905 return ret; 6906 } 6907 6908 static int ext4_try_to_trim_range(struct super_block *sb, 6909 struct ext4_buddy *e4b, ext4_grpblk_t start, 6910 ext4_grpblk_t max, ext4_grpblk_t minblocks) 6911 __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) 6912 __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) 6913 { 6914 ext4_grpblk_t next, count, free_count; 6915 void *bitmap; 6916 6917 bitmap = e4b->bd_bitmap; 6918 start = max(e4b->bd_info->bb_first_free, start); 6919 count = 0; 6920 free_count = 0; 6921 6922 while (start <= max) { 6923 start = mb_find_next_zero_bit(bitmap, max + 1, start); 6924 if (start > max) 6925 break; 6926 next = mb_find_next_bit(bitmap, max + 1, start); 6927 6928 if ((next - start) >= minblocks) { 6929 int ret = ext4_trim_extent(sb, start, next - start, e4b); 6930 6931 if (ret && ret != -EOPNOTSUPP) 6932 break; 6933 count += next - start; 6934 } 6935 free_count += next - start; 6936 start = next + 1; 6937 6938 if (fatal_signal_pending(current)) { 6939 count = -ERESTARTSYS; 6940 break; 6941 } 6942 6943 if (need_resched()) { 6944 ext4_unlock_group(sb, e4b->bd_group); 6945 cond_resched(); 6946 ext4_lock_group(sb, e4b->bd_group); 6947 } 6948 6949 if ((e4b->bd_info->bb_free - free_count) < minblocks) 6950 break; 6951 } 6952 6953 return count; 6954 } 6955 6956 /** 6957 * ext4_trim_all_free -- function to trim all free space in alloc. group 6958 * @sb: super block for file system 6959 * @group: group to be trimmed 6960 * @start: first group block to examine 6961 * @max: last group block to examine 6962 * @minblocks: minimum extent block count 6963 * @set_trimmed: set the trimmed flag if at least one block is trimmed 6964 * 6965 * ext4_trim_all_free walks through group's block bitmap searching for free 6966 * extents. When the free extent is found, mark it as used in group buddy 6967 * bitmap. Then issue a TRIM command on this extent and free the extent in 6968 * the group buddy bitmap. 6969 */ 6970 static ext4_grpblk_t 6971 ext4_trim_all_free(struct super_block *sb, ext4_group_t group, 6972 ext4_grpblk_t start, ext4_grpblk_t max, 6973 ext4_grpblk_t minblocks, bool set_trimmed) 6974 { 6975 struct ext4_buddy e4b; 6976 int ret; 6977 6978 trace_ext4_trim_all_free(sb, group, start, max); 6979 6980 ret = ext4_mb_load_buddy(sb, group, &e4b); 6981 if (ret) { 6982 ext4_warning(sb, "Error %d loading buddy information for %u", 6983 ret, group); 6984 return ret; 6985 } 6986 6987 ext4_lock_group(sb, group); 6988 6989 if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || 6990 minblocks < EXT4_SB(sb)->s_last_trim_minblks) { 6991 ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); 6992 if (ret >= 0 && set_trimmed) 6993 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); 6994 } else { 6995 ret = 0; 6996 } 6997 6998 ext4_unlock_group(sb, group); 6999 ext4_mb_unload_buddy(&e4b); 7000 7001 ext4_debug("trimmed %d blocks in the group %d\n", 7002 ret, group); 7003 7004 return ret; 7005 } 7006 7007 /** 7008 * ext4_trim_fs() -- trim ioctl handle function 7009 * @sb: superblock for filesystem 7010 * @range: fstrim_range structure 7011 * 7012 * start: First Byte to trim 7013 * len: number of Bytes to trim from start 7014 * minlen: minimum extent length in Bytes 7015 * ext4_trim_fs goes through all allocation groups containing Bytes from 7016 * start to start+len. For each such a group ext4_trim_all_free function 7017 * is invoked to trim all free space. 7018 */ 7019 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 7020 { 7021 unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); 7022 struct ext4_group_info *grp; 7023 ext4_group_t group, first_group, last_group; 7024 ext4_grpblk_t cnt = 0, first_cluster, last_cluster; 7025 uint64_t start, end, minlen, trimmed = 0; 7026 ext4_fsblk_t first_data_blk = 7027 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 7028 ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); 7029 bool whole_group, eof = false; 7030 int ret = 0; 7031 7032 start = range->start >> sb->s_blocksize_bits; 7033 end = start + (range->len >> sb->s_blocksize_bits) - 1; 7034 minlen = EXT4_NUM_B2C(EXT4_SB(sb), 7035 range->minlen >> sb->s_blocksize_bits); 7036 7037 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || 7038 start >= max_blks || 7039 range->len < sb->s_blocksize) 7040 return -EINVAL; 7041 /* No point to try to trim less than discard granularity */ 7042 if (range->minlen < discard_granularity) { 7043 minlen = EXT4_NUM_B2C(EXT4_SB(sb), 7044 discard_granularity >> sb->s_blocksize_bits); 7045 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) 7046 goto out; 7047 } 7048 if (end >= max_blks - 1) { 7049 end = max_blks - 1; 7050 eof = true; 7051 } 7052 if (end <= first_data_blk) 7053 goto out; 7054 if (start < first_data_blk) 7055 start = first_data_blk; 7056 7057 /* Determine first and last group to examine based on start and end */ 7058 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 7059 &first_group, &first_cluster); 7060 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, 7061 &last_group, &last_cluster); 7062 7063 /* end now represents the last cluster to discard in this group */ 7064 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; 7065 whole_group = true; 7066 7067 for (group = first_group; group <= last_group; group++) { 7068 grp = ext4_get_group_info(sb, group); 7069 if (!grp) 7070 continue; 7071 /* We only do this if the grp has never been initialized */ 7072 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 7073 ret = ext4_mb_init_group(sb, group, GFP_NOFS); 7074 if (ret) 7075 break; 7076 } 7077 7078 /* 7079 * For all the groups except the last one, last cluster will 7080 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to 7081 * change it for the last group, note that last_cluster is 7082 * already computed earlier by ext4_get_group_no_and_offset() 7083 */ 7084 if (group == last_group) { 7085 end = last_cluster; 7086 whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; 7087 } 7088 if (grp->bb_free >= minlen) { 7089 cnt = ext4_trim_all_free(sb, group, first_cluster, 7090 end, minlen, whole_group); 7091 if (cnt < 0) { 7092 ret = cnt; 7093 break; 7094 } 7095 trimmed += cnt; 7096 } 7097 7098 /* 7099 * For every group except the first one, we are sure 7100 * that the first cluster to discard will be cluster #0. 7101 */ 7102 first_cluster = 0; 7103 } 7104 7105 if (!ret) 7106 EXT4_SB(sb)->s_last_trim_minblks = minlen; 7107 7108 out: 7109 range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; 7110 return ret; 7111 } 7112 7113 /* Iterate all the free extents in the group. */ 7114 int 7115 ext4_mballoc_query_range( 7116 struct super_block *sb, 7117 ext4_group_t group, 7118 ext4_grpblk_t start, 7119 ext4_grpblk_t end, 7120 ext4_mballoc_query_range_fn formatter, 7121 void *priv) 7122 { 7123 void *bitmap; 7124 ext4_grpblk_t next; 7125 struct ext4_buddy e4b; 7126 int error; 7127 7128 error = ext4_mb_load_buddy(sb, group, &e4b); 7129 if (error) 7130 return error; 7131 bitmap = e4b.bd_bitmap; 7132 7133 ext4_lock_group(sb, group); 7134 7135 start = max(e4b.bd_info->bb_first_free, start); 7136 if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) 7137 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; 7138 7139 while (start <= end) { 7140 start = mb_find_next_zero_bit(bitmap, end + 1, start); 7141 if (start > end) 7142 break; 7143 next = mb_find_next_bit(bitmap, end + 1, start); 7144 7145 ext4_unlock_group(sb, group); 7146 error = formatter(sb, group, start, next - start, priv); 7147 if (error) 7148 goto out_unload; 7149 ext4_lock_group(sb, group); 7150 7151 start = next + 1; 7152 } 7153 7154 ext4_unlock_group(sb, group); 7155 out_unload: 7156 ext4_mb_unload_buddy(&e4b); 7157 7158 return error; 7159 } 7160