1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 4 * Written by Alex Tomas <alex@clusterfs.com> 5 */ 6 7 8 /* 9 * mballoc.c contains the multiblocks allocation routines 10 */ 11 12 #include "ext4_jbd2.h" 13 #include "mballoc.h" 14 #include <linux/log2.h> 15 #include <linux/module.h> 16 #include <linux/slab.h> 17 #include <linux/nospec.h> 18 #include <linux/backing-dev.h> 19 #include <trace/events/ext4.h> 20 21 /* 22 * MUSTDO: 23 * - test ext4_ext_search_left() and ext4_ext_search_right() 24 * - search for metadata in few groups 25 * 26 * TODO v4: 27 * - normalization should take into account whether file is still open 28 * - discard preallocations if no free space left (policy?) 29 * - don't normalize tails 30 * - quota 31 * - reservation for superuser 32 * 33 * TODO v3: 34 * - bitmap read-ahead (proposed by Oleg Drokin aka green) 35 * - track min/max extents in each group for better group selection 36 * - mb_mark_used() may allocate chunk right after splitting buddy 37 * - tree of groups sorted by number of free blocks 38 * - error handling 39 */ 40 41 /* 42 * The allocation request involve request for multiple number of blocks 43 * near to the goal(block) value specified. 44 * 45 * During initialization phase of the allocator we decide to use the 46 * group preallocation or inode preallocation depending on the size of 47 * the file. The size of the file could be the resulting file size we 48 * would have after allocation, or the current file size, which ever 49 * is larger. If the size is less than sbi->s_mb_stream_request we 50 * select to use the group preallocation. The default value of 51 * s_mb_stream_request is 16 blocks. This can also be tuned via 52 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in 53 * terms of number of blocks. 54 * 55 * The main motivation for having small file use group preallocation is to 56 * ensure that we have small files closer together on the disk. 57 * 58 * First stage the allocator looks at the inode prealloc list, 59 * ext4_inode_info->i_prealloc_list, which contains list of prealloc 60 * spaces for this particular inode. The inode prealloc space is 61 * represented as: 62 * 63 * pa_lstart -> the logical start block for this prealloc space 64 * pa_pstart -> the physical start block for this prealloc space 65 * pa_len -> length for this prealloc space (in clusters) 66 * pa_free -> free space available in this prealloc space (in clusters) 67 * 68 * The inode preallocation space is used looking at the _logical_ start 69 * block. If only the logical file block falls within the range of prealloc 70 * space we will consume the particular prealloc space. This makes sure that 71 * we have contiguous physical blocks representing the file blocks 72 * 73 * The important thing to be noted in case of inode prealloc space is that 74 * we don't modify the values associated to inode prealloc space except 75 * pa_free. 76 * 77 * If we are not able to find blocks in the inode prealloc space and if we 78 * have the group allocation flag set then we look at the locality group 79 * prealloc space. These are per CPU prealloc list represented as 80 * 81 * ext4_sb_info.s_locality_groups[smp_processor_id()] 82 * 83 * The reason for having a per cpu locality group is to reduce the contention 84 * between CPUs. It is possible to get scheduled at this point. 85 * 86 * The locality group prealloc space is used looking at whether we have 87 * enough free space (pa_free) within the prealloc space. 88 * 89 * If we can't allocate blocks via inode prealloc or/and locality group 90 * prealloc then we look at the buddy cache. The buddy cache is represented 91 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets 92 * mapped to the buddy and bitmap information regarding different 93 * groups. The buddy information is attached to buddy cache inode so that 94 * we can access them through the page cache. The information regarding 95 * each group is loaded via ext4_mb_load_buddy. The information involve 96 * block bitmap and buddy information. The information are stored in the 97 * inode as: 98 * 99 * { page } 100 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 101 * 102 * 103 * one block each for bitmap and buddy information. So for each group we 104 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / 105 * blocksize) blocks. So it can have information regarding groups_per_page 106 * which is blocks_per_page/2 107 * 108 * The buddy cache inode is not stored on disk. The inode is thrown 109 * away when the filesystem is unmounted. 110 * 111 * We look for count number of blocks in the buddy cache. If we were able 112 * to locate that many free blocks we return with additional information 113 * regarding rest of the contiguous physical block available 114 * 115 * Before allocating blocks via buddy cache we normalize the request 116 * blocks. This ensure we ask for more blocks that we needed. The extra 117 * blocks that we get after allocation is added to the respective prealloc 118 * list. In case of inode preallocation we follow a list of heuristics 119 * based on file size. This can be found in ext4_mb_normalize_request. If 120 * we are doing a group prealloc we try to normalize the request to 121 * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is 122 * dependent on the cluster size; for non-bigalloc file systems, it is 123 * 512 blocks. This can be tuned via 124 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in 125 * terms of number of blocks. If we have mounted the file system with -O 126 * stripe=<value> option the group prealloc request is normalized to the 127 * smallest multiple of the stripe value (sbi->s_stripe) which is 128 * greater than the default mb_group_prealloc. 129 * 130 * If "mb_optimize_scan" mount option is set, we maintain in memory group info 131 * structures in two data structures: 132 * 133 * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) 134 * 135 * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) 136 * 137 * This is an array of lists where the index in the array represents the 138 * largest free order in the buddy bitmap of the participating group infos of 139 * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total 140 * number of buddy bitmap orders possible) number of lists. Group-infos are 141 * placed in appropriate lists. 142 * 143 * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) 144 * 145 * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) 146 * 147 * This is an array of lists where in the i-th list there are groups with 148 * average fragment size >= 2^i and < 2^(i+1). The average fragment size 149 * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. 150 * Note that we don't bother with a special list for completely empty groups 151 * so we only have MB_NUM_ORDERS(sb) lists. 152 * 153 * When "mb_optimize_scan" mount option is set, mballoc consults the above data 154 * structures to decide the order in which groups are to be traversed for 155 * fulfilling an allocation request. 156 * 157 * At CR = 0, we look for groups which have the largest_free_order >= the order 158 * of the request. We directly look at the largest free order list in the data 159 * structure (1) above where largest_free_order = order of the request. If that 160 * list is empty, we look at remaining list in the increasing order of 161 * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. 162 * 163 * At CR = 1, we only consider groups where average fragment size > request 164 * size. So, we lookup a group which has average fragment size just above or 165 * equal to request size using our average fragment size group lists (data 166 * structure 2) in O(1) time. 167 * 168 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in 169 * linear order which requires O(N) search time for each CR 0 and CR 1 phase. 170 * 171 * The regular allocator (using the buddy cache) supports a few tunables. 172 * 173 * /sys/fs/ext4/<partition>/mb_min_to_scan 174 * /sys/fs/ext4/<partition>/mb_max_to_scan 175 * /sys/fs/ext4/<partition>/mb_order2_req 176 * /sys/fs/ext4/<partition>/mb_linear_limit 177 * 178 * The regular allocator uses buddy scan only if the request len is power of 179 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 180 * value of s_mb_order2_reqs can be tuned via 181 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to 182 * stripe size (sbi->s_stripe), we try to search for contiguous block in 183 * stripe size. This should result in better allocation on RAID setups. If 184 * not, we search in the specific group using bitmap for best extents. The 185 * tunable min_to_scan and max_to_scan control the behaviour here. 186 * min_to_scan indicate how long the mballoc __must__ look for a best 187 * extent and max_to_scan indicates how long the mballoc __can__ look for a 188 * best extent in the found extents. Searching for the blocks starts with 189 * the group specified as the goal value in allocation context via 190 * ac_g_ex. Each group is first checked based on the criteria whether it 191 * can be used for allocation. ext4_mb_good_group explains how the groups are 192 * checked. 193 * 194 * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not 195 * get traversed linearly. That may result in subsequent allocations being not 196 * close to each other. And so, the underlying device may get filled up in a 197 * non-linear fashion. While that may not matter on non-rotational devices, for 198 * rotational devices that may result in higher seek times. "mb_linear_limit" 199 * tells mballoc how many groups mballoc should search linearly before 200 * performing consulting above data structures for more efficient lookups. For 201 * non rotational devices, this value defaults to 0 and for rotational devices 202 * this is set to MB_DEFAULT_LINEAR_LIMIT. 203 * 204 * Both the prealloc space are getting populated as above. So for the first 205 * request we will hit the buddy cache which will result in this prealloc 206 * space getting filled. The prealloc space is then later used for the 207 * subsequent request. 208 */ 209 210 /* 211 * mballoc operates on the following data: 212 * - on-disk bitmap 213 * - in-core buddy (actually includes buddy and bitmap) 214 * - preallocation descriptors (PAs) 215 * 216 * there are two types of preallocations: 217 * - inode 218 * assiged to specific inode and can be used for this inode only. 219 * it describes part of inode's space preallocated to specific 220 * physical blocks. any block from that preallocated can be used 221 * independent. the descriptor just tracks number of blocks left 222 * unused. so, before taking some block from descriptor, one must 223 * make sure corresponded logical block isn't allocated yet. this 224 * also means that freeing any block within descriptor's range 225 * must discard all preallocated blocks. 226 * - locality group 227 * assigned to specific locality group which does not translate to 228 * permanent set of inodes: inode can join and leave group. space 229 * from this type of preallocation can be used for any inode. thus 230 * it's consumed from the beginning to the end. 231 * 232 * relation between them can be expressed as: 233 * in-core buddy = on-disk bitmap + preallocation descriptors 234 * 235 * this mean blocks mballoc considers used are: 236 * - allocated blocks (persistent) 237 * - preallocated blocks (non-persistent) 238 * 239 * consistency in mballoc world means that at any time a block is either 240 * free or used in ALL structures. notice: "any time" should not be read 241 * literally -- time is discrete and delimited by locks. 242 * 243 * to keep it simple, we don't use block numbers, instead we count number of 244 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. 245 * 246 * all operations can be expressed as: 247 * - init buddy: buddy = on-disk + PAs 248 * - new PA: buddy += N; PA = N 249 * - use inode PA: on-disk += N; PA -= N 250 * - discard inode PA buddy -= on-disk - PA; PA = 0 251 * - use locality group PA on-disk += N; PA -= N 252 * - discard locality group PA buddy -= PA; PA = 0 253 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap 254 * is used in real operation because we can't know actual used 255 * bits from PA, only from on-disk bitmap 256 * 257 * if we follow this strict logic, then all operations above should be atomic. 258 * given some of them can block, we'd have to use something like semaphores 259 * killing performance on high-end SMP hardware. let's try to relax it using 260 * the following knowledge: 261 * 1) if buddy is referenced, it's already initialized 262 * 2) while block is used in buddy and the buddy is referenced, 263 * nobody can re-allocate that block 264 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has 265 * bit set and PA claims same block, it's OK. IOW, one can set bit in 266 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded 267 * block 268 * 269 * so, now we're building a concurrency table: 270 * - init buddy vs. 271 * - new PA 272 * blocks for PA are allocated in the buddy, buddy must be referenced 273 * until PA is linked to allocation group to avoid concurrent buddy init 274 * - use inode PA 275 * we need to make sure that either on-disk bitmap or PA has uptodate data 276 * given (3) we care that PA-=N operation doesn't interfere with init 277 * - discard inode PA 278 * the simplest way would be to have buddy initialized by the discard 279 * - use locality group PA 280 * again PA-=N must be serialized with init 281 * - discard locality group PA 282 * the simplest way would be to have buddy initialized by the discard 283 * - new PA vs. 284 * - use inode PA 285 * i_data_sem serializes them 286 * - discard inode PA 287 * discard process must wait until PA isn't used by another process 288 * - use locality group PA 289 * some mutex should serialize them 290 * - discard locality group PA 291 * discard process must wait until PA isn't used by another process 292 * - use inode PA 293 * - use inode PA 294 * i_data_sem or another mutex should serializes them 295 * - discard inode PA 296 * discard process must wait until PA isn't used by another process 297 * - use locality group PA 298 * nothing wrong here -- they're different PAs covering different blocks 299 * - discard locality group PA 300 * discard process must wait until PA isn't used by another process 301 * 302 * now we're ready to make few consequences: 303 * - PA is referenced and while it is no discard is possible 304 * - PA is referenced until block isn't marked in on-disk bitmap 305 * - PA changes only after on-disk bitmap 306 * - discard must not compete with init. either init is done before 307 * any discard or they're serialized somehow 308 * - buddy init as sum of on-disk bitmap and PAs is done atomically 309 * 310 * a special case when we've used PA to emptiness. no need to modify buddy 311 * in this case, but we should care about concurrent init 312 * 313 */ 314 315 /* 316 * Logic in few words: 317 * 318 * - allocation: 319 * load group 320 * find blocks 321 * mark bits in on-disk bitmap 322 * release group 323 * 324 * - use preallocation: 325 * find proper PA (per-inode or group) 326 * load group 327 * mark bits in on-disk bitmap 328 * release group 329 * release PA 330 * 331 * - free: 332 * load group 333 * mark bits in on-disk bitmap 334 * release group 335 * 336 * - discard preallocations in group: 337 * mark PAs deleted 338 * move them onto local list 339 * load on-disk bitmap 340 * load group 341 * remove PA from object (inode or locality group) 342 * mark free blocks in-core 343 * 344 * - discard inode's preallocations: 345 */ 346 347 /* 348 * Locking rules 349 * 350 * Locks: 351 * - bitlock on a group (group) 352 * - object (inode/locality) (object) 353 * - per-pa lock (pa) 354 * - cr0 lists lock (cr0) 355 * - cr1 tree lock (cr1) 356 * 357 * Paths: 358 * - new pa 359 * object 360 * group 361 * 362 * - find and use pa: 363 * pa 364 * 365 * - release consumed pa: 366 * pa 367 * group 368 * object 369 * 370 * - generate in-core bitmap: 371 * group 372 * pa 373 * 374 * - discard all for given object (inode, locality group): 375 * object 376 * pa 377 * group 378 * 379 * - discard all for given group: 380 * group 381 * pa 382 * group 383 * object 384 * 385 * - allocation path (ext4_mb_regular_allocator) 386 * group 387 * cr0/cr1 388 */ 389 static struct kmem_cache *ext4_pspace_cachep; 390 static struct kmem_cache *ext4_ac_cachep; 391 static struct kmem_cache *ext4_free_data_cachep; 392 393 /* We create slab caches for groupinfo data structures based on the 394 * superblock block size. There will be one per mounted filesystem for 395 * each unique s_blocksize_bits */ 396 #define NR_GRPINFO_CACHES 8 397 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; 398 399 static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { 400 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", 401 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", 402 "ext4_groupinfo_64k", "ext4_groupinfo_128k" 403 }; 404 405 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 406 ext4_group_t group); 407 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 408 ext4_group_t group); 409 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); 410 411 static bool ext4_mb_good_group(struct ext4_allocation_context *ac, 412 ext4_group_t group, int cr); 413 414 static int ext4_try_to_trim_range(struct super_block *sb, 415 struct ext4_buddy *e4b, ext4_grpblk_t start, 416 ext4_grpblk_t max, ext4_grpblk_t minblocks); 417 418 /* 419 * The algorithm using this percpu seq counter goes below: 420 * 1. We sample the percpu discard_pa_seq counter before trying for block 421 * allocation in ext4_mb_new_blocks(). 422 * 2. We increment this percpu discard_pa_seq counter when we either allocate 423 * or free these blocks i.e. while marking those blocks as used/free in 424 * mb_mark_used()/mb_free_blocks(). 425 * 3. We also increment this percpu seq counter when we successfully identify 426 * that the bb_prealloc_list is not empty and hence proceed for discarding 427 * of those PAs inside ext4_mb_discard_group_preallocations(). 428 * 429 * Now to make sure that the regular fast path of block allocation is not 430 * affected, as a small optimization we only sample the percpu seq counter 431 * on that cpu. Only when the block allocation fails and when freed blocks 432 * found were 0, that is when we sample percpu seq counter for all cpus using 433 * below function ext4_get_discard_pa_seq_sum(). This happens after making 434 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. 435 */ 436 static DEFINE_PER_CPU(u64, discard_pa_seq); 437 static inline u64 ext4_get_discard_pa_seq_sum(void) 438 { 439 int __cpu; 440 u64 __seq = 0; 441 442 for_each_possible_cpu(__cpu) 443 __seq += per_cpu(discard_pa_seq, __cpu); 444 return __seq; 445 } 446 447 static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 448 { 449 #if BITS_PER_LONG == 64 450 *bit += ((unsigned long) addr & 7UL) << 3; 451 addr = (void *) ((unsigned long) addr & ~7UL); 452 #elif BITS_PER_LONG == 32 453 *bit += ((unsigned long) addr & 3UL) << 3; 454 addr = (void *) ((unsigned long) addr & ~3UL); 455 #else 456 #error "how many bits you are?!" 457 #endif 458 return addr; 459 } 460 461 static inline int mb_test_bit(int bit, void *addr) 462 { 463 /* 464 * ext4_test_bit on architecture like powerpc 465 * needs unsigned long aligned address 466 */ 467 addr = mb_correct_addr_and_bit(&bit, addr); 468 return ext4_test_bit(bit, addr); 469 } 470 471 static inline void mb_set_bit(int bit, void *addr) 472 { 473 addr = mb_correct_addr_and_bit(&bit, addr); 474 ext4_set_bit(bit, addr); 475 } 476 477 static inline void mb_clear_bit(int bit, void *addr) 478 { 479 addr = mb_correct_addr_and_bit(&bit, addr); 480 ext4_clear_bit(bit, addr); 481 } 482 483 static inline int mb_test_and_clear_bit(int bit, void *addr) 484 { 485 addr = mb_correct_addr_and_bit(&bit, addr); 486 return ext4_test_and_clear_bit(bit, addr); 487 } 488 489 static inline int mb_find_next_zero_bit(void *addr, int max, int start) 490 { 491 int fix = 0, ret, tmpmax; 492 addr = mb_correct_addr_and_bit(&fix, addr); 493 tmpmax = max + fix; 494 start += fix; 495 496 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; 497 if (ret > max) 498 return max; 499 return ret; 500 } 501 502 static inline int mb_find_next_bit(void *addr, int max, int start) 503 { 504 int fix = 0, ret, tmpmax; 505 addr = mb_correct_addr_and_bit(&fix, addr); 506 tmpmax = max + fix; 507 start += fix; 508 509 ret = ext4_find_next_bit(addr, tmpmax, start) - fix; 510 if (ret > max) 511 return max; 512 return ret; 513 } 514 515 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 516 { 517 char *bb; 518 519 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 520 BUG_ON(max == NULL); 521 522 if (order > e4b->bd_blkbits + 1) { 523 *max = 0; 524 return NULL; 525 } 526 527 /* at order 0 we see each particular block */ 528 if (order == 0) { 529 *max = 1 << (e4b->bd_blkbits + 3); 530 return e4b->bd_bitmap; 531 } 532 533 bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 534 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 535 536 return bb; 537 } 538 539 #ifdef DOUBLE_CHECK 540 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, 541 int first, int count) 542 { 543 int i; 544 struct super_block *sb = e4b->bd_sb; 545 546 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 547 return; 548 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 549 for (i = 0; i < count; i++) { 550 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 551 ext4_fsblk_t blocknr; 552 553 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 554 blocknr += EXT4_C2B(EXT4_SB(sb), first + i); 555 ext4_grp_locked_error(sb, e4b->bd_group, 556 inode ? inode->i_ino : 0, 557 blocknr, 558 "freeing block already freed " 559 "(bit %u)", 560 first + i); 561 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 562 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 563 } 564 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 565 } 566 } 567 568 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) 569 { 570 int i; 571 572 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 573 return; 574 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 575 for (i = 0; i < count; i++) { 576 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 577 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 578 } 579 } 580 581 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 582 { 583 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 584 return; 585 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { 586 unsigned char *b1, *b2; 587 int i; 588 b1 = (unsigned char *) e4b->bd_info->bb_bitmap; 589 b2 = (unsigned char *) bitmap; 590 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 591 if (b1[i] != b2[i]) { 592 ext4_msg(e4b->bd_sb, KERN_ERR, 593 "corruption in group %u " 594 "at byte %u(%u): %x in copy != %x " 595 "on disk/prealloc", 596 e4b->bd_group, i, i * 8, b1[i], b2[i]); 597 BUG(); 598 } 599 } 600 } 601 } 602 603 static void mb_group_bb_bitmap_alloc(struct super_block *sb, 604 struct ext4_group_info *grp, ext4_group_t group) 605 { 606 struct buffer_head *bh; 607 608 grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); 609 if (!grp->bb_bitmap) 610 return; 611 612 bh = ext4_read_block_bitmap(sb, group); 613 if (IS_ERR_OR_NULL(bh)) { 614 kfree(grp->bb_bitmap); 615 grp->bb_bitmap = NULL; 616 return; 617 } 618 619 memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); 620 put_bh(bh); 621 } 622 623 static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) 624 { 625 kfree(grp->bb_bitmap); 626 } 627 628 #else 629 static inline void mb_free_blocks_double(struct inode *inode, 630 struct ext4_buddy *e4b, int first, int count) 631 { 632 return; 633 } 634 static inline void mb_mark_used_double(struct ext4_buddy *e4b, 635 int first, int count) 636 { 637 return; 638 } 639 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 640 { 641 return; 642 } 643 644 static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, 645 struct ext4_group_info *grp, ext4_group_t group) 646 { 647 return; 648 } 649 650 static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) 651 { 652 return; 653 } 654 #endif 655 656 #ifdef AGGRESSIVE_CHECK 657 658 #define MB_CHECK_ASSERT(assert) \ 659 do { \ 660 if (!(assert)) { \ 661 printk(KERN_EMERG \ 662 "Assertion failure in %s() at %s:%d: \"%s\"\n", \ 663 function, file, line, # assert); \ 664 BUG(); \ 665 } \ 666 } while (0) 667 668 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, 669 const char *function, int line) 670 { 671 struct super_block *sb = e4b->bd_sb; 672 int order = e4b->bd_blkbits + 1; 673 int max; 674 int max2; 675 int i; 676 int j; 677 int k; 678 int count; 679 struct ext4_group_info *grp; 680 int fragments = 0; 681 int fstart; 682 struct list_head *cur; 683 void *buddy; 684 void *buddy2; 685 686 if (e4b->bd_info->bb_check_counter++ % 10) 687 return 0; 688 689 while (order > 1) { 690 buddy = mb_find_buddy(e4b, order, &max); 691 MB_CHECK_ASSERT(buddy); 692 buddy2 = mb_find_buddy(e4b, order - 1, &max2); 693 MB_CHECK_ASSERT(buddy2); 694 MB_CHECK_ASSERT(buddy != buddy2); 695 MB_CHECK_ASSERT(max * 2 == max2); 696 697 count = 0; 698 for (i = 0; i < max; i++) { 699 700 if (mb_test_bit(i, buddy)) { 701 /* only single bit in buddy2 may be 0 */ 702 if (!mb_test_bit(i << 1, buddy2)) { 703 MB_CHECK_ASSERT( 704 mb_test_bit((i<<1)+1, buddy2)); 705 } 706 continue; 707 } 708 709 /* both bits in buddy2 must be 1 */ 710 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 711 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 712 713 for (j = 0; j < (1 << order); j++) { 714 k = (i * (1 << order)) + j; 715 MB_CHECK_ASSERT( 716 !mb_test_bit(k, e4b->bd_bitmap)); 717 } 718 count++; 719 } 720 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); 721 order--; 722 } 723 724 fstart = -1; 725 buddy = mb_find_buddy(e4b, 0, &max); 726 for (i = 0; i < max; i++) { 727 if (!mb_test_bit(i, buddy)) { 728 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); 729 if (fstart == -1) { 730 fragments++; 731 fstart = i; 732 } 733 continue; 734 } 735 fstart = -1; 736 /* check used bits only */ 737 for (j = 0; j < e4b->bd_blkbits + 1; j++) { 738 buddy2 = mb_find_buddy(e4b, j, &max2); 739 k = i >> j; 740 MB_CHECK_ASSERT(k < max2); 741 MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); 742 } 743 } 744 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); 745 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 746 747 grp = ext4_get_group_info(sb, e4b->bd_group); 748 if (!grp) 749 return NULL; 750 list_for_each(cur, &grp->bb_prealloc_list) { 751 ext4_group_t groupnr; 752 struct ext4_prealloc_space *pa; 753 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 754 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); 755 MB_CHECK_ASSERT(groupnr == e4b->bd_group); 756 for (i = 0; i < pa->pa_len; i++) 757 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); 758 } 759 return 0; 760 } 761 #undef MB_CHECK_ASSERT 762 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ 763 __FILE__, __func__, __LINE__) 764 #else 765 #define mb_check_buddy(e4b) 766 #endif 767 768 /* 769 * Divide blocks started from @first with length @len into 770 * smaller chunks with power of 2 blocks. 771 * Clear the bits in bitmap which the blocks of the chunk(s) covered, 772 * then increase bb_counters[] for corresponded chunk size. 773 */ 774 static void ext4_mb_mark_free_simple(struct super_block *sb, 775 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 776 struct ext4_group_info *grp) 777 { 778 struct ext4_sb_info *sbi = EXT4_SB(sb); 779 ext4_grpblk_t min; 780 ext4_grpblk_t max; 781 ext4_grpblk_t chunk; 782 unsigned int border; 783 784 BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); 785 786 border = 2 << sb->s_blocksize_bits; 787 788 while (len > 0) { 789 /* find how many blocks can be covered since this position */ 790 max = ffs(first | border) - 1; 791 792 /* find how many blocks of power 2 we need to mark */ 793 min = fls(len) - 1; 794 795 if (max < min) 796 min = max; 797 chunk = 1 << min; 798 799 /* mark multiblock chunks only */ 800 grp->bb_counters[min]++; 801 if (min > 0) 802 mb_clear_bit(first >> min, 803 buddy + sbi->s_mb_offsets[min]); 804 805 len -= chunk; 806 first += chunk; 807 } 808 } 809 810 static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) 811 { 812 int order; 813 814 /* 815 * We don't bother with a special lists groups with only 1 block free 816 * extents and for completely empty groups. 817 */ 818 order = fls(len) - 2; 819 if (order < 0) 820 return 0; 821 if (order == MB_NUM_ORDERS(sb)) 822 order--; 823 return order; 824 } 825 826 /* Move group to appropriate avg_fragment_size list */ 827 static void 828 mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) 829 { 830 struct ext4_sb_info *sbi = EXT4_SB(sb); 831 int new_order; 832 833 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) 834 return; 835 836 new_order = mb_avg_fragment_size_order(sb, 837 grp->bb_free / grp->bb_fragments); 838 if (new_order == grp->bb_avg_fragment_size_order) 839 return; 840 841 if (grp->bb_avg_fragment_size_order != -1) { 842 write_lock(&sbi->s_mb_avg_fragment_size_locks[ 843 grp->bb_avg_fragment_size_order]); 844 list_del(&grp->bb_avg_fragment_size_node); 845 write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 846 grp->bb_avg_fragment_size_order]); 847 } 848 grp->bb_avg_fragment_size_order = new_order; 849 write_lock(&sbi->s_mb_avg_fragment_size_locks[ 850 grp->bb_avg_fragment_size_order]); 851 list_add_tail(&grp->bb_avg_fragment_size_node, 852 &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); 853 write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 854 grp->bb_avg_fragment_size_order]); 855 } 856 857 /* 858 * Choose next group by traversing largest_free_order lists. Updates *new_cr if 859 * cr level needs an update. 860 */ 861 static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, 862 int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 863 { 864 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 865 struct ext4_group_info *iter, *grp; 866 int i; 867 868 if (ac->ac_status == AC_STATUS_FOUND) 869 return; 870 871 if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) 872 atomic_inc(&sbi->s_bal_cr0_bad_suggestions); 873 874 grp = NULL; 875 for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { 876 if (list_empty(&sbi->s_mb_largest_free_orders[i])) 877 continue; 878 read_lock(&sbi->s_mb_largest_free_orders_locks[i]); 879 if (list_empty(&sbi->s_mb_largest_free_orders[i])) { 880 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 881 continue; 882 } 883 grp = NULL; 884 list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], 885 bb_largest_free_order_node) { 886 if (sbi->s_mb_stats) 887 atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); 888 if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { 889 grp = iter; 890 break; 891 } 892 } 893 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 894 if (grp) 895 break; 896 } 897 898 if (!grp) { 899 /* Increment cr and search again */ 900 *new_cr = 1; 901 } else { 902 *group = grp->bb_group; 903 ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; 904 } 905 } 906 907 /* 908 * Choose next group by traversing average fragment size list of suitable 909 * order. Updates *new_cr if cr level needs an update. 910 */ 911 static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, 912 int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 913 { 914 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 915 struct ext4_group_info *grp = NULL, *iter; 916 int i; 917 918 if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { 919 if (sbi->s_mb_stats) 920 atomic_inc(&sbi->s_bal_cr1_bad_suggestions); 921 } 922 923 for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); 924 i < MB_NUM_ORDERS(ac->ac_sb); i++) { 925 if (list_empty(&sbi->s_mb_avg_fragment_size[i])) 926 continue; 927 read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); 928 if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { 929 read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); 930 continue; 931 } 932 list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], 933 bb_avg_fragment_size_node) { 934 if (sbi->s_mb_stats) 935 atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); 936 if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { 937 grp = iter; 938 break; 939 } 940 } 941 read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); 942 if (grp) 943 break; 944 } 945 946 if (grp) { 947 *group = grp->bb_group; 948 ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; 949 } else { 950 *new_cr = 2; 951 } 952 } 953 954 static inline int should_optimize_scan(struct ext4_allocation_context *ac) 955 { 956 if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) 957 return 0; 958 if (ac->ac_criteria >= 2) 959 return 0; 960 if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) 961 return 0; 962 return 1; 963 } 964 965 /* 966 * Return next linear group for allocation. If linear traversal should not be 967 * performed, this function just returns the same group 968 */ 969 static int 970 next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) 971 { 972 if (!should_optimize_scan(ac)) 973 goto inc_and_return; 974 975 if (ac->ac_groups_linear_remaining) { 976 ac->ac_groups_linear_remaining--; 977 goto inc_and_return; 978 } 979 980 return group; 981 inc_and_return: 982 /* 983 * Artificially restricted ngroups for non-extent 984 * files makes group > ngroups possible on first loop. 985 */ 986 return group + 1 >= ngroups ? 0 : group + 1; 987 } 988 989 /* 990 * ext4_mb_choose_next_group: choose next group for allocation. 991 * 992 * @ac Allocation Context 993 * @new_cr This is an output parameter. If the there is no good group 994 * available at current CR level, this field is updated to indicate 995 * the new cr level that should be used. 996 * @group This is an input / output parameter. As an input it indicates the 997 * next group that the allocator intends to use for allocation. As 998 * output, this field indicates the next group that should be used as 999 * determined by the optimization functions. 1000 * @ngroups Total number of groups 1001 */ 1002 static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, 1003 int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 1004 { 1005 *new_cr = ac->ac_criteria; 1006 1007 if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { 1008 *group = next_linear_group(ac, *group, ngroups); 1009 return; 1010 } 1011 1012 if (*new_cr == 0) { 1013 ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); 1014 } else if (*new_cr == 1) { 1015 ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); 1016 } else { 1017 /* 1018 * TODO: For CR=2, we can arrange groups in an rb tree sorted by 1019 * bb_free. But until that happens, we should never come here. 1020 */ 1021 WARN_ON(1); 1022 } 1023 } 1024 1025 /* 1026 * Cache the order of the largest free extent we have available in this block 1027 * group. 1028 */ 1029 static void 1030 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) 1031 { 1032 struct ext4_sb_info *sbi = EXT4_SB(sb); 1033 int i; 1034 1035 for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) 1036 if (grp->bb_counters[i] > 0) 1037 break; 1038 /* No need to move between order lists? */ 1039 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || 1040 i == grp->bb_largest_free_order) { 1041 grp->bb_largest_free_order = i; 1042 return; 1043 } 1044 1045 if (grp->bb_largest_free_order >= 0) { 1046 write_lock(&sbi->s_mb_largest_free_orders_locks[ 1047 grp->bb_largest_free_order]); 1048 list_del_init(&grp->bb_largest_free_order_node); 1049 write_unlock(&sbi->s_mb_largest_free_orders_locks[ 1050 grp->bb_largest_free_order]); 1051 } 1052 grp->bb_largest_free_order = i; 1053 if (grp->bb_largest_free_order >= 0 && grp->bb_free) { 1054 write_lock(&sbi->s_mb_largest_free_orders_locks[ 1055 grp->bb_largest_free_order]); 1056 list_add_tail(&grp->bb_largest_free_order_node, 1057 &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); 1058 write_unlock(&sbi->s_mb_largest_free_orders_locks[ 1059 grp->bb_largest_free_order]); 1060 } 1061 } 1062 1063 static noinline_for_stack 1064 void ext4_mb_generate_buddy(struct super_block *sb, 1065 void *buddy, void *bitmap, ext4_group_t group, 1066 struct ext4_group_info *grp) 1067 { 1068 struct ext4_sb_info *sbi = EXT4_SB(sb); 1069 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 1070 ext4_grpblk_t i = 0; 1071 ext4_grpblk_t first; 1072 ext4_grpblk_t len; 1073 unsigned free = 0; 1074 unsigned fragments = 0; 1075 unsigned long long period = get_cycles(); 1076 1077 /* initialize buddy from bitmap which is aggregation 1078 * of on-disk bitmap and preallocations */ 1079 i = mb_find_next_zero_bit(bitmap, max, 0); 1080 grp->bb_first_free = i; 1081 while (i < max) { 1082 fragments++; 1083 first = i; 1084 i = mb_find_next_bit(bitmap, max, i); 1085 len = i - first; 1086 free += len; 1087 if (len > 1) 1088 ext4_mb_mark_free_simple(sb, buddy, first, len, grp); 1089 else 1090 grp->bb_counters[0]++; 1091 if (i < max) 1092 i = mb_find_next_zero_bit(bitmap, max, i); 1093 } 1094 grp->bb_fragments = fragments; 1095 1096 if (free != grp->bb_free) { 1097 ext4_grp_locked_error(sb, group, 0, 0, 1098 "block bitmap and bg descriptor " 1099 "inconsistent: %u vs %u free clusters", 1100 free, grp->bb_free); 1101 /* 1102 * If we intend to continue, we consider group descriptor 1103 * corrupt and update bb_free using bitmap value 1104 */ 1105 grp->bb_free = free; 1106 ext4_mark_group_bitmap_corrupted(sb, group, 1107 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1108 } 1109 mb_set_largest_free_order(sb, grp); 1110 mb_update_avg_fragment_size(sb, grp); 1111 1112 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 1113 1114 period = get_cycles() - period; 1115 atomic_inc(&sbi->s_mb_buddies_generated); 1116 atomic64_add(period, &sbi->s_mb_generation_time); 1117 } 1118 1119 /* The buddy information is attached the buddy cache inode 1120 * for convenience. The information regarding each group 1121 * is loaded via ext4_mb_load_buddy. The information involve 1122 * block bitmap and buddy information. The information are 1123 * stored in the inode as 1124 * 1125 * { page } 1126 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 1127 * 1128 * 1129 * one block each for bitmap and buddy information. 1130 * So for each group we take up 2 blocks. A page can 1131 * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. 1132 * So it can have information regarding groups_per_page which 1133 * is blocks_per_page/2 1134 * 1135 * Locking note: This routine takes the block group lock of all groups 1136 * for this page; do not hold this lock when calling this routine! 1137 */ 1138 1139 static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) 1140 { 1141 ext4_group_t ngroups; 1142 int blocksize; 1143 int blocks_per_page; 1144 int groups_per_page; 1145 int err = 0; 1146 int i; 1147 ext4_group_t first_group, group; 1148 int first_block; 1149 struct super_block *sb; 1150 struct buffer_head *bhs; 1151 struct buffer_head **bh = NULL; 1152 struct inode *inode; 1153 char *data; 1154 char *bitmap; 1155 struct ext4_group_info *grinfo; 1156 1157 inode = page->mapping->host; 1158 sb = inode->i_sb; 1159 ngroups = ext4_get_groups_count(sb); 1160 blocksize = i_blocksize(inode); 1161 blocks_per_page = PAGE_SIZE / blocksize; 1162 1163 mb_debug(sb, "init page %lu\n", page->index); 1164 1165 groups_per_page = blocks_per_page >> 1; 1166 if (groups_per_page == 0) 1167 groups_per_page = 1; 1168 1169 /* allocate buffer_heads to read bitmaps */ 1170 if (groups_per_page > 1) { 1171 i = sizeof(struct buffer_head *) * groups_per_page; 1172 bh = kzalloc(i, gfp); 1173 if (bh == NULL) 1174 return -ENOMEM; 1175 } else 1176 bh = &bhs; 1177 1178 first_group = page->index * blocks_per_page / 2; 1179 1180 /* read all groups the page covers into the cache */ 1181 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1182 if (group >= ngroups) 1183 break; 1184 1185 grinfo = ext4_get_group_info(sb, group); 1186 if (!grinfo) 1187 continue; 1188 /* 1189 * If page is uptodate then we came here after online resize 1190 * which added some new uninitialized group info structs, so 1191 * we must skip all initialized uptodate buddies on the page, 1192 * which may be currently in use by an allocating task. 1193 */ 1194 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { 1195 bh[i] = NULL; 1196 continue; 1197 } 1198 bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); 1199 if (IS_ERR(bh[i])) { 1200 err = PTR_ERR(bh[i]); 1201 bh[i] = NULL; 1202 goto out; 1203 } 1204 mb_debug(sb, "read bitmap for group %u\n", group); 1205 } 1206 1207 /* wait for I/O completion */ 1208 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1209 int err2; 1210 1211 if (!bh[i]) 1212 continue; 1213 err2 = ext4_wait_block_bitmap(sb, group, bh[i]); 1214 if (!err) 1215 err = err2; 1216 } 1217 1218 first_block = page->index * blocks_per_page; 1219 for (i = 0; i < blocks_per_page; i++) { 1220 group = (first_block + i) >> 1; 1221 if (group >= ngroups) 1222 break; 1223 1224 if (!bh[group - first_group]) 1225 /* skip initialized uptodate buddy */ 1226 continue; 1227 1228 if (!buffer_verified(bh[group - first_group])) 1229 /* Skip faulty bitmaps */ 1230 continue; 1231 err = 0; 1232 1233 /* 1234 * data carry information regarding this 1235 * particular group in the format specified 1236 * above 1237 * 1238 */ 1239 data = page_address(page) + (i * blocksize); 1240 bitmap = bh[group - first_group]->b_data; 1241 1242 /* 1243 * We place the buddy block and bitmap block 1244 * close together 1245 */ 1246 if ((first_block + i) & 1) { 1247 /* this is block of buddy */ 1248 BUG_ON(incore == NULL); 1249 mb_debug(sb, "put buddy for group %u in page %lu/%x\n", 1250 group, page->index, i * blocksize); 1251 trace_ext4_mb_buddy_bitmap_load(sb, group); 1252 grinfo = ext4_get_group_info(sb, group); 1253 if (!grinfo) { 1254 err = -EFSCORRUPTED; 1255 goto out; 1256 } 1257 grinfo->bb_fragments = 0; 1258 memset(grinfo->bb_counters, 0, 1259 sizeof(*grinfo->bb_counters) * 1260 (MB_NUM_ORDERS(sb))); 1261 /* 1262 * incore got set to the group block bitmap below 1263 */ 1264 ext4_lock_group(sb, group); 1265 /* init the buddy */ 1266 memset(data, 0xff, blocksize); 1267 ext4_mb_generate_buddy(sb, data, incore, group, grinfo); 1268 ext4_unlock_group(sb, group); 1269 incore = NULL; 1270 } else { 1271 /* this is block of bitmap */ 1272 BUG_ON(incore != NULL); 1273 mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", 1274 group, page->index, i * blocksize); 1275 trace_ext4_mb_bitmap_load(sb, group); 1276 1277 /* see comments in ext4_mb_put_pa() */ 1278 ext4_lock_group(sb, group); 1279 memcpy(data, bitmap, blocksize); 1280 1281 /* mark all preallocated blks used in in-core bitmap */ 1282 ext4_mb_generate_from_pa(sb, data, group); 1283 ext4_mb_generate_from_freelist(sb, data, group); 1284 ext4_unlock_group(sb, group); 1285 1286 /* set incore so that the buddy information can be 1287 * generated using this 1288 */ 1289 incore = data; 1290 } 1291 } 1292 SetPageUptodate(page); 1293 1294 out: 1295 if (bh) { 1296 for (i = 0; i < groups_per_page; i++) 1297 brelse(bh[i]); 1298 if (bh != &bhs) 1299 kfree(bh); 1300 } 1301 return err; 1302 } 1303 1304 /* 1305 * Lock the buddy and bitmap pages. This make sure other parallel init_group 1306 * on the same buddy page doesn't happen whild holding the buddy page lock. 1307 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap 1308 * are on the same page e4b->bd_buddy_page is NULL and return value is 0. 1309 */ 1310 static int ext4_mb_get_buddy_page_lock(struct super_block *sb, 1311 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) 1312 { 1313 struct inode *inode = EXT4_SB(sb)->s_buddy_cache; 1314 int block, pnum, poff; 1315 int blocks_per_page; 1316 struct page *page; 1317 1318 e4b->bd_buddy_page = NULL; 1319 e4b->bd_bitmap_page = NULL; 1320 1321 blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1322 /* 1323 * the buddy cache inode stores the block bitmap 1324 * and buddy information in consecutive blocks. 1325 * So for each group we need two blocks. 1326 */ 1327 block = group * 2; 1328 pnum = block / blocks_per_page; 1329 poff = block % blocks_per_page; 1330 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1331 if (!page) 1332 return -ENOMEM; 1333 BUG_ON(page->mapping != inode->i_mapping); 1334 e4b->bd_bitmap_page = page; 1335 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1336 1337 if (blocks_per_page >= 2) { 1338 /* buddy and bitmap are on the same page */ 1339 return 0; 1340 } 1341 1342 block++; 1343 pnum = block / blocks_per_page; 1344 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1345 if (!page) 1346 return -ENOMEM; 1347 BUG_ON(page->mapping != inode->i_mapping); 1348 e4b->bd_buddy_page = page; 1349 return 0; 1350 } 1351 1352 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) 1353 { 1354 if (e4b->bd_bitmap_page) { 1355 unlock_page(e4b->bd_bitmap_page); 1356 put_page(e4b->bd_bitmap_page); 1357 } 1358 if (e4b->bd_buddy_page) { 1359 unlock_page(e4b->bd_buddy_page); 1360 put_page(e4b->bd_buddy_page); 1361 } 1362 } 1363 1364 /* 1365 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1366 * block group lock of all groups for this page; do not hold the BG lock when 1367 * calling this routine! 1368 */ 1369 static noinline_for_stack 1370 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) 1371 { 1372 1373 struct ext4_group_info *this_grp; 1374 struct ext4_buddy e4b; 1375 struct page *page; 1376 int ret = 0; 1377 1378 might_sleep(); 1379 mb_debug(sb, "init group %u\n", group); 1380 this_grp = ext4_get_group_info(sb, group); 1381 if (!this_grp) 1382 return -EFSCORRUPTED; 1383 1384 /* 1385 * This ensures that we don't reinit the buddy cache 1386 * page which map to the group from which we are already 1387 * allocating. If we are looking at the buddy cache we would 1388 * have taken a reference using ext4_mb_load_buddy and that 1389 * would have pinned buddy page to page cache. 1390 * The call to ext4_mb_get_buddy_page_lock will mark the 1391 * page accessed. 1392 */ 1393 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); 1394 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { 1395 /* 1396 * somebody initialized the group 1397 * return without doing anything 1398 */ 1399 goto err; 1400 } 1401 1402 page = e4b.bd_bitmap_page; 1403 ret = ext4_mb_init_cache(page, NULL, gfp); 1404 if (ret) 1405 goto err; 1406 if (!PageUptodate(page)) { 1407 ret = -EIO; 1408 goto err; 1409 } 1410 1411 if (e4b.bd_buddy_page == NULL) { 1412 /* 1413 * If both the bitmap and buddy are in 1414 * the same page we don't need to force 1415 * init the buddy 1416 */ 1417 ret = 0; 1418 goto err; 1419 } 1420 /* init buddy cache */ 1421 page = e4b.bd_buddy_page; 1422 ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); 1423 if (ret) 1424 goto err; 1425 if (!PageUptodate(page)) { 1426 ret = -EIO; 1427 goto err; 1428 } 1429 err: 1430 ext4_mb_put_buddy_page_lock(&e4b); 1431 return ret; 1432 } 1433 1434 /* 1435 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1436 * block group lock of all groups for this page; do not hold the BG lock when 1437 * calling this routine! 1438 */ 1439 static noinline_for_stack int 1440 ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, 1441 struct ext4_buddy *e4b, gfp_t gfp) 1442 { 1443 int blocks_per_page; 1444 int block; 1445 int pnum; 1446 int poff; 1447 struct page *page; 1448 int ret; 1449 struct ext4_group_info *grp; 1450 struct ext4_sb_info *sbi = EXT4_SB(sb); 1451 struct inode *inode = sbi->s_buddy_cache; 1452 1453 might_sleep(); 1454 mb_debug(sb, "load group %u\n", group); 1455 1456 blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1457 grp = ext4_get_group_info(sb, group); 1458 if (!grp) 1459 return -EFSCORRUPTED; 1460 1461 e4b->bd_blkbits = sb->s_blocksize_bits; 1462 e4b->bd_info = grp; 1463 e4b->bd_sb = sb; 1464 e4b->bd_group = group; 1465 e4b->bd_buddy_page = NULL; 1466 e4b->bd_bitmap_page = NULL; 1467 1468 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1469 /* 1470 * we need full data about the group 1471 * to make a good selection 1472 */ 1473 ret = ext4_mb_init_group(sb, group, gfp); 1474 if (ret) 1475 return ret; 1476 } 1477 1478 /* 1479 * the buddy cache inode stores the block bitmap 1480 * and buddy information in consecutive blocks. 1481 * So for each group we need two blocks. 1482 */ 1483 block = group * 2; 1484 pnum = block / blocks_per_page; 1485 poff = block % blocks_per_page; 1486 1487 /* we could use find_or_create_page(), but it locks page 1488 * what we'd like to avoid in fast path ... */ 1489 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 1490 if (page == NULL || !PageUptodate(page)) { 1491 if (page) 1492 /* 1493 * drop the page reference and try 1494 * to get the page with lock. If we 1495 * are not uptodate that implies 1496 * somebody just created the page but 1497 * is yet to initialize the same. So 1498 * wait for it to initialize. 1499 */ 1500 put_page(page); 1501 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1502 if (page) { 1503 if (WARN_RATELIMIT(page->mapping != inode->i_mapping, 1504 "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { 1505 /* should never happen */ 1506 unlock_page(page); 1507 ret = -EINVAL; 1508 goto err; 1509 } 1510 if (!PageUptodate(page)) { 1511 ret = ext4_mb_init_cache(page, NULL, gfp); 1512 if (ret) { 1513 unlock_page(page); 1514 goto err; 1515 } 1516 mb_cmp_bitmaps(e4b, page_address(page) + 1517 (poff * sb->s_blocksize)); 1518 } 1519 unlock_page(page); 1520 } 1521 } 1522 if (page == NULL) { 1523 ret = -ENOMEM; 1524 goto err; 1525 } 1526 if (!PageUptodate(page)) { 1527 ret = -EIO; 1528 goto err; 1529 } 1530 1531 /* Pages marked accessed already */ 1532 e4b->bd_bitmap_page = page; 1533 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1534 1535 block++; 1536 pnum = block / blocks_per_page; 1537 poff = block % blocks_per_page; 1538 1539 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 1540 if (page == NULL || !PageUptodate(page)) { 1541 if (page) 1542 put_page(page); 1543 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1544 if (page) { 1545 if (WARN_RATELIMIT(page->mapping != inode->i_mapping, 1546 "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { 1547 /* should never happen */ 1548 unlock_page(page); 1549 ret = -EINVAL; 1550 goto err; 1551 } 1552 if (!PageUptodate(page)) { 1553 ret = ext4_mb_init_cache(page, e4b->bd_bitmap, 1554 gfp); 1555 if (ret) { 1556 unlock_page(page); 1557 goto err; 1558 } 1559 } 1560 unlock_page(page); 1561 } 1562 } 1563 if (page == NULL) { 1564 ret = -ENOMEM; 1565 goto err; 1566 } 1567 if (!PageUptodate(page)) { 1568 ret = -EIO; 1569 goto err; 1570 } 1571 1572 /* Pages marked accessed already */ 1573 e4b->bd_buddy_page = page; 1574 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1575 1576 return 0; 1577 1578 err: 1579 if (page) 1580 put_page(page); 1581 if (e4b->bd_bitmap_page) 1582 put_page(e4b->bd_bitmap_page); 1583 1584 e4b->bd_buddy = NULL; 1585 e4b->bd_bitmap = NULL; 1586 return ret; 1587 } 1588 1589 static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1590 struct ext4_buddy *e4b) 1591 { 1592 return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); 1593 } 1594 1595 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) 1596 { 1597 if (e4b->bd_bitmap_page) 1598 put_page(e4b->bd_bitmap_page); 1599 if (e4b->bd_buddy_page) 1600 put_page(e4b->bd_buddy_page); 1601 } 1602 1603 1604 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1605 { 1606 int order = 1, max; 1607 void *bb; 1608 1609 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 1610 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1611 1612 while (order <= e4b->bd_blkbits + 1) { 1613 bb = mb_find_buddy(e4b, order, &max); 1614 if (!mb_test_bit(block >> order, bb)) { 1615 /* this block is part of buddy of order 'order' */ 1616 return order; 1617 } 1618 order++; 1619 } 1620 return 0; 1621 } 1622 1623 static void mb_clear_bits(void *bm, int cur, int len) 1624 { 1625 __u32 *addr; 1626 1627 len = cur + len; 1628 while (cur < len) { 1629 if ((cur & 31) == 0 && (len - cur) >= 32) { 1630 /* fast path: clear whole word at once */ 1631 addr = bm + (cur >> 3); 1632 *addr = 0; 1633 cur += 32; 1634 continue; 1635 } 1636 mb_clear_bit(cur, bm); 1637 cur++; 1638 } 1639 } 1640 1641 /* clear bits in given range 1642 * will return first found zero bit if any, -1 otherwise 1643 */ 1644 static int mb_test_and_clear_bits(void *bm, int cur, int len) 1645 { 1646 __u32 *addr; 1647 int zero_bit = -1; 1648 1649 len = cur + len; 1650 while (cur < len) { 1651 if ((cur & 31) == 0 && (len - cur) >= 32) { 1652 /* fast path: clear whole word at once */ 1653 addr = bm + (cur >> 3); 1654 if (*addr != (__u32)(-1) && zero_bit == -1) 1655 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); 1656 *addr = 0; 1657 cur += 32; 1658 continue; 1659 } 1660 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) 1661 zero_bit = cur; 1662 cur++; 1663 } 1664 1665 return zero_bit; 1666 } 1667 1668 void mb_set_bits(void *bm, int cur, int len) 1669 { 1670 __u32 *addr; 1671 1672 len = cur + len; 1673 while (cur < len) { 1674 if ((cur & 31) == 0 && (len - cur) >= 32) { 1675 /* fast path: set whole word at once */ 1676 addr = bm + (cur >> 3); 1677 *addr = 0xffffffff; 1678 cur += 32; 1679 continue; 1680 } 1681 mb_set_bit(cur, bm); 1682 cur++; 1683 } 1684 } 1685 1686 static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) 1687 { 1688 if (mb_test_bit(*bit + side, bitmap)) { 1689 mb_clear_bit(*bit, bitmap); 1690 (*bit) -= side; 1691 return 1; 1692 } 1693 else { 1694 (*bit) += side; 1695 mb_set_bit(*bit, bitmap); 1696 return -1; 1697 } 1698 } 1699 1700 static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) 1701 { 1702 int max; 1703 int order = 1; 1704 void *buddy = mb_find_buddy(e4b, order, &max); 1705 1706 while (buddy) { 1707 void *buddy2; 1708 1709 /* Bits in range [first; last] are known to be set since 1710 * corresponding blocks were allocated. Bits in range 1711 * (first; last) will stay set because they form buddies on 1712 * upper layer. We just deal with borders if they don't 1713 * align with upper layer and then go up. 1714 * Releasing entire group is all about clearing 1715 * single bit of highest order buddy. 1716 */ 1717 1718 /* Example: 1719 * --------------------------------- 1720 * | 1 | 1 | 1 | 1 | 1721 * --------------------------------- 1722 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1723 * --------------------------------- 1724 * 0 1 2 3 4 5 6 7 1725 * \_____________________/ 1726 * 1727 * Neither [1] nor [6] is aligned to above layer. 1728 * Left neighbour [0] is free, so mark it busy, 1729 * decrease bb_counters and extend range to 1730 * [0; 6] 1731 * Right neighbour [7] is busy. It can't be coaleasced with [6], so 1732 * mark [6] free, increase bb_counters and shrink range to 1733 * [0; 5]. 1734 * Then shift range to [0; 2], go up and do the same. 1735 */ 1736 1737 1738 if (first & 1) 1739 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); 1740 if (!(last & 1)) 1741 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); 1742 if (first > last) 1743 break; 1744 order++; 1745 1746 buddy2 = mb_find_buddy(e4b, order, &max); 1747 if (!buddy2) { 1748 mb_clear_bits(buddy, first, last - first + 1); 1749 e4b->bd_info->bb_counters[order - 1] += last - first + 1; 1750 break; 1751 } 1752 first >>= 1; 1753 last >>= 1; 1754 buddy = buddy2; 1755 } 1756 } 1757 1758 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1759 int first, int count) 1760 { 1761 int left_is_free = 0; 1762 int right_is_free = 0; 1763 int block; 1764 int last = first + count - 1; 1765 struct super_block *sb = e4b->bd_sb; 1766 1767 if (WARN_ON(count == 0)) 1768 return; 1769 BUG_ON(last >= (sb->s_blocksize << 3)); 1770 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1771 /* Don't bother if the block group is corrupt. */ 1772 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) 1773 return; 1774 1775 mb_check_buddy(e4b); 1776 mb_free_blocks_double(inode, e4b, first, count); 1777 1778 this_cpu_inc(discard_pa_seq); 1779 e4b->bd_info->bb_free += count; 1780 if (first < e4b->bd_info->bb_first_free) 1781 e4b->bd_info->bb_first_free = first; 1782 1783 /* access memory sequentially: check left neighbour, 1784 * clear range and then check right neighbour 1785 */ 1786 if (first != 0) 1787 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); 1788 block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); 1789 if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) 1790 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); 1791 1792 if (unlikely(block != -1)) { 1793 struct ext4_sb_info *sbi = EXT4_SB(sb); 1794 ext4_fsblk_t blocknr; 1795 1796 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1797 blocknr += EXT4_C2B(sbi, block); 1798 if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { 1799 ext4_grp_locked_error(sb, e4b->bd_group, 1800 inode ? inode->i_ino : 0, 1801 blocknr, 1802 "freeing already freed block (bit %u); block bitmap corrupt.", 1803 block); 1804 ext4_mark_group_bitmap_corrupted( 1805 sb, e4b->bd_group, 1806 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1807 } 1808 goto done; 1809 } 1810 1811 /* let's maintain fragments counter */ 1812 if (left_is_free && right_is_free) 1813 e4b->bd_info->bb_fragments--; 1814 else if (!left_is_free && !right_is_free) 1815 e4b->bd_info->bb_fragments++; 1816 1817 /* buddy[0] == bd_bitmap is a special case, so handle 1818 * it right away and let mb_buddy_mark_free stay free of 1819 * zero order checks. 1820 * Check if neighbours are to be coaleasced, 1821 * adjust bitmap bb_counters and borders appropriately. 1822 */ 1823 if (first & 1) { 1824 first += !left_is_free; 1825 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; 1826 } 1827 if (!(last & 1)) { 1828 last -= !right_is_free; 1829 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; 1830 } 1831 1832 if (first <= last) 1833 mb_buddy_mark_free(e4b, first >> 1, last >> 1); 1834 1835 done: 1836 mb_set_largest_free_order(sb, e4b->bd_info); 1837 mb_update_avg_fragment_size(sb, e4b->bd_info); 1838 mb_check_buddy(e4b); 1839 } 1840 1841 static int mb_find_extent(struct ext4_buddy *e4b, int block, 1842 int needed, struct ext4_free_extent *ex) 1843 { 1844 int next = block; 1845 int max, order; 1846 void *buddy; 1847 1848 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1849 BUG_ON(ex == NULL); 1850 1851 buddy = mb_find_buddy(e4b, 0, &max); 1852 BUG_ON(buddy == NULL); 1853 BUG_ON(block >= max); 1854 if (mb_test_bit(block, buddy)) { 1855 ex->fe_len = 0; 1856 ex->fe_start = 0; 1857 ex->fe_group = 0; 1858 return 0; 1859 } 1860 1861 /* find actual order */ 1862 order = mb_find_order_for_block(e4b, block); 1863 block = block >> order; 1864 1865 ex->fe_len = 1 << order; 1866 ex->fe_start = block << order; 1867 ex->fe_group = e4b->bd_group; 1868 1869 /* calc difference from given start */ 1870 next = next - ex->fe_start; 1871 ex->fe_len -= next; 1872 ex->fe_start += next; 1873 1874 while (needed > ex->fe_len && 1875 mb_find_buddy(e4b, order, &max)) { 1876 1877 if (block + 1 >= max) 1878 break; 1879 1880 next = (block + 1) * (1 << order); 1881 if (mb_test_bit(next, e4b->bd_bitmap)) 1882 break; 1883 1884 order = mb_find_order_for_block(e4b, next); 1885 1886 block = next >> order; 1887 ex->fe_len += 1 << order; 1888 } 1889 1890 if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { 1891 /* Should never happen! (but apparently sometimes does?!?) */ 1892 WARN_ON(1); 1893 ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, 1894 "corruption or bug in mb_find_extent " 1895 "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", 1896 block, order, needed, ex->fe_group, ex->fe_start, 1897 ex->fe_len, ex->fe_logical); 1898 ex->fe_len = 0; 1899 ex->fe_start = 0; 1900 ex->fe_group = 0; 1901 } 1902 return ex->fe_len; 1903 } 1904 1905 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) 1906 { 1907 int ord; 1908 int mlen = 0; 1909 int max = 0; 1910 int cur; 1911 int start = ex->fe_start; 1912 int len = ex->fe_len; 1913 unsigned ret = 0; 1914 int len0 = len; 1915 void *buddy; 1916 bool split = false; 1917 1918 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 1919 BUG_ON(e4b->bd_group != ex->fe_group); 1920 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1921 mb_check_buddy(e4b); 1922 mb_mark_used_double(e4b, start, len); 1923 1924 this_cpu_inc(discard_pa_seq); 1925 e4b->bd_info->bb_free -= len; 1926 if (e4b->bd_info->bb_first_free == start) 1927 e4b->bd_info->bb_first_free += len; 1928 1929 /* let's maintain fragments counter */ 1930 if (start != 0) 1931 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); 1932 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1933 max = !mb_test_bit(start + len, e4b->bd_bitmap); 1934 if (mlen && max) 1935 e4b->bd_info->bb_fragments++; 1936 else if (!mlen && !max) 1937 e4b->bd_info->bb_fragments--; 1938 1939 /* let's maintain buddy itself */ 1940 while (len) { 1941 if (!split) 1942 ord = mb_find_order_for_block(e4b, start); 1943 1944 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 1945 /* the whole chunk may be allocated at once! */ 1946 mlen = 1 << ord; 1947 if (!split) 1948 buddy = mb_find_buddy(e4b, ord, &max); 1949 else 1950 split = false; 1951 BUG_ON((start >> ord) >= max); 1952 mb_set_bit(start >> ord, buddy); 1953 e4b->bd_info->bb_counters[ord]--; 1954 start += mlen; 1955 len -= mlen; 1956 BUG_ON(len < 0); 1957 continue; 1958 } 1959 1960 /* store for history */ 1961 if (ret == 0) 1962 ret = len | (ord << 16); 1963 1964 /* we have to split large buddy */ 1965 BUG_ON(ord <= 0); 1966 buddy = mb_find_buddy(e4b, ord, &max); 1967 mb_set_bit(start >> ord, buddy); 1968 e4b->bd_info->bb_counters[ord]--; 1969 1970 ord--; 1971 cur = (start >> ord) & ~1U; 1972 buddy = mb_find_buddy(e4b, ord, &max); 1973 mb_clear_bit(cur, buddy); 1974 mb_clear_bit(cur + 1, buddy); 1975 e4b->bd_info->bb_counters[ord]++; 1976 e4b->bd_info->bb_counters[ord]++; 1977 split = true; 1978 } 1979 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1980 1981 mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); 1982 mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0); 1983 mb_check_buddy(e4b); 1984 1985 return ret; 1986 } 1987 1988 /* 1989 * Must be called under group lock! 1990 */ 1991 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, 1992 struct ext4_buddy *e4b) 1993 { 1994 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1995 int ret; 1996 1997 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); 1998 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1999 2000 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); 2001 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; 2002 ret = mb_mark_used(e4b, &ac->ac_b_ex); 2003 2004 /* preallocation can change ac_b_ex, thus we store actually 2005 * allocated blocks for history */ 2006 ac->ac_f_ex = ac->ac_b_ex; 2007 2008 ac->ac_status = AC_STATUS_FOUND; 2009 ac->ac_tail = ret & 0xffff; 2010 ac->ac_buddy = ret >> 16; 2011 2012 /* 2013 * take the page reference. We want the page to be pinned 2014 * so that we don't get a ext4_mb_init_cache_call for this 2015 * group until we update the bitmap. That would mean we 2016 * double allocate blocks. The reference is dropped 2017 * in ext4_mb_release_context 2018 */ 2019 ac->ac_bitmap_page = e4b->bd_bitmap_page; 2020 get_page(ac->ac_bitmap_page); 2021 ac->ac_buddy_page = e4b->bd_buddy_page; 2022 get_page(ac->ac_buddy_page); 2023 /* store last allocated for subsequent stream allocation */ 2024 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2025 spin_lock(&sbi->s_md_lock); 2026 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 2027 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 2028 spin_unlock(&sbi->s_md_lock); 2029 } 2030 /* 2031 * As we've just preallocated more space than 2032 * user requested originally, we store allocated 2033 * space in a special descriptor. 2034 */ 2035 if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 2036 ext4_mb_new_preallocation(ac); 2037 2038 } 2039 2040 static void ext4_mb_check_limits(struct ext4_allocation_context *ac, 2041 struct ext4_buddy *e4b, 2042 int finish_group) 2043 { 2044 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2045 struct ext4_free_extent *bex = &ac->ac_b_ex; 2046 struct ext4_free_extent *gex = &ac->ac_g_ex; 2047 2048 if (ac->ac_status == AC_STATUS_FOUND) 2049 return; 2050 /* 2051 * We don't want to scan for a whole year 2052 */ 2053 if (ac->ac_found > sbi->s_mb_max_to_scan && 2054 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2055 ac->ac_status = AC_STATUS_BREAK; 2056 return; 2057 } 2058 2059 /* 2060 * Haven't found good chunk so far, let's continue 2061 */ 2062 if (bex->fe_len < gex->fe_len) 2063 return; 2064 2065 if (finish_group) 2066 ext4_mb_use_best_found(ac, e4b); 2067 } 2068 2069 /* 2070 * The routine checks whether found extent is good enough. If it is, 2071 * then the extent gets marked used and flag is set to the context 2072 * to stop scanning. Otherwise, the extent is compared with the 2073 * previous found extent and if new one is better, then it's stored 2074 * in the context. Later, the best found extent will be used, if 2075 * mballoc can't find good enough extent. 2076 * 2077 * FIXME: real allocation policy is to be designed yet! 2078 */ 2079 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, 2080 struct ext4_free_extent *ex, 2081 struct ext4_buddy *e4b) 2082 { 2083 struct ext4_free_extent *bex = &ac->ac_b_ex; 2084 struct ext4_free_extent *gex = &ac->ac_g_ex; 2085 2086 BUG_ON(ex->fe_len <= 0); 2087 BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 2088 BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 2089 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 2090 2091 ac->ac_found++; 2092 2093 /* 2094 * The special case - take what you catch first 2095 */ 2096 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2097 *bex = *ex; 2098 ext4_mb_use_best_found(ac, e4b); 2099 return; 2100 } 2101 2102 /* 2103 * Let's check whether the chuck is good enough 2104 */ 2105 if (ex->fe_len == gex->fe_len) { 2106 *bex = *ex; 2107 ext4_mb_use_best_found(ac, e4b); 2108 return; 2109 } 2110 2111 /* 2112 * If this is first found extent, just store it in the context 2113 */ 2114 if (bex->fe_len == 0) { 2115 *bex = *ex; 2116 return; 2117 } 2118 2119 /* 2120 * If new found extent is better, store it in the context 2121 */ 2122 if (bex->fe_len < gex->fe_len) { 2123 /* if the request isn't satisfied, any found extent 2124 * larger than previous best one is better */ 2125 if (ex->fe_len > bex->fe_len) 2126 *bex = *ex; 2127 } else if (ex->fe_len > gex->fe_len) { 2128 /* if the request is satisfied, then we try to find 2129 * an extent that still satisfy the request, but is 2130 * smaller than previous one */ 2131 if (ex->fe_len < bex->fe_len) 2132 *bex = *ex; 2133 } 2134 2135 ext4_mb_check_limits(ac, e4b, 0); 2136 } 2137 2138 static noinline_for_stack 2139 void ext4_mb_try_best_found(struct ext4_allocation_context *ac, 2140 struct ext4_buddy *e4b) 2141 { 2142 struct ext4_free_extent ex = ac->ac_b_ex; 2143 ext4_group_t group = ex.fe_group; 2144 int max; 2145 int err; 2146 2147 BUG_ON(ex.fe_len <= 0); 2148 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 2149 if (err) 2150 return; 2151 2152 ext4_lock_group(ac->ac_sb, group); 2153 max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); 2154 2155 if (max > 0) { 2156 ac->ac_b_ex = ex; 2157 ext4_mb_use_best_found(ac, e4b); 2158 } 2159 2160 ext4_unlock_group(ac->ac_sb, group); 2161 ext4_mb_unload_buddy(e4b); 2162 } 2163 2164 static noinline_for_stack 2165 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 2166 struct ext4_buddy *e4b) 2167 { 2168 ext4_group_t group = ac->ac_g_ex.fe_group; 2169 int max; 2170 int err; 2171 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2172 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2173 struct ext4_free_extent ex; 2174 2175 if (!grp) 2176 return -EFSCORRUPTED; 2177 if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) 2178 return 0; 2179 if (grp->bb_free == 0) 2180 return 0; 2181 2182 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 2183 if (err) 2184 return err; 2185 2186 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { 2187 ext4_mb_unload_buddy(e4b); 2188 return 0; 2189 } 2190 2191 ext4_lock_group(ac->ac_sb, group); 2192 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, 2193 ac->ac_g_ex.fe_len, &ex); 2194 ex.fe_logical = 0xDEADFA11; /* debug value */ 2195 2196 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 2197 ext4_fsblk_t start; 2198 2199 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + 2200 ex.fe_start; 2201 /* use do_div to get remainder (would be 64-bit modulo) */ 2202 if (do_div(start, sbi->s_stripe) == 0) { 2203 ac->ac_found++; 2204 ac->ac_b_ex = ex; 2205 ext4_mb_use_best_found(ac, e4b); 2206 } 2207 } else if (max >= ac->ac_g_ex.fe_len) { 2208 BUG_ON(ex.fe_len <= 0); 2209 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 2210 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 2211 ac->ac_found++; 2212 ac->ac_b_ex = ex; 2213 ext4_mb_use_best_found(ac, e4b); 2214 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { 2215 /* Sometimes, caller may want to merge even small 2216 * number of blocks to an existing extent */ 2217 BUG_ON(ex.fe_len <= 0); 2218 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 2219 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 2220 ac->ac_found++; 2221 ac->ac_b_ex = ex; 2222 ext4_mb_use_best_found(ac, e4b); 2223 } 2224 ext4_unlock_group(ac->ac_sb, group); 2225 ext4_mb_unload_buddy(e4b); 2226 2227 return 0; 2228 } 2229 2230 /* 2231 * The routine scans buddy structures (not bitmap!) from given order 2232 * to max order and tries to find big enough chunk to satisfy the req 2233 */ 2234 static noinline_for_stack 2235 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 2236 struct ext4_buddy *e4b) 2237 { 2238 struct super_block *sb = ac->ac_sb; 2239 struct ext4_group_info *grp = e4b->bd_info; 2240 void *buddy; 2241 int i; 2242 int k; 2243 int max; 2244 2245 BUG_ON(ac->ac_2order <= 0); 2246 for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { 2247 if (grp->bb_counters[i] == 0) 2248 continue; 2249 2250 buddy = mb_find_buddy(e4b, i, &max); 2251 if (WARN_RATELIMIT(buddy == NULL, 2252 "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) 2253 continue; 2254 2255 k = mb_find_next_zero_bit(buddy, max, 0); 2256 if (k >= max) { 2257 ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, 2258 "%d free clusters of order %d. But found 0", 2259 grp->bb_counters[i], i); 2260 ext4_mark_group_bitmap_corrupted(ac->ac_sb, 2261 e4b->bd_group, 2262 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2263 break; 2264 } 2265 ac->ac_found++; 2266 2267 ac->ac_b_ex.fe_len = 1 << i; 2268 ac->ac_b_ex.fe_start = k << i; 2269 ac->ac_b_ex.fe_group = e4b->bd_group; 2270 2271 ext4_mb_use_best_found(ac, e4b); 2272 2273 BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); 2274 2275 if (EXT4_SB(sb)->s_mb_stats) 2276 atomic_inc(&EXT4_SB(sb)->s_bal_2orders); 2277 2278 break; 2279 } 2280 } 2281 2282 /* 2283 * The routine scans the group and measures all found extents. 2284 * In order to optimize scanning, caller must pass number of 2285 * free blocks in the group, so the routine can know upper limit. 2286 */ 2287 static noinline_for_stack 2288 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 2289 struct ext4_buddy *e4b) 2290 { 2291 struct super_block *sb = ac->ac_sb; 2292 void *bitmap = e4b->bd_bitmap; 2293 struct ext4_free_extent ex; 2294 int i; 2295 int free; 2296 2297 free = e4b->bd_info->bb_free; 2298 if (WARN_ON(free <= 0)) 2299 return; 2300 2301 i = e4b->bd_info->bb_first_free; 2302 2303 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 2304 i = mb_find_next_zero_bit(bitmap, 2305 EXT4_CLUSTERS_PER_GROUP(sb), i); 2306 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { 2307 /* 2308 * IF we have corrupt bitmap, we won't find any 2309 * free blocks even though group info says we 2310 * have free blocks 2311 */ 2312 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 2313 "%d free clusters as per " 2314 "group info. But bitmap says 0", 2315 free); 2316 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 2317 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2318 break; 2319 } 2320 2321 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); 2322 if (WARN_ON(ex.fe_len <= 0)) 2323 break; 2324 if (free < ex.fe_len) { 2325 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 2326 "%d free clusters as per " 2327 "group info. But got %d blocks", 2328 free, ex.fe_len); 2329 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 2330 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2331 /* 2332 * The number of free blocks differs. This mostly 2333 * indicate that the bitmap is corrupt. So exit 2334 * without claiming the space. 2335 */ 2336 break; 2337 } 2338 ex.fe_logical = 0xDEADC0DE; /* debug value */ 2339 ext4_mb_measure_extent(ac, &ex, e4b); 2340 2341 i += ex.fe_len; 2342 free -= ex.fe_len; 2343 } 2344 2345 ext4_mb_check_limits(ac, e4b, 1); 2346 } 2347 2348 /* 2349 * This is a special case for storages like raid5 2350 * we try to find stripe-aligned chunks for stripe-size-multiple requests 2351 */ 2352 static noinline_for_stack 2353 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 2354 struct ext4_buddy *e4b) 2355 { 2356 struct super_block *sb = ac->ac_sb; 2357 struct ext4_sb_info *sbi = EXT4_SB(sb); 2358 void *bitmap = e4b->bd_bitmap; 2359 struct ext4_free_extent ex; 2360 ext4_fsblk_t first_group_block; 2361 ext4_fsblk_t a; 2362 ext4_grpblk_t i; 2363 int max; 2364 2365 BUG_ON(sbi->s_stripe == 0); 2366 2367 /* find first stripe-aligned block in group */ 2368 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); 2369 2370 a = first_group_block + sbi->s_stripe - 1; 2371 do_div(a, sbi->s_stripe); 2372 i = (a * sbi->s_stripe) - first_group_block; 2373 2374 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { 2375 if (!mb_test_bit(i, bitmap)) { 2376 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); 2377 if (max >= sbi->s_stripe) { 2378 ac->ac_found++; 2379 ex.fe_logical = 0xDEADF00D; /* debug value */ 2380 ac->ac_b_ex = ex; 2381 ext4_mb_use_best_found(ac, e4b); 2382 break; 2383 } 2384 } 2385 i += sbi->s_stripe; 2386 } 2387 } 2388 2389 /* 2390 * This is also called BEFORE we load the buddy bitmap. 2391 * Returns either 1 or 0 indicating that the group is either suitable 2392 * for the allocation or not. 2393 */ 2394 static bool ext4_mb_good_group(struct ext4_allocation_context *ac, 2395 ext4_group_t group, int cr) 2396 { 2397 ext4_grpblk_t free, fragments; 2398 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 2399 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2400 2401 BUG_ON(cr < 0 || cr >= 4); 2402 2403 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) 2404 return false; 2405 2406 free = grp->bb_free; 2407 if (free == 0) 2408 return false; 2409 2410 fragments = grp->bb_fragments; 2411 if (fragments == 0) 2412 return false; 2413 2414 switch (cr) { 2415 case 0: 2416 BUG_ON(ac->ac_2order == 0); 2417 2418 /* Avoid using the first bg of a flexgroup for data files */ 2419 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 2420 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 2421 ((group % flex_size) == 0)) 2422 return false; 2423 2424 if (free < ac->ac_g_ex.fe_len) 2425 return false; 2426 2427 if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) 2428 return true; 2429 2430 if (grp->bb_largest_free_order < ac->ac_2order) 2431 return false; 2432 2433 return true; 2434 case 1: 2435 if ((free / fragments) >= ac->ac_g_ex.fe_len) 2436 return true; 2437 break; 2438 case 2: 2439 if (free >= ac->ac_g_ex.fe_len) 2440 return true; 2441 break; 2442 case 3: 2443 return true; 2444 default: 2445 BUG(); 2446 } 2447 2448 return false; 2449 } 2450 2451 /* 2452 * This could return negative error code if something goes wrong 2453 * during ext4_mb_init_group(). This should not be called with 2454 * ext4_lock_group() held. 2455 * 2456 * Note: because we are conditionally operating with the group lock in 2457 * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this 2458 * function using __acquire and __release. This means we need to be 2459 * super careful before messing with the error path handling via "goto 2460 * out"! 2461 */ 2462 static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, 2463 ext4_group_t group, int cr) 2464 { 2465 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2466 struct super_block *sb = ac->ac_sb; 2467 struct ext4_sb_info *sbi = EXT4_SB(sb); 2468 bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; 2469 ext4_grpblk_t free; 2470 int ret = 0; 2471 2472 if (!grp) 2473 return -EFSCORRUPTED; 2474 if (sbi->s_mb_stats) 2475 atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); 2476 if (should_lock) { 2477 ext4_lock_group(sb, group); 2478 __release(ext4_group_lock_ptr(sb, group)); 2479 } 2480 free = grp->bb_free; 2481 if (free == 0) 2482 goto out; 2483 if (cr <= 2 && free < ac->ac_g_ex.fe_len) 2484 goto out; 2485 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) 2486 goto out; 2487 if (should_lock) { 2488 __acquire(ext4_group_lock_ptr(sb, group)); 2489 ext4_unlock_group(sb, group); 2490 } 2491 2492 /* We only do this if the grp has never been initialized */ 2493 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 2494 struct ext4_group_desc *gdp = 2495 ext4_get_group_desc(sb, group, NULL); 2496 int ret; 2497 2498 /* cr=0/1 is a very optimistic search to find large 2499 * good chunks almost for free. If buddy data is not 2500 * ready, then this optimization makes no sense. But 2501 * we never skip the first block group in a flex_bg, 2502 * since this gets used for metadata block allocation, 2503 * and we want to make sure we locate metadata blocks 2504 * in the first block group in the flex_bg if possible. 2505 */ 2506 if (cr < 2 && 2507 (!sbi->s_log_groups_per_flex || 2508 ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && 2509 !(ext4_has_group_desc_csum(sb) && 2510 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) 2511 return 0; 2512 ret = ext4_mb_init_group(sb, group, GFP_NOFS); 2513 if (ret) 2514 return ret; 2515 } 2516 2517 if (should_lock) { 2518 ext4_lock_group(sb, group); 2519 __release(ext4_group_lock_ptr(sb, group)); 2520 } 2521 ret = ext4_mb_good_group(ac, group, cr); 2522 out: 2523 if (should_lock) { 2524 __acquire(ext4_group_lock_ptr(sb, group)); 2525 ext4_unlock_group(sb, group); 2526 } 2527 return ret; 2528 } 2529 2530 /* 2531 * Start prefetching @nr block bitmaps starting at @group. 2532 * Return the next group which needs to be prefetched. 2533 */ 2534 ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, 2535 unsigned int nr, int *cnt) 2536 { 2537 ext4_group_t ngroups = ext4_get_groups_count(sb); 2538 struct buffer_head *bh; 2539 struct blk_plug plug; 2540 2541 blk_start_plug(&plug); 2542 while (nr-- > 0) { 2543 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, 2544 NULL); 2545 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 2546 2547 /* 2548 * Prefetch block groups with free blocks; but don't 2549 * bother if it is marked uninitialized on disk, since 2550 * it won't require I/O to read. Also only try to 2551 * prefetch once, so we avoid getblk() call, which can 2552 * be expensive. 2553 */ 2554 if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && 2555 EXT4_MB_GRP_NEED_INIT(grp) && 2556 ext4_free_group_clusters(sb, gdp) > 0 && 2557 !(ext4_has_group_desc_csum(sb) && 2558 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { 2559 bh = ext4_read_block_bitmap_nowait(sb, group, true); 2560 if (bh && !IS_ERR(bh)) { 2561 if (!buffer_uptodate(bh) && cnt) 2562 (*cnt)++; 2563 brelse(bh); 2564 } 2565 } 2566 if (++group >= ngroups) 2567 group = 0; 2568 } 2569 blk_finish_plug(&plug); 2570 return group; 2571 } 2572 2573 /* 2574 * Prefetching reads the block bitmap into the buffer cache; but we 2575 * need to make sure that the buddy bitmap in the page cache has been 2576 * initialized. Note that ext4_mb_init_group() will block if the I/O 2577 * is not yet completed, or indeed if it was not initiated by 2578 * ext4_mb_prefetch did not start the I/O. 2579 * 2580 * TODO: We should actually kick off the buddy bitmap setup in a work 2581 * queue when the buffer I/O is completed, so that we don't block 2582 * waiting for the block allocation bitmap read to finish when 2583 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). 2584 */ 2585 void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, 2586 unsigned int nr) 2587 { 2588 struct ext4_group_desc *gdp; 2589 struct ext4_group_info *grp; 2590 2591 while (nr-- > 0) { 2592 if (!group) 2593 group = ext4_get_groups_count(sb); 2594 group--; 2595 gdp = ext4_get_group_desc(sb, group, NULL); 2596 grp = ext4_get_group_info(sb, group); 2597 2598 if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && 2599 ext4_free_group_clusters(sb, gdp) > 0 && 2600 !(ext4_has_group_desc_csum(sb) && 2601 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { 2602 if (ext4_mb_init_group(sb, group, GFP_NOFS)) 2603 break; 2604 } 2605 } 2606 } 2607 2608 static noinline_for_stack int 2609 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 2610 { 2611 ext4_group_t prefetch_grp = 0, ngroups, group, i; 2612 int cr = -1, new_cr; 2613 int err = 0, first_err = 0; 2614 unsigned int nr = 0, prefetch_ios = 0; 2615 struct ext4_sb_info *sbi; 2616 struct super_block *sb; 2617 struct ext4_buddy e4b; 2618 int lost; 2619 2620 sb = ac->ac_sb; 2621 sbi = EXT4_SB(sb); 2622 ngroups = ext4_get_groups_count(sb); 2623 /* non-extent files are limited to low blocks/groups */ 2624 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) 2625 ngroups = sbi->s_blockfile_groups; 2626 2627 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2628 2629 /* first, try the goal */ 2630 err = ext4_mb_find_by_goal(ac, &e4b); 2631 if (err || ac->ac_status == AC_STATUS_FOUND) 2632 goto out; 2633 2634 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 2635 goto out; 2636 2637 /* 2638 * ac->ac_2order is set only if the fe_len is a power of 2 2639 * if ac->ac_2order is set we also set criteria to 0 so that we 2640 * try exact allocation using buddy. 2641 */ 2642 i = fls(ac->ac_g_ex.fe_len); 2643 ac->ac_2order = 0; 2644 /* 2645 * We search using buddy data only if the order of the request 2646 * is greater than equal to the sbi_s_mb_order2_reqs 2647 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req 2648 * We also support searching for power-of-two requests only for 2649 * requests upto maximum buddy size we have constructed. 2650 */ 2651 if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { 2652 /* 2653 * This should tell if fe_len is exactly power of 2 2654 */ 2655 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 2656 ac->ac_2order = array_index_nospec(i - 1, 2657 MB_NUM_ORDERS(sb)); 2658 } 2659 2660 /* if stream allocation is enabled, use global goal */ 2661 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2662 /* TBD: may be hot point */ 2663 spin_lock(&sbi->s_md_lock); 2664 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2665 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2666 spin_unlock(&sbi->s_md_lock); 2667 } 2668 2669 /* Let's just scan groups to find more-less suitable blocks */ 2670 cr = ac->ac_2order ? 0 : 1; 2671 /* 2672 * cr == 0 try to get exact allocation, 2673 * cr == 3 try to get anything 2674 */ 2675 repeat: 2676 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 2677 ac->ac_criteria = cr; 2678 /* 2679 * searching for the right group start 2680 * from the goal value specified 2681 */ 2682 group = ac->ac_g_ex.fe_group; 2683 ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; 2684 prefetch_grp = group; 2685 2686 for (i = 0, new_cr = cr; i < ngroups; i++, 2687 ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { 2688 int ret = 0; 2689 2690 cond_resched(); 2691 if (new_cr != cr) { 2692 cr = new_cr; 2693 goto repeat; 2694 } 2695 2696 /* 2697 * Batch reads of the block allocation bitmaps 2698 * to get multiple READs in flight; limit 2699 * prefetching at cr=0/1, otherwise mballoc can 2700 * spend a lot of time loading imperfect groups 2701 */ 2702 if ((prefetch_grp == group) && 2703 (cr > 1 || 2704 prefetch_ios < sbi->s_mb_prefetch_limit)) { 2705 unsigned int curr_ios = prefetch_ios; 2706 2707 nr = sbi->s_mb_prefetch; 2708 if (ext4_has_feature_flex_bg(sb)) { 2709 nr = 1 << sbi->s_log_groups_per_flex; 2710 nr -= group & (nr - 1); 2711 nr = min(nr, sbi->s_mb_prefetch); 2712 } 2713 prefetch_grp = ext4_mb_prefetch(sb, group, 2714 nr, &prefetch_ios); 2715 if (prefetch_ios == curr_ios) 2716 nr = 0; 2717 } 2718 2719 /* This now checks without needing the buddy page */ 2720 ret = ext4_mb_good_group_nolock(ac, group, cr); 2721 if (ret <= 0) { 2722 if (!first_err) 2723 first_err = ret; 2724 continue; 2725 } 2726 2727 err = ext4_mb_load_buddy(sb, group, &e4b); 2728 if (err) 2729 goto out; 2730 2731 ext4_lock_group(sb, group); 2732 2733 /* 2734 * We need to check again after locking the 2735 * block group 2736 */ 2737 ret = ext4_mb_good_group(ac, group, cr); 2738 if (ret == 0) { 2739 ext4_unlock_group(sb, group); 2740 ext4_mb_unload_buddy(&e4b); 2741 continue; 2742 } 2743 2744 ac->ac_groups_scanned++; 2745 if (cr == 0) 2746 ext4_mb_simple_scan_group(ac, &e4b); 2747 else if (cr == 1 && sbi->s_stripe && 2748 !(ac->ac_g_ex.fe_len % sbi->s_stripe)) 2749 ext4_mb_scan_aligned(ac, &e4b); 2750 else 2751 ext4_mb_complex_scan_group(ac, &e4b); 2752 2753 ext4_unlock_group(sb, group); 2754 ext4_mb_unload_buddy(&e4b); 2755 2756 if (ac->ac_status != AC_STATUS_CONTINUE) 2757 break; 2758 } 2759 /* Processed all groups and haven't found blocks */ 2760 if (sbi->s_mb_stats && i == ngroups) 2761 atomic64_inc(&sbi->s_bal_cX_failed[cr]); 2762 } 2763 2764 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && 2765 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2766 /* 2767 * We've been searching too long. Let's try to allocate 2768 * the best chunk we've found so far 2769 */ 2770 ext4_mb_try_best_found(ac, &e4b); 2771 if (ac->ac_status != AC_STATUS_FOUND) { 2772 /* 2773 * Someone more lucky has already allocated it. 2774 * The only thing we can do is just take first 2775 * found block(s) 2776 */ 2777 lost = atomic_inc_return(&sbi->s_mb_lost_chunks); 2778 mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", 2779 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, 2780 ac->ac_b_ex.fe_len, lost); 2781 2782 ac->ac_b_ex.fe_group = 0; 2783 ac->ac_b_ex.fe_start = 0; 2784 ac->ac_b_ex.fe_len = 0; 2785 ac->ac_status = AC_STATUS_CONTINUE; 2786 ac->ac_flags |= EXT4_MB_HINT_FIRST; 2787 cr = 3; 2788 goto repeat; 2789 } 2790 } 2791 2792 if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) 2793 atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); 2794 out: 2795 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) 2796 err = first_err; 2797 2798 mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", 2799 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, 2800 ac->ac_flags, cr, err); 2801 2802 if (nr) 2803 ext4_mb_prefetch_fini(sb, prefetch_grp, nr); 2804 2805 return err; 2806 } 2807 2808 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2809 { 2810 struct super_block *sb = pde_data(file_inode(seq->file)); 2811 ext4_group_t group; 2812 2813 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2814 return NULL; 2815 group = *pos + 1; 2816 return (void *) ((unsigned long) group); 2817 } 2818 2819 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2820 { 2821 struct super_block *sb = pde_data(file_inode(seq->file)); 2822 ext4_group_t group; 2823 2824 ++*pos; 2825 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2826 return NULL; 2827 group = *pos + 1; 2828 return (void *) ((unsigned long) group); 2829 } 2830 2831 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2832 { 2833 struct super_block *sb = pde_data(file_inode(seq->file)); 2834 ext4_group_t group = (ext4_group_t) ((unsigned long) v); 2835 int i; 2836 int err, buddy_loaded = 0; 2837 struct ext4_buddy e4b; 2838 struct ext4_group_info *grinfo; 2839 unsigned char blocksize_bits = min_t(unsigned char, 2840 sb->s_blocksize_bits, 2841 EXT4_MAX_BLOCK_LOG_SIZE); 2842 struct sg { 2843 struct ext4_group_info info; 2844 ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; 2845 } sg; 2846 2847 group--; 2848 if (group == 0) 2849 seq_puts(seq, "#group: free frags first [" 2850 " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " 2851 " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); 2852 2853 i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + 2854 sizeof(struct ext4_group_info); 2855 2856 grinfo = ext4_get_group_info(sb, group); 2857 if (!grinfo) 2858 return 0; 2859 /* Load the group info in memory only if not already loaded. */ 2860 if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { 2861 err = ext4_mb_load_buddy(sb, group, &e4b); 2862 if (err) { 2863 seq_printf(seq, "#%-5u: I/O error\n", group); 2864 return 0; 2865 } 2866 buddy_loaded = 1; 2867 } 2868 2869 memcpy(&sg, grinfo, i); 2870 2871 if (buddy_loaded) 2872 ext4_mb_unload_buddy(&e4b); 2873 2874 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2875 sg.info.bb_fragments, sg.info.bb_first_free); 2876 for (i = 0; i <= 13; i++) 2877 seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? 2878 sg.info.bb_counters[i] : 0); 2879 seq_puts(seq, " ]\n"); 2880 2881 return 0; 2882 } 2883 2884 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) 2885 { 2886 } 2887 2888 const struct seq_operations ext4_mb_seq_groups_ops = { 2889 .start = ext4_mb_seq_groups_start, 2890 .next = ext4_mb_seq_groups_next, 2891 .stop = ext4_mb_seq_groups_stop, 2892 .show = ext4_mb_seq_groups_show, 2893 }; 2894 2895 int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) 2896 { 2897 struct super_block *sb = seq->private; 2898 struct ext4_sb_info *sbi = EXT4_SB(sb); 2899 2900 seq_puts(seq, "mballoc:\n"); 2901 if (!sbi->s_mb_stats) { 2902 seq_puts(seq, "\tmb stats collection turned off.\n"); 2903 seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); 2904 return 0; 2905 } 2906 seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); 2907 seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); 2908 2909 seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); 2910 2911 seq_puts(seq, "\tcr0_stats:\n"); 2912 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); 2913 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2914 atomic64_read(&sbi->s_bal_cX_groups_considered[0])); 2915 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2916 atomic64_read(&sbi->s_bal_cX_failed[0])); 2917 seq_printf(seq, "\t\tbad_suggestions: %u\n", 2918 atomic_read(&sbi->s_bal_cr0_bad_suggestions)); 2919 2920 seq_puts(seq, "\tcr1_stats:\n"); 2921 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); 2922 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2923 atomic64_read(&sbi->s_bal_cX_groups_considered[1])); 2924 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2925 atomic64_read(&sbi->s_bal_cX_failed[1])); 2926 seq_printf(seq, "\t\tbad_suggestions: %u\n", 2927 atomic_read(&sbi->s_bal_cr1_bad_suggestions)); 2928 2929 seq_puts(seq, "\tcr2_stats:\n"); 2930 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); 2931 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2932 atomic64_read(&sbi->s_bal_cX_groups_considered[2])); 2933 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2934 atomic64_read(&sbi->s_bal_cX_failed[2])); 2935 2936 seq_puts(seq, "\tcr3_stats:\n"); 2937 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); 2938 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2939 atomic64_read(&sbi->s_bal_cX_groups_considered[3])); 2940 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2941 atomic64_read(&sbi->s_bal_cX_failed[3])); 2942 seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); 2943 seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); 2944 seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); 2945 seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); 2946 seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); 2947 2948 seq_printf(seq, "\tbuddies_generated: %u/%u\n", 2949 atomic_read(&sbi->s_mb_buddies_generated), 2950 ext4_get_groups_count(sb)); 2951 seq_printf(seq, "\tbuddies_time_used: %llu\n", 2952 atomic64_read(&sbi->s_mb_generation_time)); 2953 seq_printf(seq, "\tpreallocated: %u\n", 2954 atomic_read(&sbi->s_mb_preallocated)); 2955 seq_printf(seq, "\tdiscarded: %u\n", 2956 atomic_read(&sbi->s_mb_discarded)); 2957 return 0; 2958 } 2959 2960 static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) 2961 __acquires(&EXT4_SB(sb)->s_mb_rb_lock) 2962 { 2963 struct super_block *sb = pde_data(file_inode(seq->file)); 2964 unsigned long position; 2965 2966 if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) 2967 return NULL; 2968 position = *pos + 1; 2969 return (void *) ((unsigned long) position); 2970 } 2971 2972 static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) 2973 { 2974 struct super_block *sb = pde_data(file_inode(seq->file)); 2975 unsigned long position; 2976 2977 ++*pos; 2978 if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) 2979 return NULL; 2980 position = *pos + 1; 2981 return (void *) ((unsigned long) position); 2982 } 2983 2984 static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) 2985 { 2986 struct super_block *sb = pde_data(file_inode(seq->file)); 2987 struct ext4_sb_info *sbi = EXT4_SB(sb); 2988 unsigned long position = ((unsigned long) v); 2989 struct ext4_group_info *grp; 2990 unsigned int count; 2991 2992 position--; 2993 if (position >= MB_NUM_ORDERS(sb)) { 2994 position -= MB_NUM_ORDERS(sb); 2995 if (position == 0) 2996 seq_puts(seq, "avg_fragment_size_lists:\n"); 2997 2998 count = 0; 2999 read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); 3000 list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], 3001 bb_avg_fragment_size_node) 3002 count++; 3003 read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); 3004 seq_printf(seq, "\tlist_order_%u_groups: %u\n", 3005 (unsigned int)position, count); 3006 return 0; 3007 } 3008 3009 if (position == 0) { 3010 seq_printf(seq, "optimize_scan: %d\n", 3011 test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); 3012 seq_puts(seq, "max_free_order_lists:\n"); 3013 } 3014 count = 0; 3015 read_lock(&sbi->s_mb_largest_free_orders_locks[position]); 3016 list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], 3017 bb_largest_free_order_node) 3018 count++; 3019 read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); 3020 seq_printf(seq, "\tlist_order_%u_groups: %u\n", 3021 (unsigned int)position, count); 3022 3023 return 0; 3024 } 3025 3026 static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) 3027 { 3028 } 3029 3030 const struct seq_operations ext4_mb_seq_structs_summary_ops = { 3031 .start = ext4_mb_seq_structs_summary_start, 3032 .next = ext4_mb_seq_structs_summary_next, 3033 .stop = ext4_mb_seq_structs_summary_stop, 3034 .show = ext4_mb_seq_structs_summary_show, 3035 }; 3036 3037 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) 3038 { 3039 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 3040 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; 3041 3042 BUG_ON(!cachep); 3043 return cachep; 3044 } 3045 3046 /* 3047 * Allocate the top-level s_group_info array for the specified number 3048 * of groups 3049 */ 3050 int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) 3051 { 3052 struct ext4_sb_info *sbi = EXT4_SB(sb); 3053 unsigned size; 3054 struct ext4_group_info ***old_groupinfo, ***new_groupinfo; 3055 3056 size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> 3057 EXT4_DESC_PER_BLOCK_BITS(sb); 3058 if (size <= sbi->s_group_info_size) 3059 return 0; 3060 3061 size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); 3062 new_groupinfo = kvzalloc(size, GFP_KERNEL); 3063 if (!new_groupinfo) { 3064 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); 3065 return -ENOMEM; 3066 } 3067 rcu_read_lock(); 3068 old_groupinfo = rcu_dereference(sbi->s_group_info); 3069 if (old_groupinfo) 3070 memcpy(new_groupinfo, old_groupinfo, 3071 sbi->s_group_info_size * sizeof(*sbi->s_group_info)); 3072 rcu_read_unlock(); 3073 rcu_assign_pointer(sbi->s_group_info, new_groupinfo); 3074 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); 3075 if (old_groupinfo) 3076 ext4_kvfree_array_rcu(old_groupinfo); 3077 ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 3078 sbi->s_group_info_size); 3079 return 0; 3080 } 3081 3082 /* Create and initialize ext4_group_info data for the given group. */ 3083 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 3084 struct ext4_group_desc *desc) 3085 { 3086 int i; 3087 int metalen = 0; 3088 int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); 3089 struct ext4_sb_info *sbi = EXT4_SB(sb); 3090 struct ext4_group_info **meta_group_info; 3091 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3092 3093 /* 3094 * First check if this group is the first of a reserved block. 3095 * If it's true, we have to allocate a new table of pointers 3096 * to ext4_group_info structures 3097 */ 3098 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 3099 metalen = sizeof(*meta_group_info) << 3100 EXT4_DESC_PER_BLOCK_BITS(sb); 3101 meta_group_info = kmalloc(metalen, GFP_NOFS); 3102 if (meta_group_info == NULL) { 3103 ext4_msg(sb, KERN_ERR, "can't allocate mem " 3104 "for a buddy group"); 3105 return -ENOMEM; 3106 } 3107 rcu_read_lock(); 3108 rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; 3109 rcu_read_unlock(); 3110 } 3111 3112 meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); 3113 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 3114 3115 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); 3116 if (meta_group_info[i] == NULL) { 3117 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); 3118 goto exit_group_info; 3119 } 3120 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 3121 &(meta_group_info[i]->bb_state)); 3122 3123 /* 3124 * initialize bb_free to be able to skip 3125 * empty groups without initialization 3126 */ 3127 if (ext4_has_group_desc_csum(sb) && 3128 (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 3129 meta_group_info[i]->bb_free = 3130 ext4_free_clusters_after_init(sb, group, desc); 3131 } else { 3132 meta_group_info[i]->bb_free = 3133 ext4_free_group_clusters(sb, desc); 3134 } 3135 3136 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 3137 init_rwsem(&meta_group_info[i]->alloc_sem); 3138 meta_group_info[i]->bb_free_root = RB_ROOT; 3139 INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); 3140 INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); 3141 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ 3142 meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ 3143 meta_group_info[i]->bb_group = group; 3144 3145 mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); 3146 return 0; 3147 3148 exit_group_info: 3149 /* If a meta_group_info table has been allocated, release it now */ 3150 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 3151 struct ext4_group_info ***group_info; 3152 3153 rcu_read_lock(); 3154 group_info = rcu_dereference(sbi->s_group_info); 3155 kfree(group_info[idx]); 3156 group_info[idx] = NULL; 3157 rcu_read_unlock(); 3158 } 3159 return -ENOMEM; 3160 } /* ext4_mb_add_groupinfo */ 3161 3162 static int ext4_mb_init_backend(struct super_block *sb) 3163 { 3164 ext4_group_t ngroups = ext4_get_groups_count(sb); 3165 ext4_group_t i; 3166 struct ext4_sb_info *sbi = EXT4_SB(sb); 3167 int err; 3168 struct ext4_group_desc *desc; 3169 struct ext4_group_info ***group_info; 3170 struct kmem_cache *cachep; 3171 3172 err = ext4_mb_alloc_groupinfo(sb, ngroups); 3173 if (err) 3174 return err; 3175 3176 sbi->s_buddy_cache = new_inode(sb); 3177 if (sbi->s_buddy_cache == NULL) { 3178 ext4_msg(sb, KERN_ERR, "can't get new inode"); 3179 goto err_freesgi; 3180 } 3181 /* To avoid potentially colliding with an valid on-disk inode number, 3182 * use EXT4_BAD_INO for the buddy cache inode number. This inode is 3183 * not in the inode hash, so it should never be found by iget(), but 3184 * this will avoid confusion if it ever shows up during debugging. */ 3185 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; 3186 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 3187 for (i = 0; i < ngroups; i++) { 3188 cond_resched(); 3189 desc = ext4_get_group_desc(sb, i, NULL); 3190 if (desc == NULL) { 3191 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); 3192 goto err_freebuddy; 3193 } 3194 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 3195 goto err_freebuddy; 3196 } 3197 3198 if (ext4_has_feature_flex_bg(sb)) { 3199 /* a single flex group is supposed to be read by a single IO. 3200 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is 3201 * unsigned integer, so the maximum shift is 32. 3202 */ 3203 if (sbi->s_es->s_log_groups_per_flex >= 32) { 3204 ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); 3205 goto err_freebuddy; 3206 } 3207 sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, 3208 BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); 3209 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ 3210 } else { 3211 sbi->s_mb_prefetch = 32; 3212 } 3213 if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) 3214 sbi->s_mb_prefetch = ext4_get_groups_count(sb); 3215 /* now many real IOs to prefetch within a single allocation at cr=0 3216 * given cr=0 is an CPU-related optimization we shouldn't try to 3217 * load too many groups, at some point we should start to use what 3218 * we've got in memory. 3219 * with an average random access time 5ms, it'd take a second to get 3220 * 200 groups (* N with flex_bg), so let's make this limit 4 3221 */ 3222 sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; 3223 if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) 3224 sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); 3225 3226 return 0; 3227 3228 err_freebuddy: 3229 cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3230 while (i-- > 0) { 3231 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 3232 3233 if (grp) 3234 kmem_cache_free(cachep, grp); 3235 } 3236 i = sbi->s_group_info_size; 3237 rcu_read_lock(); 3238 group_info = rcu_dereference(sbi->s_group_info); 3239 while (i-- > 0) 3240 kfree(group_info[i]); 3241 rcu_read_unlock(); 3242 iput(sbi->s_buddy_cache); 3243 err_freesgi: 3244 rcu_read_lock(); 3245 kvfree(rcu_dereference(sbi->s_group_info)); 3246 rcu_read_unlock(); 3247 return -ENOMEM; 3248 } 3249 3250 static void ext4_groupinfo_destroy_slabs(void) 3251 { 3252 int i; 3253 3254 for (i = 0; i < NR_GRPINFO_CACHES; i++) { 3255 kmem_cache_destroy(ext4_groupinfo_caches[i]); 3256 ext4_groupinfo_caches[i] = NULL; 3257 } 3258 } 3259 3260 static int ext4_groupinfo_create_slab(size_t size) 3261 { 3262 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); 3263 int slab_size; 3264 int blocksize_bits = order_base_2(size); 3265 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 3266 struct kmem_cache *cachep; 3267 3268 if (cache_index >= NR_GRPINFO_CACHES) 3269 return -EINVAL; 3270 3271 if (unlikely(cache_index < 0)) 3272 cache_index = 0; 3273 3274 mutex_lock(&ext4_grpinfo_slab_create_mutex); 3275 if (ext4_groupinfo_caches[cache_index]) { 3276 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 3277 return 0; /* Already created */ 3278 } 3279 3280 slab_size = offsetof(struct ext4_group_info, 3281 bb_counters[blocksize_bits + 2]); 3282 3283 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], 3284 slab_size, 0, SLAB_RECLAIM_ACCOUNT, 3285 NULL); 3286 3287 ext4_groupinfo_caches[cache_index] = cachep; 3288 3289 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 3290 if (!cachep) { 3291 printk(KERN_EMERG 3292 "EXT4-fs: no memory for groupinfo slab cache\n"); 3293 return -ENOMEM; 3294 } 3295 3296 return 0; 3297 } 3298 3299 static void ext4_discard_work(struct work_struct *work) 3300 { 3301 struct ext4_sb_info *sbi = container_of(work, 3302 struct ext4_sb_info, s_discard_work); 3303 struct super_block *sb = sbi->s_sb; 3304 struct ext4_free_data *fd, *nfd; 3305 struct ext4_buddy e4b; 3306 struct list_head discard_list; 3307 ext4_group_t grp, load_grp; 3308 int err = 0; 3309 3310 INIT_LIST_HEAD(&discard_list); 3311 spin_lock(&sbi->s_md_lock); 3312 list_splice_init(&sbi->s_discard_list, &discard_list); 3313 spin_unlock(&sbi->s_md_lock); 3314 3315 load_grp = UINT_MAX; 3316 list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { 3317 /* 3318 * If filesystem is umounting or no memory or suffering 3319 * from no space, give up the discard 3320 */ 3321 if ((sb->s_flags & SB_ACTIVE) && !err && 3322 !atomic_read(&sbi->s_retry_alloc_pending)) { 3323 grp = fd->efd_group; 3324 if (grp != load_grp) { 3325 if (load_grp != UINT_MAX) 3326 ext4_mb_unload_buddy(&e4b); 3327 3328 err = ext4_mb_load_buddy(sb, grp, &e4b); 3329 if (err) { 3330 kmem_cache_free(ext4_free_data_cachep, fd); 3331 load_grp = UINT_MAX; 3332 continue; 3333 } else { 3334 load_grp = grp; 3335 } 3336 } 3337 3338 ext4_lock_group(sb, grp); 3339 ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, 3340 fd->efd_start_cluster + fd->efd_count - 1, 1); 3341 ext4_unlock_group(sb, grp); 3342 } 3343 kmem_cache_free(ext4_free_data_cachep, fd); 3344 } 3345 3346 if (load_grp != UINT_MAX) 3347 ext4_mb_unload_buddy(&e4b); 3348 } 3349 3350 int ext4_mb_init(struct super_block *sb) 3351 { 3352 struct ext4_sb_info *sbi = EXT4_SB(sb); 3353 unsigned i, j; 3354 unsigned offset, offset_incr; 3355 unsigned max; 3356 int ret; 3357 3358 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); 3359 3360 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 3361 if (sbi->s_mb_offsets == NULL) { 3362 ret = -ENOMEM; 3363 goto out; 3364 } 3365 3366 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); 3367 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 3368 if (sbi->s_mb_maxs == NULL) { 3369 ret = -ENOMEM; 3370 goto out; 3371 } 3372 3373 ret = ext4_groupinfo_create_slab(sb->s_blocksize); 3374 if (ret < 0) 3375 goto out; 3376 3377 /* order 0 is regular bitmap */ 3378 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 3379 sbi->s_mb_offsets[0] = 0; 3380 3381 i = 1; 3382 offset = 0; 3383 offset_incr = 1 << (sb->s_blocksize_bits - 1); 3384 max = sb->s_blocksize << 2; 3385 do { 3386 sbi->s_mb_offsets[i] = offset; 3387 sbi->s_mb_maxs[i] = max; 3388 offset += offset_incr; 3389 offset_incr = offset_incr >> 1; 3390 max = max >> 1; 3391 i++; 3392 } while (i < MB_NUM_ORDERS(sb)); 3393 3394 sbi->s_mb_avg_fragment_size = 3395 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3396 GFP_KERNEL); 3397 if (!sbi->s_mb_avg_fragment_size) { 3398 ret = -ENOMEM; 3399 goto out; 3400 } 3401 sbi->s_mb_avg_fragment_size_locks = 3402 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 3403 GFP_KERNEL); 3404 if (!sbi->s_mb_avg_fragment_size_locks) { 3405 ret = -ENOMEM; 3406 goto out; 3407 } 3408 for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 3409 INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); 3410 rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); 3411 } 3412 sbi->s_mb_largest_free_orders = 3413 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3414 GFP_KERNEL); 3415 if (!sbi->s_mb_largest_free_orders) { 3416 ret = -ENOMEM; 3417 goto out; 3418 } 3419 sbi->s_mb_largest_free_orders_locks = 3420 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 3421 GFP_KERNEL); 3422 if (!sbi->s_mb_largest_free_orders_locks) { 3423 ret = -ENOMEM; 3424 goto out; 3425 } 3426 for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 3427 INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); 3428 rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); 3429 } 3430 3431 spin_lock_init(&sbi->s_md_lock); 3432 sbi->s_mb_free_pending = 0; 3433 INIT_LIST_HEAD(&sbi->s_freed_data_list); 3434 INIT_LIST_HEAD(&sbi->s_discard_list); 3435 INIT_WORK(&sbi->s_discard_work, ext4_discard_work); 3436 atomic_set(&sbi->s_retry_alloc_pending, 0); 3437 3438 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 3439 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; 3440 sbi->s_mb_stats = MB_DEFAULT_STATS; 3441 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 3442 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 3443 /* 3444 * The default group preallocation is 512, which for 4k block 3445 * sizes translates to 2 megabytes. However for bigalloc file 3446 * systems, this is probably too big (i.e, if the cluster size 3447 * is 1 megabyte, then group preallocation size becomes half a 3448 * gigabyte!). As a default, we will keep a two megabyte 3449 * group pralloc size for cluster sizes up to 64k, and after 3450 * that, we will force a minimum group preallocation size of 3451 * 32 clusters. This translates to 8 megs when the cluster 3452 * size is 256k, and 32 megs when the cluster size is 1 meg, 3453 * which seems reasonable as a default. 3454 */ 3455 sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> 3456 sbi->s_cluster_bits, 32); 3457 /* 3458 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc 3459 * to the lowest multiple of s_stripe which is bigger than 3460 * the s_mb_group_prealloc as determined above. We want 3461 * the preallocation size to be an exact multiple of the 3462 * RAID stripe size so that preallocations don't fragment 3463 * the stripes. 3464 */ 3465 if (sbi->s_stripe > 1) { 3466 sbi->s_mb_group_prealloc = roundup( 3467 sbi->s_mb_group_prealloc, sbi->s_stripe); 3468 } 3469 3470 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 3471 if (sbi->s_locality_groups == NULL) { 3472 ret = -ENOMEM; 3473 goto out; 3474 } 3475 for_each_possible_cpu(i) { 3476 struct ext4_locality_group *lg; 3477 lg = per_cpu_ptr(sbi->s_locality_groups, i); 3478 mutex_init(&lg->lg_mutex); 3479 for (j = 0; j < PREALLOC_TB_SIZE; j++) 3480 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 3481 spin_lock_init(&lg->lg_prealloc_lock); 3482 } 3483 3484 if (bdev_nonrot(sb->s_bdev)) 3485 sbi->s_mb_max_linear_groups = 0; 3486 else 3487 sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; 3488 /* init file for buddy data */ 3489 ret = ext4_mb_init_backend(sb); 3490 if (ret != 0) 3491 goto out_free_locality_groups; 3492 3493 return 0; 3494 3495 out_free_locality_groups: 3496 free_percpu(sbi->s_locality_groups); 3497 sbi->s_locality_groups = NULL; 3498 out: 3499 kfree(sbi->s_mb_avg_fragment_size); 3500 kfree(sbi->s_mb_avg_fragment_size_locks); 3501 kfree(sbi->s_mb_largest_free_orders); 3502 kfree(sbi->s_mb_largest_free_orders_locks); 3503 kfree(sbi->s_mb_offsets); 3504 sbi->s_mb_offsets = NULL; 3505 kfree(sbi->s_mb_maxs); 3506 sbi->s_mb_maxs = NULL; 3507 return ret; 3508 } 3509 3510 /* need to called with the ext4 group lock held */ 3511 static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) 3512 { 3513 struct ext4_prealloc_space *pa; 3514 struct list_head *cur, *tmp; 3515 int count = 0; 3516 3517 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { 3518 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 3519 list_del(&pa->pa_group_list); 3520 count++; 3521 kmem_cache_free(ext4_pspace_cachep, pa); 3522 } 3523 return count; 3524 } 3525 3526 int ext4_mb_release(struct super_block *sb) 3527 { 3528 ext4_group_t ngroups = ext4_get_groups_count(sb); 3529 ext4_group_t i; 3530 int num_meta_group_infos; 3531 struct ext4_group_info *grinfo, ***group_info; 3532 struct ext4_sb_info *sbi = EXT4_SB(sb); 3533 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3534 int count; 3535 3536 if (test_opt(sb, DISCARD)) { 3537 /* 3538 * wait the discard work to drain all of ext4_free_data 3539 */ 3540 flush_work(&sbi->s_discard_work); 3541 WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); 3542 } 3543 3544 if (sbi->s_group_info) { 3545 for (i = 0; i < ngroups; i++) { 3546 cond_resched(); 3547 grinfo = ext4_get_group_info(sb, i); 3548 if (!grinfo) 3549 continue; 3550 mb_group_bb_bitmap_free(grinfo); 3551 ext4_lock_group(sb, i); 3552 count = ext4_mb_cleanup_pa(grinfo); 3553 if (count) 3554 mb_debug(sb, "mballoc: %d PAs left\n", 3555 count); 3556 ext4_unlock_group(sb, i); 3557 kmem_cache_free(cachep, grinfo); 3558 } 3559 num_meta_group_infos = (ngroups + 3560 EXT4_DESC_PER_BLOCK(sb) - 1) >> 3561 EXT4_DESC_PER_BLOCK_BITS(sb); 3562 rcu_read_lock(); 3563 group_info = rcu_dereference(sbi->s_group_info); 3564 for (i = 0; i < num_meta_group_infos; i++) 3565 kfree(group_info[i]); 3566 kvfree(group_info); 3567 rcu_read_unlock(); 3568 } 3569 kfree(sbi->s_mb_avg_fragment_size); 3570 kfree(sbi->s_mb_avg_fragment_size_locks); 3571 kfree(sbi->s_mb_largest_free_orders); 3572 kfree(sbi->s_mb_largest_free_orders_locks); 3573 kfree(sbi->s_mb_offsets); 3574 kfree(sbi->s_mb_maxs); 3575 iput(sbi->s_buddy_cache); 3576 if (sbi->s_mb_stats) { 3577 ext4_msg(sb, KERN_INFO, 3578 "mballoc: %u blocks %u reqs (%u success)", 3579 atomic_read(&sbi->s_bal_allocated), 3580 atomic_read(&sbi->s_bal_reqs), 3581 atomic_read(&sbi->s_bal_success)); 3582 ext4_msg(sb, KERN_INFO, 3583 "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " 3584 "%u 2^N hits, %u breaks, %u lost", 3585 atomic_read(&sbi->s_bal_ex_scanned), 3586 atomic_read(&sbi->s_bal_groups_scanned), 3587 atomic_read(&sbi->s_bal_goals), 3588 atomic_read(&sbi->s_bal_2orders), 3589 atomic_read(&sbi->s_bal_breaks), 3590 atomic_read(&sbi->s_mb_lost_chunks)); 3591 ext4_msg(sb, KERN_INFO, 3592 "mballoc: %u generated and it took %llu", 3593 atomic_read(&sbi->s_mb_buddies_generated), 3594 atomic64_read(&sbi->s_mb_generation_time)); 3595 ext4_msg(sb, KERN_INFO, 3596 "mballoc: %u preallocated, %u discarded", 3597 atomic_read(&sbi->s_mb_preallocated), 3598 atomic_read(&sbi->s_mb_discarded)); 3599 } 3600 3601 free_percpu(sbi->s_locality_groups); 3602 3603 return 0; 3604 } 3605 3606 static inline int ext4_issue_discard(struct super_block *sb, 3607 ext4_group_t block_group, ext4_grpblk_t cluster, int count, 3608 struct bio **biop) 3609 { 3610 ext4_fsblk_t discard_block; 3611 3612 discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + 3613 ext4_group_first_block_no(sb, block_group)); 3614 count = EXT4_C2B(EXT4_SB(sb), count); 3615 trace_ext4_discard_blocks(sb, 3616 (unsigned long long) discard_block, count); 3617 if (biop) { 3618 return __blkdev_issue_discard(sb->s_bdev, 3619 (sector_t)discard_block << (sb->s_blocksize_bits - 9), 3620 (sector_t)count << (sb->s_blocksize_bits - 9), 3621 GFP_NOFS, biop); 3622 } else 3623 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); 3624 } 3625 3626 static void ext4_free_data_in_buddy(struct super_block *sb, 3627 struct ext4_free_data *entry) 3628 { 3629 struct ext4_buddy e4b; 3630 struct ext4_group_info *db; 3631 int err, count = 0; 3632 3633 mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", 3634 entry->efd_count, entry->efd_group, entry); 3635 3636 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); 3637 /* we expect to find existing buddy because it's pinned */ 3638 BUG_ON(err != 0); 3639 3640 spin_lock(&EXT4_SB(sb)->s_md_lock); 3641 EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; 3642 spin_unlock(&EXT4_SB(sb)->s_md_lock); 3643 3644 db = e4b.bd_info; 3645 /* there are blocks to put in buddy to make them really free */ 3646 count += entry->efd_count; 3647 ext4_lock_group(sb, entry->efd_group); 3648 /* Take it out of per group rb tree */ 3649 rb_erase(&entry->efd_node, &(db->bb_free_root)); 3650 mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); 3651 3652 /* 3653 * Clear the trimmed flag for the group so that the next 3654 * ext4_trim_fs can trim it. 3655 * If the volume is mounted with -o discard, online discard 3656 * is supported and the free blocks will be trimmed online. 3657 */ 3658 if (!test_opt(sb, DISCARD)) 3659 EXT4_MB_GRP_CLEAR_TRIMMED(db); 3660 3661 if (!db->bb_free_root.rb_node) { 3662 /* No more items in the per group rb tree 3663 * balance refcounts from ext4_mb_free_metadata() 3664 */ 3665 put_page(e4b.bd_buddy_page); 3666 put_page(e4b.bd_bitmap_page); 3667 } 3668 ext4_unlock_group(sb, entry->efd_group); 3669 ext4_mb_unload_buddy(&e4b); 3670 3671 mb_debug(sb, "freed %d blocks in 1 structures\n", count); 3672 } 3673 3674 /* 3675 * This function is called by the jbd2 layer once the commit has finished, 3676 * so we know we can free the blocks that were released with that commit. 3677 */ 3678 void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) 3679 { 3680 struct ext4_sb_info *sbi = EXT4_SB(sb); 3681 struct ext4_free_data *entry, *tmp; 3682 struct list_head freed_data_list; 3683 struct list_head *cut_pos = NULL; 3684 bool wake; 3685 3686 INIT_LIST_HEAD(&freed_data_list); 3687 3688 spin_lock(&sbi->s_md_lock); 3689 list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { 3690 if (entry->efd_tid != commit_tid) 3691 break; 3692 cut_pos = &entry->efd_list; 3693 } 3694 if (cut_pos) 3695 list_cut_position(&freed_data_list, &sbi->s_freed_data_list, 3696 cut_pos); 3697 spin_unlock(&sbi->s_md_lock); 3698 3699 list_for_each_entry(entry, &freed_data_list, efd_list) 3700 ext4_free_data_in_buddy(sb, entry); 3701 3702 if (test_opt(sb, DISCARD)) { 3703 spin_lock(&sbi->s_md_lock); 3704 wake = list_empty(&sbi->s_discard_list); 3705 list_splice_tail(&freed_data_list, &sbi->s_discard_list); 3706 spin_unlock(&sbi->s_md_lock); 3707 if (wake) 3708 queue_work(system_unbound_wq, &sbi->s_discard_work); 3709 } else { 3710 list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) 3711 kmem_cache_free(ext4_free_data_cachep, entry); 3712 } 3713 } 3714 3715 int __init ext4_init_mballoc(void) 3716 { 3717 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, 3718 SLAB_RECLAIM_ACCOUNT); 3719 if (ext4_pspace_cachep == NULL) 3720 goto out; 3721 3722 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, 3723 SLAB_RECLAIM_ACCOUNT); 3724 if (ext4_ac_cachep == NULL) 3725 goto out_pa_free; 3726 3727 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, 3728 SLAB_RECLAIM_ACCOUNT); 3729 if (ext4_free_data_cachep == NULL) 3730 goto out_ac_free; 3731 3732 return 0; 3733 3734 out_ac_free: 3735 kmem_cache_destroy(ext4_ac_cachep); 3736 out_pa_free: 3737 kmem_cache_destroy(ext4_pspace_cachep); 3738 out: 3739 return -ENOMEM; 3740 } 3741 3742 void ext4_exit_mballoc(void) 3743 { 3744 /* 3745 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 3746 * before destroying the slab cache. 3747 */ 3748 rcu_barrier(); 3749 kmem_cache_destroy(ext4_pspace_cachep); 3750 kmem_cache_destroy(ext4_ac_cachep); 3751 kmem_cache_destroy(ext4_free_data_cachep); 3752 ext4_groupinfo_destroy_slabs(); 3753 } 3754 3755 3756 /* 3757 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps 3758 * Returns 0 if success or error code 3759 */ 3760 static noinline_for_stack int 3761 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 3762 handle_t *handle, unsigned int reserv_clstrs) 3763 { 3764 struct buffer_head *bitmap_bh = NULL; 3765 struct ext4_group_desc *gdp; 3766 struct buffer_head *gdp_bh; 3767 struct ext4_sb_info *sbi; 3768 struct super_block *sb; 3769 ext4_fsblk_t block; 3770 int err, len; 3771 3772 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3773 BUG_ON(ac->ac_b_ex.fe_len <= 0); 3774 3775 sb = ac->ac_sb; 3776 sbi = EXT4_SB(sb); 3777 3778 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 3779 if (IS_ERR(bitmap_bh)) { 3780 return PTR_ERR(bitmap_bh); 3781 } 3782 3783 BUFFER_TRACE(bitmap_bh, "getting write access"); 3784 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 3785 EXT4_JTR_NONE); 3786 if (err) 3787 goto out_err; 3788 3789 err = -EIO; 3790 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); 3791 if (!gdp) 3792 goto out_err; 3793 3794 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 3795 ext4_free_group_clusters(sb, gdp)); 3796 3797 BUFFER_TRACE(gdp_bh, "get_write_access"); 3798 err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); 3799 if (err) 3800 goto out_err; 3801 3802 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3803 3804 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 3805 if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { 3806 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 3807 "fs metadata", block, block+len); 3808 /* File system mounted not to panic on error 3809 * Fix the bitmap and return EFSCORRUPTED 3810 * We leak some of the blocks here. 3811 */ 3812 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3813 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 3814 ac->ac_b_ex.fe_len); 3815 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3816 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3817 if (!err) 3818 err = -EFSCORRUPTED; 3819 goto out_err; 3820 } 3821 3822 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3823 #ifdef AGGRESSIVE_CHECK 3824 { 3825 int i; 3826 for (i = 0; i < ac->ac_b_ex.fe_len; i++) { 3827 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, 3828 bitmap_bh->b_data)); 3829 } 3830 } 3831 #endif 3832 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 3833 ac->ac_b_ex.fe_len); 3834 if (ext4_has_group_desc_csum(sb) && 3835 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 3836 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3837 ext4_free_group_clusters_set(sb, gdp, 3838 ext4_free_clusters_after_init(sb, 3839 ac->ac_b_ex.fe_group, gdp)); 3840 } 3841 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; 3842 ext4_free_group_clusters_set(sb, gdp, len); 3843 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 3844 ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); 3845 3846 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3847 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); 3848 /* 3849 * Now reduce the dirty block count also. Should not go negative 3850 */ 3851 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 3852 /* release all the reserved blocks if non delalloc */ 3853 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 3854 reserv_clstrs); 3855 3856 if (sbi->s_log_groups_per_flex) { 3857 ext4_group_t flex_group = ext4_flex_group(sbi, 3858 ac->ac_b_ex.fe_group); 3859 atomic64_sub(ac->ac_b_ex.fe_len, 3860 &sbi_array_rcu_deref(sbi, s_flex_groups, 3861 flex_group)->free_clusters); 3862 } 3863 3864 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3865 if (err) 3866 goto out_err; 3867 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 3868 3869 out_err: 3870 brelse(bitmap_bh); 3871 return err; 3872 } 3873 3874 /* 3875 * Idempotent helper for Ext4 fast commit replay path to set the state of 3876 * blocks in bitmaps and update counters. 3877 */ 3878 void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, 3879 int len, int state) 3880 { 3881 struct buffer_head *bitmap_bh = NULL; 3882 struct ext4_group_desc *gdp; 3883 struct buffer_head *gdp_bh; 3884 struct ext4_sb_info *sbi = EXT4_SB(sb); 3885 ext4_group_t group; 3886 ext4_grpblk_t blkoff; 3887 int i, err; 3888 int already; 3889 unsigned int clen, clen_changed, thisgrp_len; 3890 3891 while (len > 0) { 3892 ext4_get_group_no_and_offset(sb, block, &group, &blkoff); 3893 3894 /* 3895 * Check to see if we are freeing blocks across a group 3896 * boundary. 3897 * In case of flex_bg, this can happen that (block, len) may 3898 * span across more than one group. In that case we need to 3899 * get the corresponding group metadata to work with. 3900 * For this we have goto again loop. 3901 */ 3902 thisgrp_len = min_t(unsigned int, (unsigned int)len, 3903 EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); 3904 clen = EXT4_NUM_B2C(sbi, thisgrp_len); 3905 3906 if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) { 3907 ext4_error(sb, "Marking blocks in system zone - " 3908 "Block = %llu, len = %u", 3909 block, thisgrp_len); 3910 bitmap_bh = NULL; 3911 break; 3912 } 3913 3914 bitmap_bh = ext4_read_block_bitmap(sb, group); 3915 if (IS_ERR(bitmap_bh)) { 3916 err = PTR_ERR(bitmap_bh); 3917 bitmap_bh = NULL; 3918 break; 3919 } 3920 3921 err = -EIO; 3922 gdp = ext4_get_group_desc(sb, group, &gdp_bh); 3923 if (!gdp) 3924 break; 3925 3926 ext4_lock_group(sb, group); 3927 already = 0; 3928 for (i = 0; i < clen; i++) 3929 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == 3930 !state) 3931 already++; 3932 3933 clen_changed = clen - already; 3934 if (state) 3935 mb_set_bits(bitmap_bh->b_data, blkoff, clen); 3936 else 3937 mb_clear_bits(bitmap_bh->b_data, blkoff, clen); 3938 if (ext4_has_group_desc_csum(sb) && 3939 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 3940 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3941 ext4_free_group_clusters_set(sb, gdp, 3942 ext4_free_clusters_after_init(sb, group, gdp)); 3943 } 3944 if (state) 3945 clen = ext4_free_group_clusters(sb, gdp) - clen_changed; 3946 else 3947 clen = ext4_free_group_clusters(sb, gdp) + clen_changed; 3948 3949 ext4_free_group_clusters_set(sb, gdp, clen); 3950 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 3951 ext4_group_desc_csum_set(sb, group, gdp); 3952 3953 ext4_unlock_group(sb, group); 3954 3955 if (sbi->s_log_groups_per_flex) { 3956 ext4_group_t flex_group = ext4_flex_group(sbi, group); 3957 struct flex_groups *fg = sbi_array_rcu_deref(sbi, 3958 s_flex_groups, flex_group); 3959 3960 if (state) 3961 atomic64_sub(clen_changed, &fg->free_clusters); 3962 else 3963 atomic64_add(clen_changed, &fg->free_clusters); 3964 3965 } 3966 3967 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); 3968 if (err) 3969 break; 3970 sync_dirty_buffer(bitmap_bh); 3971 err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); 3972 sync_dirty_buffer(gdp_bh); 3973 if (err) 3974 break; 3975 3976 block += thisgrp_len; 3977 len -= thisgrp_len; 3978 brelse(bitmap_bh); 3979 BUG_ON(len < 0); 3980 } 3981 3982 if (err) 3983 brelse(bitmap_bh); 3984 } 3985 3986 /* 3987 * here we normalize request for locality group 3988 * Group request are normalized to s_mb_group_prealloc, which goes to 3989 * s_strip if we set the same via mount option. 3990 * s_mb_group_prealloc can be configured via 3991 * /sys/fs/ext4/<partition>/mb_group_prealloc 3992 * 3993 * XXX: should we try to preallocate more than the group has now? 3994 */ 3995 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) 3996 { 3997 struct super_block *sb = ac->ac_sb; 3998 struct ext4_locality_group *lg = ac->ac_lg; 3999 4000 BUG_ON(lg == NULL); 4001 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 4002 mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); 4003 } 4004 4005 /* 4006 * This function returns the next element to look at during inode 4007 * PA rbtree walk. We assume that we have held the inode PA rbtree lock 4008 * (ei->i_prealloc_lock) 4009 * 4010 * new_start The start of the range we want to compare 4011 * cur_start The existing start that we are comparing against 4012 * node The node of the rb_tree 4013 */ 4014 static inline struct rb_node* 4015 ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) 4016 { 4017 if (new_start < cur_start) 4018 return node->rb_left; 4019 else 4020 return node->rb_right; 4021 } 4022 4023 static inline void 4024 ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, 4025 ext4_lblk_t start, ext4_lblk_t end) 4026 { 4027 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4028 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4029 struct ext4_prealloc_space *tmp_pa; 4030 ext4_lblk_t tmp_pa_start, tmp_pa_end; 4031 struct rb_node *iter; 4032 4033 read_lock(&ei->i_prealloc_lock); 4034 for (iter = ei->i_prealloc_node.rb_node; iter; 4035 iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { 4036 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4037 pa_node.inode_node); 4038 tmp_pa_start = tmp_pa->pa_lstart; 4039 tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); 4040 4041 spin_lock(&tmp_pa->pa_lock); 4042 if (tmp_pa->pa_deleted == 0) 4043 BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); 4044 spin_unlock(&tmp_pa->pa_lock); 4045 } 4046 read_unlock(&ei->i_prealloc_lock); 4047 } 4048 4049 /* 4050 * Given an allocation context "ac" and a range "start", "end", check 4051 * and adjust boundaries if the range overlaps with any of the existing 4052 * preallocatoins stored in the corresponding inode of the allocation context. 4053 * 4054 * Parameters: 4055 * ac allocation context 4056 * start start of the new range 4057 * end end of the new range 4058 */ 4059 static inline void 4060 ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, 4061 ext4_lblk_t *start, ext4_lblk_t *end) 4062 { 4063 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4064 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4065 struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; 4066 struct rb_node *iter; 4067 ext4_lblk_t new_start, new_end; 4068 ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; 4069 4070 new_start = *start; 4071 new_end = *end; 4072 4073 /* 4074 * Adjust the normalized range so that it doesn't overlap with any 4075 * existing preallocated blocks(PAs). Make sure to hold the rbtree lock 4076 * so it doesn't change underneath us. 4077 */ 4078 read_lock(&ei->i_prealloc_lock); 4079 4080 /* Step 1: find any one immediate neighboring PA of the normalized range */ 4081 for (iter = ei->i_prealloc_node.rb_node; iter; 4082 iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, 4083 tmp_pa_start, iter)) { 4084 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4085 pa_node.inode_node); 4086 tmp_pa_start = tmp_pa->pa_lstart; 4087 tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); 4088 4089 /* PA must not overlap original request */ 4090 spin_lock(&tmp_pa->pa_lock); 4091 if (tmp_pa->pa_deleted == 0) 4092 BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || 4093 ac->ac_o_ex.fe_logical < tmp_pa_start)); 4094 spin_unlock(&tmp_pa->pa_lock); 4095 } 4096 4097 /* 4098 * Step 2: check if the found PA is left or right neighbor and 4099 * get the other neighbor 4100 */ 4101 if (tmp_pa) { 4102 if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { 4103 struct rb_node *tmp; 4104 4105 left_pa = tmp_pa; 4106 tmp = rb_next(&left_pa->pa_node.inode_node); 4107 if (tmp) { 4108 right_pa = rb_entry(tmp, 4109 struct ext4_prealloc_space, 4110 pa_node.inode_node); 4111 } 4112 } else { 4113 struct rb_node *tmp; 4114 4115 right_pa = tmp_pa; 4116 tmp = rb_prev(&right_pa->pa_node.inode_node); 4117 if (tmp) { 4118 left_pa = rb_entry(tmp, 4119 struct ext4_prealloc_space, 4120 pa_node.inode_node); 4121 } 4122 } 4123 } 4124 4125 /* Step 3: get the non deleted neighbors */ 4126 if (left_pa) { 4127 for (iter = &left_pa->pa_node.inode_node;; 4128 iter = rb_prev(iter)) { 4129 if (!iter) { 4130 left_pa = NULL; 4131 break; 4132 } 4133 4134 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4135 pa_node.inode_node); 4136 left_pa = tmp_pa; 4137 spin_lock(&tmp_pa->pa_lock); 4138 if (tmp_pa->pa_deleted == 0) { 4139 spin_unlock(&tmp_pa->pa_lock); 4140 break; 4141 } 4142 spin_unlock(&tmp_pa->pa_lock); 4143 } 4144 } 4145 4146 if (right_pa) { 4147 for (iter = &right_pa->pa_node.inode_node;; 4148 iter = rb_next(iter)) { 4149 if (!iter) { 4150 right_pa = NULL; 4151 break; 4152 } 4153 4154 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4155 pa_node.inode_node); 4156 right_pa = tmp_pa; 4157 spin_lock(&tmp_pa->pa_lock); 4158 if (tmp_pa->pa_deleted == 0) { 4159 spin_unlock(&tmp_pa->pa_lock); 4160 break; 4161 } 4162 spin_unlock(&tmp_pa->pa_lock); 4163 } 4164 } 4165 4166 if (left_pa) { 4167 left_pa_end = 4168 left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); 4169 BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); 4170 } 4171 4172 if (right_pa) { 4173 right_pa_start = right_pa->pa_lstart; 4174 BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); 4175 } 4176 4177 /* Step 4: trim our normalized range to not overlap with the neighbors */ 4178 if (left_pa) { 4179 if (left_pa_end > new_start) 4180 new_start = left_pa_end; 4181 } 4182 4183 if (right_pa) { 4184 if (right_pa_start < new_end) 4185 new_end = right_pa_start; 4186 } 4187 read_unlock(&ei->i_prealloc_lock); 4188 4189 /* XXX: extra loop to check we really don't overlap preallocations */ 4190 ext4_mb_pa_assert_overlap(ac, new_start, new_end); 4191 4192 *start = new_start; 4193 *end = new_end; 4194 } 4195 4196 /* 4197 * Normalization means making request better in terms of 4198 * size and alignment 4199 */ 4200 static noinline_for_stack void 4201 ext4_mb_normalize_request(struct ext4_allocation_context *ac, 4202 struct ext4_allocation_request *ar) 4203 { 4204 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4205 struct ext4_super_block *es = sbi->s_es; 4206 int bsbits, max; 4207 ext4_lblk_t end; 4208 loff_t size, start_off; 4209 loff_t orig_size __maybe_unused; 4210 ext4_lblk_t start; 4211 4212 /* do normalize only data requests, metadata requests 4213 do not need preallocation */ 4214 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4215 return; 4216 4217 /* sometime caller may want exact blocks */ 4218 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4219 return; 4220 4221 /* caller may indicate that preallocation isn't 4222 * required (it's a tail, for example) */ 4223 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) 4224 return; 4225 4226 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { 4227 ext4_mb_normalize_group_request(ac); 4228 return ; 4229 } 4230 4231 bsbits = ac->ac_sb->s_blocksize_bits; 4232 4233 /* first, let's learn actual file size 4234 * given current request is allocated */ 4235 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); 4236 size = size << bsbits; 4237 if (size < i_size_read(ac->ac_inode)) 4238 size = i_size_read(ac->ac_inode); 4239 orig_size = size; 4240 4241 /* max size of free chunks */ 4242 max = 2 << bsbits; 4243 4244 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ 4245 (req <= (size) || max <= (chunk_size)) 4246 4247 /* first, try to predict filesize */ 4248 /* XXX: should this table be tunable? */ 4249 start_off = 0; 4250 if (size <= 16 * 1024) { 4251 size = 16 * 1024; 4252 } else if (size <= 32 * 1024) { 4253 size = 32 * 1024; 4254 } else if (size <= 64 * 1024) { 4255 size = 64 * 1024; 4256 } else if (size <= 128 * 1024) { 4257 size = 128 * 1024; 4258 } else if (size <= 256 * 1024) { 4259 size = 256 * 1024; 4260 } else if (size <= 512 * 1024) { 4261 size = 512 * 1024; 4262 } else if (size <= 1024 * 1024) { 4263 size = 1024 * 1024; 4264 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { 4265 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4266 (21 - bsbits)) << 21; 4267 size = 2 * 1024 * 1024; 4268 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { 4269 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4270 (22 - bsbits)) << 22; 4271 size = 4 * 1024 * 1024; 4272 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, 4273 (8<<20)>>bsbits, max, 8 * 1024)) { 4274 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4275 (23 - bsbits)) << 23; 4276 size = 8 * 1024 * 1024; 4277 } else { 4278 start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; 4279 size = (loff_t) EXT4_C2B(sbi, 4280 ac->ac_o_ex.fe_len) << bsbits; 4281 } 4282 size = size >> bsbits; 4283 start = start_off >> bsbits; 4284 4285 /* 4286 * For tiny groups (smaller than 8MB) the chosen allocation 4287 * alignment may be larger than group size. Make sure the 4288 * alignment does not move allocation to a different group which 4289 * makes mballoc fail assertions later. 4290 */ 4291 start = max(start, rounddown(ac->ac_o_ex.fe_logical, 4292 (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); 4293 4294 /* don't cover already allocated blocks in selected range */ 4295 if (ar->pleft && start <= ar->lleft) { 4296 size -= ar->lleft + 1 - start; 4297 start = ar->lleft + 1; 4298 } 4299 if (ar->pright && start + size - 1 >= ar->lright) 4300 size -= start + size - ar->lright; 4301 4302 /* 4303 * Trim allocation request for filesystems with artificially small 4304 * groups. 4305 */ 4306 if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) 4307 size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); 4308 4309 end = start + size; 4310 4311 ext4_mb_pa_adjust_overlap(ac, &start, &end); 4312 4313 size = end - start; 4314 4315 /* 4316 * In this function "start" and "size" are normalized for better 4317 * alignment and length such that we could preallocate more blocks. 4318 * This normalization is done such that original request of 4319 * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and 4320 * "size" boundaries. 4321 * (Note fe_len can be relaxed since FS block allocation API does not 4322 * provide gurantee on number of contiguous blocks allocation since that 4323 * depends upon free space left, etc). 4324 * In case of inode pa, later we use the allocated blocks 4325 * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated 4326 * range of goal/best blocks [start, size] to put it at the 4327 * ac_o_ex.fe_logical extent of this inode. 4328 * (See ext4_mb_use_inode_pa() for more details) 4329 */ 4330 if (start + size <= ac->ac_o_ex.fe_logical || 4331 start > ac->ac_o_ex.fe_logical) { 4332 ext4_msg(ac->ac_sb, KERN_ERR, 4333 "start %lu, size %lu, fe_logical %lu", 4334 (unsigned long) start, (unsigned long) size, 4335 (unsigned long) ac->ac_o_ex.fe_logical); 4336 BUG(); 4337 } 4338 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 4339 4340 /* now prepare goal request */ 4341 4342 /* XXX: is it better to align blocks WRT to logical 4343 * placement or satisfy big request as is */ 4344 ac->ac_g_ex.fe_logical = start; 4345 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); 4346 4347 /* define goal start in order to merge */ 4348 if (ar->pright && (ar->lright == (start + size)) && 4349 ar->pright >= size && 4350 ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { 4351 /* merge to the right */ 4352 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, 4353 &ac->ac_g_ex.fe_group, 4354 &ac->ac_g_ex.fe_start); 4355 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 4356 } 4357 if (ar->pleft && (ar->lleft + 1 == start) && 4358 ar->pleft + 1 < ext4_blocks_count(es)) { 4359 /* merge to the left */ 4360 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, 4361 &ac->ac_g_ex.fe_group, 4362 &ac->ac_g_ex.fe_start); 4363 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 4364 } 4365 4366 mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, 4367 orig_size, start); 4368 } 4369 4370 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) 4371 { 4372 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4373 4374 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { 4375 atomic_inc(&sbi->s_bal_reqs); 4376 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 4377 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) 4378 atomic_inc(&sbi->s_bal_success); 4379 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 4380 atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); 4381 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 4382 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 4383 atomic_inc(&sbi->s_bal_goals); 4384 if (ac->ac_found > sbi->s_mb_max_to_scan) 4385 atomic_inc(&sbi->s_bal_breaks); 4386 } 4387 4388 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) 4389 trace_ext4_mballoc_alloc(ac); 4390 else 4391 trace_ext4_mballoc_prealloc(ac); 4392 } 4393 4394 /* 4395 * Called on failure; free up any blocks from the inode PA for this 4396 * context. We don't need this for MB_GROUP_PA because we only change 4397 * pa_free in ext4_mb_release_context(), but on failure, we've already 4398 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. 4399 */ 4400 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) 4401 { 4402 struct ext4_prealloc_space *pa = ac->ac_pa; 4403 struct ext4_buddy e4b; 4404 int err; 4405 4406 if (pa == NULL) { 4407 if (ac->ac_f_ex.fe_len == 0) 4408 return; 4409 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); 4410 if (WARN_RATELIMIT(err, 4411 "ext4: mb_load_buddy failed (%d)", err)) 4412 /* 4413 * This should never happen since we pin the 4414 * pages in the ext4_allocation_context so 4415 * ext4_mb_load_buddy() should never fail. 4416 */ 4417 return; 4418 ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); 4419 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, 4420 ac->ac_f_ex.fe_len); 4421 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); 4422 ext4_mb_unload_buddy(&e4b); 4423 return; 4424 } 4425 if (pa->pa_type == MB_INODE_PA) { 4426 spin_lock(&pa->pa_lock); 4427 pa->pa_free += ac->ac_b_ex.fe_len; 4428 spin_unlock(&pa->pa_lock); 4429 } 4430 } 4431 4432 /* 4433 * use blocks preallocated to inode 4434 */ 4435 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 4436 struct ext4_prealloc_space *pa) 4437 { 4438 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4439 ext4_fsblk_t start; 4440 ext4_fsblk_t end; 4441 int len; 4442 4443 /* found preallocated blocks, use them */ 4444 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 4445 end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), 4446 start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); 4447 len = EXT4_NUM_B2C(sbi, end - start); 4448 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 4449 &ac->ac_b_ex.fe_start); 4450 ac->ac_b_ex.fe_len = len; 4451 ac->ac_status = AC_STATUS_FOUND; 4452 ac->ac_pa = pa; 4453 4454 BUG_ON(start < pa->pa_pstart); 4455 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); 4456 BUG_ON(pa->pa_free < len); 4457 BUG_ON(ac->ac_b_ex.fe_len <= 0); 4458 pa->pa_free -= len; 4459 4460 mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); 4461 } 4462 4463 /* 4464 * use blocks preallocated to locality group 4465 */ 4466 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, 4467 struct ext4_prealloc_space *pa) 4468 { 4469 unsigned int len = ac->ac_o_ex.fe_len; 4470 4471 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 4472 &ac->ac_b_ex.fe_group, 4473 &ac->ac_b_ex.fe_start); 4474 ac->ac_b_ex.fe_len = len; 4475 ac->ac_status = AC_STATUS_FOUND; 4476 ac->ac_pa = pa; 4477 4478 /* we don't correct pa_pstart or pa_len here to avoid 4479 * possible race when the group is being loaded concurrently 4480 * instead we correct pa later, after blocks are marked 4481 * in on-disk bitmap -- see ext4_mb_release_context() 4482 * Other CPUs are prevented from allocating from this pa by lg_mutex 4483 */ 4484 mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", 4485 pa->pa_lstart, len, pa); 4486 } 4487 4488 /* 4489 * Return the prealloc space that have minimal distance 4490 * from the goal block. @cpa is the prealloc 4491 * space that is having currently known minimal distance 4492 * from the goal block. 4493 */ 4494 static struct ext4_prealloc_space * 4495 ext4_mb_check_group_pa(ext4_fsblk_t goal_block, 4496 struct ext4_prealloc_space *pa, 4497 struct ext4_prealloc_space *cpa) 4498 { 4499 ext4_fsblk_t cur_distance, new_distance; 4500 4501 if (cpa == NULL) { 4502 atomic_inc(&pa->pa_count); 4503 return pa; 4504 } 4505 cur_distance = abs(goal_block - cpa->pa_pstart); 4506 new_distance = abs(goal_block - pa->pa_pstart); 4507 4508 if (cur_distance <= new_distance) 4509 return cpa; 4510 4511 /* drop the previous reference */ 4512 atomic_dec(&cpa->pa_count); 4513 atomic_inc(&pa->pa_count); 4514 return pa; 4515 } 4516 4517 /* 4518 * search goal blocks in preallocated space 4519 */ 4520 static noinline_for_stack bool 4521 ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 4522 { 4523 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4524 int order, i; 4525 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4526 struct ext4_locality_group *lg; 4527 struct ext4_prealloc_space *tmp_pa, *cpa = NULL; 4528 ext4_lblk_t tmp_pa_start, tmp_pa_end; 4529 struct rb_node *iter; 4530 ext4_fsblk_t goal_block; 4531 4532 /* only data can be preallocated */ 4533 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4534 return false; 4535 4536 /* first, try per-file preallocation */ 4537 read_lock(&ei->i_prealloc_lock); 4538 for (iter = ei->i_prealloc_node.rb_node; iter; 4539 iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, 4540 tmp_pa_start, iter)) { 4541 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4542 pa_node.inode_node); 4543 4544 /* all fields in this condition don't change, 4545 * so we can skip locking for them */ 4546 tmp_pa_start = tmp_pa->pa_lstart; 4547 tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); 4548 4549 /* original request start doesn't lie in this PA */ 4550 if (ac->ac_o_ex.fe_logical < tmp_pa_start || 4551 ac->ac_o_ex.fe_logical >= tmp_pa_end) 4552 continue; 4553 4554 /* non-extent files can't have physical blocks past 2^32 */ 4555 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 4556 (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > 4557 EXT4_MAX_BLOCK_FILE_PHYS)) { 4558 /* 4559 * Since PAs don't overlap, we won't find any 4560 * other PA to satisfy this. 4561 */ 4562 break; 4563 } 4564 4565 /* found preallocated blocks, use them */ 4566 spin_lock(&tmp_pa->pa_lock); 4567 if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) { 4568 atomic_inc(&tmp_pa->pa_count); 4569 ext4_mb_use_inode_pa(ac, tmp_pa); 4570 spin_unlock(&tmp_pa->pa_lock); 4571 ac->ac_criteria = 10; 4572 read_unlock(&ei->i_prealloc_lock); 4573 return true; 4574 } 4575 spin_unlock(&tmp_pa->pa_lock); 4576 } 4577 read_unlock(&ei->i_prealloc_lock); 4578 4579 /* can we use group allocation? */ 4580 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) 4581 return false; 4582 4583 /* inode may have no locality group for some reason */ 4584 lg = ac->ac_lg; 4585 if (lg == NULL) 4586 return false; 4587 order = fls(ac->ac_o_ex.fe_len) - 1; 4588 if (order > PREALLOC_TB_SIZE - 1) 4589 /* The max size of hash table is PREALLOC_TB_SIZE */ 4590 order = PREALLOC_TB_SIZE - 1; 4591 4592 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); 4593 /* 4594 * search for the prealloc space that is having 4595 * minimal distance from the goal block. 4596 */ 4597 for (i = order; i < PREALLOC_TB_SIZE; i++) { 4598 rcu_read_lock(); 4599 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], 4600 pa_node.lg_list) { 4601 spin_lock(&tmp_pa->pa_lock); 4602 if (tmp_pa->pa_deleted == 0 && 4603 tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { 4604 4605 cpa = ext4_mb_check_group_pa(goal_block, 4606 tmp_pa, cpa); 4607 } 4608 spin_unlock(&tmp_pa->pa_lock); 4609 } 4610 rcu_read_unlock(); 4611 } 4612 if (cpa) { 4613 ext4_mb_use_group_pa(ac, cpa); 4614 ac->ac_criteria = 20; 4615 return true; 4616 } 4617 return false; 4618 } 4619 4620 /* 4621 * the function goes through all block freed in the group 4622 * but not yet committed and marks them used in in-core bitmap. 4623 * buddy must be generated from this bitmap 4624 * Need to be called with the ext4 group lock held 4625 */ 4626 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 4627 ext4_group_t group) 4628 { 4629 struct rb_node *n; 4630 struct ext4_group_info *grp; 4631 struct ext4_free_data *entry; 4632 4633 grp = ext4_get_group_info(sb, group); 4634 if (!grp) 4635 return; 4636 n = rb_first(&(grp->bb_free_root)); 4637 4638 while (n) { 4639 entry = rb_entry(n, struct ext4_free_data, efd_node); 4640 mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); 4641 n = rb_next(n); 4642 } 4643 return; 4644 } 4645 4646 /* 4647 * the function goes through all preallocation in this group and marks them 4648 * used in in-core bitmap. buddy must be generated from this bitmap 4649 * Need to be called with ext4 group lock held 4650 */ 4651 static noinline_for_stack 4652 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 4653 ext4_group_t group) 4654 { 4655 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 4656 struct ext4_prealloc_space *pa; 4657 struct list_head *cur; 4658 ext4_group_t groupnr; 4659 ext4_grpblk_t start; 4660 int preallocated = 0; 4661 int len; 4662 4663 if (!grp) 4664 return; 4665 4666 /* all form of preallocation discards first load group, 4667 * so the only competing code is preallocation use. 4668 * we don't need any locking here 4669 * notice we do NOT ignore preallocations with pa_deleted 4670 * otherwise we could leave used blocks available for 4671 * allocation in buddy when concurrent ext4_mb_put_pa() 4672 * is dropping preallocation 4673 */ 4674 list_for_each(cur, &grp->bb_prealloc_list) { 4675 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 4676 spin_lock(&pa->pa_lock); 4677 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 4678 &groupnr, &start); 4679 len = pa->pa_len; 4680 spin_unlock(&pa->pa_lock); 4681 if (unlikely(len == 0)) 4682 continue; 4683 BUG_ON(groupnr != group); 4684 mb_set_bits(bitmap, start, len); 4685 preallocated += len; 4686 } 4687 mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); 4688 } 4689 4690 static void ext4_mb_mark_pa_deleted(struct super_block *sb, 4691 struct ext4_prealloc_space *pa) 4692 { 4693 struct ext4_inode_info *ei; 4694 4695 if (pa->pa_deleted) { 4696 ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", 4697 pa->pa_type, pa->pa_pstart, pa->pa_lstart, 4698 pa->pa_len); 4699 return; 4700 } 4701 4702 pa->pa_deleted = 1; 4703 4704 if (pa->pa_type == MB_INODE_PA) { 4705 ei = EXT4_I(pa->pa_inode); 4706 atomic_dec(&ei->i_prealloc_active); 4707 } 4708 } 4709 4710 static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) 4711 { 4712 BUG_ON(!pa); 4713 BUG_ON(atomic_read(&pa->pa_count)); 4714 BUG_ON(pa->pa_deleted == 0); 4715 kmem_cache_free(ext4_pspace_cachep, pa); 4716 } 4717 4718 static void ext4_mb_pa_callback(struct rcu_head *head) 4719 { 4720 struct ext4_prealloc_space *pa; 4721 4722 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 4723 ext4_mb_pa_free(pa); 4724 } 4725 4726 /* 4727 * drops a reference to preallocated space descriptor 4728 * if this was the last reference and the space is consumed 4729 */ 4730 static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 4731 struct super_block *sb, struct ext4_prealloc_space *pa) 4732 { 4733 ext4_group_t grp; 4734 ext4_fsblk_t grp_blk; 4735 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4736 4737 /* in this short window concurrent discard can set pa_deleted */ 4738 spin_lock(&pa->pa_lock); 4739 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { 4740 spin_unlock(&pa->pa_lock); 4741 return; 4742 } 4743 4744 if (pa->pa_deleted == 1) { 4745 spin_unlock(&pa->pa_lock); 4746 return; 4747 } 4748 4749 ext4_mb_mark_pa_deleted(sb, pa); 4750 spin_unlock(&pa->pa_lock); 4751 4752 grp_blk = pa->pa_pstart; 4753 /* 4754 * If doing group-based preallocation, pa_pstart may be in the 4755 * next group when pa is used up 4756 */ 4757 if (pa->pa_type == MB_GROUP_PA) 4758 grp_blk--; 4759 4760 grp = ext4_get_group_number(sb, grp_blk); 4761 4762 /* 4763 * possible race: 4764 * 4765 * P1 (buddy init) P2 (regular allocation) 4766 * find block B in PA 4767 * copy on-disk bitmap to buddy 4768 * mark B in on-disk bitmap 4769 * drop PA from group 4770 * mark all PAs in buddy 4771 * 4772 * thus, P1 initializes buddy with B available. to prevent this 4773 * we make "copy" and "mark all PAs" atomic and serialize "drop PA" 4774 * against that pair 4775 */ 4776 ext4_lock_group(sb, grp); 4777 list_del(&pa->pa_group_list); 4778 ext4_unlock_group(sb, grp); 4779 4780 if (pa->pa_type == MB_INODE_PA) { 4781 write_lock(pa->pa_node_lock.inode_lock); 4782 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 4783 write_unlock(pa->pa_node_lock.inode_lock); 4784 ext4_mb_pa_free(pa); 4785 } else { 4786 spin_lock(pa->pa_node_lock.lg_lock); 4787 list_del_rcu(&pa->pa_node.lg_list); 4788 spin_unlock(pa->pa_node_lock.lg_lock); 4789 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4790 } 4791 } 4792 4793 static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) 4794 { 4795 struct rb_node **iter = &root->rb_node, *parent = NULL; 4796 struct ext4_prealloc_space *iter_pa, *new_pa; 4797 ext4_lblk_t iter_start, new_start; 4798 4799 while (*iter) { 4800 iter_pa = rb_entry(*iter, struct ext4_prealloc_space, 4801 pa_node.inode_node); 4802 new_pa = rb_entry(new, struct ext4_prealloc_space, 4803 pa_node.inode_node); 4804 iter_start = iter_pa->pa_lstart; 4805 new_start = new_pa->pa_lstart; 4806 4807 parent = *iter; 4808 if (new_start < iter_start) 4809 iter = &((*iter)->rb_left); 4810 else 4811 iter = &((*iter)->rb_right); 4812 } 4813 4814 rb_link_node(new, parent, iter); 4815 rb_insert_color(new, root); 4816 } 4817 4818 /* 4819 * creates new preallocated space for given inode 4820 */ 4821 static noinline_for_stack void 4822 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 4823 { 4824 struct super_block *sb = ac->ac_sb; 4825 struct ext4_sb_info *sbi = EXT4_SB(sb); 4826 struct ext4_prealloc_space *pa; 4827 struct ext4_group_info *grp; 4828 struct ext4_inode_info *ei; 4829 4830 /* preallocate only when found space is larger then requested */ 4831 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 4832 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 4833 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 4834 BUG_ON(ac->ac_pa == NULL); 4835 4836 pa = ac->ac_pa; 4837 4838 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { 4839 int new_bex_start; 4840 int new_bex_end; 4841 4842 /* we can't allocate as much as normalizer wants. 4843 * so, found space must get proper lstart 4844 * to cover original request */ 4845 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 4846 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); 4847 4848 /* 4849 * Use the below logic for adjusting best extent as it keeps 4850 * fragmentation in check while ensuring logical range of best 4851 * extent doesn't overflow out of goal extent: 4852 * 4853 * 1. Check if best ex can be kept at end of goal and still 4854 * cover original start 4855 * 2. Else, check if best ex can be kept at start of goal and 4856 * still cover original start 4857 * 3. Else, keep the best ex at start of original request. 4858 */ 4859 new_bex_end = ac->ac_g_ex.fe_logical + 4860 EXT4_C2B(sbi, ac->ac_g_ex.fe_len); 4861 new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4862 if (ac->ac_o_ex.fe_logical >= new_bex_start) 4863 goto adjust_bex; 4864 4865 new_bex_start = ac->ac_g_ex.fe_logical; 4866 new_bex_end = 4867 new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4868 if (ac->ac_o_ex.fe_logical < new_bex_end) 4869 goto adjust_bex; 4870 4871 new_bex_start = ac->ac_o_ex.fe_logical; 4872 new_bex_end = 4873 new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4874 4875 adjust_bex: 4876 ac->ac_b_ex.fe_logical = new_bex_start; 4877 4878 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 4879 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 4880 BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + 4881 EXT4_C2B(sbi, ac->ac_g_ex.fe_len))); 4882 } 4883 4884 pa->pa_lstart = ac->ac_b_ex.fe_logical; 4885 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4886 pa->pa_len = ac->ac_b_ex.fe_len; 4887 pa->pa_free = pa->pa_len; 4888 spin_lock_init(&pa->pa_lock); 4889 INIT_LIST_HEAD(&pa->pa_group_list); 4890 pa->pa_deleted = 0; 4891 pa->pa_type = MB_INODE_PA; 4892 4893 mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, 4894 pa->pa_len, pa->pa_lstart); 4895 trace_ext4_mb_new_inode_pa(ac, pa); 4896 4897 atomic_add(pa->pa_free, &sbi->s_mb_preallocated); 4898 ext4_mb_use_inode_pa(ac, pa); 4899 4900 ei = EXT4_I(ac->ac_inode); 4901 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 4902 if (!grp) 4903 return; 4904 4905 pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; 4906 pa->pa_inode = ac->ac_inode; 4907 4908 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 4909 4910 write_lock(pa->pa_node_lock.inode_lock); 4911 ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); 4912 write_unlock(pa->pa_node_lock.inode_lock); 4913 atomic_inc(&ei->i_prealloc_active); 4914 } 4915 4916 /* 4917 * creates new preallocated space for locality group inodes belongs to 4918 */ 4919 static noinline_for_stack void 4920 ext4_mb_new_group_pa(struct ext4_allocation_context *ac) 4921 { 4922 struct super_block *sb = ac->ac_sb; 4923 struct ext4_locality_group *lg; 4924 struct ext4_prealloc_space *pa; 4925 struct ext4_group_info *grp; 4926 4927 /* preallocate only when found space is larger then requested */ 4928 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 4929 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 4930 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 4931 BUG_ON(ac->ac_pa == NULL); 4932 4933 pa = ac->ac_pa; 4934 4935 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4936 pa->pa_lstart = pa->pa_pstart; 4937 pa->pa_len = ac->ac_b_ex.fe_len; 4938 pa->pa_free = pa->pa_len; 4939 spin_lock_init(&pa->pa_lock); 4940 INIT_LIST_HEAD(&pa->pa_node.lg_list); 4941 INIT_LIST_HEAD(&pa->pa_group_list); 4942 pa->pa_deleted = 0; 4943 pa->pa_type = MB_GROUP_PA; 4944 4945 mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, 4946 pa->pa_len, pa->pa_lstart); 4947 trace_ext4_mb_new_group_pa(ac, pa); 4948 4949 ext4_mb_use_group_pa(ac, pa); 4950 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 4951 4952 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 4953 if (!grp) 4954 return; 4955 lg = ac->ac_lg; 4956 BUG_ON(lg == NULL); 4957 4958 pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; 4959 pa->pa_inode = NULL; 4960 4961 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 4962 4963 /* 4964 * We will later add the new pa to the right bucket 4965 * after updating the pa_free in ext4_mb_release_context 4966 */ 4967 } 4968 4969 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) 4970 { 4971 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 4972 ext4_mb_new_group_pa(ac); 4973 else 4974 ext4_mb_new_inode_pa(ac); 4975 } 4976 4977 /* 4978 * finds all unused blocks in on-disk bitmap, frees them in 4979 * in-core bitmap and buddy. 4980 * @pa must be unlinked from inode and group lists, so that 4981 * nobody else can find/use it. 4982 * the caller MUST hold group/inode locks. 4983 * TODO: optimize the case when there are no in-core structures yet 4984 */ 4985 static noinline_for_stack int 4986 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 4987 struct ext4_prealloc_space *pa) 4988 { 4989 struct super_block *sb = e4b->bd_sb; 4990 struct ext4_sb_info *sbi = EXT4_SB(sb); 4991 unsigned int end; 4992 unsigned int next; 4993 ext4_group_t group; 4994 ext4_grpblk_t bit; 4995 unsigned long long grp_blk_start; 4996 int free = 0; 4997 4998 BUG_ON(pa->pa_deleted == 0); 4999 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 5000 grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); 5001 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 5002 end = bit + pa->pa_len; 5003 5004 while (bit < end) { 5005 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 5006 if (bit >= end) 5007 break; 5008 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 5009 mb_debug(sb, "free preallocated %u/%u in group %u\n", 5010 (unsigned) ext4_group_first_block_no(sb, group) + bit, 5011 (unsigned) next - bit, (unsigned) group); 5012 free += next - bit; 5013 5014 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); 5015 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + 5016 EXT4_C2B(sbi, bit)), 5017 next - bit); 5018 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 5019 bit = next + 1; 5020 } 5021 if (free != pa->pa_free) { 5022 ext4_msg(e4b->bd_sb, KERN_CRIT, 5023 "pa %p: logic %lu, phys. %lu, len %d", 5024 pa, (unsigned long) pa->pa_lstart, 5025 (unsigned long) pa->pa_pstart, 5026 pa->pa_len); 5027 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", 5028 free, pa->pa_free); 5029 /* 5030 * pa is already deleted so we use the value obtained 5031 * from the bitmap and continue. 5032 */ 5033 } 5034 atomic_add(free, &sbi->s_mb_discarded); 5035 5036 return 0; 5037 } 5038 5039 static noinline_for_stack int 5040 ext4_mb_release_group_pa(struct ext4_buddy *e4b, 5041 struct ext4_prealloc_space *pa) 5042 { 5043 struct super_block *sb = e4b->bd_sb; 5044 ext4_group_t group; 5045 ext4_grpblk_t bit; 5046 5047 trace_ext4_mb_release_group_pa(sb, pa); 5048 BUG_ON(pa->pa_deleted == 0); 5049 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 5050 if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { 5051 ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", 5052 e4b->bd_group, group, pa->pa_pstart); 5053 return 0; 5054 } 5055 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 5056 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 5057 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); 5058 5059 return 0; 5060 } 5061 5062 /* 5063 * releases all preallocations in given group 5064 * 5065 * first, we need to decide discard policy: 5066 * - when do we discard 5067 * 1) ENOSPC 5068 * - how many do we discard 5069 * 1) how many requested 5070 */ 5071 static noinline_for_stack int 5072 ext4_mb_discard_group_preallocations(struct super_block *sb, 5073 ext4_group_t group, int *busy) 5074 { 5075 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 5076 struct buffer_head *bitmap_bh = NULL; 5077 struct ext4_prealloc_space *pa, *tmp; 5078 struct list_head list; 5079 struct ext4_buddy e4b; 5080 struct ext4_inode_info *ei; 5081 int err; 5082 int free = 0; 5083 5084 if (!grp) 5085 return 0; 5086 mb_debug(sb, "discard preallocation for group %u\n", group); 5087 if (list_empty(&grp->bb_prealloc_list)) 5088 goto out_dbg; 5089 5090 bitmap_bh = ext4_read_block_bitmap(sb, group); 5091 if (IS_ERR(bitmap_bh)) { 5092 err = PTR_ERR(bitmap_bh); 5093 ext4_error_err(sb, -err, 5094 "Error %d reading block bitmap for %u", 5095 err, group); 5096 goto out_dbg; 5097 } 5098 5099 err = ext4_mb_load_buddy(sb, group, &e4b); 5100 if (err) { 5101 ext4_warning(sb, "Error %d loading buddy information for %u", 5102 err, group); 5103 put_bh(bitmap_bh); 5104 goto out_dbg; 5105 } 5106 5107 INIT_LIST_HEAD(&list); 5108 ext4_lock_group(sb, group); 5109 list_for_each_entry_safe(pa, tmp, 5110 &grp->bb_prealloc_list, pa_group_list) { 5111 spin_lock(&pa->pa_lock); 5112 if (atomic_read(&pa->pa_count)) { 5113 spin_unlock(&pa->pa_lock); 5114 *busy = 1; 5115 continue; 5116 } 5117 if (pa->pa_deleted) { 5118 spin_unlock(&pa->pa_lock); 5119 continue; 5120 } 5121 5122 /* seems this one can be freed ... */ 5123 ext4_mb_mark_pa_deleted(sb, pa); 5124 5125 if (!free) 5126 this_cpu_inc(discard_pa_seq); 5127 5128 /* we can trust pa_free ... */ 5129 free += pa->pa_free; 5130 5131 spin_unlock(&pa->pa_lock); 5132 5133 list_del(&pa->pa_group_list); 5134 list_add(&pa->u.pa_tmp_list, &list); 5135 } 5136 5137 /* now free all selected PAs */ 5138 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 5139 5140 /* remove from object (inode or locality group) */ 5141 if (pa->pa_type == MB_GROUP_PA) { 5142 spin_lock(pa->pa_node_lock.lg_lock); 5143 list_del_rcu(&pa->pa_node.lg_list); 5144 spin_unlock(pa->pa_node_lock.lg_lock); 5145 } else { 5146 write_lock(pa->pa_node_lock.inode_lock); 5147 ei = EXT4_I(pa->pa_inode); 5148 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 5149 write_unlock(pa->pa_node_lock.inode_lock); 5150 } 5151 5152 list_del(&pa->u.pa_tmp_list); 5153 5154 if (pa->pa_type == MB_GROUP_PA) { 5155 ext4_mb_release_group_pa(&e4b, pa); 5156 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 5157 } else { 5158 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 5159 ext4_mb_pa_free(pa); 5160 } 5161 } 5162 5163 ext4_unlock_group(sb, group); 5164 ext4_mb_unload_buddy(&e4b); 5165 put_bh(bitmap_bh); 5166 out_dbg: 5167 mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", 5168 free, group, grp->bb_free); 5169 return free; 5170 } 5171 5172 /* 5173 * releases all non-used preallocated blocks for given inode 5174 * 5175 * It's important to discard preallocations under i_data_sem 5176 * We don't want another block to be served from the prealloc 5177 * space when we are discarding the inode prealloc space. 5178 * 5179 * FIXME!! Make sure it is valid at all the call sites 5180 */ 5181 void ext4_discard_preallocations(struct inode *inode, unsigned int needed) 5182 { 5183 struct ext4_inode_info *ei = EXT4_I(inode); 5184 struct super_block *sb = inode->i_sb; 5185 struct buffer_head *bitmap_bh = NULL; 5186 struct ext4_prealloc_space *pa, *tmp; 5187 ext4_group_t group = 0; 5188 struct list_head list; 5189 struct ext4_buddy e4b; 5190 struct rb_node *iter; 5191 int err; 5192 5193 if (!S_ISREG(inode->i_mode)) { 5194 return; 5195 } 5196 5197 if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) 5198 return; 5199 5200 mb_debug(sb, "discard preallocation for inode %lu\n", 5201 inode->i_ino); 5202 trace_ext4_discard_preallocations(inode, 5203 atomic_read(&ei->i_prealloc_active), needed); 5204 5205 INIT_LIST_HEAD(&list); 5206 5207 if (needed == 0) 5208 needed = UINT_MAX; 5209 5210 repeat: 5211 /* first, collect all pa's in the inode */ 5212 write_lock(&ei->i_prealloc_lock); 5213 for (iter = rb_first(&ei->i_prealloc_node); iter && needed; 5214 iter = rb_next(iter)) { 5215 pa = rb_entry(iter, struct ext4_prealloc_space, 5216 pa_node.inode_node); 5217 BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); 5218 5219 spin_lock(&pa->pa_lock); 5220 if (atomic_read(&pa->pa_count)) { 5221 /* this shouldn't happen often - nobody should 5222 * use preallocation while we're discarding it */ 5223 spin_unlock(&pa->pa_lock); 5224 write_unlock(&ei->i_prealloc_lock); 5225 ext4_msg(sb, KERN_ERR, 5226 "uh-oh! used pa while discarding"); 5227 WARN_ON(1); 5228 schedule_timeout_uninterruptible(HZ); 5229 goto repeat; 5230 5231 } 5232 if (pa->pa_deleted == 0) { 5233 ext4_mb_mark_pa_deleted(sb, pa); 5234 spin_unlock(&pa->pa_lock); 5235 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 5236 list_add(&pa->u.pa_tmp_list, &list); 5237 needed--; 5238 continue; 5239 } 5240 5241 /* someone is deleting pa right now */ 5242 spin_unlock(&pa->pa_lock); 5243 write_unlock(&ei->i_prealloc_lock); 5244 5245 /* we have to wait here because pa_deleted 5246 * doesn't mean pa is already unlinked from 5247 * the list. as we might be called from 5248 * ->clear_inode() the inode will get freed 5249 * and concurrent thread which is unlinking 5250 * pa from inode's list may access already 5251 * freed memory, bad-bad-bad */ 5252 5253 /* XXX: if this happens too often, we can 5254 * add a flag to force wait only in case 5255 * of ->clear_inode(), but not in case of 5256 * regular truncate */ 5257 schedule_timeout_uninterruptible(HZ); 5258 goto repeat; 5259 } 5260 write_unlock(&ei->i_prealloc_lock); 5261 5262 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 5263 BUG_ON(pa->pa_type != MB_INODE_PA); 5264 group = ext4_get_group_number(sb, pa->pa_pstart); 5265 5266 err = ext4_mb_load_buddy_gfp(sb, group, &e4b, 5267 GFP_NOFS|__GFP_NOFAIL); 5268 if (err) { 5269 ext4_error_err(sb, -err, "Error %d loading buddy information for %u", 5270 err, group); 5271 continue; 5272 } 5273 5274 bitmap_bh = ext4_read_block_bitmap(sb, group); 5275 if (IS_ERR(bitmap_bh)) { 5276 err = PTR_ERR(bitmap_bh); 5277 ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", 5278 err, group); 5279 ext4_mb_unload_buddy(&e4b); 5280 continue; 5281 } 5282 5283 ext4_lock_group(sb, group); 5284 list_del(&pa->pa_group_list); 5285 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 5286 ext4_unlock_group(sb, group); 5287 5288 ext4_mb_unload_buddy(&e4b); 5289 put_bh(bitmap_bh); 5290 5291 list_del(&pa->u.pa_tmp_list); 5292 ext4_mb_pa_free(pa); 5293 } 5294 } 5295 5296 static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) 5297 { 5298 struct ext4_prealloc_space *pa; 5299 5300 BUG_ON(ext4_pspace_cachep == NULL); 5301 pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); 5302 if (!pa) 5303 return -ENOMEM; 5304 atomic_set(&pa->pa_count, 1); 5305 ac->ac_pa = pa; 5306 return 0; 5307 } 5308 5309 static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) 5310 { 5311 struct ext4_prealloc_space *pa = ac->ac_pa; 5312 5313 BUG_ON(!pa); 5314 ac->ac_pa = NULL; 5315 WARN_ON(!atomic_dec_and_test(&pa->pa_count)); 5316 /* 5317 * current function is only called due to an error or due to 5318 * len of found blocks < len of requested blocks hence the PA has not 5319 * been added to grp->bb_prealloc_list. So we don't need to lock it 5320 */ 5321 pa->pa_deleted = 1; 5322 ext4_mb_pa_free(pa); 5323 } 5324 5325 #ifdef CONFIG_EXT4_DEBUG 5326 static inline void ext4_mb_show_pa(struct super_block *sb) 5327 { 5328 ext4_group_t i, ngroups; 5329 5330 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) 5331 return; 5332 5333 ngroups = ext4_get_groups_count(sb); 5334 mb_debug(sb, "groups: "); 5335 for (i = 0; i < ngroups; i++) { 5336 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 5337 struct ext4_prealloc_space *pa; 5338 ext4_grpblk_t start; 5339 struct list_head *cur; 5340 5341 if (!grp) 5342 continue; 5343 ext4_lock_group(sb, i); 5344 list_for_each(cur, &grp->bb_prealloc_list) { 5345 pa = list_entry(cur, struct ext4_prealloc_space, 5346 pa_group_list); 5347 spin_lock(&pa->pa_lock); 5348 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 5349 NULL, &start); 5350 spin_unlock(&pa->pa_lock); 5351 mb_debug(sb, "PA:%u:%d:%d\n", i, start, 5352 pa->pa_len); 5353 } 5354 ext4_unlock_group(sb, i); 5355 mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, 5356 grp->bb_fragments); 5357 } 5358 } 5359 5360 static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 5361 { 5362 struct super_block *sb = ac->ac_sb; 5363 5364 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) 5365 return; 5366 5367 mb_debug(sb, "Can't allocate:" 5368 " Allocation context details:"); 5369 mb_debug(sb, "status %u flags 0x%x", 5370 ac->ac_status, ac->ac_flags); 5371 mb_debug(sb, "orig %lu/%lu/%lu@%lu, " 5372 "goal %lu/%lu/%lu@%lu, " 5373 "best %lu/%lu/%lu@%lu cr %d", 5374 (unsigned long)ac->ac_o_ex.fe_group, 5375 (unsigned long)ac->ac_o_ex.fe_start, 5376 (unsigned long)ac->ac_o_ex.fe_len, 5377 (unsigned long)ac->ac_o_ex.fe_logical, 5378 (unsigned long)ac->ac_g_ex.fe_group, 5379 (unsigned long)ac->ac_g_ex.fe_start, 5380 (unsigned long)ac->ac_g_ex.fe_len, 5381 (unsigned long)ac->ac_g_ex.fe_logical, 5382 (unsigned long)ac->ac_b_ex.fe_group, 5383 (unsigned long)ac->ac_b_ex.fe_start, 5384 (unsigned long)ac->ac_b_ex.fe_len, 5385 (unsigned long)ac->ac_b_ex.fe_logical, 5386 (int)ac->ac_criteria); 5387 mb_debug(sb, "%u found", ac->ac_found); 5388 ext4_mb_show_pa(sb); 5389 } 5390 #else 5391 static inline void ext4_mb_show_pa(struct super_block *sb) 5392 { 5393 return; 5394 } 5395 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) 5396 { 5397 ext4_mb_show_pa(ac->ac_sb); 5398 return; 5399 } 5400 #endif 5401 5402 /* 5403 * We use locality group preallocation for small size file. The size of the 5404 * file is determined by the current size or the resulting size after 5405 * allocation which ever is larger 5406 * 5407 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req 5408 */ 5409 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 5410 { 5411 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 5412 int bsbits = ac->ac_sb->s_blocksize_bits; 5413 loff_t size, isize; 5414 bool inode_pa_eligible, group_pa_eligible; 5415 5416 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 5417 return; 5418 5419 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 5420 return; 5421 5422 group_pa_eligible = sbi->s_mb_group_prealloc > 0; 5423 inode_pa_eligible = true; 5424 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); 5425 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 5426 >> bsbits; 5427 5428 /* No point in using inode preallocation for closed files */ 5429 if ((size == isize) && !ext4_fs_is_busy(sbi) && 5430 !inode_is_open_for_write(ac->ac_inode)) 5431 inode_pa_eligible = false; 5432 5433 size = max(size, isize); 5434 /* Don't use group allocation for large files */ 5435 if (size > sbi->s_mb_stream_request) 5436 group_pa_eligible = false; 5437 5438 if (!group_pa_eligible) { 5439 if (inode_pa_eligible) 5440 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 5441 else 5442 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; 5443 return; 5444 } 5445 5446 BUG_ON(ac->ac_lg != NULL); 5447 /* 5448 * locality group prealloc space are per cpu. The reason for having 5449 * per cpu locality group is to reduce the contention between block 5450 * request from multiple CPUs. 5451 */ 5452 ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); 5453 5454 /* we're going to use group allocation */ 5455 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 5456 5457 /* serialize all allocations in the group */ 5458 mutex_lock(&ac->ac_lg->lg_mutex); 5459 } 5460 5461 static noinline_for_stack void 5462 ext4_mb_initialize_context(struct ext4_allocation_context *ac, 5463 struct ext4_allocation_request *ar) 5464 { 5465 struct super_block *sb = ar->inode->i_sb; 5466 struct ext4_sb_info *sbi = EXT4_SB(sb); 5467 struct ext4_super_block *es = sbi->s_es; 5468 ext4_group_t group; 5469 unsigned int len; 5470 ext4_fsblk_t goal; 5471 ext4_grpblk_t block; 5472 5473 /* we can't allocate > group size */ 5474 len = ar->len; 5475 5476 /* just a dirty hack to filter too big requests */ 5477 if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) 5478 len = EXT4_CLUSTERS_PER_GROUP(sb); 5479 5480 /* start searching from the goal */ 5481 goal = ar->goal; 5482 if (goal < le32_to_cpu(es->s_first_data_block) || 5483 goal >= ext4_blocks_count(es)) 5484 goal = le32_to_cpu(es->s_first_data_block); 5485 ext4_get_group_no_and_offset(sb, goal, &group, &block); 5486 5487 /* set up allocation goals */ 5488 ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); 5489 ac->ac_status = AC_STATUS_CONTINUE; 5490 ac->ac_sb = sb; 5491 ac->ac_inode = ar->inode; 5492 ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; 5493 ac->ac_o_ex.fe_group = group; 5494 ac->ac_o_ex.fe_start = block; 5495 ac->ac_o_ex.fe_len = len; 5496 ac->ac_g_ex = ac->ac_o_ex; 5497 ac->ac_flags = ar->flags; 5498 5499 /* we have to define context: we'll work with a file or 5500 * locality group. this is a policy, actually */ 5501 ext4_mb_group_or_file(ac); 5502 5503 mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " 5504 "left: %u/%u, right %u/%u to %swritable\n", 5505 (unsigned) ar->len, (unsigned) ar->logical, 5506 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 5507 (unsigned) ar->lleft, (unsigned) ar->pleft, 5508 (unsigned) ar->lright, (unsigned) ar->pright, 5509 inode_is_open_for_write(ar->inode) ? "" : "non-"); 5510 } 5511 5512 static noinline_for_stack void 5513 ext4_mb_discard_lg_preallocations(struct super_block *sb, 5514 struct ext4_locality_group *lg, 5515 int order, int total_entries) 5516 { 5517 ext4_group_t group = 0; 5518 struct ext4_buddy e4b; 5519 struct list_head discard_list; 5520 struct ext4_prealloc_space *pa, *tmp; 5521 5522 mb_debug(sb, "discard locality group preallocation\n"); 5523 5524 INIT_LIST_HEAD(&discard_list); 5525 5526 spin_lock(&lg->lg_prealloc_lock); 5527 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 5528 pa_node.lg_list, 5529 lockdep_is_held(&lg->lg_prealloc_lock)) { 5530 spin_lock(&pa->pa_lock); 5531 if (atomic_read(&pa->pa_count)) { 5532 /* 5533 * This is the pa that we just used 5534 * for block allocation. So don't 5535 * free that 5536 */ 5537 spin_unlock(&pa->pa_lock); 5538 continue; 5539 } 5540 if (pa->pa_deleted) { 5541 spin_unlock(&pa->pa_lock); 5542 continue; 5543 } 5544 /* only lg prealloc space */ 5545 BUG_ON(pa->pa_type != MB_GROUP_PA); 5546 5547 /* seems this one can be freed ... */ 5548 ext4_mb_mark_pa_deleted(sb, pa); 5549 spin_unlock(&pa->pa_lock); 5550 5551 list_del_rcu(&pa->pa_node.lg_list); 5552 list_add(&pa->u.pa_tmp_list, &discard_list); 5553 5554 total_entries--; 5555 if (total_entries <= 5) { 5556 /* 5557 * we want to keep only 5 entries 5558 * allowing it to grow to 8. This 5559 * mak sure we don't call discard 5560 * soon for this list. 5561 */ 5562 break; 5563 } 5564 } 5565 spin_unlock(&lg->lg_prealloc_lock); 5566 5567 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 5568 int err; 5569 5570 group = ext4_get_group_number(sb, pa->pa_pstart); 5571 err = ext4_mb_load_buddy_gfp(sb, group, &e4b, 5572 GFP_NOFS|__GFP_NOFAIL); 5573 if (err) { 5574 ext4_error_err(sb, -err, "Error %d loading buddy information for %u", 5575 err, group); 5576 continue; 5577 } 5578 ext4_lock_group(sb, group); 5579 list_del(&pa->pa_group_list); 5580 ext4_mb_release_group_pa(&e4b, pa); 5581 ext4_unlock_group(sb, group); 5582 5583 ext4_mb_unload_buddy(&e4b); 5584 list_del(&pa->u.pa_tmp_list); 5585 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 5586 } 5587 } 5588 5589 /* 5590 * We have incremented pa_count. So it cannot be freed at this 5591 * point. Also we hold lg_mutex. So no parallel allocation is 5592 * possible from this lg. That means pa_free cannot be updated. 5593 * 5594 * A parallel ext4_mb_discard_group_preallocations is possible. 5595 * which can cause the lg_prealloc_list to be updated. 5596 */ 5597 5598 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) 5599 { 5600 int order, added = 0, lg_prealloc_count = 1; 5601 struct super_block *sb = ac->ac_sb; 5602 struct ext4_locality_group *lg = ac->ac_lg; 5603 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; 5604 5605 order = fls(pa->pa_free) - 1; 5606 if (order > PREALLOC_TB_SIZE - 1) 5607 /* The max size of hash table is PREALLOC_TB_SIZE */ 5608 order = PREALLOC_TB_SIZE - 1; 5609 /* Add the prealloc space to lg */ 5610 spin_lock(&lg->lg_prealloc_lock); 5611 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], 5612 pa_node.lg_list, 5613 lockdep_is_held(&lg->lg_prealloc_lock)) { 5614 spin_lock(&tmp_pa->pa_lock); 5615 if (tmp_pa->pa_deleted) { 5616 spin_unlock(&tmp_pa->pa_lock); 5617 continue; 5618 } 5619 if (!added && pa->pa_free < tmp_pa->pa_free) { 5620 /* Add to the tail of the previous entry */ 5621 list_add_tail_rcu(&pa->pa_node.lg_list, 5622 &tmp_pa->pa_node.lg_list); 5623 added = 1; 5624 /* 5625 * we want to count the total 5626 * number of entries in the list 5627 */ 5628 } 5629 spin_unlock(&tmp_pa->pa_lock); 5630 lg_prealloc_count++; 5631 } 5632 if (!added) 5633 list_add_tail_rcu(&pa->pa_node.lg_list, 5634 &lg->lg_prealloc_list[order]); 5635 spin_unlock(&lg->lg_prealloc_lock); 5636 5637 /* Now trim the list to be not more than 8 elements */ 5638 if (lg_prealloc_count > 8) { 5639 ext4_mb_discard_lg_preallocations(sb, lg, 5640 order, lg_prealloc_count); 5641 return; 5642 } 5643 return ; 5644 } 5645 5646 /* 5647 * release all resource we used in allocation 5648 */ 5649 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 5650 { 5651 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 5652 struct ext4_prealloc_space *pa = ac->ac_pa; 5653 if (pa) { 5654 if (pa->pa_type == MB_GROUP_PA) { 5655 /* see comment in ext4_mb_use_group_pa() */ 5656 spin_lock(&pa->pa_lock); 5657 pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 5658 pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 5659 pa->pa_free -= ac->ac_b_ex.fe_len; 5660 pa->pa_len -= ac->ac_b_ex.fe_len; 5661 spin_unlock(&pa->pa_lock); 5662 5663 /* 5664 * We want to add the pa to the right bucket. 5665 * Remove it from the list and while adding 5666 * make sure the list to which we are adding 5667 * doesn't grow big. 5668 */ 5669 if (likely(pa->pa_free)) { 5670 spin_lock(pa->pa_node_lock.lg_lock); 5671 list_del_rcu(&pa->pa_node.lg_list); 5672 spin_unlock(pa->pa_node_lock.lg_lock); 5673 ext4_mb_add_n_trim(ac); 5674 } 5675 } 5676 5677 ext4_mb_put_pa(ac, ac->ac_sb, pa); 5678 } 5679 if (ac->ac_bitmap_page) 5680 put_page(ac->ac_bitmap_page); 5681 if (ac->ac_buddy_page) 5682 put_page(ac->ac_buddy_page); 5683 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 5684 mutex_unlock(&ac->ac_lg->lg_mutex); 5685 ext4_mb_collect_stats(ac); 5686 return 0; 5687 } 5688 5689 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 5690 { 5691 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 5692 int ret; 5693 int freed = 0, busy = 0; 5694 int retry = 0; 5695 5696 trace_ext4_mb_discard_preallocations(sb, needed); 5697 5698 if (needed == 0) 5699 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; 5700 repeat: 5701 for (i = 0; i < ngroups && needed > 0; i++) { 5702 ret = ext4_mb_discard_group_preallocations(sb, i, &busy); 5703 freed += ret; 5704 needed -= ret; 5705 cond_resched(); 5706 } 5707 5708 if (needed > 0 && busy && ++retry < 3) { 5709 busy = 0; 5710 goto repeat; 5711 } 5712 5713 return freed; 5714 } 5715 5716 static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, 5717 struct ext4_allocation_context *ac, u64 *seq) 5718 { 5719 int freed; 5720 u64 seq_retry = 0; 5721 bool ret = false; 5722 5723 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); 5724 if (freed) { 5725 ret = true; 5726 goto out_dbg; 5727 } 5728 seq_retry = ext4_get_discard_pa_seq_sum(); 5729 if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { 5730 ac->ac_flags |= EXT4_MB_STRICT_CHECK; 5731 *seq = seq_retry; 5732 ret = true; 5733 } 5734 5735 out_dbg: 5736 mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); 5737 return ret; 5738 } 5739 5740 static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, 5741 struct ext4_allocation_request *ar, int *errp); 5742 5743 /* 5744 * Main entry point into mballoc to allocate blocks 5745 * it tries to use preallocation first, then falls back 5746 * to usual allocation 5747 */ 5748 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 5749 struct ext4_allocation_request *ar, int *errp) 5750 { 5751 struct ext4_allocation_context *ac = NULL; 5752 struct ext4_sb_info *sbi; 5753 struct super_block *sb; 5754 ext4_fsblk_t block = 0; 5755 unsigned int inquota = 0; 5756 unsigned int reserv_clstrs = 0; 5757 int retries = 0; 5758 u64 seq; 5759 5760 might_sleep(); 5761 sb = ar->inode->i_sb; 5762 sbi = EXT4_SB(sb); 5763 5764 trace_ext4_request_blocks(ar); 5765 if (sbi->s_mount_state & EXT4_FC_REPLAY) 5766 return ext4_mb_new_blocks_simple(handle, ar, errp); 5767 5768 /* Allow to use superuser reservation for quota file */ 5769 if (ext4_is_quota_file(ar->inode)) 5770 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; 5771 5772 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { 5773 /* Without delayed allocation we need to verify 5774 * there is enough free blocks to do block allocation 5775 * and verify allocation doesn't exceed the quota limits. 5776 */ 5777 while (ar->len && 5778 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { 5779 5780 /* let others to free the space */ 5781 cond_resched(); 5782 ar->len = ar->len >> 1; 5783 } 5784 if (!ar->len) { 5785 ext4_mb_show_pa(sb); 5786 *errp = -ENOSPC; 5787 return 0; 5788 } 5789 reserv_clstrs = ar->len; 5790 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { 5791 dquot_alloc_block_nofail(ar->inode, 5792 EXT4_C2B(sbi, ar->len)); 5793 } else { 5794 while (ar->len && 5795 dquot_alloc_block(ar->inode, 5796 EXT4_C2B(sbi, ar->len))) { 5797 5798 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 5799 ar->len--; 5800 } 5801 } 5802 inquota = ar->len; 5803 if (ar->len == 0) { 5804 *errp = -EDQUOT; 5805 goto out; 5806 } 5807 } 5808 5809 ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); 5810 if (!ac) { 5811 ar->len = 0; 5812 *errp = -ENOMEM; 5813 goto out; 5814 } 5815 5816 ext4_mb_initialize_context(ac, ar); 5817 5818 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 5819 seq = this_cpu_read(discard_pa_seq); 5820 if (!ext4_mb_use_preallocated(ac)) { 5821 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 5822 ext4_mb_normalize_request(ac, ar); 5823 5824 *errp = ext4_mb_pa_alloc(ac); 5825 if (*errp) 5826 goto errout; 5827 repeat: 5828 /* allocate space in core */ 5829 *errp = ext4_mb_regular_allocator(ac); 5830 /* 5831 * pa allocated above is added to grp->bb_prealloc_list only 5832 * when we were able to allocate some block i.e. when 5833 * ac->ac_status == AC_STATUS_FOUND. 5834 * And error from above mean ac->ac_status != AC_STATUS_FOUND 5835 * So we have to free this pa here itself. 5836 */ 5837 if (*errp) { 5838 ext4_mb_pa_put_free(ac); 5839 ext4_discard_allocated_blocks(ac); 5840 goto errout; 5841 } 5842 if (ac->ac_status == AC_STATUS_FOUND && 5843 ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) 5844 ext4_mb_pa_put_free(ac); 5845 } 5846 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 5847 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); 5848 if (*errp) { 5849 ext4_discard_allocated_blocks(ac); 5850 goto errout; 5851 } else { 5852 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 5853 ar->len = ac->ac_b_ex.fe_len; 5854 } 5855 } else { 5856 if (++retries < 3 && 5857 ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) 5858 goto repeat; 5859 /* 5860 * If block allocation fails then the pa allocated above 5861 * needs to be freed here itself. 5862 */ 5863 ext4_mb_pa_put_free(ac); 5864 *errp = -ENOSPC; 5865 } 5866 5867 if (*errp) { 5868 errout: 5869 ac->ac_b_ex.fe_len = 0; 5870 ar->len = 0; 5871 ext4_mb_show_ac(ac); 5872 } 5873 ext4_mb_release_context(ac); 5874 kmem_cache_free(ext4_ac_cachep, ac); 5875 out: 5876 if (inquota && ar->len < inquota) 5877 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); 5878 if (!ar->len) { 5879 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) 5880 /* release all the reserved blocks if non delalloc */ 5881 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 5882 reserv_clstrs); 5883 } 5884 5885 trace_ext4_allocate_blocks(ar, (unsigned long long)block); 5886 5887 return block; 5888 } 5889 5890 /* 5891 * We can merge two free data extents only if the physical blocks 5892 * are contiguous, AND the extents were freed by the same transaction, 5893 * AND the blocks are associated with the same group. 5894 */ 5895 static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, 5896 struct ext4_free_data *entry, 5897 struct ext4_free_data *new_entry, 5898 struct rb_root *entry_rb_root) 5899 { 5900 if ((entry->efd_tid != new_entry->efd_tid) || 5901 (entry->efd_group != new_entry->efd_group)) 5902 return; 5903 if (entry->efd_start_cluster + entry->efd_count == 5904 new_entry->efd_start_cluster) { 5905 new_entry->efd_start_cluster = entry->efd_start_cluster; 5906 new_entry->efd_count += entry->efd_count; 5907 } else if (new_entry->efd_start_cluster + new_entry->efd_count == 5908 entry->efd_start_cluster) { 5909 new_entry->efd_count += entry->efd_count; 5910 } else 5911 return; 5912 spin_lock(&sbi->s_md_lock); 5913 list_del(&entry->efd_list); 5914 spin_unlock(&sbi->s_md_lock); 5915 rb_erase(&entry->efd_node, entry_rb_root); 5916 kmem_cache_free(ext4_free_data_cachep, entry); 5917 } 5918 5919 static noinline_for_stack void 5920 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 5921 struct ext4_free_data *new_entry) 5922 { 5923 ext4_group_t group = e4b->bd_group; 5924 ext4_grpblk_t cluster; 5925 ext4_grpblk_t clusters = new_entry->efd_count; 5926 struct ext4_free_data *entry; 5927 struct ext4_group_info *db = e4b->bd_info; 5928 struct super_block *sb = e4b->bd_sb; 5929 struct ext4_sb_info *sbi = EXT4_SB(sb); 5930 struct rb_node **n = &db->bb_free_root.rb_node, *node; 5931 struct rb_node *parent = NULL, *new_node; 5932 5933 BUG_ON(!ext4_handle_valid(handle)); 5934 BUG_ON(e4b->bd_bitmap_page == NULL); 5935 BUG_ON(e4b->bd_buddy_page == NULL); 5936 5937 new_node = &new_entry->efd_node; 5938 cluster = new_entry->efd_start_cluster; 5939 5940 if (!*n) { 5941 /* first free block exent. We need to 5942 protect buddy cache from being freed, 5943 * otherwise we'll refresh it from 5944 * on-disk bitmap and lose not-yet-available 5945 * blocks */ 5946 get_page(e4b->bd_buddy_page); 5947 get_page(e4b->bd_bitmap_page); 5948 } 5949 while (*n) { 5950 parent = *n; 5951 entry = rb_entry(parent, struct ext4_free_data, efd_node); 5952 if (cluster < entry->efd_start_cluster) 5953 n = &(*n)->rb_left; 5954 else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) 5955 n = &(*n)->rb_right; 5956 else { 5957 ext4_grp_locked_error(sb, group, 0, 5958 ext4_group_first_block_no(sb, group) + 5959 EXT4_C2B(sbi, cluster), 5960 "Block already on to-be-freed list"); 5961 kmem_cache_free(ext4_free_data_cachep, new_entry); 5962 return; 5963 } 5964 } 5965 5966 rb_link_node(new_node, parent, n); 5967 rb_insert_color(new_node, &db->bb_free_root); 5968 5969 /* Now try to see the extent can be merged to left and right */ 5970 node = rb_prev(new_node); 5971 if (node) { 5972 entry = rb_entry(node, struct ext4_free_data, efd_node); 5973 ext4_try_merge_freed_extent(sbi, entry, new_entry, 5974 &(db->bb_free_root)); 5975 } 5976 5977 node = rb_next(new_node); 5978 if (node) { 5979 entry = rb_entry(node, struct ext4_free_data, efd_node); 5980 ext4_try_merge_freed_extent(sbi, entry, new_entry, 5981 &(db->bb_free_root)); 5982 } 5983 5984 spin_lock(&sbi->s_md_lock); 5985 list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); 5986 sbi->s_mb_free_pending += clusters; 5987 spin_unlock(&sbi->s_md_lock); 5988 } 5989 5990 /* 5991 * Simple allocator for Ext4 fast commit replay path. It searches for blocks 5992 * linearly starting at the goal block and also excludes the blocks which 5993 * are going to be in use after fast commit replay. 5994 */ 5995 static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, 5996 struct ext4_allocation_request *ar, int *errp) 5997 { 5998 struct buffer_head *bitmap_bh; 5999 struct super_block *sb = ar->inode->i_sb; 6000 ext4_group_t group; 6001 ext4_grpblk_t blkoff; 6002 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 6003 ext4_grpblk_t i = 0; 6004 ext4_fsblk_t goal, block; 6005 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 6006 6007 goal = ar->goal; 6008 if (goal < le32_to_cpu(es->s_first_data_block) || 6009 goal >= ext4_blocks_count(es)) 6010 goal = le32_to_cpu(es->s_first_data_block); 6011 6012 ar->len = 0; 6013 ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); 6014 for (; group < ext4_get_groups_count(sb); group++) { 6015 bitmap_bh = ext4_read_block_bitmap(sb, group); 6016 if (IS_ERR(bitmap_bh)) { 6017 *errp = PTR_ERR(bitmap_bh); 6018 pr_warn("Failed to read block bitmap\n"); 6019 return 0; 6020 } 6021 6022 while (1) { 6023 i = mb_find_next_zero_bit(bitmap_bh->b_data, max, 6024 blkoff); 6025 if (i >= max) 6026 break; 6027 if (ext4_fc_replay_check_excluded(sb, 6028 ext4_group_first_block_no(sb, group) + i)) { 6029 blkoff = i + 1; 6030 } else 6031 break; 6032 } 6033 brelse(bitmap_bh); 6034 if (i < max) 6035 break; 6036 6037 blkoff = 0; 6038 } 6039 6040 if (group >= ext4_get_groups_count(sb) || i >= max) { 6041 *errp = -ENOSPC; 6042 return 0; 6043 } 6044 6045 block = ext4_group_first_block_no(sb, group) + i; 6046 ext4_mb_mark_bb(sb, block, 1, 1); 6047 ar->len = 1; 6048 6049 return block; 6050 } 6051 6052 static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, 6053 unsigned long count) 6054 { 6055 struct buffer_head *bitmap_bh; 6056 struct super_block *sb = inode->i_sb; 6057 struct ext4_group_desc *gdp; 6058 struct buffer_head *gdp_bh; 6059 ext4_group_t group; 6060 ext4_grpblk_t blkoff; 6061 int already_freed = 0, err, i; 6062 6063 ext4_get_group_no_and_offset(sb, block, &group, &blkoff); 6064 bitmap_bh = ext4_read_block_bitmap(sb, group); 6065 if (IS_ERR(bitmap_bh)) { 6066 pr_warn("Failed to read block bitmap\n"); 6067 return; 6068 } 6069 gdp = ext4_get_group_desc(sb, group, &gdp_bh); 6070 if (!gdp) 6071 goto err_out; 6072 6073 for (i = 0; i < count; i++) { 6074 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) 6075 already_freed++; 6076 } 6077 mb_clear_bits(bitmap_bh->b_data, blkoff, count); 6078 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); 6079 if (err) 6080 goto err_out; 6081 ext4_free_group_clusters_set( 6082 sb, gdp, ext4_free_group_clusters(sb, gdp) + 6083 count - already_freed); 6084 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 6085 ext4_group_desc_csum_set(sb, group, gdp); 6086 ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); 6087 sync_dirty_buffer(bitmap_bh); 6088 sync_dirty_buffer(gdp_bh); 6089 6090 err_out: 6091 brelse(bitmap_bh); 6092 } 6093 6094 /** 6095 * ext4_mb_clear_bb() -- helper function for freeing blocks. 6096 * Used by ext4_free_blocks() 6097 * @handle: handle for this transaction 6098 * @inode: inode 6099 * @block: starting physical block to be freed 6100 * @count: number of blocks to be freed 6101 * @flags: flags used by ext4_free_blocks 6102 */ 6103 static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, 6104 ext4_fsblk_t block, unsigned long count, 6105 int flags) 6106 { 6107 struct buffer_head *bitmap_bh = NULL; 6108 struct super_block *sb = inode->i_sb; 6109 struct ext4_group_desc *gdp; 6110 struct ext4_group_info *grp; 6111 unsigned int overflow; 6112 ext4_grpblk_t bit; 6113 struct buffer_head *gd_bh; 6114 ext4_group_t block_group; 6115 struct ext4_sb_info *sbi; 6116 struct ext4_buddy e4b; 6117 unsigned int count_clusters; 6118 int err = 0; 6119 int ret; 6120 6121 sbi = EXT4_SB(sb); 6122 6123 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6124 !ext4_inode_block_valid(inode, block, count)) { 6125 ext4_error(sb, "Freeing blocks in system zone - " 6126 "Block = %llu, count = %lu", block, count); 6127 /* err = 0. ext4_std_error should be a no op */ 6128 goto error_return; 6129 } 6130 flags |= EXT4_FREE_BLOCKS_VALIDATED; 6131 6132 do_more: 6133 overflow = 0; 6134 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 6135 6136 grp = ext4_get_group_info(sb, block_group); 6137 if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) 6138 return; 6139 6140 /* 6141 * Check to see if we are freeing blocks across a group 6142 * boundary. 6143 */ 6144 if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { 6145 overflow = EXT4_C2B(sbi, bit) + count - 6146 EXT4_BLOCKS_PER_GROUP(sb); 6147 count -= overflow; 6148 /* The range changed so it's no longer validated */ 6149 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6150 } 6151 count_clusters = EXT4_NUM_B2C(sbi, count); 6152 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 6153 if (IS_ERR(bitmap_bh)) { 6154 err = PTR_ERR(bitmap_bh); 6155 bitmap_bh = NULL; 6156 goto error_return; 6157 } 6158 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 6159 if (!gdp) { 6160 err = -EIO; 6161 goto error_return; 6162 } 6163 6164 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6165 !ext4_inode_block_valid(inode, block, count)) { 6166 ext4_error(sb, "Freeing blocks in system zone - " 6167 "Block = %llu, count = %lu", block, count); 6168 /* err = 0. ext4_std_error should be a no op */ 6169 goto error_return; 6170 } 6171 6172 BUFFER_TRACE(bitmap_bh, "getting write access"); 6173 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 6174 EXT4_JTR_NONE); 6175 if (err) 6176 goto error_return; 6177 6178 /* 6179 * We are about to modify some metadata. Call the journal APIs 6180 * to unshare ->b_data if a currently-committing transaction is 6181 * using it 6182 */ 6183 BUFFER_TRACE(gd_bh, "get_write_access"); 6184 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); 6185 if (err) 6186 goto error_return; 6187 #ifdef AGGRESSIVE_CHECK 6188 { 6189 int i; 6190 for (i = 0; i < count_clusters; i++) 6191 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 6192 } 6193 #endif 6194 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); 6195 6196 /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ 6197 err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, 6198 GFP_NOFS|__GFP_NOFAIL); 6199 if (err) 6200 goto error_return; 6201 6202 /* 6203 * We need to make sure we don't reuse the freed block until after the 6204 * transaction is committed. We make an exception if the inode is to be 6205 * written in writeback mode since writeback mode has weak data 6206 * consistency guarantees. 6207 */ 6208 if (ext4_handle_valid(handle) && 6209 ((flags & EXT4_FREE_BLOCKS_METADATA) || 6210 !ext4_should_writeback_data(inode))) { 6211 struct ext4_free_data *new_entry; 6212 /* 6213 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed 6214 * to fail. 6215 */ 6216 new_entry = kmem_cache_alloc(ext4_free_data_cachep, 6217 GFP_NOFS|__GFP_NOFAIL); 6218 new_entry->efd_start_cluster = bit; 6219 new_entry->efd_group = block_group; 6220 new_entry->efd_count = count_clusters; 6221 new_entry->efd_tid = handle->h_transaction->t_tid; 6222 6223 ext4_lock_group(sb, block_group); 6224 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 6225 ext4_mb_free_metadata(handle, &e4b, new_entry); 6226 } else { 6227 /* need to update group_info->bb_free and bitmap 6228 * with group lock held. generate_buddy look at 6229 * them with group lock_held 6230 */ 6231 if (test_opt(sb, DISCARD)) { 6232 err = ext4_issue_discard(sb, block_group, bit, count, 6233 NULL); 6234 if (err && err != -EOPNOTSUPP) 6235 ext4_msg(sb, KERN_WARNING, "discard request in" 6236 " group:%u block:%d count:%lu failed" 6237 " with %d", block_group, bit, count, 6238 err); 6239 } else 6240 EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); 6241 6242 ext4_lock_group(sb, block_group); 6243 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 6244 mb_free_blocks(inode, &e4b, bit, count_clusters); 6245 } 6246 6247 ret = ext4_free_group_clusters(sb, gdp) + count_clusters; 6248 ext4_free_group_clusters_set(sb, gdp, ret); 6249 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 6250 ext4_group_desc_csum_set(sb, block_group, gdp); 6251 ext4_unlock_group(sb, block_group); 6252 6253 if (sbi->s_log_groups_per_flex) { 6254 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 6255 atomic64_add(count_clusters, 6256 &sbi_array_rcu_deref(sbi, s_flex_groups, 6257 flex_group)->free_clusters); 6258 } 6259 6260 /* 6261 * on a bigalloc file system, defer the s_freeclusters_counter 6262 * update to the caller (ext4_remove_space and friends) so they 6263 * can determine if a cluster freed here should be rereserved 6264 */ 6265 if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { 6266 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) 6267 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); 6268 percpu_counter_add(&sbi->s_freeclusters_counter, 6269 count_clusters); 6270 } 6271 6272 ext4_mb_unload_buddy(&e4b); 6273 6274 /* We dirtied the bitmap block */ 6275 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 6276 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 6277 6278 /* And the group descriptor block */ 6279 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 6280 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 6281 if (!err) 6282 err = ret; 6283 6284 if (overflow && !err) { 6285 block += count; 6286 count = overflow; 6287 put_bh(bitmap_bh); 6288 /* The range changed so it's no longer validated */ 6289 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6290 goto do_more; 6291 } 6292 error_return: 6293 brelse(bitmap_bh); 6294 ext4_std_error(sb, err); 6295 return; 6296 } 6297 6298 /** 6299 * ext4_free_blocks() -- Free given blocks and update quota 6300 * @handle: handle for this transaction 6301 * @inode: inode 6302 * @bh: optional buffer of the block to be freed 6303 * @block: starting physical block to be freed 6304 * @count: number of blocks to be freed 6305 * @flags: flags used by ext4_free_blocks 6306 */ 6307 void ext4_free_blocks(handle_t *handle, struct inode *inode, 6308 struct buffer_head *bh, ext4_fsblk_t block, 6309 unsigned long count, int flags) 6310 { 6311 struct super_block *sb = inode->i_sb; 6312 unsigned int overflow; 6313 struct ext4_sb_info *sbi; 6314 6315 sbi = EXT4_SB(sb); 6316 6317 if (sbi->s_mount_state & EXT4_FC_REPLAY) { 6318 ext4_free_blocks_simple(inode, block, count); 6319 return; 6320 } 6321 6322 might_sleep(); 6323 if (bh) { 6324 if (block) 6325 BUG_ON(block != bh->b_blocknr); 6326 else 6327 block = bh->b_blocknr; 6328 } 6329 6330 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6331 !ext4_inode_block_valid(inode, block, count)) { 6332 ext4_error(sb, "Freeing blocks not in datazone - " 6333 "block = %llu, count = %lu", block, count); 6334 return; 6335 } 6336 flags |= EXT4_FREE_BLOCKS_VALIDATED; 6337 6338 ext4_debug("freeing block %llu\n", block); 6339 trace_ext4_free_blocks(inode, block, count, flags); 6340 6341 if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { 6342 BUG_ON(count > 1); 6343 6344 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 6345 inode, bh, block); 6346 } 6347 6348 /* 6349 * If the extent to be freed does not begin on a cluster 6350 * boundary, we need to deal with partial clusters at the 6351 * beginning and end of the extent. Normally we will free 6352 * blocks at the beginning or the end unless we are explicitly 6353 * requested to avoid doing so. 6354 */ 6355 overflow = EXT4_PBLK_COFF(sbi, block); 6356 if (overflow) { 6357 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { 6358 overflow = sbi->s_cluster_ratio - overflow; 6359 block += overflow; 6360 if (count > overflow) 6361 count -= overflow; 6362 else 6363 return; 6364 } else { 6365 block -= overflow; 6366 count += overflow; 6367 } 6368 /* The range changed so it's no longer validated */ 6369 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6370 } 6371 overflow = EXT4_LBLK_COFF(sbi, count); 6372 if (overflow) { 6373 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { 6374 if (count > overflow) 6375 count -= overflow; 6376 else 6377 return; 6378 } else 6379 count += sbi->s_cluster_ratio - overflow; 6380 /* The range changed so it's no longer validated */ 6381 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6382 } 6383 6384 if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { 6385 int i; 6386 int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; 6387 6388 for (i = 0; i < count; i++) { 6389 cond_resched(); 6390 if (is_metadata) 6391 bh = sb_find_get_block(inode->i_sb, block + i); 6392 ext4_forget(handle, is_metadata, inode, bh, block + i); 6393 } 6394 } 6395 6396 ext4_mb_clear_bb(handle, inode, block, count, flags); 6397 return; 6398 } 6399 6400 /** 6401 * ext4_group_add_blocks() -- Add given blocks to an existing group 6402 * @handle: handle to this transaction 6403 * @sb: super block 6404 * @block: start physical block to add to the block group 6405 * @count: number of blocks to free 6406 * 6407 * This marks the blocks as free in the bitmap and buddy. 6408 */ 6409 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, 6410 ext4_fsblk_t block, unsigned long count) 6411 { 6412 struct buffer_head *bitmap_bh = NULL; 6413 struct buffer_head *gd_bh; 6414 ext4_group_t block_group; 6415 ext4_grpblk_t bit; 6416 unsigned int i; 6417 struct ext4_group_desc *desc; 6418 struct ext4_sb_info *sbi = EXT4_SB(sb); 6419 struct ext4_buddy e4b; 6420 int err = 0, ret, free_clusters_count; 6421 ext4_grpblk_t clusters_freed; 6422 ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); 6423 ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); 6424 unsigned long cluster_count = last_cluster - first_cluster + 1; 6425 6426 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 6427 6428 if (count == 0) 6429 return 0; 6430 6431 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 6432 /* 6433 * Check to see if we are freeing blocks across a group 6434 * boundary. 6435 */ 6436 if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { 6437 ext4_warning(sb, "too many blocks added to group %u", 6438 block_group); 6439 err = -EINVAL; 6440 goto error_return; 6441 } 6442 6443 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 6444 if (IS_ERR(bitmap_bh)) { 6445 err = PTR_ERR(bitmap_bh); 6446 bitmap_bh = NULL; 6447 goto error_return; 6448 } 6449 6450 desc = ext4_get_group_desc(sb, block_group, &gd_bh); 6451 if (!desc) { 6452 err = -EIO; 6453 goto error_return; 6454 } 6455 6456 if (!ext4_sb_block_valid(sb, NULL, block, count)) { 6457 ext4_error(sb, "Adding blocks in system zones - " 6458 "Block = %llu, count = %lu", 6459 block, count); 6460 err = -EINVAL; 6461 goto error_return; 6462 } 6463 6464 BUFFER_TRACE(bitmap_bh, "getting write access"); 6465 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 6466 EXT4_JTR_NONE); 6467 if (err) 6468 goto error_return; 6469 6470 /* 6471 * We are about to modify some metadata. Call the journal APIs 6472 * to unshare ->b_data if a currently-committing transaction is 6473 * using it 6474 */ 6475 BUFFER_TRACE(gd_bh, "get_write_access"); 6476 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); 6477 if (err) 6478 goto error_return; 6479 6480 for (i = 0, clusters_freed = 0; i < cluster_count; i++) { 6481 BUFFER_TRACE(bitmap_bh, "clear bit"); 6482 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { 6483 ext4_error(sb, "bit already cleared for block %llu", 6484 (ext4_fsblk_t)(block + i)); 6485 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 6486 } else { 6487 clusters_freed++; 6488 } 6489 } 6490 6491 err = ext4_mb_load_buddy(sb, block_group, &e4b); 6492 if (err) 6493 goto error_return; 6494 6495 /* 6496 * need to update group_info->bb_free and bitmap 6497 * with group lock held. generate_buddy look at 6498 * them with group lock_held 6499 */ 6500 ext4_lock_group(sb, block_group); 6501 mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); 6502 mb_free_blocks(NULL, &e4b, bit, cluster_count); 6503 free_clusters_count = clusters_freed + 6504 ext4_free_group_clusters(sb, desc); 6505 ext4_free_group_clusters_set(sb, desc, free_clusters_count); 6506 ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); 6507 ext4_group_desc_csum_set(sb, block_group, desc); 6508 ext4_unlock_group(sb, block_group); 6509 percpu_counter_add(&sbi->s_freeclusters_counter, 6510 clusters_freed); 6511 6512 if (sbi->s_log_groups_per_flex) { 6513 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 6514 atomic64_add(clusters_freed, 6515 &sbi_array_rcu_deref(sbi, s_flex_groups, 6516 flex_group)->free_clusters); 6517 } 6518 6519 ext4_mb_unload_buddy(&e4b); 6520 6521 /* We dirtied the bitmap block */ 6522 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 6523 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 6524 6525 /* And the group descriptor block */ 6526 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 6527 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 6528 if (!err) 6529 err = ret; 6530 6531 error_return: 6532 brelse(bitmap_bh); 6533 ext4_std_error(sb, err); 6534 return err; 6535 } 6536 6537 /** 6538 * ext4_trim_extent -- function to TRIM one single free extent in the group 6539 * @sb: super block for the file system 6540 * @start: starting block of the free extent in the alloc. group 6541 * @count: number of blocks to TRIM 6542 * @e4b: ext4 buddy for the group 6543 * 6544 * Trim "count" blocks starting at "start" in the "group". To assure that no 6545 * one will allocate those blocks, mark it as used in buddy bitmap. This must 6546 * be called with under the group lock. 6547 */ 6548 static int ext4_trim_extent(struct super_block *sb, 6549 int start, int count, struct ext4_buddy *e4b) 6550 __releases(bitlock) 6551 __acquires(bitlock) 6552 { 6553 struct ext4_free_extent ex; 6554 ext4_group_t group = e4b->bd_group; 6555 int ret = 0; 6556 6557 trace_ext4_trim_extent(sb, group, start, count); 6558 6559 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 6560 6561 ex.fe_start = start; 6562 ex.fe_group = group; 6563 ex.fe_len = count; 6564 6565 /* 6566 * Mark blocks used, so no one can reuse them while 6567 * being trimmed. 6568 */ 6569 mb_mark_used(e4b, &ex); 6570 ext4_unlock_group(sb, group); 6571 ret = ext4_issue_discard(sb, group, start, count, NULL); 6572 ext4_lock_group(sb, group); 6573 mb_free_blocks(NULL, e4b, start, ex.fe_len); 6574 return ret; 6575 } 6576 6577 static int ext4_try_to_trim_range(struct super_block *sb, 6578 struct ext4_buddy *e4b, ext4_grpblk_t start, 6579 ext4_grpblk_t max, ext4_grpblk_t minblocks) 6580 __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) 6581 __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) 6582 { 6583 ext4_grpblk_t next, count, free_count; 6584 void *bitmap; 6585 6586 bitmap = e4b->bd_bitmap; 6587 start = (e4b->bd_info->bb_first_free > start) ? 6588 e4b->bd_info->bb_first_free : start; 6589 count = 0; 6590 free_count = 0; 6591 6592 while (start <= max) { 6593 start = mb_find_next_zero_bit(bitmap, max + 1, start); 6594 if (start > max) 6595 break; 6596 next = mb_find_next_bit(bitmap, max + 1, start); 6597 6598 if ((next - start) >= minblocks) { 6599 int ret = ext4_trim_extent(sb, start, next - start, e4b); 6600 6601 if (ret && ret != -EOPNOTSUPP) 6602 break; 6603 count += next - start; 6604 } 6605 free_count += next - start; 6606 start = next + 1; 6607 6608 if (fatal_signal_pending(current)) { 6609 count = -ERESTARTSYS; 6610 break; 6611 } 6612 6613 if (need_resched()) { 6614 ext4_unlock_group(sb, e4b->bd_group); 6615 cond_resched(); 6616 ext4_lock_group(sb, e4b->bd_group); 6617 } 6618 6619 if ((e4b->bd_info->bb_free - free_count) < minblocks) 6620 break; 6621 } 6622 6623 return count; 6624 } 6625 6626 /** 6627 * ext4_trim_all_free -- function to trim all free space in alloc. group 6628 * @sb: super block for file system 6629 * @group: group to be trimmed 6630 * @start: first group block to examine 6631 * @max: last group block to examine 6632 * @minblocks: minimum extent block count 6633 * @set_trimmed: set the trimmed flag if at least one block is trimmed 6634 * 6635 * ext4_trim_all_free walks through group's block bitmap searching for free 6636 * extents. When the free extent is found, mark it as used in group buddy 6637 * bitmap. Then issue a TRIM command on this extent and free the extent in 6638 * the group buddy bitmap. 6639 */ 6640 static ext4_grpblk_t 6641 ext4_trim_all_free(struct super_block *sb, ext4_group_t group, 6642 ext4_grpblk_t start, ext4_grpblk_t max, 6643 ext4_grpblk_t minblocks, bool set_trimmed) 6644 { 6645 struct ext4_buddy e4b; 6646 int ret; 6647 6648 trace_ext4_trim_all_free(sb, group, start, max); 6649 6650 ret = ext4_mb_load_buddy(sb, group, &e4b); 6651 if (ret) { 6652 ext4_warning(sb, "Error %d loading buddy information for %u", 6653 ret, group); 6654 return ret; 6655 } 6656 6657 ext4_lock_group(sb, group); 6658 6659 if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || 6660 minblocks < EXT4_SB(sb)->s_last_trim_minblks) { 6661 ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); 6662 if (ret >= 0 && set_trimmed) 6663 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); 6664 } else { 6665 ret = 0; 6666 } 6667 6668 ext4_unlock_group(sb, group); 6669 ext4_mb_unload_buddy(&e4b); 6670 6671 ext4_debug("trimmed %d blocks in the group %d\n", 6672 ret, group); 6673 6674 return ret; 6675 } 6676 6677 /** 6678 * ext4_trim_fs() -- trim ioctl handle function 6679 * @sb: superblock for filesystem 6680 * @range: fstrim_range structure 6681 * 6682 * start: First Byte to trim 6683 * len: number of Bytes to trim from start 6684 * minlen: minimum extent length in Bytes 6685 * ext4_trim_fs goes through all allocation groups containing Bytes from 6686 * start to start+len. For each such a group ext4_trim_all_free function 6687 * is invoked to trim all free space. 6688 */ 6689 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 6690 { 6691 unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); 6692 struct ext4_group_info *grp; 6693 ext4_group_t group, first_group, last_group; 6694 ext4_grpblk_t cnt = 0, first_cluster, last_cluster; 6695 uint64_t start, end, minlen, trimmed = 0; 6696 ext4_fsblk_t first_data_blk = 6697 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 6698 ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); 6699 bool whole_group, eof = false; 6700 int ret = 0; 6701 6702 start = range->start >> sb->s_blocksize_bits; 6703 end = start + (range->len >> sb->s_blocksize_bits) - 1; 6704 minlen = EXT4_NUM_B2C(EXT4_SB(sb), 6705 range->minlen >> sb->s_blocksize_bits); 6706 6707 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || 6708 start >= max_blks || 6709 range->len < sb->s_blocksize) 6710 return -EINVAL; 6711 /* No point to try to trim less than discard granularity */ 6712 if (range->minlen < discard_granularity) { 6713 minlen = EXT4_NUM_B2C(EXT4_SB(sb), 6714 discard_granularity >> sb->s_blocksize_bits); 6715 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) 6716 goto out; 6717 } 6718 if (end >= max_blks - 1) { 6719 end = max_blks - 1; 6720 eof = true; 6721 } 6722 if (end <= first_data_blk) 6723 goto out; 6724 if (start < first_data_blk) 6725 start = first_data_blk; 6726 6727 /* Determine first and last group to examine based on start and end */ 6728 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 6729 &first_group, &first_cluster); 6730 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, 6731 &last_group, &last_cluster); 6732 6733 /* end now represents the last cluster to discard in this group */ 6734 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; 6735 whole_group = true; 6736 6737 for (group = first_group; group <= last_group; group++) { 6738 grp = ext4_get_group_info(sb, group); 6739 if (!grp) 6740 continue; 6741 /* We only do this if the grp has never been initialized */ 6742 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 6743 ret = ext4_mb_init_group(sb, group, GFP_NOFS); 6744 if (ret) 6745 break; 6746 } 6747 6748 /* 6749 * For all the groups except the last one, last cluster will 6750 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to 6751 * change it for the last group, note that last_cluster is 6752 * already computed earlier by ext4_get_group_no_and_offset() 6753 */ 6754 if (group == last_group) { 6755 end = last_cluster; 6756 whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; 6757 } 6758 if (grp->bb_free >= minlen) { 6759 cnt = ext4_trim_all_free(sb, group, first_cluster, 6760 end, minlen, whole_group); 6761 if (cnt < 0) { 6762 ret = cnt; 6763 break; 6764 } 6765 trimmed += cnt; 6766 } 6767 6768 /* 6769 * For every group except the first one, we are sure 6770 * that the first cluster to discard will be cluster #0. 6771 */ 6772 first_cluster = 0; 6773 } 6774 6775 if (!ret) 6776 EXT4_SB(sb)->s_last_trim_minblks = minlen; 6777 6778 out: 6779 range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; 6780 return ret; 6781 } 6782 6783 /* Iterate all the free extents in the group. */ 6784 int 6785 ext4_mballoc_query_range( 6786 struct super_block *sb, 6787 ext4_group_t group, 6788 ext4_grpblk_t start, 6789 ext4_grpblk_t end, 6790 ext4_mballoc_query_range_fn formatter, 6791 void *priv) 6792 { 6793 void *bitmap; 6794 ext4_grpblk_t next; 6795 struct ext4_buddy e4b; 6796 int error; 6797 6798 error = ext4_mb_load_buddy(sb, group, &e4b); 6799 if (error) 6800 return error; 6801 bitmap = e4b.bd_bitmap; 6802 6803 ext4_lock_group(sb, group); 6804 6805 start = (e4b.bd_info->bb_first_free > start) ? 6806 e4b.bd_info->bb_first_free : start; 6807 if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) 6808 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; 6809 6810 while (start <= end) { 6811 start = mb_find_next_zero_bit(bitmap, end + 1, start); 6812 if (start > end) 6813 break; 6814 next = mb_find_next_bit(bitmap, end + 1, start); 6815 6816 ext4_unlock_group(sb, group); 6817 error = formatter(sb, group, start, next - start, priv); 6818 if (error) 6819 goto out_unload; 6820 ext4_lock_group(sb, group); 6821 6822 start = next + 1; 6823 } 6824 6825 ext4_unlock_group(sb, group); 6826 out_unload: 6827 ext4_mb_unload_buddy(&e4b); 6828 6829 return error; 6830 } 6831