1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 4 * Written by Alex Tomas <alex@clusterfs.com> 5 */ 6 7 8 /* 9 * mballoc.c contains the multiblocks allocation routines 10 */ 11 12 #include "ext4_jbd2.h" 13 #include "mballoc.h" 14 #include <linux/log2.h> 15 #include <linux/module.h> 16 #include <linux/slab.h> 17 #include <linux/nospec.h> 18 #include <linux/backing-dev.h> 19 #include <trace/events/ext4.h> 20 21 /* 22 * MUSTDO: 23 * - test ext4_ext_search_left() and ext4_ext_search_right() 24 * - search for metadata in few groups 25 * 26 * TODO v4: 27 * - normalization should take into account whether file is still open 28 * - discard preallocations if no free space left (policy?) 29 * - don't normalize tails 30 * - quota 31 * - reservation for superuser 32 * 33 * TODO v3: 34 * - bitmap read-ahead (proposed by Oleg Drokin aka green) 35 * - track min/max extents in each group for better group selection 36 * - mb_mark_used() may allocate chunk right after splitting buddy 37 * - tree of groups sorted by number of free blocks 38 * - error handling 39 */ 40 41 /* 42 * The allocation request involve request for multiple number of blocks 43 * near to the goal(block) value specified. 44 * 45 * During initialization phase of the allocator we decide to use the 46 * group preallocation or inode preallocation depending on the size of 47 * the file. The size of the file could be the resulting file size we 48 * would have after allocation, or the current file size, which ever 49 * is larger. If the size is less than sbi->s_mb_stream_request we 50 * select to use the group preallocation. The default value of 51 * s_mb_stream_request is 16 blocks. This can also be tuned via 52 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in 53 * terms of number of blocks. 54 * 55 * The main motivation for having small file use group preallocation is to 56 * ensure that we have small files closer together on the disk. 57 * 58 * First stage the allocator looks at the inode prealloc list, 59 * ext4_inode_info->i_prealloc_list, which contains list of prealloc 60 * spaces for this particular inode. The inode prealloc space is 61 * represented as: 62 * 63 * pa_lstart -> the logical start block for this prealloc space 64 * pa_pstart -> the physical start block for this prealloc space 65 * pa_len -> length for this prealloc space (in clusters) 66 * pa_free -> free space available in this prealloc space (in clusters) 67 * 68 * The inode preallocation space is used looking at the _logical_ start 69 * block. If only the logical file block falls within the range of prealloc 70 * space we will consume the particular prealloc space. This makes sure that 71 * we have contiguous physical blocks representing the file blocks 72 * 73 * The important thing to be noted in case of inode prealloc space is that 74 * we don't modify the values associated to inode prealloc space except 75 * pa_free. 76 * 77 * If we are not able to find blocks in the inode prealloc space and if we 78 * have the group allocation flag set then we look at the locality group 79 * prealloc space. These are per CPU prealloc list represented as 80 * 81 * ext4_sb_info.s_locality_groups[smp_processor_id()] 82 * 83 * The reason for having a per cpu locality group is to reduce the contention 84 * between CPUs. It is possible to get scheduled at this point. 85 * 86 * The locality group prealloc space is used looking at whether we have 87 * enough free space (pa_free) within the prealloc space. 88 * 89 * If we can't allocate blocks via inode prealloc or/and locality group 90 * prealloc then we look at the buddy cache. The buddy cache is represented 91 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets 92 * mapped to the buddy and bitmap information regarding different 93 * groups. The buddy information is attached to buddy cache inode so that 94 * we can access them through the page cache. The information regarding 95 * each group is loaded via ext4_mb_load_buddy. The information involve 96 * block bitmap and buddy information. The information are stored in the 97 * inode as: 98 * 99 * { page } 100 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 101 * 102 * 103 * one block each for bitmap and buddy information. So for each group we 104 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / 105 * blocksize) blocks. So it can have information regarding groups_per_page 106 * which is blocks_per_page/2 107 * 108 * The buddy cache inode is not stored on disk. The inode is thrown 109 * away when the filesystem is unmounted. 110 * 111 * We look for count number of blocks in the buddy cache. If we were able 112 * to locate that many free blocks we return with additional information 113 * regarding rest of the contiguous physical block available 114 * 115 * Before allocating blocks via buddy cache we normalize the request 116 * blocks. This ensure we ask for more blocks that we needed. The extra 117 * blocks that we get after allocation is added to the respective prealloc 118 * list. In case of inode preallocation we follow a list of heuristics 119 * based on file size. This can be found in ext4_mb_normalize_request. If 120 * we are doing a group prealloc we try to normalize the request to 121 * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is 122 * dependent on the cluster size; for non-bigalloc file systems, it is 123 * 512 blocks. This can be tuned via 124 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in 125 * terms of number of blocks. If we have mounted the file system with -O 126 * stripe=<value> option the group prealloc request is normalized to the 127 * smallest multiple of the stripe value (sbi->s_stripe) which is 128 * greater than the default mb_group_prealloc. 129 * 130 * If "mb_optimize_scan" mount option is set, we maintain in memory group info 131 * structures in two data structures: 132 * 133 * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) 134 * 135 * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) 136 * 137 * This is an array of lists where the index in the array represents the 138 * largest free order in the buddy bitmap of the participating group infos of 139 * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total 140 * number of buddy bitmap orders possible) number of lists. Group-infos are 141 * placed in appropriate lists. 142 * 143 * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) 144 * 145 * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) 146 * 147 * This is an array of lists where in the i-th list there are groups with 148 * average fragment size >= 2^i and < 2^(i+1). The average fragment size 149 * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. 150 * Note that we don't bother with a special list for completely empty groups 151 * so we only have MB_NUM_ORDERS(sb) lists. 152 * 153 * When "mb_optimize_scan" mount option is set, mballoc consults the above data 154 * structures to decide the order in which groups are to be traversed for 155 * fulfilling an allocation request. 156 * 157 * At CR = 0, we look for groups which have the largest_free_order >= the order 158 * of the request. We directly look at the largest free order list in the data 159 * structure (1) above where largest_free_order = order of the request. If that 160 * list is empty, we look at remaining list in the increasing order of 161 * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. 162 * 163 * At CR = 1, we only consider groups where average fragment size > request 164 * size. So, we lookup a group which has average fragment size just above or 165 * equal to request size using our average fragment size group lists (data 166 * structure 2) in O(1) time. 167 * 168 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in 169 * linear order which requires O(N) search time for each CR 0 and CR 1 phase. 170 * 171 * The regular allocator (using the buddy cache) supports a few tunables. 172 * 173 * /sys/fs/ext4/<partition>/mb_min_to_scan 174 * /sys/fs/ext4/<partition>/mb_max_to_scan 175 * /sys/fs/ext4/<partition>/mb_order2_req 176 * /sys/fs/ext4/<partition>/mb_linear_limit 177 * 178 * The regular allocator uses buddy scan only if the request len is power of 179 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 180 * value of s_mb_order2_reqs can be tuned via 181 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to 182 * stripe size (sbi->s_stripe), we try to search for contiguous block in 183 * stripe size. This should result in better allocation on RAID setups. If 184 * not, we search in the specific group using bitmap for best extents. The 185 * tunable min_to_scan and max_to_scan control the behaviour here. 186 * min_to_scan indicate how long the mballoc __must__ look for a best 187 * extent and max_to_scan indicates how long the mballoc __can__ look for a 188 * best extent in the found extents. Searching for the blocks starts with 189 * the group specified as the goal value in allocation context via 190 * ac_g_ex. Each group is first checked based on the criteria whether it 191 * can be used for allocation. ext4_mb_good_group explains how the groups are 192 * checked. 193 * 194 * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not 195 * get traversed linearly. That may result in subsequent allocations being not 196 * close to each other. And so, the underlying device may get filled up in a 197 * non-linear fashion. While that may not matter on non-rotational devices, for 198 * rotational devices that may result in higher seek times. "mb_linear_limit" 199 * tells mballoc how many groups mballoc should search linearly before 200 * performing consulting above data structures for more efficient lookups. For 201 * non rotational devices, this value defaults to 0 and for rotational devices 202 * this is set to MB_DEFAULT_LINEAR_LIMIT. 203 * 204 * Both the prealloc space are getting populated as above. So for the first 205 * request we will hit the buddy cache which will result in this prealloc 206 * space getting filled. The prealloc space is then later used for the 207 * subsequent request. 208 */ 209 210 /* 211 * mballoc operates on the following data: 212 * - on-disk bitmap 213 * - in-core buddy (actually includes buddy and bitmap) 214 * - preallocation descriptors (PAs) 215 * 216 * there are two types of preallocations: 217 * - inode 218 * assiged to specific inode and can be used for this inode only. 219 * it describes part of inode's space preallocated to specific 220 * physical blocks. any block from that preallocated can be used 221 * independent. the descriptor just tracks number of blocks left 222 * unused. so, before taking some block from descriptor, one must 223 * make sure corresponded logical block isn't allocated yet. this 224 * also means that freeing any block within descriptor's range 225 * must discard all preallocated blocks. 226 * - locality group 227 * assigned to specific locality group which does not translate to 228 * permanent set of inodes: inode can join and leave group. space 229 * from this type of preallocation can be used for any inode. thus 230 * it's consumed from the beginning to the end. 231 * 232 * relation between them can be expressed as: 233 * in-core buddy = on-disk bitmap + preallocation descriptors 234 * 235 * this mean blocks mballoc considers used are: 236 * - allocated blocks (persistent) 237 * - preallocated blocks (non-persistent) 238 * 239 * consistency in mballoc world means that at any time a block is either 240 * free or used in ALL structures. notice: "any time" should not be read 241 * literally -- time is discrete and delimited by locks. 242 * 243 * to keep it simple, we don't use block numbers, instead we count number of 244 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. 245 * 246 * all operations can be expressed as: 247 * - init buddy: buddy = on-disk + PAs 248 * - new PA: buddy += N; PA = N 249 * - use inode PA: on-disk += N; PA -= N 250 * - discard inode PA buddy -= on-disk - PA; PA = 0 251 * - use locality group PA on-disk += N; PA -= N 252 * - discard locality group PA buddy -= PA; PA = 0 253 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap 254 * is used in real operation because we can't know actual used 255 * bits from PA, only from on-disk bitmap 256 * 257 * if we follow this strict logic, then all operations above should be atomic. 258 * given some of them can block, we'd have to use something like semaphores 259 * killing performance on high-end SMP hardware. let's try to relax it using 260 * the following knowledge: 261 * 1) if buddy is referenced, it's already initialized 262 * 2) while block is used in buddy and the buddy is referenced, 263 * nobody can re-allocate that block 264 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has 265 * bit set and PA claims same block, it's OK. IOW, one can set bit in 266 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded 267 * block 268 * 269 * so, now we're building a concurrency table: 270 * - init buddy vs. 271 * - new PA 272 * blocks for PA are allocated in the buddy, buddy must be referenced 273 * until PA is linked to allocation group to avoid concurrent buddy init 274 * - use inode PA 275 * we need to make sure that either on-disk bitmap or PA has uptodate data 276 * given (3) we care that PA-=N operation doesn't interfere with init 277 * - discard inode PA 278 * the simplest way would be to have buddy initialized by the discard 279 * - use locality group PA 280 * again PA-=N must be serialized with init 281 * - discard locality group PA 282 * the simplest way would be to have buddy initialized by the discard 283 * - new PA vs. 284 * - use inode PA 285 * i_data_sem serializes them 286 * - discard inode PA 287 * discard process must wait until PA isn't used by another process 288 * - use locality group PA 289 * some mutex should serialize them 290 * - discard locality group PA 291 * discard process must wait until PA isn't used by another process 292 * - use inode PA 293 * - use inode PA 294 * i_data_sem or another mutex should serializes them 295 * - discard inode PA 296 * discard process must wait until PA isn't used by another process 297 * - use locality group PA 298 * nothing wrong here -- they're different PAs covering different blocks 299 * - discard locality group PA 300 * discard process must wait until PA isn't used by another process 301 * 302 * now we're ready to make few consequences: 303 * - PA is referenced and while it is no discard is possible 304 * - PA is referenced until block isn't marked in on-disk bitmap 305 * - PA changes only after on-disk bitmap 306 * - discard must not compete with init. either init is done before 307 * any discard or they're serialized somehow 308 * - buddy init as sum of on-disk bitmap and PAs is done atomically 309 * 310 * a special case when we've used PA to emptiness. no need to modify buddy 311 * in this case, but we should care about concurrent init 312 * 313 */ 314 315 /* 316 * Logic in few words: 317 * 318 * - allocation: 319 * load group 320 * find blocks 321 * mark bits in on-disk bitmap 322 * release group 323 * 324 * - use preallocation: 325 * find proper PA (per-inode or group) 326 * load group 327 * mark bits in on-disk bitmap 328 * release group 329 * release PA 330 * 331 * - free: 332 * load group 333 * mark bits in on-disk bitmap 334 * release group 335 * 336 * - discard preallocations in group: 337 * mark PAs deleted 338 * move them onto local list 339 * load on-disk bitmap 340 * load group 341 * remove PA from object (inode or locality group) 342 * mark free blocks in-core 343 * 344 * - discard inode's preallocations: 345 */ 346 347 /* 348 * Locking rules 349 * 350 * Locks: 351 * - bitlock on a group (group) 352 * - object (inode/locality) (object) 353 * - per-pa lock (pa) 354 * - cr0 lists lock (cr0) 355 * - cr1 tree lock (cr1) 356 * 357 * Paths: 358 * - new pa 359 * object 360 * group 361 * 362 * - find and use pa: 363 * pa 364 * 365 * - release consumed pa: 366 * pa 367 * group 368 * object 369 * 370 * - generate in-core bitmap: 371 * group 372 * pa 373 * 374 * - discard all for given object (inode, locality group): 375 * object 376 * pa 377 * group 378 * 379 * - discard all for given group: 380 * group 381 * pa 382 * group 383 * object 384 * 385 * - allocation path (ext4_mb_regular_allocator) 386 * group 387 * cr0/cr1 388 */ 389 static struct kmem_cache *ext4_pspace_cachep; 390 static struct kmem_cache *ext4_ac_cachep; 391 static struct kmem_cache *ext4_free_data_cachep; 392 393 /* We create slab caches for groupinfo data structures based on the 394 * superblock block size. There will be one per mounted filesystem for 395 * each unique s_blocksize_bits */ 396 #define NR_GRPINFO_CACHES 8 397 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; 398 399 static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { 400 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", 401 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", 402 "ext4_groupinfo_64k", "ext4_groupinfo_128k" 403 }; 404 405 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 406 ext4_group_t group); 407 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 408 ext4_group_t group); 409 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); 410 411 static bool ext4_mb_good_group(struct ext4_allocation_context *ac, 412 ext4_group_t group, int cr); 413 414 static int ext4_try_to_trim_range(struct super_block *sb, 415 struct ext4_buddy *e4b, ext4_grpblk_t start, 416 ext4_grpblk_t max, ext4_grpblk_t minblocks); 417 418 /* 419 * The algorithm using this percpu seq counter goes below: 420 * 1. We sample the percpu discard_pa_seq counter before trying for block 421 * allocation in ext4_mb_new_blocks(). 422 * 2. We increment this percpu discard_pa_seq counter when we either allocate 423 * or free these blocks i.e. while marking those blocks as used/free in 424 * mb_mark_used()/mb_free_blocks(). 425 * 3. We also increment this percpu seq counter when we successfully identify 426 * that the bb_prealloc_list is not empty and hence proceed for discarding 427 * of those PAs inside ext4_mb_discard_group_preallocations(). 428 * 429 * Now to make sure that the regular fast path of block allocation is not 430 * affected, as a small optimization we only sample the percpu seq counter 431 * on that cpu. Only when the block allocation fails and when freed blocks 432 * found were 0, that is when we sample percpu seq counter for all cpus using 433 * below function ext4_get_discard_pa_seq_sum(). This happens after making 434 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. 435 */ 436 static DEFINE_PER_CPU(u64, discard_pa_seq); 437 static inline u64 ext4_get_discard_pa_seq_sum(void) 438 { 439 int __cpu; 440 u64 __seq = 0; 441 442 for_each_possible_cpu(__cpu) 443 __seq += per_cpu(discard_pa_seq, __cpu); 444 return __seq; 445 } 446 447 static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 448 { 449 #if BITS_PER_LONG == 64 450 *bit += ((unsigned long) addr & 7UL) << 3; 451 addr = (void *) ((unsigned long) addr & ~7UL); 452 #elif BITS_PER_LONG == 32 453 *bit += ((unsigned long) addr & 3UL) << 3; 454 addr = (void *) ((unsigned long) addr & ~3UL); 455 #else 456 #error "how many bits you are?!" 457 #endif 458 return addr; 459 } 460 461 static inline int mb_test_bit(int bit, void *addr) 462 { 463 /* 464 * ext4_test_bit on architecture like powerpc 465 * needs unsigned long aligned address 466 */ 467 addr = mb_correct_addr_and_bit(&bit, addr); 468 return ext4_test_bit(bit, addr); 469 } 470 471 static inline void mb_set_bit(int bit, void *addr) 472 { 473 addr = mb_correct_addr_and_bit(&bit, addr); 474 ext4_set_bit(bit, addr); 475 } 476 477 static inline void mb_clear_bit(int bit, void *addr) 478 { 479 addr = mb_correct_addr_and_bit(&bit, addr); 480 ext4_clear_bit(bit, addr); 481 } 482 483 static inline int mb_test_and_clear_bit(int bit, void *addr) 484 { 485 addr = mb_correct_addr_and_bit(&bit, addr); 486 return ext4_test_and_clear_bit(bit, addr); 487 } 488 489 static inline int mb_find_next_zero_bit(void *addr, int max, int start) 490 { 491 int fix = 0, ret, tmpmax; 492 addr = mb_correct_addr_and_bit(&fix, addr); 493 tmpmax = max + fix; 494 start += fix; 495 496 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; 497 if (ret > max) 498 return max; 499 return ret; 500 } 501 502 static inline int mb_find_next_bit(void *addr, int max, int start) 503 { 504 int fix = 0, ret, tmpmax; 505 addr = mb_correct_addr_and_bit(&fix, addr); 506 tmpmax = max + fix; 507 start += fix; 508 509 ret = ext4_find_next_bit(addr, tmpmax, start) - fix; 510 if (ret > max) 511 return max; 512 return ret; 513 } 514 515 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 516 { 517 char *bb; 518 519 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 520 BUG_ON(max == NULL); 521 522 if (order > e4b->bd_blkbits + 1) { 523 *max = 0; 524 return NULL; 525 } 526 527 /* at order 0 we see each particular block */ 528 if (order == 0) { 529 *max = 1 << (e4b->bd_blkbits + 3); 530 return e4b->bd_bitmap; 531 } 532 533 bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 534 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 535 536 return bb; 537 } 538 539 #ifdef DOUBLE_CHECK 540 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, 541 int first, int count) 542 { 543 int i; 544 struct super_block *sb = e4b->bd_sb; 545 546 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 547 return; 548 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 549 for (i = 0; i < count; i++) { 550 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 551 ext4_fsblk_t blocknr; 552 553 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 554 blocknr += EXT4_C2B(EXT4_SB(sb), first + i); 555 ext4_grp_locked_error(sb, e4b->bd_group, 556 inode ? inode->i_ino : 0, 557 blocknr, 558 "freeing block already freed " 559 "(bit %u)", 560 first + i); 561 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 562 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 563 } 564 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 565 } 566 } 567 568 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) 569 { 570 int i; 571 572 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 573 return; 574 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 575 for (i = 0; i < count; i++) { 576 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 577 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 578 } 579 } 580 581 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 582 { 583 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 584 return; 585 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { 586 unsigned char *b1, *b2; 587 int i; 588 b1 = (unsigned char *) e4b->bd_info->bb_bitmap; 589 b2 = (unsigned char *) bitmap; 590 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 591 if (b1[i] != b2[i]) { 592 ext4_msg(e4b->bd_sb, KERN_ERR, 593 "corruption in group %u " 594 "at byte %u(%u): %x in copy != %x " 595 "on disk/prealloc", 596 e4b->bd_group, i, i * 8, b1[i], b2[i]); 597 BUG(); 598 } 599 } 600 } 601 } 602 603 static void mb_group_bb_bitmap_alloc(struct super_block *sb, 604 struct ext4_group_info *grp, ext4_group_t group) 605 { 606 struct buffer_head *bh; 607 608 grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); 609 if (!grp->bb_bitmap) 610 return; 611 612 bh = ext4_read_block_bitmap(sb, group); 613 if (IS_ERR_OR_NULL(bh)) { 614 kfree(grp->bb_bitmap); 615 grp->bb_bitmap = NULL; 616 return; 617 } 618 619 memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); 620 put_bh(bh); 621 } 622 623 static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) 624 { 625 kfree(grp->bb_bitmap); 626 } 627 628 #else 629 static inline void mb_free_blocks_double(struct inode *inode, 630 struct ext4_buddy *e4b, int first, int count) 631 { 632 return; 633 } 634 static inline void mb_mark_used_double(struct ext4_buddy *e4b, 635 int first, int count) 636 { 637 return; 638 } 639 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 640 { 641 return; 642 } 643 644 static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, 645 struct ext4_group_info *grp, ext4_group_t group) 646 { 647 return; 648 } 649 650 static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) 651 { 652 return; 653 } 654 #endif 655 656 #ifdef AGGRESSIVE_CHECK 657 658 #define MB_CHECK_ASSERT(assert) \ 659 do { \ 660 if (!(assert)) { \ 661 printk(KERN_EMERG \ 662 "Assertion failure in %s() at %s:%d: \"%s\"\n", \ 663 function, file, line, # assert); \ 664 BUG(); \ 665 } \ 666 } while (0) 667 668 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, 669 const char *function, int line) 670 { 671 struct super_block *sb = e4b->bd_sb; 672 int order = e4b->bd_blkbits + 1; 673 int max; 674 int max2; 675 int i; 676 int j; 677 int k; 678 int count; 679 struct ext4_group_info *grp; 680 int fragments = 0; 681 int fstart; 682 struct list_head *cur; 683 void *buddy; 684 void *buddy2; 685 686 if (e4b->bd_info->bb_check_counter++ % 10) 687 return 0; 688 689 while (order > 1) { 690 buddy = mb_find_buddy(e4b, order, &max); 691 MB_CHECK_ASSERT(buddy); 692 buddy2 = mb_find_buddy(e4b, order - 1, &max2); 693 MB_CHECK_ASSERT(buddy2); 694 MB_CHECK_ASSERT(buddy != buddy2); 695 MB_CHECK_ASSERT(max * 2 == max2); 696 697 count = 0; 698 for (i = 0; i < max; i++) { 699 700 if (mb_test_bit(i, buddy)) { 701 /* only single bit in buddy2 may be 0 */ 702 if (!mb_test_bit(i << 1, buddy2)) { 703 MB_CHECK_ASSERT( 704 mb_test_bit((i<<1)+1, buddy2)); 705 } 706 continue; 707 } 708 709 /* both bits in buddy2 must be 1 */ 710 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 711 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 712 713 for (j = 0; j < (1 << order); j++) { 714 k = (i * (1 << order)) + j; 715 MB_CHECK_ASSERT( 716 !mb_test_bit(k, e4b->bd_bitmap)); 717 } 718 count++; 719 } 720 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); 721 order--; 722 } 723 724 fstart = -1; 725 buddy = mb_find_buddy(e4b, 0, &max); 726 for (i = 0; i < max; i++) { 727 if (!mb_test_bit(i, buddy)) { 728 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); 729 if (fstart == -1) { 730 fragments++; 731 fstart = i; 732 } 733 continue; 734 } 735 fstart = -1; 736 /* check used bits only */ 737 for (j = 0; j < e4b->bd_blkbits + 1; j++) { 738 buddy2 = mb_find_buddy(e4b, j, &max2); 739 k = i >> j; 740 MB_CHECK_ASSERT(k < max2); 741 MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); 742 } 743 } 744 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); 745 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 746 747 grp = ext4_get_group_info(sb, e4b->bd_group); 748 list_for_each(cur, &grp->bb_prealloc_list) { 749 ext4_group_t groupnr; 750 struct ext4_prealloc_space *pa; 751 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 752 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); 753 MB_CHECK_ASSERT(groupnr == e4b->bd_group); 754 for (i = 0; i < pa->pa_len; i++) 755 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); 756 } 757 return 0; 758 } 759 #undef MB_CHECK_ASSERT 760 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ 761 __FILE__, __func__, __LINE__) 762 #else 763 #define mb_check_buddy(e4b) 764 #endif 765 766 /* 767 * Divide blocks started from @first with length @len into 768 * smaller chunks with power of 2 blocks. 769 * Clear the bits in bitmap which the blocks of the chunk(s) covered, 770 * then increase bb_counters[] for corresponded chunk size. 771 */ 772 static void ext4_mb_mark_free_simple(struct super_block *sb, 773 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 774 struct ext4_group_info *grp) 775 { 776 struct ext4_sb_info *sbi = EXT4_SB(sb); 777 ext4_grpblk_t min; 778 ext4_grpblk_t max; 779 ext4_grpblk_t chunk; 780 unsigned int border; 781 782 BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); 783 784 border = 2 << sb->s_blocksize_bits; 785 786 while (len > 0) { 787 /* find how many blocks can be covered since this position */ 788 max = ffs(first | border) - 1; 789 790 /* find how many blocks of power 2 we need to mark */ 791 min = fls(len) - 1; 792 793 if (max < min) 794 min = max; 795 chunk = 1 << min; 796 797 /* mark multiblock chunks only */ 798 grp->bb_counters[min]++; 799 if (min > 0) 800 mb_clear_bit(first >> min, 801 buddy + sbi->s_mb_offsets[min]); 802 803 len -= chunk; 804 first += chunk; 805 } 806 } 807 808 static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) 809 { 810 int order; 811 812 /* 813 * We don't bother with a special lists groups with only 1 block free 814 * extents and for completely empty groups. 815 */ 816 order = fls(len) - 2; 817 if (order < 0) 818 return 0; 819 if (order == MB_NUM_ORDERS(sb)) 820 order--; 821 return order; 822 } 823 824 /* Move group to appropriate avg_fragment_size list */ 825 static void 826 mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) 827 { 828 struct ext4_sb_info *sbi = EXT4_SB(sb); 829 int new_order; 830 831 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) 832 return; 833 834 new_order = mb_avg_fragment_size_order(sb, 835 grp->bb_free / grp->bb_fragments); 836 if (new_order == grp->bb_avg_fragment_size_order) 837 return; 838 839 if (grp->bb_avg_fragment_size_order != -1) { 840 write_lock(&sbi->s_mb_avg_fragment_size_locks[ 841 grp->bb_avg_fragment_size_order]); 842 list_del(&grp->bb_avg_fragment_size_node); 843 write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 844 grp->bb_avg_fragment_size_order]); 845 } 846 grp->bb_avg_fragment_size_order = new_order; 847 write_lock(&sbi->s_mb_avg_fragment_size_locks[ 848 grp->bb_avg_fragment_size_order]); 849 list_add_tail(&grp->bb_avg_fragment_size_node, 850 &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); 851 write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 852 grp->bb_avg_fragment_size_order]); 853 } 854 855 /* 856 * Choose next group by traversing largest_free_order lists. Updates *new_cr if 857 * cr level needs an update. 858 */ 859 static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, 860 int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 861 { 862 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 863 struct ext4_group_info *iter, *grp; 864 int i; 865 866 if (ac->ac_status == AC_STATUS_FOUND) 867 return; 868 869 if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) 870 atomic_inc(&sbi->s_bal_cr0_bad_suggestions); 871 872 grp = NULL; 873 for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { 874 if (list_empty(&sbi->s_mb_largest_free_orders[i])) 875 continue; 876 read_lock(&sbi->s_mb_largest_free_orders_locks[i]); 877 if (list_empty(&sbi->s_mb_largest_free_orders[i])) { 878 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 879 continue; 880 } 881 grp = NULL; 882 list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], 883 bb_largest_free_order_node) { 884 if (sbi->s_mb_stats) 885 atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); 886 if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { 887 grp = iter; 888 break; 889 } 890 } 891 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 892 if (grp) 893 break; 894 } 895 896 if (!grp) { 897 /* Increment cr and search again */ 898 *new_cr = 1; 899 } else { 900 *group = grp->bb_group; 901 ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; 902 } 903 } 904 905 /* 906 * Choose next group by traversing average fragment size list of suitable 907 * order. Updates *new_cr if cr level needs an update. 908 */ 909 static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, 910 int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 911 { 912 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 913 struct ext4_group_info *grp = NULL, *iter; 914 int i; 915 916 if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { 917 if (sbi->s_mb_stats) 918 atomic_inc(&sbi->s_bal_cr1_bad_suggestions); 919 } 920 921 for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); 922 i < MB_NUM_ORDERS(ac->ac_sb); i++) { 923 if (list_empty(&sbi->s_mb_avg_fragment_size[i])) 924 continue; 925 read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); 926 if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { 927 read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); 928 continue; 929 } 930 list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], 931 bb_avg_fragment_size_node) { 932 if (sbi->s_mb_stats) 933 atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); 934 if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { 935 grp = iter; 936 break; 937 } 938 } 939 read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); 940 if (grp) 941 break; 942 } 943 944 if (grp) { 945 *group = grp->bb_group; 946 ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; 947 } else { 948 *new_cr = 2; 949 } 950 } 951 952 static inline int should_optimize_scan(struct ext4_allocation_context *ac) 953 { 954 if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) 955 return 0; 956 if (ac->ac_criteria >= 2) 957 return 0; 958 if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) 959 return 0; 960 return 1; 961 } 962 963 /* 964 * Return next linear group for allocation. If linear traversal should not be 965 * performed, this function just returns the same group 966 */ 967 static int 968 next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) 969 { 970 if (!should_optimize_scan(ac)) 971 goto inc_and_return; 972 973 if (ac->ac_groups_linear_remaining) { 974 ac->ac_groups_linear_remaining--; 975 goto inc_and_return; 976 } 977 978 return group; 979 inc_and_return: 980 /* 981 * Artificially restricted ngroups for non-extent 982 * files makes group > ngroups possible on first loop. 983 */ 984 return group + 1 >= ngroups ? 0 : group + 1; 985 } 986 987 /* 988 * ext4_mb_choose_next_group: choose next group for allocation. 989 * 990 * @ac Allocation Context 991 * @new_cr This is an output parameter. If the there is no good group 992 * available at current CR level, this field is updated to indicate 993 * the new cr level that should be used. 994 * @group This is an input / output parameter. As an input it indicates the 995 * next group that the allocator intends to use for allocation. As 996 * output, this field indicates the next group that should be used as 997 * determined by the optimization functions. 998 * @ngroups Total number of groups 999 */ 1000 static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, 1001 int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 1002 { 1003 *new_cr = ac->ac_criteria; 1004 1005 if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { 1006 *group = next_linear_group(ac, *group, ngroups); 1007 return; 1008 } 1009 1010 if (*new_cr == 0) { 1011 ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); 1012 } else if (*new_cr == 1) { 1013 ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); 1014 } else { 1015 /* 1016 * TODO: For CR=2, we can arrange groups in an rb tree sorted by 1017 * bb_free. But until that happens, we should never come here. 1018 */ 1019 WARN_ON(1); 1020 } 1021 } 1022 1023 /* 1024 * Cache the order of the largest free extent we have available in this block 1025 * group. 1026 */ 1027 static void 1028 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) 1029 { 1030 struct ext4_sb_info *sbi = EXT4_SB(sb); 1031 int i; 1032 1033 for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) 1034 if (grp->bb_counters[i] > 0) 1035 break; 1036 /* No need to move between order lists? */ 1037 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || 1038 i == grp->bb_largest_free_order) { 1039 grp->bb_largest_free_order = i; 1040 return; 1041 } 1042 1043 if (grp->bb_largest_free_order >= 0) { 1044 write_lock(&sbi->s_mb_largest_free_orders_locks[ 1045 grp->bb_largest_free_order]); 1046 list_del_init(&grp->bb_largest_free_order_node); 1047 write_unlock(&sbi->s_mb_largest_free_orders_locks[ 1048 grp->bb_largest_free_order]); 1049 } 1050 grp->bb_largest_free_order = i; 1051 if (grp->bb_largest_free_order >= 0 && grp->bb_free) { 1052 write_lock(&sbi->s_mb_largest_free_orders_locks[ 1053 grp->bb_largest_free_order]); 1054 list_add_tail(&grp->bb_largest_free_order_node, 1055 &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); 1056 write_unlock(&sbi->s_mb_largest_free_orders_locks[ 1057 grp->bb_largest_free_order]); 1058 } 1059 } 1060 1061 static noinline_for_stack 1062 void ext4_mb_generate_buddy(struct super_block *sb, 1063 void *buddy, void *bitmap, ext4_group_t group) 1064 { 1065 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 1066 struct ext4_sb_info *sbi = EXT4_SB(sb); 1067 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 1068 ext4_grpblk_t i = 0; 1069 ext4_grpblk_t first; 1070 ext4_grpblk_t len; 1071 unsigned free = 0; 1072 unsigned fragments = 0; 1073 unsigned long long period = get_cycles(); 1074 1075 /* initialize buddy from bitmap which is aggregation 1076 * of on-disk bitmap and preallocations */ 1077 i = mb_find_next_zero_bit(bitmap, max, 0); 1078 grp->bb_first_free = i; 1079 while (i < max) { 1080 fragments++; 1081 first = i; 1082 i = mb_find_next_bit(bitmap, max, i); 1083 len = i - first; 1084 free += len; 1085 if (len > 1) 1086 ext4_mb_mark_free_simple(sb, buddy, first, len, grp); 1087 else 1088 grp->bb_counters[0]++; 1089 if (i < max) 1090 i = mb_find_next_zero_bit(bitmap, max, i); 1091 } 1092 grp->bb_fragments = fragments; 1093 1094 if (free != grp->bb_free) { 1095 ext4_grp_locked_error(sb, group, 0, 0, 1096 "block bitmap and bg descriptor " 1097 "inconsistent: %u vs %u free clusters", 1098 free, grp->bb_free); 1099 /* 1100 * If we intend to continue, we consider group descriptor 1101 * corrupt and update bb_free using bitmap value 1102 */ 1103 grp->bb_free = free; 1104 ext4_mark_group_bitmap_corrupted(sb, group, 1105 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1106 } 1107 mb_set_largest_free_order(sb, grp); 1108 mb_update_avg_fragment_size(sb, grp); 1109 1110 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 1111 1112 period = get_cycles() - period; 1113 atomic_inc(&sbi->s_mb_buddies_generated); 1114 atomic64_add(period, &sbi->s_mb_generation_time); 1115 } 1116 1117 /* The buddy information is attached the buddy cache inode 1118 * for convenience. The information regarding each group 1119 * is loaded via ext4_mb_load_buddy. The information involve 1120 * block bitmap and buddy information. The information are 1121 * stored in the inode as 1122 * 1123 * { page } 1124 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 1125 * 1126 * 1127 * one block each for bitmap and buddy information. 1128 * So for each group we take up 2 blocks. A page can 1129 * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. 1130 * So it can have information regarding groups_per_page which 1131 * is blocks_per_page/2 1132 * 1133 * Locking note: This routine takes the block group lock of all groups 1134 * for this page; do not hold this lock when calling this routine! 1135 */ 1136 1137 static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) 1138 { 1139 ext4_group_t ngroups; 1140 int blocksize; 1141 int blocks_per_page; 1142 int groups_per_page; 1143 int err = 0; 1144 int i; 1145 ext4_group_t first_group, group; 1146 int first_block; 1147 struct super_block *sb; 1148 struct buffer_head *bhs; 1149 struct buffer_head **bh = NULL; 1150 struct inode *inode; 1151 char *data; 1152 char *bitmap; 1153 struct ext4_group_info *grinfo; 1154 1155 inode = page->mapping->host; 1156 sb = inode->i_sb; 1157 ngroups = ext4_get_groups_count(sb); 1158 blocksize = i_blocksize(inode); 1159 blocks_per_page = PAGE_SIZE / blocksize; 1160 1161 mb_debug(sb, "init page %lu\n", page->index); 1162 1163 groups_per_page = blocks_per_page >> 1; 1164 if (groups_per_page == 0) 1165 groups_per_page = 1; 1166 1167 /* allocate buffer_heads to read bitmaps */ 1168 if (groups_per_page > 1) { 1169 i = sizeof(struct buffer_head *) * groups_per_page; 1170 bh = kzalloc(i, gfp); 1171 if (bh == NULL) 1172 return -ENOMEM; 1173 } else 1174 bh = &bhs; 1175 1176 first_group = page->index * blocks_per_page / 2; 1177 1178 /* read all groups the page covers into the cache */ 1179 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1180 if (group >= ngroups) 1181 break; 1182 1183 grinfo = ext4_get_group_info(sb, group); 1184 /* 1185 * If page is uptodate then we came here after online resize 1186 * which added some new uninitialized group info structs, so 1187 * we must skip all initialized uptodate buddies on the page, 1188 * which may be currently in use by an allocating task. 1189 */ 1190 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { 1191 bh[i] = NULL; 1192 continue; 1193 } 1194 bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); 1195 if (IS_ERR(bh[i])) { 1196 err = PTR_ERR(bh[i]); 1197 bh[i] = NULL; 1198 goto out; 1199 } 1200 mb_debug(sb, "read bitmap for group %u\n", group); 1201 } 1202 1203 /* wait for I/O completion */ 1204 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1205 int err2; 1206 1207 if (!bh[i]) 1208 continue; 1209 err2 = ext4_wait_block_bitmap(sb, group, bh[i]); 1210 if (!err) 1211 err = err2; 1212 } 1213 1214 first_block = page->index * blocks_per_page; 1215 for (i = 0; i < blocks_per_page; i++) { 1216 group = (first_block + i) >> 1; 1217 if (group >= ngroups) 1218 break; 1219 1220 if (!bh[group - first_group]) 1221 /* skip initialized uptodate buddy */ 1222 continue; 1223 1224 if (!buffer_verified(bh[group - first_group])) 1225 /* Skip faulty bitmaps */ 1226 continue; 1227 err = 0; 1228 1229 /* 1230 * data carry information regarding this 1231 * particular group in the format specified 1232 * above 1233 * 1234 */ 1235 data = page_address(page) + (i * blocksize); 1236 bitmap = bh[group - first_group]->b_data; 1237 1238 /* 1239 * We place the buddy block and bitmap block 1240 * close together 1241 */ 1242 if ((first_block + i) & 1) { 1243 /* this is block of buddy */ 1244 BUG_ON(incore == NULL); 1245 mb_debug(sb, "put buddy for group %u in page %lu/%x\n", 1246 group, page->index, i * blocksize); 1247 trace_ext4_mb_buddy_bitmap_load(sb, group); 1248 grinfo = ext4_get_group_info(sb, group); 1249 grinfo->bb_fragments = 0; 1250 memset(grinfo->bb_counters, 0, 1251 sizeof(*grinfo->bb_counters) * 1252 (MB_NUM_ORDERS(sb))); 1253 /* 1254 * incore got set to the group block bitmap below 1255 */ 1256 ext4_lock_group(sb, group); 1257 /* init the buddy */ 1258 memset(data, 0xff, blocksize); 1259 ext4_mb_generate_buddy(sb, data, incore, group); 1260 ext4_unlock_group(sb, group); 1261 incore = NULL; 1262 } else { 1263 /* this is block of bitmap */ 1264 BUG_ON(incore != NULL); 1265 mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", 1266 group, page->index, i * blocksize); 1267 trace_ext4_mb_bitmap_load(sb, group); 1268 1269 /* see comments in ext4_mb_put_pa() */ 1270 ext4_lock_group(sb, group); 1271 memcpy(data, bitmap, blocksize); 1272 1273 /* mark all preallocated blks used in in-core bitmap */ 1274 ext4_mb_generate_from_pa(sb, data, group); 1275 ext4_mb_generate_from_freelist(sb, data, group); 1276 ext4_unlock_group(sb, group); 1277 1278 /* set incore so that the buddy information can be 1279 * generated using this 1280 */ 1281 incore = data; 1282 } 1283 } 1284 SetPageUptodate(page); 1285 1286 out: 1287 if (bh) { 1288 for (i = 0; i < groups_per_page; i++) 1289 brelse(bh[i]); 1290 if (bh != &bhs) 1291 kfree(bh); 1292 } 1293 return err; 1294 } 1295 1296 /* 1297 * Lock the buddy and bitmap pages. This make sure other parallel init_group 1298 * on the same buddy page doesn't happen whild holding the buddy page lock. 1299 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap 1300 * are on the same page e4b->bd_buddy_page is NULL and return value is 0. 1301 */ 1302 static int ext4_mb_get_buddy_page_lock(struct super_block *sb, 1303 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) 1304 { 1305 struct inode *inode = EXT4_SB(sb)->s_buddy_cache; 1306 int block, pnum, poff; 1307 int blocks_per_page; 1308 struct page *page; 1309 1310 e4b->bd_buddy_page = NULL; 1311 e4b->bd_bitmap_page = NULL; 1312 1313 blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1314 /* 1315 * the buddy cache inode stores the block bitmap 1316 * and buddy information in consecutive blocks. 1317 * So for each group we need two blocks. 1318 */ 1319 block = group * 2; 1320 pnum = block / blocks_per_page; 1321 poff = block % blocks_per_page; 1322 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1323 if (!page) 1324 return -ENOMEM; 1325 BUG_ON(page->mapping != inode->i_mapping); 1326 e4b->bd_bitmap_page = page; 1327 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1328 1329 if (blocks_per_page >= 2) { 1330 /* buddy and bitmap are on the same page */ 1331 return 0; 1332 } 1333 1334 block++; 1335 pnum = block / blocks_per_page; 1336 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1337 if (!page) 1338 return -ENOMEM; 1339 BUG_ON(page->mapping != inode->i_mapping); 1340 e4b->bd_buddy_page = page; 1341 return 0; 1342 } 1343 1344 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) 1345 { 1346 if (e4b->bd_bitmap_page) { 1347 unlock_page(e4b->bd_bitmap_page); 1348 put_page(e4b->bd_bitmap_page); 1349 } 1350 if (e4b->bd_buddy_page) { 1351 unlock_page(e4b->bd_buddy_page); 1352 put_page(e4b->bd_buddy_page); 1353 } 1354 } 1355 1356 /* 1357 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1358 * block group lock of all groups for this page; do not hold the BG lock when 1359 * calling this routine! 1360 */ 1361 static noinline_for_stack 1362 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) 1363 { 1364 1365 struct ext4_group_info *this_grp; 1366 struct ext4_buddy e4b; 1367 struct page *page; 1368 int ret = 0; 1369 1370 might_sleep(); 1371 mb_debug(sb, "init group %u\n", group); 1372 this_grp = ext4_get_group_info(sb, group); 1373 /* 1374 * This ensures that we don't reinit the buddy cache 1375 * page which map to the group from which we are already 1376 * allocating. If we are looking at the buddy cache we would 1377 * have taken a reference using ext4_mb_load_buddy and that 1378 * would have pinned buddy page to page cache. 1379 * The call to ext4_mb_get_buddy_page_lock will mark the 1380 * page accessed. 1381 */ 1382 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); 1383 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { 1384 /* 1385 * somebody initialized the group 1386 * return without doing anything 1387 */ 1388 goto err; 1389 } 1390 1391 page = e4b.bd_bitmap_page; 1392 ret = ext4_mb_init_cache(page, NULL, gfp); 1393 if (ret) 1394 goto err; 1395 if (!PageUptodate(page)) { 1396 ret = -EIO; 1397 goto err; 1398 } 1399 1400 if (e4b.bd_buddy_page == NULL) { 1401 /* 1402 * If both the bitmap and buddy are in 1403 * the same page we don't need to force 1404 * init the buddy 1405 */ 1406 ret = 0; 1407 goto err; 1408 } 1409 /* init buddy cache */ 1410 page = e4b.bd_buddy_page; 1411 ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); 1412 if (ret) 1413 goto err; 1414 if (!PageUptodate(page)) { 1415 ret = -EIO; 1416 goto err; 1417 } 1418 err: 1419 ext4_mb_put_buddy_page_lock(&e4b); 1420 return ret; 1421 } 1422 1423 /* 1424 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1425 * block group lock of all groups for this page; do not hold the BG lock when 1426 * calling this routine! 1427 */ 1428 static noinline_for_stack int 1429 ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, 1430 struct ext4_buddy *e4b, gfp_t gfp) 1431 { 1432 int blocks_per_page; 1433 int block; 1434 int pnum; 1435 int poff; 1436 struct page *page; 1437 int ret; 1438 struct ext4_group_info *grp; 1439 struct ext4_sb_info *sbi = EXT4_SB(sb); 1440 struct inode *inode = sbi->s_buddy_cache; 1441 1442 might_sleep(); 1443 mb_debug(sb, "load group %u\n", group); 1444 1445 blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1446 grp = ext4_get_group_info(sb, group); 1447 1448 e4b->bd_blkbits = sb->s_blocksize_bits; 1449 e4b->bd_info = grp; 1450 e4b->bd_sb = sb; 1451 e4b->bd_group = group; 1452 e4b->bd_buddy_page = NULL; 1453 e4b->bd_bitmap_page = NULL; 1454 1455 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1456 /* 1457 * we need full data about the group 1458 * to make a good selection 1459 */ 1460 ret = ext4_mb_init_group(sb, group, gfp); 1461 if (ret) 1462 return ret; 1463 } 1464 1465 /* 1466 * the buddy cache inode stores the block bitmap 1467 * and buddy information in consecutive blocks. 1468 * So for each group we need two blocks. 1469 */ 1470 block = group * 2; 1471 pnum = block / blocks_per_page; 1472 poff = block % blocks_per_page; 1473 1474 /* we could use find_or_create_page(), but it locks page 1475 * what we'd like to avoid in fast path ... */ 1476 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 1477 if (page == NULL || !PageUptodate(page)) { 1478 if (page) 1479 /* 1480 * drop the page reference and try 1481 * to get the page with lock. If we 1482 * are not uptodate that implies 1483 * somebody just created the page but 1484 * is yet to initialize the same. So 1485 * wait for it to initialize. 1486 */ 1487 put_page(page); 1488 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1489 if (page) { 1490 if (WARN_RATELIMIT(page->mapping != inode->i_mapping, 1491 "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { 1492 /* should never happen */ 1493 unlock_page(page); 1494 ret = -EINVAL; 1495 goto err; 1496 } 1497 if (!PageUptodate(page)) { 1498 ret = ext4_mb_init_cache(page, NULL, gfp); 1499 if (ret) { 1500 unlock_page(page); 1501 goto err; 1502 } 1503 mb_cmp_bitmaps(e4b, page_address(page) + 1504 (poff * sb->s_blocksize)); 1505 } 1506 unlock_page(page); 1507 } 1508 } 1509 if (page == NULL) { 1510 ret = -ENOMEM; 1511 goto err; 1512 } 1513 if (!PageUptodate(page)) { 1514 ret = -EIO; 1515 goto err; 1516 } 1517 1518 /* Pages marked accessed already */ 1519 e4b->bd_bitmap_page = page; 1520 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1521 1522 block++; 1523 pnum = block / blocks_per_page; 1524 poff = block % blocks_per_page; 1525 1526 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 1527 if (page == NULL || !PageUptodate(page)) { 1528 if (page) 1529 put_page(page); 1530 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1531 if (page) { 1532 if (WARN_RATELIMIT(page->mapping != inode->i_mapping, 1533 "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { 1534 /* should never happen */ 1535 unlock_page(page); 1536 ret = -EINVAL; 1537 goto err; 1538 } 1539 if (!PageUptodate(page)) { 1540 ret = ext4_mb_init_cache(page, e4b->bd_bitmap, 1541 gfp); 1542 if (ret) { 1543 unlock_page(page); 1544 goto err; 1545 } 1546 } 1547 unlock_page(page); 1548 } 1549 } 1550 if (page == NULL) { 1551 ret = -ENOMEM; 1552 goto err; 1553 } 1554 if (!PageUptodate(page)) { 1555 ret = -EIO; 1556 goto err; 1557 } 1558 1559 /* Pages marked accessed already */ 1560 e4b->bd_buddy_page = page; 1561 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1562 1563 return 0; 1564 1565 err: 1566 if (page) 1567 put_page(page); 1568 if (e4b->bd_bitmap_page) 1569 put_page(e4b->bd_bitmap_page); 1570 1571 e4b->bd_buddy = NULL; 1572 e4b->bd_bitmap = NULL; 1573 return ret; 1574 } 1575 1576 static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1577 struct ext4_buddy *e4b) 1578 { 1579 return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); 1580 } 1581 1582 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) 1583 { 1584 if (e4b->bd_bitmap_page) 1585 put_page(e4b->bd_bitmap_page); 1586 if (e4b->bd_buddy_page) 1587 put_page(e4b->bd_buddy_page); 1588 } 1589 1590 1591 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1592 { 1593 int order = 1, max; 1594 void *bb; 1595 1596 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 1597 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1598 1599 while (order <= e4b->bd_blkbits + 1) { 1600 bb = mb_find_buddy(e4b, order, &max); 1601 if (!mb_test_bit(block >> order, bb)) { 1602 /* this block is part of buddy of order 'order' */ 1603 return order; 1604 } 1605 order++; 1606 } 1607 return 0; 1608 } 1609 1610 static void mb_clear_bits(void *bm, int cur, int len) 1611 { 1612 __u32 *addr; 1613 1614 len = cur + len; 1615 while (cur < len) { 1616 if ((cur & 31) == 0 && (len - cur) >= 32) { 1617 /* fast path: clear whole word at once */ 1618 addr = bm + (cur >> 3); 1619 *addr = 0; 1620 cur += 32; 1621 continue; 1622 } 1623 mb_clear_bit(cur, bm); 1624 cur++; 1625 } 1626 } 1627 1628 /* clear bits in given range 1629 * will return first found zero bit if any, -1 otherwise 1630 */ 1631 static int mb_test_and_clear_bits(void *bm, int cur, int len) 1632 { 1633 __u32 *addr; 1634 int zero_bit = -1; 1635 1636 len = cur + len; 1637 while (cur < len) { 1638 if ((cur & 31) == 0 && (len - cur) >= 32) { 1639 /* fast path: clear whole word at once */ 1640 addr = bm + (cur >> 3); 1641 if (*addr != (__u32)(-1) && zero_bit == -1) 1642 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); 1643 *addr = 0; 1644 cur += 32; 1645 continue; 1646 } 1647 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) 1648 zero_bit = cur; 1649 cur++; 1650 } 1651 1652 return zero_bit; 1653 } 1654 1655 void mb_set_bits(void *bm, int cur, int len) 1656 { 1657 __u32 *addr; 1658 1659 len = cur + len; 1660 while (cur < len) { 1661 if ((cur & 31) == 0 && (len - cur) >= 32) { 1662 /* fast path: set whole word at once */ 1663 addr = bm + (cur >> 3); 1664 *addr = 0xffffffff; 1665 cur += 32; 1666 continue; 1667 } 1668 mb_set_bit(cur, bm); 1669 cur++; 1670 } 1671 } 1672 1673 static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) 1674 { 1675 if (mb_test_bit(*bit + side, bitmap)) { 1676 mb_clear_bit(*bit, bitmap); 1677 (*bit) -= side; 1678 return 1; 1679 } 1680 else { 1681 (*bit) += side; 1682 mb_set_bit(*bit, bitmap); 1683 return -1; 1684 } 1685 } 1686 1687 static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) 1688 { 1689 int max; 1690 int order = 1; 1691 void *buddy = mb_find_buddy(e4b, order, &max); 1692 1693 while (buddy) { 1694 void *buddy2; 1695 1696 /* Bits in range [first; last] are known to be set since 1697 * corresponding blocks were allocated. Bits in range 1698 * (first; last) will stay set because they form buddies on 1699 * upper layer. We just deal with borders if they don't 1700 * align with upper layer and then go up. 1701 * Releasing entire group is all about clearing 1702 * single bit of highest order buddy. 1703 */ 1704 1705 /* Example: 1706 * --------------------------------- 1707 * | 1 | 1 | 1 | 1 | 1708 * --------------------------------- 1709 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1710 * --------------------------------- 1711 * 0 1 2 3 4 5 6 7 1712 * \_____________________/ 1713 * 1714 * Neither [1] nor [6] is aligned to above layer. 1715 * Left neighbour [0] is free, so mark it busy, 1716 * decrease bb_counters and extend range to 1717 * [0; 6] 1718 * Right neighbour [7] is busy. It can't be coaleasced with [6], so 1719 * mark [6] free, increase bb_counters and shrink range to 1720 * [0; 5]. 1721 * Then shift range to [0; 2], go up and do the same. 1722 */ 1723 1724 1725 if (first & 1) 1726 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); 1727 if (!(last & 1)) 1728 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); 1729 if (first > last) 1730 break; 1731 order++; 1732 1733 buddy2 = mb_find_buddy(e4b, order, &max); 1734 if (!buddy2) { 1735 mb_clear_bits(buddy, first, last - first + 1); 1736 e4b->bd_info->bb_counters[order - 1] += last - first + 1; 1737 break; 1738 } 1739 first >>= 1; 1740 last >>= 1; 1741 buddy = buddy2; 1742 } 1743 } 1744 1745 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1746 int first, int count) 1747 { 1748 int left_is_free = 0; 1749 int right_is_free = 0; 1750 int block; 1751 int last = first + count - 1; 1752 struct super_block *sb = e4b->bd_sb; 1753 1754 if (WARN_ON(count == 0)) 1755 return; 1756 BUG_ON(last >= (sb->s_blocksize << 3)); 1757 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1758 /* Don't bother if the block group is corrupt. */ 1759 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) 1760 return; 1761 1762 mb_check_buddy(e4b); 1763 mb_free_blocks_double(inode, e4b, first, count); 1764 1765 this_cpu_inc(discard_pa_seq); 1766 e4b->bd_info->bb_free += count; 1767 if (first < e4b->bd_info->bb_first_free) 1768 e4b->bd_info->bb_first_free = first; 1769 1770 /* access memory sequentially: check left neighbour, 1771 * clear range and then check right neighbour 1772 */ 1773 if (first != 0) 1774 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); 1775 block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); 1776 if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) 1777 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); 1778 1779 if (unlikely(block != -1)) { 1780 struct ext4_sb_info *sbi = EXT4_SB(sb); 1781 ext4_fsblk_t blocknr; 1782 1783 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1784 blocknr += EXT4_C2B(sbi, block); 1785 if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { 1786 ext4_grp_locked_error(sb, e4b->bd_group, 1787 inode ? inode->i_ino : 0, 1788 blocknr, 1789 "freeing already freed block (bit %u); block bitmap corrupt.", 1790 block); 1791 ext4_mark_group_bitmap_corrupted( 1792 sb, e4b->bd_group, 1793 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1794 } 1795 goto done; 1796 } 1797 1798 /* let's maintain fragments counter */ 1799 if (left_is_free && right_is_free) 1800 e4b->bd_info->bb_fragments--; 1801 else if (!left_is_free && !right_is_free) 1802 e4b->bd_info->bb_fragments++; 1803 1804 /* buddy[0] == bd_bitmap is a special case, so handle 1805 * it right away and let mb_buddy_mark_free stay free of 1806 * zero order checks. 1807 * Check if neighbours are to be coaleasced, 1808 * adjust bitmap bb_counters and borders appropriately. 1809 */ 1810 if (first & 1) { 1811 first += !left_is_free; 1812 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; 1813 } 1814 if (!(last & 1)) { 1815 last -= !right_is_free; 1816 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; 1817 } 1818 1819 if (first <= last) 1820 mb_buddy_mark_free(e4b, first >> 1, last >> 1); 1821 1822 done: 1823 mb_set_largest_free_order(sb, e4b->bd_info); 1824 mb_update_avg_fragment_size(sb, e4b->bd_info); 1825 mb_check_buddy(e4b); 1826 } 1827 1828 static int mb_find_extent(struct ext4_buddy *e4b, int block, 1829 int needed, struct ext4_free_extent *ex) 1830 { 1831 int next = block; 1832 int max, order; 1833 void *buddy; 1834 1835 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1836 BUG_ON(ex == NULL); 1837 1838 buddy = mb_find_buddy(e4b, 0, &max); 1839 BUG_ON(buddy == NULL); 1840 BUG_ON(block >= max); 1841 if (mb_test_bit(block, buddy)) { 1842 ex->fe_len = 0; 1843 ex->fe_start = 0; 1844 ex->fe_group = 0; 1845 return 0; 1846 } 1847 1848 /* find actual order */ 1849 order = mb_find_order_for_block(e4b, block); 1850 block = block >> order; 1851 1852 ex->fe_len = 1 << order; 1853 ex->fe_start = block << order; 1854 ex->fe_group = e4b->bd_group; 1855 1856 /* calc difference from given start */ 1857 next = next - ex->fe_start; 1858 ex->fe_len -= next; 1859 ex->fe_start += next; 1860 1861 while (needed > ex->fe_len && 1862 mb_find_buddy(e4b, order, &max)) { 1863 1864 if (block + 1 >= max) 1865 break; 1866 1867 next = (block + 1) * (1 << order); 1868 if (mb_test_bit(next, e4b->bd_bitmap)) 1869 break; 1870 1871 order = mb_find_order_for_block(e4b, next); 1872 1873 block = next >> order; 1874 ex->fe_len += 1 << order; 1875 } 1876 1877 if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { 1878 /* Should never happen! (but apparently sometimes does?!?) */ 1879 WARN_ON(1); 1880 ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, 1881 "corruption or bug in mb_find_extent " 1882 "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", 1883 block, order, needed, ex->fe_group, ex->fe_start, 1884 ex->fe_len, ex->fe_logical); 1885 ex->fe_len = 0; 1886 ex->fe_start = 0; 1887 ex->fe_group = 0; 1888 } 1889 return ex->fe_len; 1890 } 1891 1892 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) 1893 { 1894 int ord; 1895 int mlen = 0; 1896 int max = 0; 1897 int cur; 1898 int start = ex->fe_start; 1899 int len = ex->fe_len; 1900 unsigned ret = 0; 1901 int len0 = len; 1902 void *buddy; 1903 bool split = false; 1904 1905 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 1906 BUG_ON(e4b->bd_group != ex->fe_group); 1907 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1908 mb_check_buddy(e4b); 1909 mb_mark_used_double(e4b, start, len); 1910 1911 this_cpu_inc(discard_pa_seq); 1912 e4b->bd_info->bb_free -= len; 1913 if (e4b->bd_info->bb_first_free == start) 1914 e4b->bd_info->bb_first_free += len; 1915 1916 /* let's maintain fragments counter */ 1917 if (start != 0) 1918 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); 1919 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1920 max = !mb_test_bit(start + len, e4b->bd_bitmap); 1921 if (mlen && max) 1922 e4b->bd_info->bb_fragments++; 1923 else if (!mlen && !max) 1924 e4b->bd_info->bb_fragments--; 1925 1926 /* let's maintain buddy itself */ 1927 while (len) { 1928 if (!split) 1929 ord = mb_find_order_for_block(e4b, start); 1930 1931 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 1932 /* the whole chunk may be allocated at once! */ 1933 mlen = 1 << ord; 1934 if (!split) 1935 buddy = mb_find_buddy(e4b, ord, &max); 1936 else 1937 split = false; 1938 BUG_ON((start >> ord) >= max); 1939 mb_set_bit(start >> ord, buddy); 1940 e4b->bd_info->bb_counters[ord]--; 1941 start += mlen; 1942 len -= mlen; 1943 BUG_ON(len < 0); 1944 continue; 1945 } 1946 1947 /* store for history */ 1948 if (ret == 0) 1949 ret = len | (ord << 16); 1950 1951 /* we have to split large buddy */ 1952 BUG_ON(ord <= 0); 1953 buddy = mb_find_buddy(e4b, ord, &max); 1954 mb_set_bit(start >> ord, buddy); 1955 e4b->bd_info->bb_counters[ord]--; 1956 1957 ord--; 1958 cur = (start >> ord) & ~1U; 1959 buddy = mb_find_buddy(e4b, ord, &max); 1960 mb_clear_bit(cur, buddy); 1961 mb_clear_bit(cur + 1, buddy); 1962 e4b->bd_info->bb_counters[ord]++; 1963 e4b->bd_info->bb_counters[ord]++; 1964 split = true; 1965 } 1966 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1967 1968 mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); 1969 mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0); 1970 mb_check_buddy(e4b); 1971 1972 return ret; 1973 } 1974 1975 /* 1976 * Must be called under group lock! 1977 */ 1978 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, 1979 struct ext4_buddy *e4b) 1980 { 1981 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1982 int ret; 1983 1984 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); 1985 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1986 1987 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); 1988 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; 1989 ret = mb_mark_used(e4b, &ac->ac_b_ex); 1990 1991 /* preallocation can change ac_b_ex, thus we store actually 1992 * allocated blocks for history */ 1993 ac->ac_f_ex = ac->ac_b_ex; 1994 1995 ac->ac_status = AC_STATUS_FOUND; 1996 ac->ac_tail = ret & 0xffff; 1997 ac->ac_buddy = ret >> 16; 1998 1999 /* 2000 * take the page reference. We want the page to be pinned 2001 * so that we don't get a ext4_mb_init_cache_call for this 2002 * group until we update the bitmap. That would mean we 2003 * double allocate blocks. The reference is dropped 2004 * in ext4_mb_release_context 2005 */ 2006 ac->ac_bitmap_page = e4b->bd_bitmap_page; 2007 get_page(ac->ac_bitmap_page); 2008 ac->ac_buddy_page = e4b->bd_buddy_page; 2009 get_page(ac->ac_buddy_page); 2010 /* store last allocated for subsequent stream allocation */ 2011 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2012 spin_lock(&sbi->s_md_lock); 2013 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 2014 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 2015 spin_unlock(&sbi->s_md_lock); 2016 } 2017 /* 2018 * As we've just preallocated more space than 2019 * user requested originally, we store allocated 2020 * space in a special descriptor. 2021 */ 2022 if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 2023 ext4_mb_new_preallocation(ac); 2024 2025 } 2026 2027 static void ext4_mb_check_limits(struct ext4_allocation_context *ac, 2028 struct ext4_buddy *e4b, 2029 int finish_group) 2030 { 2031 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2032 struct ext4_free_extent *bex = &ac->ac_b_ex; 2033 struct ext4_free_extent *gex = &ac->ac_g_ex; 2034 2035 if (ac->ac_status == AC_STATUS_FOUND) 2036 return; 2037 /* 2038 * We don't want to scan for a whole year 2039 */ 2040 if (ac->ac_found > sbi->s_mb_max_to_scan && 2041 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2042 ac->ac_status = AC_STATUS_BREAK; 2043 return; 2044 } 2045 2046 /* 2047 * Haven't found good chunk so far, let's continue 2048 */ 2049 if (bex->fe_len < gex->fe_len) 2050 return; 2051 2052 if (finish_group) 2053 ext4_mb_use_best_found(ac, e4b); 2054 } 2055 2056 /* 2057 * The routine checks whether found extent is good enough. If it is, 2058 * then the extent gets marked used and flag is set to the context 2059 * to stop scanning. Otherwise, the extent is compared with the 2060 * previous found extent and if new one is better, then it's stored 2061 * in the context. Later, the best found extent will be used, if 2062 * mballoc can't find good enough extent. 2063 * 2064 * FIXME: real allocation policy is to be designed yet! 2065 */ 2066 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, 2067 struct ext4_free_extent *ex, 2068 struct ext4_buddy *e4b) 2069 { 2070 struct ext4_free_extent *bex = &ac->ac_b_ex; 2071 struct ext4_free_extent *gex = &ac->ac_g_ex; 2072 2073 BUG_ON(ex->fe_len <= 0); 2074 BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 2075 BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 2076 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 2077 2078 ac->ac_found++; 2079 2080 /* 2081 * The special case - take what you catch first 2082 */ 2083 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2084 *bex = *ex; 2085 ext4_mb_use_best_found(ac, e4b); 2086 return; 2087 } 2088 2089 /* 2090 * Let's check whether the chuck is good enough 2091 */ 2092 if (ex->fe_len == gex->fe_len) { 2093 *bex = *ex; 2094 ext4_mb_use_best_found(ac, e4b); 2095 return; 2096 } 2097 2098 /* 2099 * If this is first found extent, just store it in the context 2100 */ 2101 if (bex->fe_len == 0) { 2102 *bex = *ex; 2103 return; 2104 } 2105 2106 /* 2107 * If new found extent is better, store it in the context 2108 */ 2109 if (bex->fe_len < gex->fe_len) { 2110 /* if the request isn't satisfied, any found extent 2111 * larger than previous best one is better */ 2112 if (ex->fe_len > bex->fe_len) 2113 *bex = *ex; 2114 } else if (ex->fe_len > gex->fe_len) { 2115 /* if the request is satisfied, then we try to find 2116 * an extent that still satisfy the request, but is 2117 * smaller than previous one */ 2118 if (ex->fe_len < bex->fe_len) 2119 *bex = *ex; 2120 } 2121 2122 ext4_mb_check_limits(ac, e4b, 0); 2123 } 2124 2125 static noinline_for_stack 2126 void ext4_mb_try_best_found(struct ext4_allocation_context *ac, 2127 struct ext4_buddy *e4b) 2128 { 2129 struct ext4_free_extent ex = ac->ac_b_ex; 2130 ext4_group_t group = ex.fe_group; 2131 int max; 2132 int err; 2133 2134 BUG_ON(ex.fe_len <= 0); 2135 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 2136 if (err) 2137 return; 2138 2139 ext4_lock_group(ac->ac_sb, group); 2140 max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); 2141 2142 if (max > 0) { 2143 ac->ac_b_ex = ex; 2144 ext4_mb_use_best_found(ac, e4b); 2145 } 2146 2147 ext4_unlock_group(ac->ac_sb, group); 2148 ext4_mb_unload_buddy(e4b); 2149 } 2150 2151 static noinline_for_stack 2152 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 2153 struct ext4_buddy *e4b) 2154 { 2155 ext4_group_t group = ac->ac_g_ex.fe_group; 2156 int max; 2157 int err; 2158 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2159 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2160 struct ext4_free_extent ex; 2161 2162 if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) 2163 return 0; 2164 if (grp->bb_free == 0) 2165 return 0; 2166 2167 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 2168 if (err) 2169 return err; 2170 2171 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { 2172 ext4_mb_unload_buddy(e4b); 2173 return 0; 2174 } 2175 2176 ext4_lock_group(ac->ac_sb, group); 2177 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, 2178 ac->ac_g_ex.fe_len, &ex); 2179 ex.fe_logical = 0xDEADFA11; /* debug value */ 2180 2181 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 2182 ext4_fsblk_t start; 2183 2184 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + 2185 ex.fe_start; 2186 /* use do_div to get remainder (would be 64-bit modulo) */ 2187 if (do_div(start, sbi->s_stripe) == 0) { 2188 ac->ac_found++; 2189 ac->ac_b_ex = ex; 2190 ext4_mb_use_best_found(ac, e4b); 2191 } 2192 } else if (max >= ac->ac_g_ex.fe_len) { 2193 BUG_ON(ex.fe_len <= 0); 2194 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 2195 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 2196 ac->ac_found++; 2197 ac->ac_b_ex = ex; 2198 ext4_mb_use_best_found(ac, e4b); 2199 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { 2200 /* Sometimes, caller may want to merge even small 2201 * number of blocks to an existing extent */ 2202 BUG_ON(ex.fe_len <= 0); 2203 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 2204 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 2205 ac->ac_found++; 2206 ac->ac_b_ex = ex; 2207 ext4_mb_use_best_found(ac, e4b); 2208 } 2209 ext4_unlock_group(ac->ac_sb, group); 2210 ext4_mb_unload_buddy(e4b); 2211 2212 return 0; 2213 } 2214 2215 /* 2216 * The routine scans buddy structures (not bitmap!) from given order 2217 * to max order and tries to find big enough chunk to satisfy the req 2218 */ 2219 static noinline_for_stack 2220 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 2221 struct ext4_buddy *e4b) 2222 { 2223 struct super_block *sb = ac->ac_sb; 2224 struct ext4_group_info *grp = e4b->bd_info; 2225 void *buddy; 2226 int i; 2227 int k; 2228 int max; 2229 2230 BUG_ON(ac->ac_2order <= 0); 2231 for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { 2232 if (grp->bb_counters[i] == 0) 2233 continue; 2234 2235 buddy = mb_find_buddy(e4b, i, &max); 2236 if (WARN_RATELIMIT(buddy == NULL, 2237 "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) 2238 continue; 2239 2240 k = mb_find_next_zero_bit(buddy, max, 0); 2241 if (k >= max) { 2242 ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, 2243 "%d free clusters of order %d. But found 0", 2244 grp->bb_counters[i], i); 2245 ext4_mark_group_bitmap_corrupted(ac->ac_sb, 2246 e4b->bd_group, 2247 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2248 break; 2249 } 2250 ac->ac_found++; 2251 2252 ac->ac_b_ex.fe_len = 1 << i; 2253 ac->ac_b_ex.fe_start = k << i; 2254 ac->ac_b_ex.fe_group = e4b->bd_group; 2255 2256 ext4_mb_use_best_found(ac, e4b); 2257 2258 BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); 2259 2260 if (EXT4_SB(sb)->s_mb_stats) 2261 atomic_inc(&EXT4_SB(sb)->s_bal_2orders); 2262 2263 break; 2264 } 2265 } 2266 2267 /* 2268 * The routine scans the group and measures all found extents. 2269 * In order to optimize scanning, caller must pass number of 2270 * free blocks in the group, so the routine can know upper limit. 2271 */ 2272 static noinline_for_stack 2273 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 2274 struct ext4_buddy *e4b) 2275 { 2276 struct super_block *sb = ac->ac_sb; 2277 void *bitmap = e4b->bd_bitmap; 2278 struct ext4_free_extent ex; 2279 int i; 2280 int free; 2281 2282 free = e4b->bd_info->bb_free; 2283 if (WARN_ON(free <= 0)) 2284 return; 2285 2286 i = e4b->bd_info->bb_first_free; 2287 2288 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 2289 i = mb_find_next_zero_bit(bitmap, 2290 EXT4_CLUSTERS_PER_GROUP(sb), i); 2291 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { 2292 /* 2293 * IF we have corrupt bitmap, we won't find any 2294 * free blocks even though group info says we 2295 * have free blocks 2296 */ 2297 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 2298 "%d free clusters as per " 2299 "group info. But bitmap says 0", 2300 free); 2301 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 2302 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2303 break; 2304 } 2305 2306 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); 2307 if (WARN_ON(ex.fe_len <= 0)) 2308 break; 2309 if (free < ex.fe_len) { 2310 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 2311 "%d free clusters as per " 2312 "group info. But got %d blocks", 2313 free, ex.fe_len); 2314 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 2315 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2316 /* 2317 * The number of free blocks differs. This mostly 2318 * indicate that the bitmap is corrupt. So exit 2319 * without claiming the space. 2320 */ 2321 break; 2322 } 2323 ex.fe_logical = 0xDEADC0DE; /* debug value */ 2324 ext4_mb_measure_extent(ac, &ex, e4b); 2325 2326 i += ex.fe_len; 2327 free -= ex.fe_len; 2328 } 2329 2330 ext4_mb_check_limits(ac, e4b, 1); 2331 } 2332 2333 /* 2334 * This is a special case for storages like raid5 2335 * we try to find stripe-aligned chunks for stripe-size-multiple requests 2336 */ 2337 static noinline_for_stack 2338 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 2339 struct ext4_buddy *e4b) 2340 { 2341 struct super_block *sb = ac->ac_sb; 2342 struct ext4_sb_info *sbi = EXT4_SB(sb); 2343 void *bitmap = e4b->bd_bitmap; 2344 struct ext4_free_extent ex; 2345 ext4_fsblk_t first_group_block; 2346 ext4_fsblk_t a; 2347 ext4_grpblk_t i; 2348 int max; 2349 2350 BUG_ON(sbi->s_stripe == 0); 2351 2352 /* find first stripe-aligned block in group */ 2353 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); 2354 2355 a = first_group_block + sbi->s_stripe - 1; 2356 do_div(a, sbi->s_stripe); 2357 i = (a * sbi->s_stripe) - first_group_block; 2358 2359 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { 2360 if (!mb_test_bit(i, bitmap)) { 2361 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); 2362 if (max >= sbi->s_stripe) { 2363 ac->ac_found++; 2364 ex.fe_logical = 0xDEADF00D; /* debug value */ 2365 ac->ac_b_ex = ex; 2366 ext4_mb_use_best_found(ac, e4b); 2367 break; 2368 } 2369 } 2370 i += sbi->s_stripe; 2371 } 2372 } 2373 2374 /* 2375 * This is also called BEFORE we load the buddy bitmap. 2376 * Returns either 1 or 0 indicating that the group is either suitable 2377 * for the allocation or not. 2378 */ 2379 static bool ext4_mb_good_group(struct ext4_allocation_context *ac, 2380 ext4_group_t group, int cr) 2381 { 2382 ext4_grpblk_t free, fragments; 2383 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 2384 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2385 2386 BUG_ON(cr < 0 || cr >= 4); 2387 2388 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) 2389 return false; 2390 2391 free = grp->bb_free; 2392 if (free == 0) 2393 return false; 2394 2395 fragments = grp->bb_fragments; 2396 if (fragments == 0) 2397 return false; 2398 2399 switch (cr) { 2400 case 0: 2401 BUG_ON(ac->ac_2order == 0); 2402 2403 /* Avoid using the first bg of a flexgroup for data files */ 2404 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 2405 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 2406 ((group % flex_size) == 0)) 2407 return false; 2408 2409 if (free < ac->ac_g_ex.fe_len) 2410 return false; 2411 2412 if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) 2413 return true; 2414 2415 if (grp->bb_largest_free_order < ac->ac_2order) 2416 return false; 2417 2418 return true; 2419 case 1: 2420 if ((free / fragments) >= ac->ac_g_ex.fe_len) 2421 return true; 2422 break; 2423 case 2: 2424 if (free >= ac->ac_g_ex.fe_len) 2425 return true; 2426 break; 2427 case 3: 2428 return true; 2429 default: 2430 BUG(); 2431 } 2432 2433 return false; 2434 } 2435 2436 /* 2437 * This could return negative error code if something goes wrong 2438 * during ext4_mb_init_group(). This should not be called with 2439 * ext4_lock_group() held. 2440 * 2441 * Note: because we are conditionally operating with the group lock in 2442 * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this 2443 * function using __acquire and __release. This means we need to be 2444 * super careful before messing with the error path handling via "goto 2445 * out"! 2446 */ 2447 static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, 2448 ext4_group_t group, int cr) 2449 { 2450 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2451 struct super_block *sb = ac->ac_sb; 2452 struct ext4_sb_info *sbi = EXT4_SB(sb); 2453 bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; 2454 ext4_grpblk_t free; 2455 int ret = 0; 2456 2457 if (sbi->s_mb_stats) 2458 atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); 2459 if (should_lock) { 2460 ext4_lock_group(sb, group); 2461 __release(ext4_group_lock_ptr(sb, group)); 2462 } 2463 free = grp->bb_free; 2464 if (free == 0) 2465 goto out; 2466 if (cr <= 2 && free < ac->ac_g_ex.fe_len) 2467 goto out; 2468 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) 2469 goto out; 2470 if (should_lock) { 2471 __acquire(ext4_group_lock_ptr(sb, group)); 2472 ext4_unlock_group(sb, group); 2473 } 2474 2475 /* We only do this if the grp has never been initialized */ 2476 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 2477 struct ext4_group_desc *gdp = 2478 ext4_get_group_desc(sb, group, NULL); 2479 int ret; 2480 2481 /* cr=0/1 is a very optimistic search to find large 2482 * good chunks almost for free. If buddy data is not 2483 * ready, then this optimization makes no sense. But 2484 * we never skip the first block group in a flex_bg, 2485 * since this gets used for metadata block allocation, 2486 * and we want to make sure we locate metadata blocks 2487 * in the first block group in the flex_bg if possible. 2488 */ 2489 if (cr < 2 && 2490 (!sbi->s_log_groups_per_flex || 2491 ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && 2492 !(ext4_has_group_desc_csum(sb) && 2493 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) 2494 return 0; 2495 ret = ext4_mb_init_group(sb, group, GFP_NOFS); 2496 if (ret) 2497 return ret; 2498 } 2499 2500 if (should_lock) { 2501 ext4_lock_group(sb, group); 2502 __release(ext4_group_lock_ptr(sb, group)); 2503 } 2504 ret = ext4_mb_good_group(ac, group, cr); 2505 out: 2506 if (should_lock) { 2507 __acquire(ext4_group_lock_ptr(sb, group)); 2508 ext4_unlock_group(sb, group); 2509 } 2510 return ret; 2511 } 2512 2513 /* 2514 * Start prefetching @nr block bitmaps starting at @group. 2515 * Return the next group which needs to be prefetched. 2516 */ 2517 ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, 2518 unsigned int nr, int *cnt) 2519 { 2520 ext4_group_t ngroups = ext4_get_groups_count(sb); 2521 struct buffer_head *bh; 2522 struct blk_plug plug; 2523 2524 blk_start_plug(&plug); 2525 while (nr-- > 0) { 2526 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, 2527 NULL); 2528 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 2529 2530 /* 2531 * Prefetch block groups with free blocks; but don't 2532 * bother if it is marked uninitialized on disk, since 2533 * it won't require I/O to read. Also only try to 2534 * prefetch once, so we avoid getblk() call, which can 2535 * be expensive. 2536 */ 2537 if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) && 2538 EXT4_MB_GRP_NEED_INIT(grp) && 2539 ext4_free_group_clusters(sb, gdp) > 0 && 2540 !(ext4_has_group_desc_csum(sb) && 2541 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { 2542 bh = ext4_read_block_bitmap_nowait(sb, group, true); 2543 if (bh && !IS_ERR(bh)) { 2544 if (!buffer_uptodate(bh) && cnt) 2545 (*cnt)++; 2546 brelse(bh); 2547 } 2548 } 2549 if (++group >= ngroups) 2550 group = 0; 2551 } 2552 blk_finish_plug(&plug); 2553 return group; 2554 } 2555 2556 /* 2557 * Prefetching reads the block bitmap into the buffer cache; but we 2558 * need to make sure that the buddy bitmap in the page cache has been 2559 * initialized. Note that ext4_mb_init_group() will block if the I/O 2560 * is not yet completed, or indeed if it was not initiated by 2561 * ext4_mb_prefetch did not start the I/O. 2562 * 2563 * TODO: We should actually kick off the buddy bitmap setup in a work 2564 * queue when the buffer I/O is completed, so that we don't block 2565 * waiting for the block allocation bitmap read to finish when 2566 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). 2567 */ 2568 void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, 2569 unsigned int nr) 2570 { 2571 struct ext4_group_desc *gdp; 2572 struct ext4_group_info *grp; 2573 2574 while (nr-- > 0) { 2575 if (!group) 2576 group = ext4_get_groups_count(sb); 2577 group--; 2578 gdp = ext4_get_group_desc(sb, group, NULL); 2579 grp = ext4_get_group_info(sb, group); 2580 2581 if (EXT4_MB_GRP_NEED_INIT(grp) && 2582 ext4_free_group_clusters(sb, gdp) > 0 && 2583 !(ext4_has_group_desc_csum(sb) && 2584 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { 2585 if (ext4_mb_init_group(sb, group, GFP_NOFS)) 2586 break; 2587 } 2588 } 2589 } 2590 2591 static noinline_for_stack int 2592 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 2593 { 2594 ext4_group_t prefetch_grp = 0, ngroups, group, i; 2595 int cr = -1, new_cr; 2596 int err = 0, first_err = 0; 2597 unsigned int nr = 0, prefetch_ios = 0; 2598 struct ext4_sb_info *sbi; 2599 struct super_block *sb; 2600 struct ext4_buddy e4b; 2601 int lost; 2602 2603 sb = ac->ac_sb; 2604 sbi = EXT4_SB(sb); 2605 ngroups = ext4_get_groups_count(sb); 2606 /* non-extent files are limited to low blocks/groups */ 2607 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) 2608 ngroups = sbi->s_blockfile_groups; 2609 2610 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2611 2612 /* first, try the goal */ 2613 err = ext4_mb_find_by_goal(ac, &e4b); 2614 if (err || ac->ac_status == AC_STATUS_FOUND) 2615 goto out; 2616 2617 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 2618 goto out; 2619 2620 /* 2621 * ac->ac_2order is set only if the fe_len is a power of 2 2622 * if ac->ac_2order is set we also set criteria to 0 so that we 2623 * try exact allocation using buddy. 2624 */ 2625 i = fls(ac->ac_g_ex.fe_len); 2626 ac->ac_2order = 0; 2627 /* 2628 * We search using buddy data only if the order of the request 2629 * is greater than equal to the sbi_s_mb_order2_reqs 2630 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req 2631 * We also support searching for power-of-two requests only for 2632 * requests upto maximum buddy size we have constructed. 2633 */ 2634 if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { 2635 /* 2636 * This should tell if fe_len is exactly power of 2 2637 */ 2638 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 2639 ac->ac_2order = array_index_nospec(i - 1, 2640 MB_NUM_ORDERS(sb)); 2641 } 2642 2643 /* if stream allocation is enabled, use global goal */ 2644 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2645 /* TBD: may be hot point */ 2646 spin_lock(&sbi->s_md_lock); 2647 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2648 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2649 spin_unlock(&sbi->s_md_lock); 2650 } 2651 2652 /* Let's just scan groups to find more-less suitable blocks */ 2653 cr = ac->ac_2order ? 0 : 1; 2654 /* 2655 * cr == 0 try to get exact allocation, 2656 * cr == 3 try to get anything 2657 */ 2658 repeat: 2659 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 2660 ac->ac_criteria = cr; 2661 /* 2662 * searching for the right group start 2663 * from the goal value specified 2664 */ 2665 group = ac->ac_g_ex.fe_group; 2666 ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; 2667 prefetch_grp = group; 2668 2669 for (i = 0, new_cr = cr; i < ngroups; i++, 2670 ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { 2671 int ret = 0; 2672 2673 cond_resched(); 2674 if (new_cr != cr) { 2675 cr = new_cr; 2676 goto repeat; 2677 } 2678 2679 /* 2680 * Batch reads of the block allocation bitmaps 2681 * to get multiple READs in flight; limit 2682 * prefetching at cr=0/1, otherwise mballoc can 2683 * spend a lot of time loading imperfect groups 2684 */ 2685 if ((prefetch_grp == group) && 2686 (cr > 1 || 2687 prefetch_ios < sbi->s_mb_prefetch_limit)) { 2688 unsigned int curr_ios = prefetch_ios; 2689 2690 nr = sbi->s_mb_prefetch; 2691 if (ext4_has_feature_flex_bg(sb)) { 2692 nr = 1 << sbi->s_log_groups_per_flex; 2693 nr -= group & (nr - 1); 2694 nr = min(nr, sbi->s_mb_prefetch); 2695 } 2696 prefetch_grp = ext4_mb_prefetch(sb, group, 2697 nr, &prefetch_ios); 2698 if (prefetch_ios == curr_ios) 2699 nr = 0; 2700 } 2701 2702 /* This now checks without needing the buddy page */ 2703 ret = ext4_mb_good_group_nolock(ac, group, cr); 2704 if (ret <= 0) { 2705 if (!first_err) 2706 first_err = ret; 2707 continue; 2708 } 2709 2710 err = ext4_mb_load_buddy(sb, group, &e4b); 2711 if (err) 2712 goto out; 2713 2714 ext4_lock_group(sb, group); 2715 2716 /* 2717 * We need to check again after locking the 2718 * block group 2719 */ 2720 ret = ext4_mb_good_group(ac, group, cr); 2721 if (ret == 0) { 2722 ext4_unlock_group(sb, group); 2723 ext4_mb_unload_buddy(&e4b); 2724 continue; 2725 } 2726 2727 ac->ac_groups_scanned++; 2728 if (cr == 0) 2729 ext4_mb_simple_scan_group(ac, &e4b); 2730 else if (cr == 1 && sbi->s_stripe && 2731 !(ac->ac_g_ex.fe_len % sbi->s_stripe)) 2732 ext4_mb_scan_aligned(ac, &e4b); 2733 else 2734 ext4_mb_complex_scan_group(ac, &e4b); 2735 2736 ext4_unlock_group(sb, group); 2737 ext4_mb_unload_buddy(&e4b); 2738 2739 if (ac->ac_status != AC_STATUS_CONTINUE) 2740 break; 2741 } 2742 /* Processed all groups and haven't found blocks */ 2743 if (sbi->s_mb_stats && i == ngroups) 2744 atomic64_inc(&sbi->s_bal_cX_failed[cr]); 2745 } 2746 2747 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && 2748 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2749 /* 2750 * We've been searching too long. Let's try to allocate 2751 * the best chunk we've found so far 2752 */ 2753 ext4_mb_try_best_found(ac, &e4b); 2754 if (ac->ac_status != AC_STATUS_FOUND) { 2755 /* 2756 * Someone more lucky has already allocated it. 2757 * The only thing we can do is just take first 2758 * found block(s) 2759 */ 2760 lost = atomic_inc_return(&sbi->s_mb_lost_chunks); 2761 mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", 2762 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, 2763 ac->ac_b_ex.fe_len, lost); 2764 2765 ac->ac_b_ex.fe_group = 0; 2766 ac->ac_b_ex.fe_start = 0; 2767 ac->ac_b_ex.fe_len = 0; 2768 ac->ac_status = AC_STATUS_CONTINUE; 2769 ac->ac_flags |= EXT4_MB_HINT_FIRST; 2770 cr = 3; 2771 goto repeat; 2772 } 2773 } 2774 2775 if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) 2776 atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); 2777 out: 2778 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) 2779 err = first_err; 2780 2781 mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", 2782 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, 2783 ac->ac_flags, cr, err); 2784 2785 if (nr) 2786 ext4_mb_prefetch_fini(sb, prefetch_grp, nr); 2787 2788 return err; 2789 } 2790 2791 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2792 { 2793 struct super_block *sb = pde_data(file_inode(seq->file)); 2794 ext4_group_t group; 2795 2796 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2797 return NULL; 2798 group = *pos + 1; 2799 return (void *) ((unsigned long) group); 2800 } 2801 2802 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2803 { 2804 struct super_block *sb = pde_data(file_inode(seq->file)); 2805 ext4_group_t group; 2806 2807 ++*pos; 2808 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2809 return NULL; 2810 group = *pos + 1; 2811 return (void *) ((unsigned long) group); 2812 } 2813 2814 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2815 { 2816 struct super_block *sb = pde_data(file_inode(seq->file)); 2817 ext4_group_t group = (ext4_group_t) ((unsigned long) v); 2818 int i; 2819 int err, buddy_loaded = 0; 2820 struct ext4_buddy e4b; 2821 struct ext4_group_info *grinfo; 2822 unsigned char blocksize_bits = min_t(unsigned char, 2823 sb->s_blocksize_bits, 2824 EXT4_MAX_BLOCK_LOG_SIZE); 2825 struct sg { 2826 struct ext4_group_info info; 2827 ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; 2828 } sg; 2829 2830 group--; 2831 if (group == 0) 2832 seq_puts(seq, "#group: free frags first [" 2833 " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " 2834 " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); 2835 2836 i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + 2837 sizeof(struct ext4_group_info); 2838 2839 grinfo = ext4_get_group_info(sb, group); 2840 /* Load the group info in memory only if not already loaded. */ 2841 if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { 2842 err = ext4_mb_load_buddy(sb, group, &e4b); 2843 if (err) { 2844 seq_printf(seq, "#%-5u: I/O error\n", group); 2845 return 0; 2846 } 2847 buddy_loaded = 1; 2848 } 2849 2850 memcpy(&sg, ext4_get_group_info(sb, group), i); 2851 2852 if (buddy_loaded) 2853 ext4_mb_unload_buddy(&e4b); 2854 2855 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2856 sg.info.bb_fragments, sg.info.bb_first_free); 2857 for (i = 0; i <= 13; i++) 2858 seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? 2859 sg.info.bb_counters[i] : 0); 2860 seq_puts(seq, " ]\n"); 2861 2862 return 0; 2863 } 2864 2865 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) 2866 { 2867 } 2868 2869 const struct seq_operations ext4_mb_seq_groups_ops = { 2870 .start = ext4_mb_seq_groups_start, 2871 .next = ext4_mb_seq_groups_next, 2872 .stop = ext4_mb_seq_groups_stop, 2873 .show = ext4_mb_seq_groups_show, 2874 }; 2875 2876 int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) 2877 { 2878 struct super_block *sb = seq->private; 2879 struct ext4_sb_info *sbi = EXT4_SB(sb); 2880 2881 seq_puts(seq, "mballoc:\n"); 2882 if (!sbi->s_mb_stats) { 2883 seq_puts(seq, "\tmb stats collection turned off.\n"); 2884 seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); 2885 return 0; 2886 } 2887 seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); 2888 seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); 2889 2890 seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); 2891 2892 seq_puts(seq, "\tcr0_stats:\n"); 2893 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); 2894 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2895 atomic64_read(&sbi->s_bal_cX_groups_considered[0])); 2896 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2897 atomic64_read(&sbi->s_bal_cX_failed[0])); 2898 seq_printf(seq, "\t\tbad_suggestions: %u\n", 2899 atomic_read(&sbi->s_bal_cr0_bad_suggestions)); 2900 2901 seq_puts(seq, "\tcr1_stats:\n"); 2902 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); 2903 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2904 atomic64_read(&sbi->s_bal_cX_groups_considered[1])); 2905 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2906 atomic64_read(&sbi->s_bal_cX_failed[1])); 2907 seq_printf(seq, "\t\tbad_suggestions: %u\n", 2908 atomic_read(&sbi->s_bal_cr1_bad_suggestions)); 2909 2910 seq_puts(seq, "\tcr2_stats:\n"); 2911 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); 2912 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2913 atomic64_read(&sbi->s_bal_cX_groups_considered[2])); 2914 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2915 atomic64_read(&sbi->s_bal_cX_failed[2])); 2916 2917 seq_puts(seq, "\tcr3_stats:\n"); 2918 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); 2919 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2920 atomic64_read(&sbi->s_bal_cX_groups_considered[3])); 2921 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2922 atomic64_read(&sbi->s_bal_cX_failed[3])); 2923 seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); 2924 seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); 2925 seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); 2926 seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); 2927 seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); 2928 2929 seq_printf(seq, "\tbuddies_generated: %u/%u\n", 2930 atomic_read(&sbi->s_mb_buddies_generated), 2931 ext4_get_groups_count(sb)); 2932 seq_printf(seq, "\tbuddies_time_used: %llu\n", 2933 atomic64_read(&sbi->s_mb_generation_time)); 2934 seq_printf(seq, "\tpreallocated: %u\n", 2935 atomic_read(&sbi->s_mb_preallocated)); 2936 seq_printf(seq, "\tdiscarded: %u\n", 2937 atomic_read(&sbi->s_mb_discarded)); 2938 return 0; 2939 } 2940 2941 static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) 2942 __acquires(&EXT4_SB(sb)->s_mb_rb_lock) 2943 { 2944 struct super_block *sb = pde_data(file_inode(seq->file)); 2945 unsigned long position; 2946 2947 if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) 2948 return NULL; 2949 position = *pos + 1; 2950 return (void *) ((unsigned long) position); 2951 } 2952 2953 static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) 2954 { 2955 struct super_block *sb = pde_data(file_inode(seq->file)); 2956 unsigned long position; 2957 2958 ++*pos; 2959 if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) 2960 return NULL; 2961 position = *pos + 1; 2962 return (void *) ((unsigned long) position); 2963 } 2964 2965 static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) 2966 { 2967 struct super_block *sb = pde_data(file_inode(seq->file)); 2968 struct ext4_sb_info *sbi = EXT4_SB(sb); 2969 unsigned long position = ((unsigned long) v); 2970 struct ext4_group_info *grp; 2971 unsigned int count; 2972 2973 position--; 2974 if (position >= MB_NUM_ORDERS(sb)) { 2975 position -= MB_NUM_ORDERS(sb); 2976 if (position == 0) 2977 seq_puts(seq, "avg_fragment_size_lists:\n"); 2978 2979 count = 0; 2980 read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); 2981 list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], 2982 bb_avg_fragment_size_node) 2983 count++; 2984 read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); 2985 seq_printf(seq, "\tlist_order_%u_groups: %u\n", 2986 (unsigned int)position, count); 2987 return 0; 2988 } 2989 2990 if (position == 0) { 2991 seq_printf(seq, "optimize_scan: %d\n", 2992 test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); 2993 seq_puts(seq, "max_free_order_lists:\n"); 2994 } 2995 count = 0; 2996 read_lock(&sbi->s_mb_largest_free_orders_locks[position]); 2997 list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], 2998 bb_largest_free_order_node) 2999 count++; 3000 read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); 3001 seq_printf(seq, "\tlist_order_%u_groups: %u\n", 3002 (unsigned int)position, count); 3003 3004 return 0; 3005 } 3006 3007 static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) 3008 { 3009 } 3010 3011 const struct seq_operations ext4_mb_seq_structs_summary_ops = { 3012 .start = ext4_mb_seq_structs_summary_start, 3013 .next = ext4_mb_seq_structs_summary_next, 3014 .stop = ext4_mb_seq_structs_summary_stop, 3015 .show = ext4_mb_seq_structs_summary_show, 3016 }; 3017 3018 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) 3019 { 3020 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 3021 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; 3022 3023 BUG_ON(!cachep); 3024 return cachep; 3025 } 3026 3027 /* 3028 * Allocate the top-level s_group_info array for the specified number 3029 * of groups 3030 */ 3031 int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) 3032 { 3033 struct ext4_sb_info *sbi = EXT4_SB(sb); 3034 unsigned size; 3035 struct ext4_group_info ***old_groupinfo, ***new_groupinfo; 3036 3037 size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> 3038 EXT4_DESC_PER_BLOCK_BITS(sb); 3039 if (size <= sbi->s_group_info_size) 3040 return 0; 3041 3042 size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); 3043 new_groupinfo = kvzalloc(size, GFP_KERNEL); 3044 if (!new_groupinfo) { 3045 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); 3046 return -ENOMEM; 3047 } 3048 rcu_read_lock(); 3049 old_groupinfo = rcu_dereference(sbi->s_group_info); 3050 if (old_groupinfo) 3051 memcpy(new_groupinfo, old_groupinfo, 3052 sbi->s_group_info_size * sizeof(*sbi->s_group_info)); 3053 rcu_read_unlock(); 3054 rcu_assign_pointer(sbi->s_group_info, new_groupinfo); 3055 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); 3056 if (old_groupinfo) 3057 ext4_kvfree_array_rcu(old_groupinfo); 3058 ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 3059 sbi->s_group_info_size); 3060 return 0; 3061 } 3062 3063 /* Create and initialize ext4_group_info data for the given group. */ 3064 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 3065 struct ext4_group_desc *desc) 3066 { 3067 int i; 3068 int metalen = 0; 3069 int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); 3070 struct ext4_sb_info *sbi = EXT4_SB(sb); 3071 struct ext4_group_info **meta_group_info; 3072 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3073 3074 /* 3075 * First check if this group is the first of a reserved block. 3076 * If it's true, we have to allocate a new table of pointers 3077 * to ext4_group_info structures 3078 */ 3079 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 3080 metalen = sizeof(*meta_group_info) << 3081 EXT4_DESC_PER_BLOCK_BITS(sb); 3082 meta_group_info = kmalloc(metalen, GFP_NOFS); 3083 if (meta_group_info == NULL) { 3084 ext4_msg(sb, KERN_ERR, "can't allocate mem " 3085 "for a buddy group"); 3086 return -ENOMEM; 3087 } 3088 rcu_read_lock(); 3089 rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; 3090 rcu_read_unlock(); 3091 } 3092 3093 meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); 3094 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 3095 3096 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); 3097 if (meta_group_info[i] == NULL) { 3098 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); 3099 goto exit_group_info; 3100 } 3101 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 3102 &(meta_group_info[i]->bb_state)); 3103 3104 /* 3105 * initialize bb_free to be able to skip 3106 * empty groups without initialization 3107 */ 3108 if (ext4_has_group_desc_csum(sb) && 3109 (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 3110 meta_group_info[i]->bb_free = 3111 ext4_free_clusters_after_init(sb, group, desc); 3112 } else { 3113 meta_group_info[i]->bb_free = 3114 ext4_free_group_clusters(sb, desc); 3115 } 3116 3117 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 3118 init_rwsem(&meta_group_info[i]->alloc_sem); 3119 meta_group_info[i]->bb_free_root = RB_ROOT; 3120 INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); 3121 INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); 3122 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ 3123 meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ 3124 meta_group_info[i]->bb_group = group; 3125 3126 mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); 3127 return 0; 3128 3129 exit_group_info: 3130 /* If a meta_group_info table has been allocated, release it now */ 3131 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 3132 struct ext4_group_info ***group_info; 3133 3134 rcu_read_lock(); 3135 group_info = rcu_dereference(sbi->s_group_info); 3136 kfree(group_info[idx]); 3137 group_info[idx] = NULL; 3138 rcu_read_unlock(); 3139 } 3140 return -ENOMEM; 3141 } /* ext4_mb_add_groupinfo */ 3142 3143 static int ext4_mb_init_backend(struct super_block *sb) 3144 { 3145 ext4_group_t ngroups = ext4_get_groups_count(sb); 3146 ext4_group_t i; 3147 struct ext4_sb_info *sbi = EXT4_SB(sb); 3148 int err; 3149 struct ext4_group_desc *desc; 3150 struct ext4_group_info ***group_info; 3151 struct kmem_cache *cachep; 3152 3153 err = ext4_mb_alloc_groupinfo(sb, ngroups); 3154 if (err) 3155 return err; 3156 3157 sbi->s_buddy_cache = new_inode(sb); 3158 if (sbi->s_buddy_cache == NULL) { 3159 ext4_msg(sb, KERN_ERR, "can't get new inode"); 3160 goto err_freesgi; 3161 } 3162 /* To avoid potentially colliding with an valid on-disk inode number, 3163 * use EXT4_BAD_INO for the buddy cache inode number. This inode is 3164 * not in the inode hash, so it should never be found by iget(), but 3165 * this will avoid confusion if it ever shows up during debugging. */ 3166 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; 3167 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 3168 for (i = 0; i < ngroups; i++) { 3169 cond_resched(); 3170 desc = ext4_get_group_desc(sb, i, NULL); 3171 if (desc == NULL) { 3172 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); 3173 goto err_freebuddy; 3174 } 3175 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 3176 goto err_freebuddy; 3177 } 3178 3179 if (ext4_has_feature_flex_bg(sb)) { 3180 /* a single flex group is supposed to be read by a single IO. 3181 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is 3182 * unsigned integer, so the maximum shift is 32. 3183 */ 3184 if (sbi->s_es->s_log_groups_per_flex >= 32) { 3185 ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); 3186 goto err_freebuddy; 3187 } 3188 sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, 3189 BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); 3190 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ 3191 } else { 3192 sbi->s_mb_prefetch = 32; 3193 } 3194 if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) 3195 sbi->s_mb_prefetch = ext4_get_groups_count(sb); 3196 /* now many real IOs to prefetch within a single allocation at cr=0 3197 * given cr=0 is an CPU-related optimization we shouldn't try to 3198 * load too many groups, at some point we should start to use what 3199 * we've got in memory. 3200 * with an average random access time 5ms, it'd take a second to get 3201 * 200 groups (* N with flex_bg), so let's make this limit 4 3202 */ 3203 sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; 3204 if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) 3205 sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); 3206 3207 return 0; 3208 3209 err_freebuddy: 3210 cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3211 while (i-- > 0) 3212 kmem_cache_free(cachep, ext4_get_group_info(sb, i)); 3213 i = sbi->s_group_info_size; 3214 rcu_read_lock(); 3215 group_info = rcu_dereference(sbi->s_group_info); 3216 while (i-- > 0) 3217 kfree(group_info[i]); 3218 rcu_read_unlock(); 3219 iput(sbi->s_buddy_cache); 3220 err_freesgi: 3221 rcu_read_lock(); 3222 kvfree(rcu_dereference(sbi->s_group_info)); 3223 rcu_read_unlock(); 3224 return -ENOMEM; 3225 } 3226 3227 static void ext4_groupinfo_destroy_slabs(void) 3228 { 3229 int i; 3230 3231 for (i = 0; i < NR_GRPINFO_CACHES; i++) { 3232 kmem_cache_destroy(ext4_groupinfo_caches[i]); 3233 ext4_groupinfo_caches[i] = NULL; 3234 } 3235 } 3236 3237 static int ext4_groupinfo_create_slab(size_t size) 3238 { 3239 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); 3240 int slab_size; 3241 int blocksize_bits = order_base_2(size); 3242 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 3243 struct kmem_cache *cachep; 3244 3245 if (cache_index >= NR_GRPINFO_CACHES) 3246 return -EINVAL; 3247 3248 if (unlikely(cache_index < 0)) 3249 cache_index = 0; 3250 3251 mutex_lock(&ext4_grpinfo_slab_create_mutex); 3252 if (ext4_groupinfo_caches[cache_index]) { 3253 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 3254 return 0; /* Already created */ 3255 } 3256 3257 slab_size = offsetof(struct ext4_group_info, 3258 bb_counters[blocksize_bits + 2]); 3259 3260 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], 3261 slab_size, 0, SLAB_RECLAIM_ACCOUNT, 3262 NULL); 3263 3264 ext4_groupinfo_caches[cache_index] = cachep; 3265 3266 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 3267 if (!cachep) { 3268 printk(KERN_EMERG 3269 "EXT4-fs: no memory for groupinfo slab cache\n"); 3270 return -ENOMEM; 3271 } 3272 3273 return 0; 3274 } 3275 3276 static void ext4_discard_work(struct work_struct *work) 3277 { 3278 struct ext4_sb_info *sbi = container_of(work, 3279 struct ext4_sb_info, s_discard_work); 3280 struct super_block *sb = sbi->s_sb; 3281 struct ext4_free_data *fd, *nfd; 3282 struct ext4_buddy e4b; 3283 struct list_head discard_list; 3284 ext4_group_t grp, load_grp; 3285 int err = 0; 3286 3287 INIT_LIST_HEAD(&discard_list); 3288 spin_lock(&sbi->s_md_lock); 3289 list_splice_init(&sbi->s_discard_list, &discard_list); 3290 spin_unlock(&sbi->s_md_lock); 3291 3292 load_grp = UINT_MAX; 3293 list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { 3294 /* 3295 * If filesystem is umounting or no memory or suffering 3296 * from no space, give up the discard 3297 */ 3298 if ((sb->s_flags & SB_ACTIVE) && !err && 3299 !atomic_read(&sbi->s_retry_alloc_pending)) { 3300 grp = fd->efd_group; 3301 if (grp != load_grp) { 3302 if (load_grp != UINT_MAX) 3303 ext4_mb_unload_buddy(&e4b); 3304 3305 err = ext4_mb_load_buddy(sb, grp, &e4b); 3306 if (err) { 3307 kmem_cache_free(ext4_free_data_cachep, fd); 3308 load_grp = UINT_MAX; 3309 continue; 3310 } else { 3311 load_grp = grp; 3312 } 3313 } 3314 3315 ext4_lock_group(sb, grp); 3316 ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, 3317 fd->efd_start_cluster + fd->efd_count - 1, 1); 3318 ext4_unlock_group(sb, grp); 3319 } 3320 kmem_cache_free(ext4_free_data_cachep, fd); 3321 } 3322 3323 if (load_grp != UINT_MAX) 3324 ext4_mb_unload_buddy(&e4b); 3325 } 3326 3327 int ext4_mb_init(struct super_block *sb) 3328 { 3329 struct ext4_sb_info *sbi = EXT4_SB(sb); 3330 unsigned i, j; 3331 unsigned offset, offset_incr; 3332 unsigned max; 3333 int ret; 3334 3335 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); 3336 3337 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 3338 if (sbi->s_mb_offsets == NULL) { 3339 ret = -ENOMEM; 3340 goto out; 3341 } 3342 3343 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); 3344 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 3345 if (sbi->s_mb_maxs == NULL) { 3346 ret = -ENOMEM; 3347 goto out; 3348 } 3349 3350 ret = ext4_groupinfo_create_slab(sb->s_blocksize); 3351 if (ret < 0) 3352 goto out; 3353 3354 /* order 0 is regular bitmap */ 3355 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 3356 sbi->s_mb_offsets[0] = 0; 3357 3358 i = 1; 3359 offset = 0; 3360 offset_incr = 1 << (sb->s_blocksize_bits - 1); 3361 max = sb->s_blocksize << 2; 3362 do { 3363 sbi->s_mb_offsets[i] = offset; 3364 sbi->s_mb_maxs[i] = max; 3365 offset += offset_incr; 3366 offset_incr = offset_incr >> 1; 3367 max = max >> 1; 3368 i++; 3369 } while (i < MB_NUM_ORDERS(sb)); 3370 3371 sbi->s_mb_avg_fragment_size = 3372 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3373 GFP_KERNEL); 3374 if (!sbi->s_mb_avg_fragment_size) { 3375 ret = -ENOMEM; 3376 goto out; 3377 } 3378 sbi->s_mb_avg_fragment_size_locks = 3379 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 3380 GFP_KERNEL); 3381 if (!sbi->s_mb_avg_fragment_size_locks) { 3382 ret = -ENOMEM; 3383 goto out; 3384 } 3385 for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 3386 INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); 3387 rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); 3388 } 3389 sbi->s_mb_largest_free_orders = 3390 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3391 GFP_KERNEL); 3392 if (!sbi->s_mb_largest_free_orders) { 3393 ret = -ENOMEM; 3394 goto out; 3395 } 3396 sbi->s_mb_largest_free_orders_locks = 3397 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 3398 GFP_KERNEL); 3399 if (!sbi->s_mb_largest_free_orders_locks) { 3400 ret = -ENOMEM; 3401 goto out; 3402 } 3403 for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 3404 INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); 3405 rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); 3406 } 3407 3408 spin_lock_init(&sbi->s_md_lock); 3409 sbi->s_mb_free_pending = 0; 3410 INIT_LIST_HEAD(&sbi->s_freed_data_list); 3411 INIT_LIST_HEAD(&sbi->s_discard_list); 3412 INIT_WORK(&sbi->s_discard_work, ext4_discard_work); 3413 atomic_set(&sbi->s_retry_alloc_pending, 0); 3414 3415 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 3416 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; 3417 sbi->s_mb_stats = MB_DEFAULT_STATS; 3418 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 3419 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 3420 /* 3421 * The default group preallocation is 512, which for 4k block 3422 * sizes translates to 2 megabytes. However for bigalloc file 3423 * systems, this is probably too big (i.e, if the cluster size 3424 * is 1 megabyte, then group preallocation size becomes half a 3425 * gigabyte!). As a default, we will keep a two megabyte 3426 * group pralloc size for cluster sizes up to 64k, and after 3427 * that, we will force a minimum group preallocation size of 3428 * 32 clusters. This translates to 8 megs when the cluster 3429 * size is 256k, and 32 megs when the cluster size is 1 meg, 3430 * which seems reasonable as a default. 3431 */ 3432 sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> 3433 sbi->s_cluster_bits, 32); 3434 /* 3435 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc 3436 * to the lowest multiple of s_stripe which is bigger than 3437 * the s_mb_group_prealloc as determined above. We want 3438 * the preallocation size to be an exact multiple of the 3439 * RAID stripe size so that preallocations don't fragment 3440 * the stripes. 3441 */ 3442 if (sbi->s_stripe > 1) { 3443 sbi->s_mb_group_prealloc = roundup( 3444 sbi->s_mb_group_prealloc, sbi->s_stripe); 3445 } 3446 3447 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 3448 if (sbi->s_locality_groups == NULL) { 3449 ret = -ENOMEM; 3450 goto out; 3451 } 3452 for_each_possible_cpu(i) { 3453 struct ext4_locality_group *lg; 3454 lg = per_cpu_ptr(sbi->s_locality_groups, i); 3455 mutex_init(&lg->lg_mutex); 3456 for (j = 0; j < PREALLOC_TB_SIZE; j++) 3457 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 3458 spin_lock_init(&lg->lg_prealloc_lock); 3459 } 3460 3461 if (bdev_nonrot(sb->s_bdev)) 3462 sbi->s_mb_max_linear_groups = 0; 3463 else 3464 sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; 3465 /* init file for buddy data */ 3466 ret = ext4_mb_init_backend(sb); 3467 if (ret != 0) 3468 goto out_free_locality_groups; 3469 3470 return 0; 3471 3472 out_free_locality_groups: 3473 free_percpu(sbi->s_locality_groups); 3474 sbi->s_locality_groups = NULL; 3475 out: 3476 kfree(sbi->s_mb_avg_fragment_size); 3477 kfree(sbi->s_mb_avg_fragment_size_locks); 3478 kfree(sbi->s_mb_largest_free_orders); 3479 kfree(sbi->s_mb_largest_free_orders_locks); 3480 kfree(sbi->s_mb_offsets); 3481 sbi->s_mb_offsets = NULL; 3482 kfree(sbi->s_mb_maxs); 3483 sbi->s_mb_maxs = NULL; 3484 return ret; 3485 } 3486 3487 /* need to called with the ext4 group lock held */ 3488 static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) 3489 { 3490 struct ext4_prealloc_space *pa; 3491 struct list_head *cur, *tmp; 3492 int count = 0; 3493 3494 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { 3495 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 3496 list_del(&pa->pa_group_list); 3497 count++; 3498 kmem_cache_free(ext4_pspace_cachep, pa); 3499 } 3500 return count; 3501 } 3502 3503 int ext4_mb_release(struct super_block *sb) 3504 { 3505 ext4_group_t ngroups = ext4_get_groups_count(sb); 3506 ext4_group_t i; 3507 int num_meta_group_infos; 3508 struct ext4_group_info *grinfo, ***group_info; 3509 struct ext4_sb_info *sbi = EXT4_SB(sb); 3510 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3511 int count; 3512 3513 if (test_opt(sb, DISCARD)) { 3514 /* 3515 * wait the discard work to drain all of ext4_free_data 3516 */ 3517 flush_work(&sbi->s_discard_work); 3518 WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); 3519 } 3520 3521 if (sbi->s_group_info) { 3522 for (i = 0; i < ngroups; i++) { 3523 cond_resched(); 3524 grinfo = ext4_get_group_info(sb, i); 3525 mb_group_bb_bitmap_free(grinfo); 3526 ext4_lock_group(sb, i); 3527 count = ext4_mb_cleanup_pa(grinfo); 3528 if (count) 3529 mb_debug(sb, "mballoc: %d PAs left\n", 3530 count); 3531 ext4_unlock_group(sb, i); 3532 kmem_cache_free(cachep, grinfo); 3533 } 3534 num_meta_group_infos = (ngroups + 3535 EXT4_DESC_PER_BLOCK(sb) - 1) >> 3536 EXT4_DESC_PER_BLOCK_BITS(sb); 3537 rcu_read_lock(); 3538 group_info = rcu_dereference(sbi->s_group_info); 3539 for (i = 0; i < num_meta_group_infos; i++) 3540 kfree(group_info[i]); 3541 kvfree(group_info); 3542 rcu_read_unlock(); 3543 } 3544 kfree(sbi->s_mb_avg_fragment_size); 3545 kfree(sbi->s_mb_avg_fragment_size_locks); 3546 kfree(sbi->s_mb_largest_free_orders); 3547 kfree(sbi->s_mb_largest_free_orders_locks); 3548 kfree(sbi->s_mb_offsets); 3549 kfree(sbi->s_mb_maxs); 3550 iput(sbi->s_buddy_cache); 3551 if (sbi->s_mb_stats) { 3552 ext4_msg(sb, KERN_INFO, 3553 "mballoc: %u blocks %u reqs (%u success)", 3554 atomic_read(&sbi->s_bal_allocated), 3555 atomic_read(&sbi->s_bal_reqs), 3556 atomic_read(&sbi->s_bal_success)); 3557 ext4_msg(sb, KERN_INFO, 3558 "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " 3559 "%u 2^N hits, %u breaks, %u lost", 3560 atomic_read(&sbi->s_bal_ex_scanned), 3561 atomic_read(&sbi->s_bal_groups_scanned), 3562 atomic_read(&sbi->s_bal_goals), 3563 atomic_read(&sbi->s_bal_2orders), 3564 atomic_read(&sbi->s_bal_breaks), 3565 atomic_read(&sbi->s_mb_lost_chunks)); 3566 ext4_msg(sb, KERN_INFO, 3567 "mballoc: %u generated and it took %llu", 3568 atomic_read(&sbi->s_mb_buddies_generated), 3569 atomic64_read(&sbi->s_mb_generation_time)); 3570 ext4_msg(sb, KERN_INFO, 3571 "mballoc: %u preallocated, %u discarded", 3572 atomic_read(&sbi->s_mb_preallocated), 3573 atomic_read(&sbi->s_mb_discarded)); 3574 } 3575 3576 free_percpu(sbi->s_locality_groups); 3577 3578 return 0; 3579 } 3580 3581 static inline int ext4_issue_discard(struct super_block *sb, 3582 ext4_group_t block_group, ext4_grpblk_t cluster, int count, 3583 struct bio **biop) 3584 { 3585 ext4_fsblk_t discard_block; 3586 3587 discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + 3588 ext4_group_first_block_no(sb, block_group)); 3589 count = EXT4_C2B(EXT4_SB(sb), count); 3590 trace_ext4_discard_blocks(sb, 3591 (unsigned long long) discard_block, count); 3592 if (biop) { 3593 return __blkdev_issue_discard(sb->s_bdev, 3594 (sector_t)discard_block << (sb->s_blocksize_bits - 9), 3595 (sector_t)count << (sb->s_blocksize_bits - 9), 3596 GFP_NOFS, biop); 3597 } else 3598 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); 3599 } 3600 3601 static void ext4_free_data_in_buddy(struct super_block *sb, 3602 struct ext4_free_data *entry) 3603 { 3604 struct ext4_buddy e4b; 3605 struct ext4_group_info *db; 3606 int err, count = 0; 3607 3608 mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", 3609 entry->efd_count, entry->efd_group, entry); 3610 3611 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); 3612 /* we expect to find existing buddy because it's pinned */ 3613 BUG_ON(err != 0); 3614 3615 spin_lock(&EXT4_SB(sb)->s_md_lock); 3616 EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; 3617 spin_unlock(&EXT4_SB(sb)->s_md_lock); 3618 3619 db = e4b.bd_info; 3620 /* there are blocks to put in buddy to make them really free */ 3621 count += entry->efd_count; 3622 ext4_lock_group(sb, entry->efd_group); 3623 /* Take it out of per group rb tree */ 3624 rb_erase(&entry->efd_node, &(db->bb_free_root)); 3625 mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); 3626 3627 /* 3628 * Clear the trimmed flag for the group so that the next 3629 * ext4_trim_fs can trim it. 3630 * If the volume is mounted with -o discard, online discard 3631 * is supported and the free blocks will be trimmed online. 3632 */ 3633 if (!test_opt(sb, DISCARD)) 3634 EXT4_MB_GRP_CLEAR_TRIMMED(db); 3635 3636 if (!db->bb_free_root.rb_node) { 3637 /* No more items in the per group rb tree 3638 * balance refcounts from ext4_mb_free_metadata() 3639 */ 3640 put_page(e4b.bd_buddy_page); 3641 put_page(e4b.bd_bitmap_page); 3642 } 3643 ext4_unlock_group(sb, entry->efd_group); 3644 ext4_mb_unload_buddy(&e4b); 3645 3646 mb_debug(sb, "freed %d blocks in 1 structures\n", count); 3647 } 3648 3649 /* 3650 * This function is called by the jbd2 layer once the commit has finished, 3651 * so we know we can free the blocks that were released with that commit. 3652 */ 3653 void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) 3654 { 3655 struct ext4_sb_info *sbi = EXT4_SB(sb); 3656 struct ext4_free_data *entry, *tmp; 3657 struct list_head freed_data_list; 3658 struct list_head *cut_pos = NULL; 3659 bool wake; 3660 3661 INIT_LIST_HEAD(&freed_data_list); 3662 3663 spin_lock(&sbi->s_md_lock); 3664 list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { 3665 if (entry->efd_tid != commit_tid) 3666 break; 3667 cut_pos = &entry->efd_list; 3668 } 3669 if (cut_pos) 3670 list_cut_position(&freed_data_list, &sbi->s_freed_data_list, 3671 cut_pos); 3672 spin_unlock(&sbi->s_md_lock); 3673 3674 list_for_each_entry(entry, &freed_data_list, efd_list) 3675 ext4_free_data_in_buddy(sb, entry); 3676 3677 if (test_opt(sb, DISCARD)) { 3678 spin_lock(&sbi->s_md_lock); 3679 wake = list_empty(&sbi->s_discard_list); 3680 list_splice_tail(&freed_data_list, &sbi->s_discard_list); 3681 spin_unlock(&sbi->s_md_lock); 3682 if (wake) 3683 queue_work(system_unbound_wq, &sbi->s_discard_work); 3684 } else { 3685 list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) 3686 kmem_cache_free(ext4_free_data_cachep, entry); 3687 } 3688 } 3689 3690 int __init ext4_init_mballoc(void) 3691 { 3692 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, 3693 SLAB_RECLAIM_ACCOUNT); 3694 if (ext4_pspace_cachep == NULL) 3695 goto out; 3696 3697 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, 3698 SLAB_RECLAIM_ACCOUNT); 3699 if (ext4_ac_cachep == NULL) 3700 goto out_pa_free; 3701 3702 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, 3703 SLAB_RECLAIM_ACCOUNT); 3704 if (ext4_free_data_cachep == NULL) 3705 goto out_ac_free; 3706 3707 return 0; 3708 3709 out_ac_free: 3710 kmem_cache_destroy(ext4_ac_cachep); 3711 out_pa_free: 3712 kmem_cache_destroy(ext4_pspace_cachep); 3713 out: 3714 return -ENOMEM; 3715 } 3716 3717 void ext4_exit_mballoc(void) 3718 { 3719 /* 3720 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 3721 * before destroying the slab cache. 3722 */ 3723 rcu_barrier(); 3724 kmem_cache_destroy(ext4_pspace_cachep); 3725 kmem_cache_destroy(ext4_ac_cachep); 3726 kmem_cache_destroy(ext4_free_data_cachep); 3727 ext4_groupinfo_destroy_slabs(); 3728 } 3729 3730 3731 /* 3732 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps 3733 * Returns 0 if success or error code 3734 */ 3735 static noinline_for_stack int 3736 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 3737 handle_t *handle, unsigned int reserv_clstrs) 3738 { 3739 struct buffer_head *bitmap_bh = NULL; 3740 struct ext4_group_desc *gdp; 3741 struct buffer_head *gdp_bh; 3742 struct ext4_sb_info *sbi; 3743 struct super_block *sb; 3744 ext4_fsblk_t block; 3745 int err, len; 3746 3747 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3748 BUG_ON(ac->ac_b_ex.fe_len <= 0); 3749 3750 sb = ac->ac_sb; 3751 sbi = EXT4_SB(sb); 3752 3753 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 3754 if (IS_ERR(bitmap_bh)) { 3755 return PTR_ERR(bitmap_bh); 3756 } 3757 3758 BUFFER_TRACE(bitmap_bh, "getting write access"); 3759 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 3760 EXT4_JTR_NONE); 3761 if (err) 3762 goto out_err; 3763 3764 err = -EIO; 3765 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); 3766 if (!gdp) 3767 goto out_err; 3768 3769 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 3770 ext4_free_group_clusters(sb, gdp)); 3771 3772 BUFFER_TRACE(gdp_bh, "get_write_access"); 3773 err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); 3774 if (err) 3775 goto out_err; 3776 3777 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3778 3779 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 3780 if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { 3781 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 3782 "fs metadata", block, block+len); 3783 /* File system mounted not to panic on error 3784 * Fix the bitmap and return EFSCORRUPTED 3785 * We leak some of the blocks here. 3786 */ 3787 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3788 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 3789 ac->ac_b_ex.fe_len); 3790 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3791 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3792 if (!err) 3793 err = -EFSCORRUPTED; 3794 goto out_err; 3795 } 3796 3797 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3798 #ifdef AGGRESSIVE_CHECK 3799 { 3800 int i; 3801 for (i = 0; i < ac->ac_b_ex.fe_len; i++) { 3802 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, 3803 bitmap_bh->b_data)); 3804 } 3805 } 3806 #endif 3807 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 3808 ac->ac_b_ex.fe_len); 3809 if (ext4_has_group_desc_csum(sb) && 3810 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 3811 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3812 ext4_free_group_clusters_set(sb, gdp, 3813 ext4_free_clusters_after_init(sb, 3814 ac->ac_b_ex.fe_group, gdp)); 3815 } 3816 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; 3817 ext4_free_group_clusters_set(sb, gdp, len); 3818 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 3819 ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); 3820 3821 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3822 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); 3823 /* 3824 * Now reduce the dirty block count also. Should not go negative 3825 */ 3826 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 3827 /* release all the reserved blocks if non delalloc */ 3828 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 3829 reserv_clstrs); 3830 3831 if (sbi->s_log_groups_per_flex) { 3832 ext4_group_t flex_group = ext4_flex_group(sbi, 3833 ac->ac_b_ex.fe_group); 3834 atomic64_sub(ac->ac_b_ex.fe_len, 3835 &sbi_array_rcu_deref(sbi, s_flex_groups, 3836 flex_group)->free_clusters); 3837 } 3838 3839 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3840 if (err) 3841 goto out_err; 3842 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 3843 3844 out_err: 3845 brelse(bitmap_bh); 3846 return err; 3847 } 3848 3849 /* 3850 * Idempotent helper for Ext4 fast commit replay path to set the state of 3851 * blocks in bitmaps and update counters. 3852 */ 3853 void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, 3854 int len, int state) 3855 { 3856 struct buffer_head *bitmap_bh = NULL; 3857 struct ext4_group_desc *gdp; 3858 struct buffer_head *gdp_bh; 3859 struct ext4_sb_info *sbi = EXT4_SB(sb); 3860 ext4_group_t group; 3861 ext4_grpblk_t blkoff; 3862 int i, err; 3863 int already; 3864 unsigned int clen, clen_changed, thisgrp_len; 3865 3866 while (len > 0) { 3867 ext4_get_group_no_and_offset(sb, block, &group, &blkoff); 3868 3869 /* 3870 * Check to see if we are freeing blocks across a group 3871 * boundary. 3872 * In case of flex_bg, this can happen that (block, len) may 3873 * span across more than one group. In that case we need to 3874 * get the corresponding group metadata to work with. 3875 * For this we have goto again loop. 3876 */ 3877 thisgrp_len = min_t(unsigned int, (unsigned int)len, 3878 EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); 3879 clen = EXT4_NUM_B2C(sbi, thisgrp_len); 3880 3881 if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) { 3882 ext4_error(sb, "Marking blocks in system zone - " 3883 "Block = %llu, len = %u", 3884 block, thisgrp_len); 3885 bitmap_bh = NULL; 3886 break; 3887 } 3888 3889 bitmap_bh = ext4_read_block_bitmap(sb, group); 3890 if (IS_ERR(bitmap_bh)) { 3891 err = PTR_ERR(bitmap_bh); 3892 bitmap_bh = NULL; 3893 break; 3894 } 3895 3896 err = -EIO; 3897 gdp = ext4_get_group_desc(sb, group, &gdp_bh); 3898 if (!gdp) 3899 break; 3900 3901 ext4_lock_group(sb, group); 3902 already = 0; 3903 for (i = 0; i < clen; i++) 3904 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == 3905 !state) 3906 already++; 3907 3908 clen_changed = clen - already; 3909 if (state) 3910 mb_set_bits(bitmap_bh->b_data, blkoff, clen); 3911 else 3912 mb_clear_bits(bitmap_bh->b_data, blkoff, clen); 3913 if (ext4_has_group_desc_csum(sb) && 3914 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 3915 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3916 ext4_free_group_clusters_set(sb, gdp, 3917 ext4_free_clusters_after_init(sb, group, gdp)); 3918 } 3919 if (state) 3920 clen = ext4_free_group_clusters(sb, gdp) - clen_changed; 3921 else 3922 clen = ext4_free_group_clusters(sb, gdp) + clen_changed; 3923 3924 ext4_free_group_clusters_set(sb, gdp, clen); 3925 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 3926 ext4_group_desc_csum_set(sb, group, gdp); 3927 3928 ext4_unlock_group(sb, group); 3929 3930 if (sbi->s_log_groups_per_flex) { 3931 ext4_group_t flex_group = ext4_flex_group(sbi, group); 3932 struct flex_groups *fg = sbi_array_rcu_deref(sbi, 3933 s_flex_groups, flex_group); 3934 3935 if (state) 3936 atomic64_sub(clen_changed, &fg->free_clusters); 3937 else 3938 atomic64_add(clen_changed, &fg->free_clusters); 3939 3940 } 3941 3942 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); 3943 if (err) 3944 break; 3945 sync_dirty_buffer(bitmap_bh); 3946 err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); 3947 sync_dirty_buffer(gdp_bh); 3948 if (err) 3949 break; 3950 3951 block += thisgrp_len; 3952 len -= thisgrp_len; 3953 brelse(bitmap_bh); 3954 BUG_ON(len < 0); 3955 } 3956 3957 if (err) 3958 brelse(bitmap_bh); 3959 } 3960 3961 /* 3962 * here we normalize request for locality group 3963 * Group request are normalized to s_mb_group_prealloc, which goes to 3964 * s_strip if we set the same via mount option. 3965 * s_mb_group_prealloc can be configured via 3966 * /sys/fs/ext4/<partition>/mb_group_prealloc 3967 * 3968 * XXX: should we try to preallocate more than the group has now? 3969 */ 3970 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) 3971 { 3972 struct super_block *sb = ac->ac_sb; 3973 struct ext4_locality_group *lg = ac->ac_lg; 3974 3975 BUG_ON(lg == NULL); 3976 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 3977 mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); 3978 } 3979 3980 /* 3981 * This function returns the next element to look at during inode 3982 * PA rbtree walk. We assume that we have held the inode PA rbtree lock 3983 * (ei->i_prealloc_lock) 3984 * 3985 * new_start The start of the range we want to compare 3986 * cur_start The existing start that we are comparing against 3987 * node The node of the rb_tree 3988 */ 3989 static inline struct rb_node* 3990 ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) 3991 { 3992 if (new_start < cur_start) 3993 return node->rb_left; 3994 else 3995 return node->rb_right; 3996 } 3997 3998 static inline void 3999 ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, 4000 ext4_lblk_t start, ext4_lblk_t end) 4001 { 4002 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4003 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4004 struct ext4_prealloc_space *tmp_pa; 4005 ext4_lblk_t tmp_pa_start, tmp_pa_end; 4006 struct rb_node *iter; 4007 4008 read_lock(&ei->i_prealloc_lock); 4009 for (iter = ei->i_prealloc_node.rb_node; iter; 4010 iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { 4011 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4012 pa_node.inode_node); 4013 tmp_pa_start = tmp_pa->pa_lstart; 4014 tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); 4015 4016 spin_lock(&tmp_pa->pa_lock); 4017 if (tmp_pa->pa_deleted == 0) 4018 BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); 4019 spin_unlock(&tmp_pa->pa_lock); 4020 } 4021 read_unlock(&ei->i_prealloc_lock); 4022 } 4023 4024 /* 4025 * Given an allocation context "ac" and a range "start", "end", check 4026 * and adjust boundaries if the range overlaps with any of the existing 4027 * preallocatoins stored in the corresponding inode of the allocation context. 4028 * 4029 * Parameters: 4030 * ac allocation context 4031 * start start of the new range 4032 * end end of the new range 4033 */ 4034 static inline void 4035 ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, 4036 ext4_lblk_t *start, ext4_lblk_t *end) 4037 { 4038 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4039 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4040 struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; 4041 struct rb_node *iter; 4042 ext4_lblk_t new_start, new_end; 4043 ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; 4044 4045 new_start = *start; 4046 new_end = *end; 4047 4048 /* 4049 * Adjust the normalized range so that it doesn't overlap with any 4050 * existing preallocated blocks(PAs). Make sure to hold the rbtree lock 4051 * so it doesn't change underneath us. 4052 */ 4053 read_lock(&ei->i_prealloc_lock); 4054 4055 /* Step 1: find any one immediate neighboring PA of the normalized range */ 4056 for (iter = ei->i_prealloc_node.rb_node; iter; 4057 iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, 4058 tmp_pa_start, iter)) { 4059 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4060 pa_node.inode_node); 4061 tmp_pa_start = tmp_pa->pa_lstart; 4062 tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); 4063 4064 /* PA must not overlap original request */ 4065 spin_lock(&tmp_pa->pa_lock); 4066 if (tmp_pa->pa_deleted == 0) 4067 BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || 4068 ac->ac_o_ex.fe_logical < tmp_pa_start)); 4069 spin_unlock(&tmp_pa->pa_lock); 4070 } 4071 4072 /* 4073 * Step 2: check if the found PA is left or right neighbor and 4074 * get the other neighbor 4075 */ 4076 if (tmp_pa) { 4077 if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { 4078 struct rb_node *tmp; 4079 4080 left_pa = tmp_pa; 4081 tmp = rb_next(&left_pa->pa_node.inode_node); 4082 if (tmp) { 4083 right_pa = rb_entry(tmp, 4084 struct ext4_prealloc_space, 4085 pa_node.inode_node); 4086 } 4087 } else { 4088 struct rb_node *tmp; 4089 4090 right_pa = tmp_pa; 4091 tmp = rb_prev(&right_pa->pa_node.inode_node); 4092 if (tmp) { 4093 left_pa = rb_entry(tmp, 4094 struct ext4_prealloc_space, 4095 pa_node.inode_node); 4096 } 4097 } 4098 } 4099 4100 /* Step 3: get the non deleted neighbors */ 4101 if (left_pa) { 4102 for (iter = &left_pa->pa_node.inode_node;; 4103 iter = rb_prev(iter)) { 4104 if (!iter) { 4105 left_pa = NULL; 4106 break; 4107 } 4108 4109 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4110 pa_node.inode_node); 4111 left_pa = tmp_pa; 4112 spin_lock(&tmp_pa->pa_lock); 4113 if (tmp_pa->pa_deleted == 0) { 4114 spin_unlock(&tmp_pa->pa_lock); 4115 break; 4116 } 4117 spin_unlock(&tmp_pa->pa_lock); 4118 } 4119 } 4120 4121 if (right_pa) { 4122 for (iter = &right_pa->pa_node.inode_node;; 4123 iter = rb_next(iter)) { 4124 if (!iter) { 4125 right_pa = NULL; 4126 break; 4127 } 4128 4129 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4130 pa_node.inode_node); 4131 right_pa = tmp_pa; 4132 spin_lock(&tmp_pa->pa_lock); 4133 if (tmp_pa->pa_deleted == 0) { 4134 spin_unlock(&tmp_pa->pa_lock); 4135 break; 4136 } 4137 spin_unlock(&tmp_pa->pa_lock); 4138 } 4139 } 4140 4141 if (left_pa) { 4142 left_pa_end = 4143 left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); 4144 BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); 4145 } 4146 4147 if (right_pa) { 4148 right_pa_start = right_pa->pa_lstart; 4149 BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); 4150 } 4151 4152 /* Step 4: trim our normalized range to not overlap with the neighbors */ 4153 if (left_pa) { 4154 if (left_pa_end > new_start) 4155 new_start = left_pa_end; 4156 } 4157 4158 if (right_pa) { 4159 if (right_pa_start < new_end) 4160 new_end = right_pa_start; 4161 } 4162 read_unlock(&ei->i_prealloc_lock); 4163 4164 /* XXX: extra loop to check we really don't overlap preallocations */ 4165 ext4_mb_pa_assert_overlap(ac, new_start, new_end); 4166 4167 *start = new_start; 4168 *end = new_end; 4169 } 4170 4171 /* 4172 * Normalization means making request better in terms of 4173 * size and alignment 4174 */ 4175 static noinline_for_stack void 4176 ext4_mb_normalize_request(struct ext4_allocation_context *ac, 4177 struct ext4_allocation_request *ar) 4178 { 4179 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4180 struct ext4_super_block *es = sbi->s_es; 4181 int bsbits, max; 4182 ext4_lblk_t end; 4183 loff_t size, start_off; 4184 loff_t orig_size __maybe_unused; 4185 ext4_lblk_t start; 4186 4187 /* do normalize only data requests, metadata requests 4188 do not need preallocation */ 4189 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4190 return; 4191 4192 /* sometime caller may want exact blocks */ 4193 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4194 return; 4195 4196 /* caller may indicate that preallocation isn't 4197 * required (it's a tail, for example) */ 4198 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) 4199 return; 4200 4201 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { 4202 ext4_mb_normalize_group_request(ac); 4203 return ; 4204 } 4205 4206 bsbits = ac->ac_sb->s_blocksize_bits; 4207 4208 /* first, let's learn actual file size 4209 * given current request is allocated */ 4210 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); 4211 size = size << bsbits; 4212 if (size < i_size_read(ac->ac_inode)) 4213 size = i_size_read(ac->ac_inode); 4214 orig_size = size; 4215 4216 /* max size of free chunks */ 4217 max = 2 << bsbits; 4218 4219 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ 4220 (req <= (size) || max <= (chunk_size)) 4221 4222 /* first, try to predict filesize */ 4223 /* XXX: should this table be tunable? */ 4224 start_off = 0; 4225 if (size <= 16 * 1024) { 4226 size = 16 * 1024; 4227 } else if (size <= 32 * 1024) { 4228 size = 32 * 1024; 4229 } else if (size <= 64 * 1024) { 4230 size = 64 * 1024; 4231 } else if (size <= 128 * 1024) { 4232 size = 128 * 1024; 4233 } else if (size <= 256 * 1024) { 4234 size = 256 * 1024; 4235 } else if (size <= 512 * 1024) { 4236 size = 512 * 1024; 4237 } else if (size <= 1024 * 1024) { 4238 size = 1024 * 1024; 4239 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { 4240 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4241 (21 - bsbits)) << 21; 4242 size = 2 * 1024 * 1024; 4243 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { 4244 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4245 (22 - bsbits)) << 22; 4246 size = 4 * 1024 * 1024; 4247 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, 4248 (8<<20)>>bsbits, max, 8 * 1024)) { 4249 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4250 (23 - bsbits)) << 23; 4251 size = 8 * 1024 * 1024; 4252 } else { 4253 start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; 4254 size = (loff_t) EXT4_C2B(sbi, 4255 ac->ac_o_ex.fe_len) << bsbits; 4256 } 4257 size = size >> bsbits; 4258 start = start_off >> bsbits; 4259 4260 /* 4261 * For tiny groups (smaller than 8MB) the chosen allocation 4262 * alignment may be larger than group size. Make sure the 4263 * alignment does not move allocation to a different group which 4264 * makes mballoc fail assertions later. 4265 */ 4266 start = max(start, rounddown(ac->ac_o_ex.fe_logical, 4267 (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); 4268 4269 /* don't cover already allocated blocks in selected range */ 4270 if (ar->pleft && start <= ar->lleft) { 4271 size -= ar->lleft + 1 - start; 4272 start = ar->lleft + 1; 4273 } 4274 if (ar->pright && start + size - 1 >= ar->lright) 4275 size -= start + size - ar->lright; 4276 4277 /* 4278 * Trim allocation request for filesystems with artificially small 4279 * groups. 4280 */ 4281 if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) 4282 size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); 4283 4284 end = start + size; 4285 4286 ext4_mb_pa_adjust_overlap(ac, &start, &end); 4287 4288 size = end - start; 4289 4290 /* 4291 * In this function "start" and "size" are normalized for better 4292 * alignment and length such that we could preallocate more blocks. 4293 * This normalization is done such that original request of 4294 * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and 4295 * "size" boundaries. 4296 * (Note fe_len can be relaxed since FS block allocation API does not 4297 * provide gurantee on number of contiguous blocks allocation since that 4298 * depends upon free space left, etc). 4299 * In case of inode pa, later we use the allocated blocks 4300 * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated 4301 * range of goal/best blocks [start, size] to put it at the 4302 * ac_o_ex.fe_logical extent of this inode. 4303 * (See ext4_mb_use_inode_pa() for more details) 4304 */ 4305 if (start + size <= ac->ac_o_ex.fe_logical || 4306 start > ac->ac_o_ex.fe_logical) { 4307 ext4_msg(ac->ac_sb, KERN_ERR, 4308 "start %lu, size %lu, fe_logical %lu", 4309 (unsigned long) start, (unsigned long) size, 4310 (unsigned long) ac->ac_o_ex.fe_logical); 4311 BUG(); 4312 } 4313 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 4314 4315 /* now prepare goal request */ 4316 4317 /* XXX: is it better to align blocks WRT to logical 4318 * placement or satisfy big request as is */ 4319 ac->ac_g_ex.fe_logical = start; 4320 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); 4321 4322 /* define goal start in order to merge */ 4323 if (ar->pright && (ar->lright == (start + size)) && 4324 ar->pright >= size && 4325 ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { 4326 /* merge to the right */ 4327 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, 4328 &ac->ac_g_ex.fe_group, 4329 &ac->ac_g_ex.fe_start); 4330 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 4331 } 4332 if (ar->pleft && (ar->lleft + 1 == start) && 4333 ar->pleft + 1 < ext4_blocks_count(es)) { 4334 /* merge to the left */ 4335 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, 4336 &ac->ac_g_ex.fe_group, 4337 &ac->ac_g_ex.fe_start); 4338 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 4339 } 4340 4341 mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, 4342 orig_size, start); 4343 } 4344 4345 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) 4346 { 4347 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4348 4349 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { 4350 atomic_inc(&sbi->s_bal_reqs); 4351 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 4352 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) 4353 atomic_inc(&sbi->s_bal_success); 4354 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 4355 atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); 4356 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 4357 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 4358 atomic_inc(&sbi->s_bal_goals); 4359 if (ac->ac_found > sbi->s_mb_max_to_scan) 4360 atomic_inc(&sbi->s_bal_breaks); 4361 } 4362 4363 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) 4364 trace_ext4_mballoc_alloc(ac); 4365 else 4366 trace_ext4_mballoc_prealloc(ac); 4367 } 4368 4369 /* 4370 * Called on failure; free up any blocks from the inode PA for this 4371 * context. We don't need this for MB_GROUP_PA because we only change 4372 * pa_free in ext4_mb_release_context(), but on failure, we've already 4373 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. 4374 */ 4375 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) 4376 { 4377 struct ext4_prealloc_space *pa = ac->ac_pa; 4378 struct ext4_buddy e4b; 4379 int err; 4380 4381 if (pa == NULL) { 4382 if (ac->ac_f_ex.fe_len == 0) 4383 return; 4384 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); 4385 if (WARN_RATELIMIT(err, 4386 "ext4: mb_load_buddy failed (%d)", err)) 4387 /* 4388 * This should never happen since we pin the 4389 * pages in the ext4_allocation_context so 4390 * ext4_mb_load_buddy() should never fail. 4391 */ 4392 return; 4393 ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); 4394 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, 4395 ac->ac_f_ex.fe_len); 4396 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); 4397 ext4_mb_unload_buddy(&e4b); 4398 return; 4399 } 4400 if (pa->pa_type == MB_INODE_PA) { 4401 spin_lock(&pa->pa_lock); 4402 pa->pa_free += ac->ac_b_ex.fe_len; 4403 spin_unlock(&pa->pa_lock); 4404 } 4405 } 4406 4407 /* 4408 * use blocks preallocated to inode 4409 */ 4410 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 4411 struct ext4_prealloc_space *pa) 4412 { 4413 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4414 ext4_fsblk_t start; 4415 ext4_fsblk_t end; 4416 int len; 4417 4418 /* found preallocated blocks, use them */ 4419 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 4420 end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), 4421 start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); 4422 len = EXT4_NUM_B2C(sbi, end - start); 4423 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 4424 &ac->ac_b_ex.fe_start); 4425 ac->ac_b_ex.fe_len = len; 4426 ac->ac_status = AC_STATUS_FOUND; 4427 ac->ac_pa = pa; 4428 4429 BUG_ON(start < pa->pa_pstart); 4430 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); 4431 BUG_ON(pa->pa_free < len); 4432 BUG_ON(ac->ac_b_ex.fe_len <= 0); 4433 pa->pa_free -= len; 4434 4435 mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); 4436 } 4437 4438 /* 4439 * use blocks preallocated to locality group 4440 */ 4441 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, 4442 struct ext4_prealloc_space *pa) 4443 { 4444 unsigned int len = ac->ac_o_ex.fe_len; 4445 4446 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 4447 &ac->ac_b_ex.fe_group, 4448 &ac->ac_b_ex.fe_start); 4449 ac->ac_b_ex.fe_len = len; 4450 ac->ac_status = AC_STATUS_FOUND; 4451 ac->ac_pa = pa; 4452 4453 /* we don't correct pa_pstart or pa_len here to avoid 4454 * possible race when the group is being loaded concurrently 4455 * instead we correct pa later, after blocks are marked 4456 * in on-disk bitmap -- see ext4_mb_release_context() 4457 * Other CPUs are prevented from allocating from this pa by lg_mutex 4458 */ 4459 mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", 4460 pa->pa_lstart, len, pa); 4461 } 4462 4463 /* 4464 * Return the prealloc space that have minimal distance 4465 * from the goal block. @cpa is the prealloc 4466 * space that is having currently known minimal distance 4467 * from the goal block. 4468 */ 4469 static struct ext4_prealloc_space * 4470 ext4_mb_check_group_pa(ext4_fsblk_t goal_block, 4471 struct ext4_prealloc_space *pa, 4472 struct ext4_prealloc_space *cpa) 4473 { 4474 ext4_fsblk_t cur_distance, new_distance; 4475 4476 if (cpa == NULL) { 4477 atomic_inc(&pa->pa_count); 4478 return pa; 4479 } 4480 cur_distance = abs(goal_block - cpa->pa_pstart); 4481 new_distance = abs(goal_block - pa->pa_pstart); 4482 4483 if (cur_distance <= new_distance) 4484 return cpa; 4485 4486 /* drop the previous reference */ 4487 atomic_dec(&cpa->pa_count); 4488 atomic_inc(&pa->pa_count); 4489 return pa; 4490 } 4491 4492 /* 4493 * search goal blocks in preallocated space 4494 */ 4495 static noinline_for_stack bool 4496 ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 4497 { 4498 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4499 int order, i; 4500 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4501 struct ext4_locality_group *lg; 4502 struct ext4_prealloc_space *tmp_pa, *cpa = NULL; 4503 ext4_lblk_t tmp_pa_start, tmp_pa_end; 4504 struct rb_node *iter; 4505 ext4_fsblk_t goal_block; 4506 4507 /* only data can be preallocated */ 4508 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4509 return false; 4510 4511 /* first, try per-file preallocation */ 4512 read_lock(&ei->i_prealloc_lock); 4513 for (iter = ei->i_prealloc_node.rb_node; iter; 4514 iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, 4515 tmp_pa_start, iter)) { 4516 tmp_pa = rb_entry(iter, struct ext4_prealloc_space, 4517 pa_node.inode_node); 4518 4519 /* all fields in this condition don't change, 4520 * so we can skip locking for them */ 4521 tmp_pa_start = tmp_pa->pa_lstart; 4522 tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); 4523 4524 /* original request start doesn't lie in this PA */ 4525 if (ac->ac_o_ex.fe_logical < tmp_pa_start || 4526 ac->ac_o_ex.fe_logical >= tmp_pa_end) 4527 continue; 4528 4529 /* non-extent files can't have physical blocks past 2^32 */ 4530 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 4531 (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > 4532 EXT4_MAX_BLOCK_FILE_PHYS)) { 4533 /* 4534 * Since PAs don't overlap, we won't find any 4535 * other PA to satisfy this. 4536 */ 4537 break; 4538 } 4539 4540 /* found preallocated blocks, use them */ 4541 spin_lock(&tmp_pa->pa_lock); 4542 if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) { 4543 atomic_inc(&tmp_pa->pa_count); 4544 ext4_mb_use_inode_pa(ac, tmp_pa); 4545 spin_unlock(&tmp_pa->pa_lock); 4546 ac->ac_criteria = 10; 4547 read_unlock(&ei->i_prealloc_lock); 4548 return true; 4549 } 4550 spin_unlock(&tmp_pa->pa_lock); 4551 } 4552 read_unlock(&ei->i_prealloc_lock); 4553 4554 /* can we use group allocation? */ 4555 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) 4556 return false; 4557 4558 /* inode may have no locality group for some reason */ 4559 lg = ac->ac_lg; 4560 if (lg == NULL) 4561 return false; 4562 order = fls(ac->ac_o_ex.fe_len) - 1; 4563 if (order > PREALLOC_TB_SIZE - 1) 4564 /* The max size of hash table is PREALLOC_TB_SIZE */ 4565 order = PREALLOC_TB_SIZE - 1; 4566 4567 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); 4568 /* 4569 * search for the prealloc space that is having 4570 * minimal distance from the goal block. 4571 */ 4572 for (i = order; i < PREALLOC_TB_SIZE; i++) { 4573 rcu_read_lock(); 4574 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], 4575 pa_node.lg_list) { 4576 spin_lock(&tmp_pa->pa_lock); 4577 if (tmp_pa->pa_deleted == 0 && 4578 tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { 4579 4580 cpa = ext4_mb_check_group_pa(goal_block, 4581 tmp_pa, cpa); 4582 } 4583 spin_unlock(&tmp_pa->pa_lock); 4584 } 4585 rcu_read_unlock(); 4586 } 4587 if (cpa) { 4588 ext4_mb_use_group_pa(ac, cpa); 4589 ac->ac_criteria = 20; 4590 return true; 4591 } 4592 return false; 4593 } 4594 4595 /* 4596 * the function goes through all block freed in the group 4597 * but not yet committed and marks them used in in-core bitmap. 4598 * buddy must be generated from this bitmap 4599 * Need to be called with the ext4 group lock held 4600 */ 4601 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 4602 ext4_group_t group) 4603 { 4604 struct rb_node *n; 4605 struct ext4_group_info *grp; 4606 struct ext4_free_data *entry; 4607 4608 grp = ext4_get_group_info(sb, group); 4609 n = rb_first(&(grp->bb_free_root)); 4610 4611 while (n) { 4612 entry = rb_entry(n, struct ext4_free_data, efd_node); 4613 mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); 4614 n = rb_next(n); 4615 } 4616 return; 4617 } 4618 4619 /* 4620 * the function goes through all preallocation in this group and marks them 4621 * used in in-core bitmap. buddy must be generated from this bitmap 4622 * Need to be called with ext4 group lock held 4623 */ 4624 static noinline_for_stack 4625 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 4626 ext4_group_t group) 4627 { 4628 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 4629 struct ext4_prealloc_space *pa; 4630 struct list_head *cur; 4631 ext4_group_t groupnr; 4632 ext4_grpblk_t start; 4633 int preallocated = 0; 4634 int len; 4635 4636 /* all form of preallocation discards first load group, 4637 * so the only competing code is preallocation use. 4638 * we don't need any locking here 4639 * notice we do NOT ignore preallocations with pa_deleted 4640 * otherwise we could leave used blocks available for 4641 * allocation in buddy when concurrent ext4_mb_put_pa() 4642 * is dropping preallocation 4643 */ 4644 list_for_each(cur, &grp->bb_prealloc_list) { 4645 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 4646 spin_lock(&pa->pa_lock); 4647 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 4648 &groupnr, &start); 4649 len = pa->pa_len; 4650 spin_unlock(&pa->pa_lock); 4651 if (unlikely(len == 0)) 4652 continue; 4653 BUG_ON(groupnr != group); 4654 mb_set_bits(bitmap, start, len); 4655 preallocated += len; 4656 } 4657 mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); 4658 } 4659 4660 static void ext4_mb_mark_pa_deleted(struct super_block *sb, 4661 struct ext4_prealloc_space *pa) 4662 { 4663 struct ext4_inode_info *ei; 4664 4665 if (pa->pa_deleted) { 4666 ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", 4667 pa->pa_type, pa->pa_pstart, pa->pa_lstart, 4668 pa->pa_len); 4669 return; 4670 } 4671 4672 pa->pa_deleted = 1; 4673 4674 if (pa->pa_type == MB_INODE_PA) { 4675 ei = EXT4_I(pa->pa_inode); 4676 atomic_dec(&ei->i_prealloc_active); 4677 } 4678 } 4679 4680 static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) 4681 { 4682 BUG_ON(!pa); 4683 BUG_ON(atomic_read(&pa->pa_count)); 4684 BUG_ON(pa->pa_deleted == 0); 4685 kmem_cache_free(ext4_pspace_cachep, pa); 4686 } 4687 4688 static void ext4_mb_pa_callback(struct rcu_head *head) 4689 { 4690 struct ext4_prealloc_space *pa; 4691 4692 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 4693 ext4_mb_pa_free(pa); 4694 } 4695 4696 /* 4697 * drops a reference to preallocated space descriptor 4698 * if this was the last reference and the space is consumed 4699 */ 4700 static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 4701 struct super_block *sb, struct ext4_prealloc_space *pa) 4702 { 4703 ext4_group_t grp; 4704 ext4_fsblk_t grp_blk; 4705 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4706 4707 /* in this short window concurrent discard can set pa_deleted */ 4708 spin_lock(&pa->pa_lock); 4709 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { 4710 spin_unlock(&pa->pa_lock); 4711 return; 4712 } 4713 4714 if (pa->pa_deleted == 1) { 4715 spin_unlock(&pa->pa_lock); 4716 return; 4717 } 4718 4719 ext4_mb_mark_pa_deleted(sb, pa); 4720 spin_unlock(&pa->pa_lock); 4721 4722 grp_blk = pa->pa_pstart; 4723 /* 4724 * If doing group-based preallocation, pa_pstart may be in the 4725 * next group when pa is used up 4726 */ 4727 if (pa->pa_type == MB_GROUP_PA) 4728 grp_blk--; 4729 4730 grp = ext4_get_group_number(sb, grp_blk); 4731 4732 /* 4733 * possible race: 4734 * 4735 * P1 (buddy init) P2 (regular allocation) 4736 * find block B in PA 4737 * copy on-disk bitmap to buddy 4738 * mark B in on-disk bitmap 4739 * drop PA from group 4740 * mark all PAs in buddy 4741 * 4742 * thus, P1 initializes buddy with B available. to prevent this 4743 * we make "copy" and "mark all PAs" atomic and serialize "drop PA" 4744 * against that pair 4745 */ 4746 ext4_lock_group(sb, grp); 4747 list_del(&pa->pa_group_list); 4748 ext4_unlock_group(sb, grp); 4749 4750 if (pa->pa_type == MB_INODE_PA) { 4751 write_lock(pa->pa_node_lock.inode_lock); 4752 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 4753 write_unlock(pa->pa_node_lock.inode_lock); 4754 ext4_mb_pa_free(pa); 4755 } else { 4756 spin_lock(pa->pa_node_lock.lg_lock); 4757 list_del_rcu(&pa->pa_node.lg_list); 4758 spin_unlock(pa->pa_node_lock.lg_lock); 4759 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4760 } 4761 } 4762 4763 static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) 4764 { 4765 struct rb_node **iter = &root->rb_node, *parent = NULL; 4766 struct ext4_prealloc_space *iter_pa, *new_pa; 4767 ext4_lblk_t iter_start, new_start; 4768 4769 while (*iter) { 4770 iter_pa = rb_entry(*iter, struct ext4_prealloc_space, 4771 pa_node.inode_node); 4772 new_pa = rb_entry(new, struct ext4_prealloc_space, 4773 pa_node.inode_node); 4774 iter_start = iter_pa->pa_lstart; 4775 new_start = new_pa->pa_lstart; 4776 4777 parent = *iter; 4778 if (new_start < iter_start) 4779 iter = &((*iter)->rb_left); 4780 else 4781 iter = &((*iter)->rb_right); 4782 } 4783 4784 rb_link_node(new, parent, iter); 4785 rb_insert_color(new, root); 4786 } 4787 4788 /* 4789 * creates new preallocated space for given inode 4790 */ 4791 static noinline_for_stack void 4792 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 4793 { 4794 struct super_block *sb = ac->ac_sb; 4795 struct ext4_sb_info *sbi = EXT4_SB(sb); 4796 struct ext4_prealloc_space *pa; 4797 struct ext4_group_info *grp; 4798 struct ext4_inode_info *ei; 4799 4800 /* preallocate only when found space is larger then requested */ 4801 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 4802 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 4803 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 4804 BUG_ON(ac->ac_pa == NULL); 4805 4806 pa = ac->ac_pa; 4807 4808 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { 4809 int new_bex_start; 4810 int new_bex_end; 4811 4812 /* we can't allocate as much as normalizer wants. 4813 * so, found space must get proper lstart 4814 * to cover original request */ 4815 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 4816 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); 4817 4818 /* 4819 * Use the below logic for adjusting best extent as it keeps 4820 * fragmentation in check while ensuring logical range of best 4821 * extent doesn't overflow out of goal extent: 4822 * 4823 * 1. Check if best ex can be kept at end of goal and still 4824 * cover original start 4825 * 2. Else, check if best ex can be kept at start of goal and 4826 * still cover original start 4827 * 3. Else, keep the best ex at start of original request. 4828 */ 4829 new_bex_end = ac->ac_g_ex.fe_logical + 4830 EXT4_C2B(sbi, ac->ac_g_ex.fe_len); 4831 new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4832 if (ac->ac_o_ex.fe_logical >= new_bex_start) 4833 goto adjust_bex; 4834 4835 new_bex_start = ac->ac_g_ex.fe_logical; 4836 new_bex_end = 4837 new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4838 if (ac->ac_o_ex.fe_logical < new_bex_end) 4839 goto adjust_bex; 4840 4841 new_bex_start = ac->ac_o_ex.fe_logical; 4842 new_bex_end = 4843 new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4844 4845 adjust_bex: 4846 ac->ac_b_ex.fe_logical = new_bex_start; 4847 4848 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 4849 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 4850 BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + 4851 EXT4_C2B(sbi, ac->ac_g_ex.fe_len))); 4852 } 4853 4854 pa->pa_lstart = ac->ac_b_ex.fe_logical; 4855 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4856 pa->pa_len = ac->ac_b_ex.fe_len; 4857 pa->pa_free = pa->pa_len; 4858 spin_lock_init(&pa->pa_lock); 4859 INIT_LIST_HEAD(&pa->pa_group_list); 4860 pa->pa_deleted = 0; 4861 pa->pa_type = MB_INODE_PA; 4862 4863 mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, 4864 pa->pa_len, pa->pa_lstart); 4865 trace_ext4_mb_new_inode_pa(ac, pa); 4866 4867 atomic_add(pa->pa_free, &sbi->s_mb_preallocated); 4868 ext4_mb_use_inode_pa(ac, pa); 4869 4870 ei = EXT4_I(ac->ac_inode); 4871 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 4872 4873 pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; 4874 pa->pa_inode = ac->ac_inode; 4875 4876 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 4877 4878 write_lock(pa->pa_node_lock.inode_lock); 4879 ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); 4880 write_unlock(pa->pa_node_lock.inode_lock); 4881 atomic_inc(&ei->i_prealloc_active); 4882 } 4883 4884 /* 4885 * creates new preallocated space for locality group inodes belongs to 4886 */ 4887 static noinline_for_stack void 4888 ext4_mb_new_group_pa(struct ext4_allocation_context *ac) 4889 { 4890 struct super_block *sb = ac->ac_sb; 4891 struct ext4_locality_group *lg; 4892 struct ext4_prealloc_space *pa; 4893 struct ext4_group_info *grp; 4894 4895 /* preallocate only when found space is larger then requested */ 4896 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 4897 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 4898 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 4899 BUG_ON(ac->ac_pa == NULL); 4900 4901 pa = ac->ac_pa; 4902 4903 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4904 pa->pa_lstart = pa->pa_pstart; 4905 pa->pa_len = ac->ac_b_ex.fe_len; 4906 pa->pa_free = pa->pa_len; 4907 spin_lock_init(&pa->pa_lock); 4908 INIT_LIST_HEAD(&pa->pa_node.lg_list); 4909 INIT_LIST_HEAD(&pa->pa_group_list); 4910 pa->pa_deleted = 0; 4911 pa->pa_type = MB_GROUP_PA; 4912 4913 mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, 4914 pa->pa_len, pa->pa_lstart); 4915 trace_ext4_mb_new_group_pa(ac, pa); 4916 4917 ext4_mb_use_group_pa(ac, pa); 4918 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 4919 4920 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 4921 lg = ac->ac_lg; 4922 BUG_ON(lg == NULL); 4923 4924 pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; 4925 pa->pa_inode = NULL; 4926 4927 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 4928 4929 /* 4930 * We will later add the new pa to the right bucket 4931 * after updating the pa_free in ext4_mb_release_context 4932 */ 4933 } 4934 4935 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) 4936 { 4937 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 4938 ext4_mb_new_group_pa(ac); 4939 else 4940 ext4_mb_new_inode_pa(ac); 4941 } 4942 4943 /* 4944 * finds all unused blocks in on-disk bitmap, frees them in 4945 * in-core bitmap and buddy. 4946 * @pa must be unlinked from inode and group lists, so that 4947 * nobody else can find/use it. 4948 * the caller MUST hold group/inode locks. 4949 * TODO: optimize the case when there are no in-core structures yet 4950 */ 4951 static noinline_for_stack int 4952 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 4953 struct ext4_prealloc_space *pa) 4954 { 4955 struct super_block *sb = e4b->bd_sb; 4956 struct ext4_sb_info *sbi = EXT4_SB(sb); 4957 unsigned int end; 4958 unsigned int next; 4959 ext4_group_t group; 4960 ext4_grpblk_t bit; 4961 unsigned long long grp_blk_start; 4962 int free = 0; 4963 4964 BUG_ON(pa->pa_deleted == 0); 4965 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 4966 grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); 4967 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 4968 end = bit + pa->pa_len; 4969 4970 while (bit < end) { 4971 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 4972 if (bit >= end) 4973 break; 4974 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 4975 mb_debug(sb, "free preallocated %u/%u in group %u\n", 4976 (unsigned) ext4_group_first_block_no(sb, group) + bit, 4977 (unsigned) next - bit, (unsigned) group); 4978 free += next - bit; 4979 4980 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); 4981 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + 4982 EXT4_C2B(sbi, bit)), 4983 next - bit); 4984 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 4985 bit = next + 1; 4986 } 4987 if (free != pa->pa_free) { 4988 ext4_msg(e4b->bd_sb, KERN_CRIT, 4989 "pa %p: logic %lu, phys. %lu, len %d", 4990 pa, (unsigned long) pa->pa_lstart, 4991 (unsigned long) pa->pa_pstart, 4992 pa->pa_len); 4993 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", 4994 free, pa->pa_free); 4995 /* 4996 * pa is already deleted so we use the value obtained 4997 * from the bitmap and continue. 4998 */ 4999 } 5000 atomic_add(free, &sbi->s_mb_discarded); 5001 5002 return 0; 5003 } 5004 5005 static noinline_for_stack int 5006 ext4_mb_release_group_pa(struct ext4_buddy *e4b, 5007 struct ext4_prealloc_space *pa) 5008 { 5009 struct super_block *sb = e4b->bd_sb; 5010 ext4_group_t group; 5011 ext4_grpblk_t bit; 5012 5013 trace_ext4_mb_release_group_pa(sb, pa); 5014 BUG_ON(pa->pa_deleted == 0); 5015 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 5016 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 5017 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 5018 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 5019 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); 5020 5021 return 0; 5022 } 5023 5024 /* 5025 * releases all preallocations in given group 5026 * 5027 * first, we need to decide discard policy: 5028 * - when do we discard 5029 * 1) ENOSPC 5030 * - how many do we discard 5031 * 1) how many requested 5032 */ 5033 static noinline_for_stack int 5034 ext4_mb_discard_group_preallocations(struct super_block *sb, 5035 ext4_group_t group, int *busy) 5036 { 5037 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 5038 struct buffer_head *bitmap_bh = NULL; 5039 struct ext4_prealloc_space *pa, *tmp; 5040 struct list_head list; 5041 struct ext4_buddy e4b; 5042 struct ext4_inode_info *ei; 5043 int err; 5044 int free = 0; 5045 5046 mb_debug(sb, "discard preallocation for group %u\n", group); 5047 if (list_empty(&grp->bb_prealloc_list)) 5048 goto out_dbg; 5049 5050 bitmap_bh = ext4_read_block_bitmap(sb, group); 5051 if (IS_ERR(bitmap_bh)) { 5052 err = PTR_ERR(bitmap_bh); 5053 ext4_error_err(sb, -err, 5054 "Error %d reading block bitmap for %u", 5055 err, group); 5056 goto out_dbg; 5057 } 5058 5059 err = ext4_mb_load_buddy(sb, group, &e4b); 5060 if (err) { 5061 ext4_warning(sb, "Error %d loading buddy information for %u", 5062 err, group); 5063 put_bh(bitmap_bh); 5064 goto out_dbg; 5065 } 5066 5067 INIT_LIST_HEAD(&list); 5068 ext4_lock_group(sb, group); 5069 list_for_each_entry_safe(pa, tmp, 5070 &grp->bb_prealloc_list, pa_group_list) { 5071 spin_lock(&pa->pa_lock); 5072 if (atomic_read(&pa->pa_count)) { 5073 spin_unlock(&pa->pa_lock); 5074 *busy = 1; 5075 continue; 5076 } 5077 if (pa->pa_deleted) { 5078 spin_unlock(&pa->pa_lock); 5079 continue; 5080 } 5081 5082 /* seems this one can be freed ... */ 5083 ext4_mb_mark_pa_deleted(sb, pa); 5084 5085 if (!free) 5086 this_cpu_inc(discard_pa_seq); 5087 5088 /* we can trust pa_free ... */ 5089 free += pa->pa_free; 5090 5091 spin_unlock(&pa->pa_lock); 5092 5093 list_del(&pa->pa_group_list); 5094 list_add(&pa->u.pa_tmp_list, &list); 5095 } 5096 5097 /* now free all selected PAs */ 5098 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 5099 5100 /* remove from object (inode or locality group) */ 5101 if (pa->pa_type == MB_GROUP_PA) { 5102 spin_lock(pa->pa_node_lock.lg_lock); 5103 list_del_rcu(&pa->pa_node.lg_list); 5104 spin_unlock(pa->pa_node_lock.lg_lock); 5105 } else { 5106 write_lock(pa->pa_node_lock.inode_lock); 5107 ei = EXT4_I(pa->pa_inode); 5108 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 5109 write_unlock(pa->pa_node_lock.inode_lock); 5110 } 5111 5112 list_del(&pa->u.pa_tmp_list); 5113 5114 if (pa->pa_type == MB_GROUP_PA) { 5115 ext4_mb_release_group_pa(&e4b, pa); 5116 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 5117 } else { 5118 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 5119 ext4_mb_pa_free(pa); 5120 } 5121 } 5122 5123 ext4_unlock_group(sb, group); 5124 ext4_mb_unload_buddy(&e4b); 5125 put_bh(bitmap_bh); 5126 out_dbg: 5127 mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", 5128 free, group, grp->bb_free); 5129 return free; 5130 } 5131 5132 /* 5133 * releases all non-used preallocated blocks for given inode 5134 * 5135 * It's important to discard preallocations under i_data_sem 5136 * We don't want another block to be served from the prealloc 5137 * space when we are discarding the inode prealloc space. 5138 * 5139 * FIXME!! Make sure it is valid at all the call sites 5140 */ 5141 void ext4_discard_preallocations(struct inode *inode, unsigned int needed) 5142 { 5143 struct ext4_inode_info *ei = EXT4_I(inode); 5144 struct super_block *sb = inode->i_sb; 5145 struct buffer_head *bitmap_bh = NULL; 5146 struct ext4_prealloc_space *pa, *tmp; 5147 ext4_group_t group = 0; 5148 struct list_head list; 5149 struct ext4_buddy e4b; 5150 struct rb_node *iter; 5151 int err; 5152 5153 if (!S_ISREG(inode->i_mode)) { 5154 return; 5155 } 5156 5157 if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) 5158 return; 5159 5160 mb_debug(sb, "discard preallocation for inode %lu\n", 5161 inode->i_ino); 5162 trace_ext4_discard_preallocations(inode, 5163 atomic_read(&ei->i_prealloc_active), needed); 5164 5165 INIT_LIST_HEAD(&list); 5166 5167 if (needed == 0) 5168 needed = UINT_MAX; 5169 5170 repeat: 5171 /* first, collect all pa's in the inode */ 5172 write_lock(&ei->i_prealloc_lock); 5173 for (iter = rb_first(&ei->i_prealloc_node); iter && needed; 5174 iter = rb_next(iter)) { 5175 pa = rb_entry(iter, struct ext4_prealloc_space, 5176 pa_node.inode_node); 5177 BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); 5178 5179 spin_lock(&pa->pa_lock); 5180 if (atomic_read(&pa->pa_count)) { 5181 /* this shouldn't happen often - nobody should 5182 * use preallocation while we're discarding it */ 5183 spin_unlock(&pa->pa_lock); 5184 write_unlock(&ei->i_prealloc_lock); 5185 ext4_msg(sb, KERN_ERR, 5186 "uh-oh! used pa while discarding"); 5187 WARN_ON(1); 5188 schedule_timeout_uninterruptible(HZ); 5189 goto repeat; 5190 5191 } 5192 if (pa->pa_deleted == 0) { 5193 ext4_mb_mark_pa_deleted(sb, pa); 5194 spin_unlock(&pa->pa_lock); 5195 rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); 5196 list_add(&pa->u.pa_tmp_list, &list); 5197 needed--; 5198 continue; 5199 } 5200 5201 /* someone is deleting pa right now */ 5202 spin_unlock(&pa->pa_lock); 5203 write_unlock(&ei->i_prealloc_lock); 5204 5205 /* we have to wait here because pa_deleted 5206 * doesn't mean pa is already unlinked from 5207 * the list. as we might be called from 5208 * ->clear_inode() the inode will get freed 5209 * and concurrent thread which is unlinking 5210 * pa from inode's list may access already 5211 * freed memory, bad-bad-bad */ 5212 5213 /* XXX: if this happens too often, we can 5214 * add a flag to force wait only in case 5215 * of ->clear_inode(), but not in case of 5216 * regular truncate */ 5217 schedule_timeout_uninterruptible(HZ); 5218 goto repeat; 5219 } 5220 write_unlock(&ei->i_prealloc_lock); 5221 5222 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 5223 BUG_ON(pa->pa_type != MB_INODE_PA); 5224 group = ext4_get_group_number(sb, pa->pa_pstart); 5225 5226 err = ext4_mb_load_buddy_gfp(sb, group, &e4b, 5227 GFP_NOFS|__GFP_NOFAIL); 5228 if (err) { 5229 ext4_error_err(sb, -err, "Error %d loading buddy information for %u", 5230 err, group); 5231 continue; 5232 } 5233 5234 bitmap_bh = ext4_read_block_bitmap(sb, group); 5235 if (IS_ERR(bitmap_bh)) { 5236 err = PTR_ERR(bitmap_bh); 5237 ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", 5238 err, group); 5239 ext4_mb_unload_buddy(&e4b); 5240 continue; 5241 } 5242 5243 ext4_lock_group(sb, group); 5244 list_del(&pa->pa_group_list); 5245 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 5246 ext4_unlock_group(sb, group); 5247 5248 ext4_mb_unload_buddy(&e4b); 5249 put_bh(bitmap_bh); 5250 5251 list_del(&pa->u.pa_tmp_list); 5252 ext4_mb_pa_free(pa); 5253 } 5254 } 5255 5256 static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) 5257 { 5258 struct ext4_prealloc_space *pa; 5259 5260 BUG_ON(ext4_pspace_cachep == NULL); 5261 pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); 5262 if (!pa) 5263 return -ENOMEM; 5264 atomic_set(&pa->pa_count, 1); 5265 ac->ac_pa = pa; 5266 return 0; 5267 } 5268 5269 static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) 5270 { 5271 struct ext4_prealloc_space *pa = ac->ac_pa; 5272 5273 BUG_ON(!pa); 5274 ac->ac_pa = NULL; 5275 WARN_ON(!atomic_dec_and_test(&pa->pa_count)); 5276 /* 5277 * current function is only called due to an error or due to 5278 * len of found blocks < len of requested blocks hence the PA has not 5279 * been added to grp->bb_prealloc_list. So we don't need to lock it 5280 */ 5281 pa->pa_deleted = 1; 5282 ext4_mb_pa_free(pa); 5283 } 5284 5285 #ifdef CONFIG_EXT4_DEBUG 5286 static inline void ext4_mb_show_pa(struct super_block *sb) 5287 { 5288 ext4_group_t i, ngroups; 5289 5290 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) 5291 return; 5292 5293 ngroups = ext4_get_groups_count(sb); 5294 mb_debug(sb, "groups: "); 5295 for (i = 0; i < ngroups; i++) { 5296 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 5297 struct ext4_prealloc_space *pa; 5298 ext4_grpblk_t start; 5299 struct list_head *cur; 5300 ext4_lock_group(sb, i); 5301 list_for_each(cur, &grp->bb_prealloc_list) { 5302 pa = list_entry(cur, struct ext4_prealloc_space, 5303 pa_group_list); 5304 spin_lock(&pa->pa_lock); 5305 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 5306 NULL, &start); 5307 spin_unlock(&pa->pa_lock); 5308 mb_debug(sb, "PA:%u:%d:%d\n", i, start, 5309 pa->pa_len); 5310 } 5311 ext4_unlock_group(sb, i); 5312 mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, 5313 grp->bb_fragments); 5314 } 5315 } 5316 5317 static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 5318 { 5319 struct super_block *sb = ac->ac_sb; 5320 5321 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) 5322 return; 5323 5324 mb_debug(sb, "Can't allocate:" 5325 " Allocation context details:"); 5326 mb_debug(sb, "status %u flags 0x%x", 5327 ac->ac_status, ac->ac_flags); 5328 mb_debug(sb, "orig %lu/%lu/%lu@%lu, " 5329 "goal %lu/%lu/%lu@%lu, " 5330 "best %lu/%lu/%lu@%lu cr %d", 5331 (unsigned long)ac->ac_o_ex.fe_group, 5332 (unsigned long)ac->ac_o_ex.fe_start, 5333 (unsigned long)ac->ac_o_ex.fe_len, 5334 (unsigned long)ac->ac_o_ex.fe_logical, 5335 (unsigned long)ac->ac_g_ex.fe_group, 5336 (unsigned long)ac->ac_g_ex.fe_start, 5337 (unsigned long)ac->ac_g_ex.fe_len, 5338 (unsigned long)ac->ac_g_ex.fe_logical, 5339 (unsigned long)ac->ac_b_ex.fe_group, 5340 (unsigned long)ac->ac_b_ex.fe_start, 5341 (unsigned long)ac->ac_b_ex.fe_len, 5342 (unsigned long)ac->ac_b_ex.fe_logical, 5343 (int)ac->ac_criteria); 5344 mb_debug(sb, "%u found", ac->ac_found); 5345 ext4_mb_show_pa(sb); 5346 } 5347 #else 5348 static inline void ext4_mb_show_pa(struct super_block *sb) 5349 { 5350 return; 5351 } 5352 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) 5353 { 5354 ext4_mb_show_pa(ac->ac_sb); 5355 return; 5356 } 5357 #endif 5358 5359 /* 5360 * We use locality group preallocation for small size file. The size of the 5361 * file is determined by the current size or the resulting size after 5362 * allocation which ever is larger 5363 * 5364 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req 5365 */ 5366 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 5367 { 5368 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 5369 int bsbits = ac->ac_sb->s_blocksize_bits; 5370 loff_t size, isize; 5371 bool inode_pa_eligible, group_pa_eligible; 5372 5373 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 5374 return; 5375 5376 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 5377 return; 5378 5379 group_pa_eligible = sbi->s_mb_group_prealloc > 0; 5380 inode_pa_eligible = true; 5381 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); 5382 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 5383 >> bsbits; 5384 5385 /* No point in using inode preallocation for closed files */ 5386 if ((size == isize) && !ext4_fs_is_busy(sbi) && 5387 !inode_is_open_for_write(ac->ac_inode)) 5388 inode_pa_eligible = false; 5389 5390 size = max(size, isize); 5391 /* Don't use group allocation for large files */ 5392 if (size > sbi->s_mb_stream_request) 5393 group_pa_eligible = false; 5394 5395 if (!group_pa_eligible) { 5396 if (inode_pa_eligible) 5397 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 5398 else 5399 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; 5400 return; 5401 } 5402 5403 BUG_ON(ac->ac_lg != NULL); 5404 /* 5405 * locality group prealloc space are per cpu. The reason for having 5406 * per cpu locality group is to reduce the contention between block 5407 * request from multiple CPUs. 5408 */ 5409 ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); 5410 5411 /* we're going to use group allocation */ 5412 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 5413 5414 /* serialize all allocations in the group */ 5415 mutex_lock(&ac->ac_lg->lg_mutex); 5416 } 5417 5418 static noinline_for_stack void 5419 ext4_mb_initialize_context(struct ext4_allocation_context *ac, 5420 struct ext4_allocation_request *ar) 5421 { 5422 struct super_block *sb = ar->inode->i_sb; 5423 struct ext4_sb_info *sbi = EXT4_SB(sb); 5424 struct ext4_super_block *es = sbi->s_es; 5425 ext4_group_t group; 5426 unsigned int len; 5427 ext4_fsblk_t goal; 5428 ext4_grpblk_t block; 5429 5430 /* we can't allocate > group size */ 5431 len = ar->len; 5432 5433 /* just a dirty hack to filter too big requests */ 5434 if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) 5435 len = EXT4_CLUSTERS_PER_GROUP(sb); 5436 5437 /* start searching from the goal */ 5438 goal = ar->goal; 5439 if (goal < le32_to_cpu(es->s_first_data_block) || 5440 goal >= ext4_blocks_count(es)) 5441 goal = le32_to_cpu(es->s_first_data_block); 5442 ext4_get_group_no_and_offset(sb, goal, &group, &block); 5443 5444 /* set up allocation goals */ 5445 ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); 5446 ac->ac_status = AC_STATUS_CONTINUE; 5447 ac->ac_sb = sb; 5448 ac->ac_inode = ar->inode; 5449 ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; 5450 ac->ac_o_ex.fe_group = group; 5451 ac->ac_o_ex.fe_start = block; 5452 ac->ac_o_ex.fe_len = len; 5453 ac->ac_g_ex = ac->ac_o_ex; 5454 ac->ac_flags = ar->flags; 5455 5456 /* we have to define context: we'll work with a file or 5457 * locality group. this is a policy, actually */ 5458 ext4_mb_group_or_file(ac); 5459 5460 mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " 5461 "left: %u/%u, right %u/%u to %swritable\n", 5462 (unsigned) ar->len, (unsigned) ar->logical, 5463 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 5464 (unsigned) ar->lleft, (unsigned) ar->pleft, 5465 (unsigned) ar->lright, (unsigned) ar->pright, 5466 inode_is_open_for_write(ar->inode) ? "" : "non-"); 5467 } 5468 5469 static noinline_for_stack void 5470 ext4_mb_discard_lg_preallocations(struct super_block *sb, 5471 struct ext4_locality_group *lg, 5472 int order, int total_entries) 5473 { 5474 ext4_group_t group = 0; 5475 struct ext4_buddy e4b; 5476 struct list_head discard_list; 5477 struct ext4_prealloc_space *pa, *tmp; 5478 5479 mb_debug(sb, "discard locality group preallocation\n"); 5480 5481 INIT_LIST_HEAD(&discard_list); 5482 5483 spin_lock(&lg->lg_prealloc_lock); 5484 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 5485 pa_node.lg_list, 5486 lockdep_is_held(&lg->lg_prealloc_lock)) { 5487 spin_lock(&pa->pa_lock); 5488 if (atomic_read(&pa->pa_count)) { 5489 /* 5490 * This is the pa that we just used 5491 * for block allocation. So don't 5492 * free that 5493 */ 5494 spin_unlock(&pa->pa_lock); 5495 continue; 5496 } 5497 if (pa->pa_deleted) { 5498 spin_unlock(&pa->pa_lock); 5499 continue; 5500 } 5501 /* only lg prealloc space */ 5502 BUG_ON(pa->pa_type != MB_GROUP_PA); 5503 5504 /* seems this one can be freed ... */ 5505 ext4_mb_mark_pa_deleted(sb, pa); 5506 spin_unlock(&pa->pa_lock); 5507 5508 list_del_rcu(&pa->pa_node.lg_list); 5509 list_add(&pa->u.pa_tmp_list, &discard_list); 5510 5511 total_entries--; 5512 if (total_entries <= 5) { 5513 /* 5514 * we want to keep only 5 entries 5515 * allowing it to grow to 8. This 5516 * mak sure we don't call discard 5517 * soon for this list. 5518 */ 5519 break; 5520 } 5521 } 5522 spin_unlock(&lg->lg_prealloc_lock); 5523 5524 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 5525 int err; 5526 5527 group = ext4_get_group_number(sb, pa->pa_pstart); 5528 err = ext4_mb_load_buddy_gfp(sb, group, &e4b, 5529 GFP_NOFS|__GFP_NOFAIL); 5530 if (err) { 5531 ext4_error_err(sb, -err, "Error %d loading buddy information for %u", 5532 err, group); 5533 continue; 5534 } 5535 ext4_lock_group(sb, group); 5536 list_del(&pa->pa_group_list); 5537 ext4_mb_release_group_pa(&e4b, pa); 5538 ext4_unlock_group(sb, group); 5539 5540 ext4_mb_unload_buddy(&e4b); 5541 list_del(&pa->u.pa_tmp_list); 5542 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 5543 } 5544 } 5545 5546 /* 5547 * We have incremented pa_count. So it cannot be freed at this 5548 * point. Also we hold lg_mutex. So no parallel allocation is 5549 * possible from this lg. That means pa_free cannot be updated. 5550 * 5551 * A parallel ext4_mb_discard_group_preallocations is possible. 5552 * which can cause the lg_prealloc_list to be updated. 5553 */ 5554 5555 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) 5556 { 5557 int order, added = 0, lg_prealloc_count = 1; 5558 struct super_block *sb = ac->ac_sb; 5559 struct ext4_locality_group *lg = ac->ac_lg; 5560 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; 5561 5562 order = fls(pa->pa_free) - 1; 5563 if (order > PREALLOC_TB_SIZE - 1) 5564 /* The max size of hash table is PREALLOC_TB_SIZE */ 5565 order = PREALLOC_TB_SIZE - 1; 5566 /* Add the prealloc space to lg */ 5567 spin_lock(&lg->lg_prealloc_lock); 5568 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], 5569 pa_node.lg_list, 5570 lockdep_is_held(&lg->lg_prealloc_lock)) { 5571 spin_lock(&tmp_pa->pa_lock); 5572 if (tmp_pa->pa_deleted) { 5573 spin_unlock(&tmp_pa->pa_lock); 5574 continue; 5575 } 5576 if (!added && pa->pa_free < tmp_pa->pa_free) { 5577 /* Add to the tail of the previous entry */ 5578 list_add_tail_rcu(&pa->pa_node.lg_list, 5579 &tmp_pa->pa_node.lg_list); 5580 added = 1; 5581 /* 5582 * we want to count the total 5583 * number of entries in the list 5584 */ 5585 } 5586 spin_unlock(&tmp_pa->pa_lock); 5587 lg_prealloc_count++; 5588 } 5589 if (!added) 5590 list_add_tail_rcu(&pa->pa_node.lg_list, 5591 &lg->lg_prealloc_list[order]); 5592 spin_unlock(&lg->lg_prealloc_lock); 5593 5594 /* Now trim the list to be not more than 8 elements */ 5595 if (lg_prealloc_count > 8) { 5596 ext4_mb_discard_lg_preallocations(sb, lg, 5597 order, lg_prealloc_count); 5598 return; 5599 } 5600 return ; 5601 } 5602 5603 /* 5604 * release all resource we used in allocation 5605 */ 5606 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 5607 { 5608 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 5609 struct ext4_prealloc_space *pa = ac->ac_pa; 5610 if (pa) { 5611 if (pa->pa_type == MB_GROUP_PA) { 5612 /* see comment in ext4_mb_use_group_pa() */ 5613 spin_lock(&pa->pa_lock); 5614 pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 5615 pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 5616 pa->pa_free -= ac->ac_b_ex.fe_len; 5617 pa->pa_len -= ac->ac_b_ex.fe_len; 5618 spin_unlock(&pa->pa_lock); 5619 5620 /* 5621 * We want to add the pa to the right bucket. 5622 * Remove it from the list and while adding 5623 * make sure the list to which we are adding 5624 * doesn't grow big. 5625 */ 5626 if (likely(pa->pa_free)) { 5627 spin_lock(pa->pa_node_lock.lg_lock); 5628 list_del_rcu(&pa->pa_node.lg_list); 5629 spin_unlock(pa->pa_node_lock.lg_lock); 5630 ext4_mb_add_n_trim(ac); 5631 } 5632 } 5633 5634 ext4_mb_put_pa(ac, ac->ac_sb, pa); 5635 } 5636 if (ac->ac_bitmap_page) 5637 put_page(ac->ac_bitmap_page); 5638 if (ac->ac_buddy_page) 5639 put_page(ac->ac_buddy_page); 5640 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 5641 mutex_unlock(&ac->ac_lg->lg_mutex); 5642 ext4_mb_collect_stats(ac); 5643 return 0; 5644 } 5645 5646 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 5647 { 5648 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 5649 int ret; 5650 int freed = 0, busy = 0; 5651 int retry = 0; 5652 5653 trace_ext4_mb_discard_preallocations(sb, needed); 5654 5655 if (needed == 0) 5656 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; 5657 repeat: 5658 for (i = 0; i < ngroups && needed > 0; i++) { 5659 ret = ext4_mb_discard_group_preallocations(sb, i, &busy); 5660 freed += ret; 5661 needed -= ret; 5662 cond_resched(); 5663 } 5664 5665 if (needed > 0 && busy && ++retry < 3) { 5666 busy = 0; 5667 goto repeat; 5668 } 5669 5670 return freed; 5671 } 5672 5673 static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, 5674 struct ext4_allocation_context *ac, u64 *seq) 5675 { 5676 int freed; 5677 u64 seq_retry = 0; 5678 bool ret = false; 5679 5680 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); 5681 if (freed) { 5682 ret = true; 5683 goto out_dbg; 5684 } 5685 seq_retry = ext4_get_discard_pa_seq_sum(); 5686 if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { 5687 ac->ac_flags |= EXT4_MB_STRICT_CHECK; 5688 *seq = seq_retry; 5689 ret = true; 5690 } 5691 5692 out_dbg: 5693 mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); 5694 return ret; 5695 } 5696 5697 static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, 5698 struct ext4_allocation_request *ar, int *errp); 5699 5700 /* 5701 * Main entry point into mballoc to allocate blocks 5702 * it tries to use preallocation first, then falls back 5703 * to usual allocation 5704 */ 5705 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 5706 struct ext4_allocation_request *ar, int *errp) 5707 { 5708 struct ext4_allocation_context *ac = NULL; 5709 struct ext4_sb_info *sbi; 5710 struct super_block *sb; 5711 ext4_fsblk_t block = 0; 5712 unsigned int inquota = 0; 5713 unsigned int reserv_clstrs = 0; 5714 int retries = 0; 5715 u64 seq; 5716 5717 might_sleep(); 5718 sb = ar->inode->i_sb; 5719 sbi = EXT4_SB(sb); 5720 5721 trace_ext4_request_blocks(ar); 5722 if (sbi->s_mount_state & EXT4_FC_REPLAY) 5723 return ext4_mb_new_blocks_simple(handle, ar, errp); 5724 5725 /* Allow to use superuser reservation for quota file */ 5726 if (ext4_is_quota_file(ar->inode)) 5727 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; 5728 5729 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { 5730 /* Without delayed allocation we need to verify 5731 * there is enough free blocks to do block allocation 5732 * and verify allocation doesn't exceed the quota limits. 5733 */ 5734 while (ar->len && 5735 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { 5736 5737 /* let others to free the space */ 5738 cond_resched(); 5739 ar->len = ar->len >> 1; 5740 } 5741 if (!ar->len) { 5742 ext4_mb_show_pa(sb); 5743 *errp = -ENOSPC; 5744 return 0; 5745 } 5746 reserv_clstrs = ar->len; 5747 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { 5748 dquot_alloc_block_nofail(ar->inode, 5749 EXT4_C2B(sbi, ar->len)); 5750 } else { 5751 while (ar->len && 5752 dquot_alloc_block(ar->inode, 5753 EXT4_C2B(sbi, ar->len))) { 5754 5755 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 5756 ar->len--; 5757 } 5758 } 5759 inquota = ar->len; 5760 if (ar->len == 0) { 5761 *errp = -EDQUOT; 5762 goto out; 5763 } 5764 } 5765 5766 ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); 5767 if (!ac) { 5768 ar->len = 0; 5769 *errp = -ENOMEM; 5770 goto out; 5771 } 5772 5773 ext4_mb_initialize_context(ac, ar); 5774 5775 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 5776 seq = this_cpu_read(discard_pa_seq); 5777 if (!ext4_mb_use_preallocated(ac)) { 5778 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 5779 ext4_mb_normalize_request(ac, ar); 5780 5781 *errp = ext4_mb_pa_alloc(ac); 5782 if (*errp) 5783 goto errout; 5784 repeat: 5785 /* allocate space in core */ 5786 *errp = ext4_mb_regular_allocator(ac); 5787 /* 5788 * pa allocated above is added to grp->bb_prealloc_list only 5789 * when we were able to allocate some block i.e. when 5790 * ac->ac_status == AC_STATUS_FOUND. 5791 * And error from above mean ac->ac_status != AC_STATUS_FOUND 5792 * So we have to free this pa here itself. 5793 */ 5794 if (*errp) { 5795 ext4_mb_pa_put_free(ac); 5796 ext4_discard_allocated_blocks(ac); 5797 goto errout; 5798 } 5799 if (ac->ac_status == AC_STATUS_FOUND && 5800 ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) 5801 ext4_mb_pa_put_free(ac); 5802 } 5803 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 5804 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); 5805 if (*errp) { 5806 ext4_discard_allocated_blocks(ac); 5807 goto errout; 5808 } else { 5809 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 5810 ar->len = ac->ac_b_ex.fe_len; 5811 } 5812 } else { 5813 if (++retries < 3 && 5814 ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) 5815 goto repeat; 5816 /* 5817 * If block allocation fails then the pa allocated above 5818 * needs to be freed here itself. 5819 */ 5820 ext4_mb_pa_put_free(ac); 5821 *errp = -ENOSPC; 5822 } 5823 5824 if (*errp) { 5825 errout: 5826 ac->ac_b_ex.fe_len = 0; 5827 ar->len = 0; 5828 ext4_mb_show_ac(ac); 5829 } 5830 ext4_mb_release_context(ac); 5831 kmem_cache_free(ext4_ac_cachep, ac); 5832 out: 5833 if (inquota && ar->len < inquota) 5834 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); 5835 if (!ar->len) { 5836 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) 5837 /* release all the reserved blocks if non delalloc */ 5838 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 5839 reserv_clstrs); 5840 } 5841 5842 trace_ext4_allocate_blocks(ar, (unsigned long long)block); 5843 5844 return block; 5845 } 5846 5847 /* 5848 * We can merge two free data extents only if the physical blocks 5849 * are contiguous, AND the extents were freed by the same transaction, 5850 * AND the blocks are associated with the same group. 5851 */ 5852 static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, 5853 struct ext4_free_data *entry, 5854 struct ext4_free_data *new_entry, 5855 struct rb_root *entry_rb_root) 5856 { 5857 if ((entry->efd_tid != new_entry->efd_tid) || 5858 (entry->efd_group != new_entry->efd_group)) 5859 return; 5860 if (entry->efd_start_cluster + entry->efd_count == 5861 new_entry->efd_start_cluster) { 5862 new_entry->efd_start_cluster = entry->efd_start_cluster; 5863 new_entry->efd_count += entry->efd_count; 5864 } else if (new_entry->efd_start_cluster + new_entry->efd_count == 5865 entry->efd_start_cluster) { 5866 new_entry->efd_count += entry->efd_count; 5867 } else 5868 return; 5869 spin_lock(&sbi->s_md_lock); 5870 list_del(&entry->efd_list); 5871 spin_unlock(&sbi->s_md_lock); 5872 rb_erase(&entry->efd_node, entry_rb_root); 5873 kmem_cache_free(ext4_free_data_cachep, entry); 5874 } 5875 5876 static noinline_for_stack void 5877 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 5878 struct ext4_free_data *new_entry) 5879 { 5880 ext4_group_t group = e4b->bd_group; 5881 ext4_grpblk_t cluster; 5882 ext4_grpblk_t clusters = new_entry->efd_count; 5883 struct ext4_free_data *entry; 5884 struct ext4_group_info *db = e4b->bd_info; 5885 struct super_block *sb = e4b->bd_sb; 5886 struct ext4_sb_info *sbi = EXT4_SB(sb); 5887 struct rb_node **n = &db->bb_free_root.rb_node, *node; 5888 struct rb_node *parent = NULL, *new_node; 5889 5890 BUG_ON(!ext4_handle_valid(handle)); 5891 BUG_ON(e4b->bd_bitmap_page == NULL); 5892 BUG_ON(e4b->bd_buddy_page == NULL); 5893 5894 new_node = &new_entry->efd_node; 5895 cluster = new_entry->efd_start_cluster; 5896 5897 if (!*n) { 5898 /* first free block exent. We need to 5899 protect buddy cache from being freed, 5900 * otherwise we'll refresh it from 5901 * on-disk bitmap and lose not-yet-available 5902 * blocks */ 5903 get_page(e4b->bd_buddy_page); 5904 get_page(e4b->bd_bitmap_page); 5905 } 5906 while (*n) { 5907 parent = *n; 5908 entry = rb_entry(parent, struct ext4_free_data, efd_node); 5909 if (cluster < entry->efd_start_cluster) 5910 n = &(*n)->rb_left; 5911 else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) 5912 n = &(*n)->rb_right; 5913 else { 5914 ext4_grp_locked_error(sb, group, 0, 5915 ext4_group_first_block_no(sb, group) + 5916 EXT4_C2B(sbi, cluster), 5917 "Block already on to-be-freed list"); 5918 kmem_cache_free(ext4_free_data_cachep, new_entry); 5919 return; 5920 } 5921 } 5922 5923 rb_link_node(new_node, parent, n); 5924 rb_insert_color(new_node, &db->bb_free_root); 5925 5926 /* Now try to see the extent can be merged to left and right */ 5927 node = rb_prev(new_node); 5928 if (node) { 5929 entry = rb_entry(node, struct ext4_free_data, efd_node); 5930 ext4_try_merge_freed_extent(sbi, entry, new_entry, 5931 &(db->bb_free_root)); 5932 } 5933 5934 node = rb_next(new_node); 5935 if (node) { 5936 entry = rb_entry(node, struct ext4_free_data, efd_node); 5937 ext4_try_merge_freed_extent(sbi, entry, new_entry, 5938 &(db->bb_free_root)); 5939 } 5940 5941 spin_lock(&sbi->s_md_lock); 5942 list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); 5943 sbi->s_mb_free_pending += clusters; 5944 spin_unlock(&sbi->s_md_lock); 5945 } 5946 5947 /* 5948 * Simple allocator for Ext4 fast commit replay path. It searches for blocks 5949 * linearly starting at the goal block and also excludes the blocks which 5950 * are going to be in use after fast commit replay. 5951 */ 5952 static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, 5953 struct ext4_allocation_request *ar, int *errp) 5954 { 5955 struct buffer_head *bitmap_bh; 5956 struct super_block *sb = ar->inode->i_sb; 5957 ext4_group_t group; 5958 ext4_grpblk_t blkoff; 5959 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 5960 ext4_grpblk_t i = 0; 5961 ext4_fsblk_t goal, block; 5962 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 5963 5964 goal = ar->goal; 5965 if (goal < le32_to_cpu(es->s_first_data_block) || 5966 goal >= ext4_blocks_count(es)) 5967 goal = le32_to_cpu(es->s_first_data_block); 5968 5969 ar->len = 0; 5970 ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); 5971 for (; group < ext4_get_groups_count(sb); group++) { 5972 bitmap_bh = ext4_read_block_bitmap(sb, group); 5973 if (IS_ERR(bitmap_bh)) { 5974 *errp = PTR_ERR(bitmap_bh); 5975 pr_warn("Failed to read block bitmap\n"); 5976 return 0; 5977 } 5978 5979 while (1) { 5980 i = mb_find_next_zero_bit(bitmap_bh->b_data, max, 5981 blkoff); 5982 if (i >= max) 5983 break; 5984 if (ext4_fc_replay_check_excluded(sb, 5985 ext4_group_first_block_no(sb, group) + i)) { 5986 blkoff = i + 1; 5987 } else 5988 break; 5989 } 5990 brelse(bitmap_bh); 5991 if (i < max) 5992 break; 5993 5994 blkoff = 0; 5995 } 5996 5997 if (group >= ext4_get_groups_count(sb) || i >= max) { 5998 *errp = -ENOSPC; 5999 return 0; 6000 } 6001 6002 block = ext4_group_first_block_no(sb, group) + i; 6003 ext4_mb_mark_bb(sb, block, 1, 1); 6004 ar->len = 1; 6005 6006 return block; 6007 } 6008 6009 static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, 6010 unsigned long count) 6011 { 6012 struct buffer_head *bitmap_bh; 6013 struct super_block *sb = inode->i_sb; 6014 struct ext4_group_desc *gdp; 6015 struct buffer_head *gdp_bh; 6016 ext4_group_t group; 6017 ext4_grpblk_t blkoff; 6018 int already_freed = 0, err, i; 6019 6020 ext4_get_group_no_and_offset(sb, block, &group, &blkoff); 6021 bitmap_bh = ext4_read_block_bitmap(sb, group); 6022 if (IS_ERR(bitmap_bh)) { 6023 pr_warn("Failed to read block bitmap\n"); 6024 return; 6025 } 6026 gdp = ext4_get_group_desc(sb, group, &gdp_bh); 6027 if (!gdp) 6028 goto err_out; 6029 6030 for (i = 0; i < count; i++) { 6031 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) 6032 already_freed++; 6033 } 6034 mb_clear_bits(bitmap_bh->b_data, blkoff, count); 6035 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); 6036 if (err) 6037 goto err_out; 6038 ext4_free_group_clusters_set( 6039 sb, gdp, ext4_free_group_clusters(sb, gdp) + 6040 count - already_freed); 6041 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 6042 ext4_group_desc_csum_set(sb, group, gdp); 6043 ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); 6044 sync_dirty_buffer(bitmap_bh); 6045 sync_dirty_buffer(gdp_bh); 6046 6047 err_out: 6048 brelse(bitmap_bh); 6049 } 6050 6051 /** 6052 * ext4_mb_clear_bb() -- helper function for freeing blocks. 6053 * Used by ext4_free_blocks() 6054 * @handle: handle for this transaction 6055 * @inode: inode 6056 * @block: starting physical block to be freed 6057 * @count: number of blocks to be freed 6058 * @flags: flags used by ext4_free_blocks 6059 */ 6060 static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, 6061 ext4_fsblk_t block, unsigned long count, 6062 int flags) 6063 { 6064 struct buffer_head *bitmap_bh = NULL; 6065 struct super_block *sb = inode->i_sb; 6066 struct ext4_group_desc *gdp; 6067 unsigned int overflow; 6068 ext4_grpblk_t bit; 6069 struct buffer_head *gd_bh; 6070 ext4_group_t block_group; 6071 struct ext4_sb_info *sbi; 6072 struct ext4_buddy e4b; 6073 unsigned int count_clusters; 6074 int err = 0; 6075 int ret; 6076 6077 sbi = EXT4_SB(sb); 6078 6079 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6080 !ext4_inode_block_valid(inode, block, count)) { 6081 ext4_error(sb, "Freeing blocks in system zone - " 6082 "Block = %llu, count = %lu", block, count); 6083 /* err = 0. ext4_std_error should be a no op */ 6084 goto error_return; 6085 } 6086 flags |= EXT4_FREE_BLOCKS_VALIDATED; 6087 6088 do_more: 6089 overflow = 0; 6090 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 6091 6092 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( 6093 ext4_get_group_info(sb, block_group)))) 6094 return; 6095 6096 /* 6097 * Check to see if we are freeing blocks across a group 6098 * boundary. 6099 */ 6100 if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { 6101 overflow = EXT4_C2B(sbi, bit) + count - 6102 EXT4_BLOCKS_PER_GROUP(sb); 6103 count -= overflow; 6104 /* The range changed so it's no longer validated */ 6105 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6106 } 6107 count_clusters = EXT4_NUM_B2C(sbi, count); 6108 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 6109 if (IS_ERR(bitmap_bh)) { 6110 err = PTR_ERR(bitmap_bh); 6111 bitmap_bh = NULL; 6112 goto error_return; 6113 } 6114 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 6115 if (!gdp) { 6116 err = -EIO; 6117 goto error_return; 6118 } 6119 6120 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6121 !ext4_inode_block_valid(inode, block, count)) { 6122 ext4_error(sb, "Freeing blocks in system zone - " 6123 "Block = %llu, count = %lu", block, count); 6124 /* err = 0. ext4_std_error should be a no op */ 6125 goto error_return; 6126 } 6127 6128 BUFFER_TRACE(bitmap_bh, "getting write access"); 6129 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 6130 EXT4_JTR_NONE); 6131 if (err) 6132 goto error_return; 6133 6134 /* 6135 * We are about to modify some metadata. Call the journal APIs 6136 * to unshare ->b_data if a currently-committing transaction is 6137 * using it 6138 */ 6139 BUFFER_TRACE(gd_bh, "get_write_access"); 6140 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); 6141 if (err) 6142 goto error_return; 6143 #ifdef AGGRESSIVE_CHECK 6144 { 6145 int i; 6146 for (i = 0; i < count_clusters; i++) 6147 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 6148 } 6149 #endif 6150 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); 6151 6152 /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ 6153 err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, 6154 GFP_NOFS|__GFP_NOFAIL); 6155 if (err) 6156 goto error_return; 6157 6158 /* 6159 * We need to make sure we don't reuse the freed block until after the 6160 * transaction is committed. We make an exception if the inode is to be 6161 * written in writeback mode since writeback mode has weak data 6162 * consistency guarantees. 6163 */ 6164 if (ext4_handle_valid(handle) && 6165 ((flags & EXT4_FREE_BLOCKS_METADATA) || 6166 !ext4_should_writeback_data(inode))) { 6167 struct ext4_free_data *new_entry; 6168 /* 6169 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed 6170 * to fail. 6171 */ 6172 new_entry = kmem_cache_alloc(ext4_free_data_cachep, 6173 GFP_NOFS|__GFP_NOFAIL); 6174 new_entry->efd_start_cluster = bit; 6175 new_entry->efd_group = block_group; 6176 new_entry->efd_count = count_clusters; 6177 new_entry->efd_tid = handle->h_transaction->t_tid; 6178 6179 ext4_lock_group(sb, block_group); 6180 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 6181 ext4_mb_free_metadata(handle, &e4b, new_entry); 6182 } else { 6183 /* need to update group_info->bb_free and bitmap 6184 * with group lock held. generate_buddy look at 6185 * them with group lock_held 6186 */ 6187 if (test_opt(sb, DISCARD)) { 6188 err = ext4_issue_discard(sb, block_group, bit, count, 6189 NULL); 6190 if (err && err != -EOPNOTSUPP) 6191 ext4_msg(sb, KERN_WARNING, "discard request in" 6192 " group:%u block:%d count:%lu failed" 6193 " with %d", block_group, bit, count, 6194 err); 6195 } else 6196 EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); 6197 6198 ext4_lock_group(sb, block_group); 6199 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 6200 mb_free_blocks(inode, &e4b, bit, count_clusters); 6201 } 6202 6203 ret = ext4_free_group_clusters(sb, gdp) + count_clusters; 6204 ext4_free_group_clusters_set(sb, gdp, ret); 6205 ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 6206 ext4_group_desc_csum_set(sb, block_group, gdp); 6207 ext4_unlock_group(sb, block_group); 6208 6209 if (sbi->s_log_groups_per_flex) { 6210 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 6211 atomic64_add(count_clusters, 6212 &sbi_array_rcu_deref(sbi, s_flex_groups, 6213 flex_group)->free_clusters); 6214 } 6215 6216 /* 6217 * on a bigalloc file system, defer the s_freeclusters_counter 6218 * update to the caller (ext4_remove_space and friends) so they 6219 * can determine if a cluster freed here should be rereserved 6220 */ 6221 if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { 6222 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) 6223 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); 6224 percpu_counter_add(&sbi->s_freeclusters_counter, 6225 count_clusters); 6226 } 6227 6228 ext4_mb_unload_buddy(&e4b); 6229 6230 /* We dirtied the bitmap block */ 6231 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 6232 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 6233 6234 /* And the group descriptor block */ 6235 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 6236 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 6237 if (!err) 6238 err = ret; 6239 6240 if (overflow && !err) { 6241 block += count; 6242 count = overflow; 6243 put_bh(bitmap_bh); 6244 /* The range changed so it's no longer validated */ 6245 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6246 goto do_more; 6247 } 6248 error_return: 6249 brelse(bitmap_bh); 6250 ext4_std_error(sb, err); 6251 return; 6252 } 6253 6254 /** 6255 * ext4_free_blocks() -- Free given blocks and update quota 6256 * @handle: handle for this transaction 6257 * @inode: inode 6258 * @bh: optional buffer of the block to be freed 6259 * @block: starting physical block to be freed 6260 * @count: number of blocks to be freed 6261 * @flags: flags used by ext4_free_blocks 6262 */ 6263 void ext4_free_blocks(handle_t *handle, struct inode *inode, 6264 struct buffer_head *bh, ext4_fsblk_t block, 6265 unsigned long count, int flags) 6266 { 6267 struct super_block *sb = inode->i_sb; 6268 unsigned int overflow; 6269 struct ext4_sb_info *sbi; 6270 6271 sbi = EXT4_SB(sb); 6272 6273 if (sbi->s_mount_state & EXT4_FC_REPLAY) { 6274 ext4_free_blocks_simple(inode, block, count); 6275 return; 6276 } 6277 6278 might_sleep(); 6279 if (bh) { 6280 if (block) 6281 BUG_ON(block != bh->b_blocknr); 6282 else 6283 block = bh->b_blocknr; 6284 } 6285 6286 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6287 !ext4_inode_block_valid(inode, block, count)) { 6288 ext4_error(sb, "Freeing blocks not in datazone - " 6289 "block = %llu, count = %lu", block, count); 6290 return; 6291 } 6292 flags |= EXT4_FREE_BLOCKS_VALIDATED; 6293 6294 ext4_debug("freeing block %llu\n", block); 6295 trace_ext4_free_blocks(inode, block, count, flags); 6296 6297 if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { 6298 BUG_ON(count > 1); 6299 6300 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 6301 inode, bh, block); 6302 } 6303 6304 /* 6305 * If the extent to be freed does not begin on a cluster 6306 * boundary, we need to deal with partial clusters at the 6307 * beginning and end of the extent. Normally we will free 6308 * blocks at the beginning or the end unless we are explicitly 6309 * requested to avoid doing so. 6310 */ 6311 overflow = EXT4_PBLK_COFF(sbi, block); 6312 if (overflow) { 6313 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { 6314 overflow = sbi->s_cluster_ratio - overflow; 6315 block += overflow; 6316 if (count > overflow) 6317 count -= overflow; 6318 else 6319 return; 6320 } else { 6321 block -= overflow; 6322 count += overflow; 6323 } 6324 /* The range changed so it's no longer validated */ 6325 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6326 } 6327 overflow = EXT4_LBLK_COFF(sbi, count); 6328 if (overflow) { 6329 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { 6330 if (count > overflow) 6331 count -= overflow; 6332 else 6333 return; 6334 } else 6335 count += sbi->s_cluster_ratio - overflow; 6336 /* The range changed so it's no longer validated */ 6337 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6338 } 6339 6340 if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { 6341 int i; 6342 int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; 6343 6344 for (i = 0; i < count; i++) { 6345 cond_resched(); 6346 if (is_metadata) 6347 bh = sb_find_get_block(inode->i_sb, block + i); 6348 ext4_forget(handle, is_metadata, inode, bh, block + i); 6349 } 6350 } 6351 6352 ext4_mb_clear_bb(handle, inode, block, count, flags); 6353 return; 6354 } 6355 6356 /** 6357 * ext4_group_add_blocks() -- Add given blocks to an existing group 6358 * @handle: handle to this transaction 6359 * @sb: super block 6360 * @block: start physical block to add to the block group 6361 * @count: number of blocks to free 6362 * 6363 * This marks the blocks as free in the bitmap and buddy. 6364 */ 6365 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, 6366 ext4_fsblk_t block, unsigned long count) 6367 { 6368 struct buffer_head *bitmap_bh = NULL; 6369 struct buffer_head *gd_bh; 6370 ext4_group_t block_group; 6371 ext4_grpblk_t bit; 6372 unsigned int i; 6373 struct ext4_group_desc *desc; 6374 struct ext4_sb_info *sbi = EXT4_SB(sb); 6375 struct ext4_buddy e4b; 6376 int err = 0, ret, free_clusters_count; 6377 ext4_grpblk_t clusters_freed; 6378 ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); 6379 ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); 6380 unsigned long cluster_count = last_cluster - first_cluster + 1; 6381 6382 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 6383 6384 if (count == 0) 6385 return 0; 6386 6387 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 6388 /* 6389 * Check to see if we are freeing blocks across a group 6390 * boundary. 6391 */ 6392 if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { 6393 ext4_warning(sb, "too many blocks added to group %u", 6394 block_group); 6395 err = -EINVAL; 6396 goto error_return; 6397 } 6398 6399 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 6400 if (IS_ERR(bitmap_bh)) { 6401 err = PTR_ERR(bitmap_bh); 6402 bitmap_bh = NULL; 6403 goto error_return; 6404 } 6405 6406 desc = ext4_get_group_desc(sb, block_group, &gd_bh); 6407 if (!desc) { 6408 err = -EIO; 6409 goto error_return; 6410 } 6411 6412 if (!ext4_sb_block_valid(sb, NULL, block, count)) { 6413 ext4_error(sb, "Adding blocks in system zones - " 6414 "Block = %llu, count = %lu", 6415 block, count); 6416 err = -EINVAL; 6417 goto error_return; 6418 } 6419 6420 BUFFER_TRACE(bitmap_bh, "getting write access"); 6421 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 6422 EXT4_JTR_NONE); 6423 if (err) 6424 goto error_return; 6425 6426 /* 6427 * We are about to modify some metadata. Call the journal APIs 6428 * to unshare ->b_data if a currently-committing transaction is 6429 * using it 6430 */ 6431 BUFFER_TRACE(gd_bh, "get_write_access"); 6432 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); 6433 if (err) 6434 goto error_return; 6435 6436 for (i = 0, clusters_freed = 0; i < cluster_count; i++) { 6437 BUFFER_TRACE(bitmap_bh, "clear bit"); 6438 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { 6439 ext4_error(sb, "bit already cleared for block %llu", 6440 (ext4_fsblk_t)(block + i)); 6441 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 6442 } else { 6443 clusters_freed++; 6444 } 6445 } 6446 6447 err = ext4_mb_load_buddy(sb, block_group, &e4b); 6448 if (err) 6449 goto error_return; 6450 6451 /* 6452 * need to update group_info->bb_free and bitmap 6453 * with group lock held. generate_buddy look at 6454 * them with group lock_held 6455 */ 6456 ext4_lock_group(sb, block_group); 6457 mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); 6458 mb_free_blocks(NULL, &e4b, bit, cluster_count); 6459 free_clusters_count = clusters_freed + 6460 ext4_free_group_clusters(sb, desc); 6461 ext4_free_group_clusters_set(sb, desc, free_clusters_count); 6462 ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); 6463 ext4_group_desc_csum_set(sb, block_group, desc); 6464 ext4_unlock_group(sb, block_group); 6465 percpu_counter_add(&sbi->s_freeclusters_counter, 6466 clusters_freed); 6467 6468 if (sbi->s_log_groups_per_flex) { 6469 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 6470 atomic64_add(clusters_freed, 6471 &sbi_array_rcu_deref(sbi, s_flex_groups, 6472 flex_group)->free_clusters); 6473 } 6474 6475 ext4_mb_unload_buddy(&e4b); 6476 6477 /* We dirtied the bitmap block */ 6478 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 6479 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 6480 6481 /* And the group descriptor block */ 6482 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 6483 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 6484 if (!err) 6485 err = ret; 6486 6487 error_return: 6488 brelse(bitmap_bh); 6489 ext4_std_error(sb, err); 6490 return err; 6491 } 6492 6493 /** 6494 * ext4_trim_extent -- function to TRIM one single free extent in the group 6495 * @sb: super block for the file system 6496 * @start: starting block of the free extent in the alloc. group 6497 * @count: number of blocks to TRIM 6498 * @e4b: ext4 buddy for the group 6499 * 6500 * Trim "count" blocks starting at "start" in the "group". To assure that no 6501 * one will allocate those blocks, mark it as used in buddy bitmap. This must 6502 * be called with under the group lock. 6503 */ 6504 static int ext4_trim_extent(struct super_block *sb, 6505 int start, int count, struct ext4_buddy *e4b) 6506 __releases(bitlock) 6507 __acquires(bitlock) 6508 { 6509 struct ext4_free_extent ex; 6510 ext4_group_t group = e4b->bd_group; 6511 int ret = 0; 6512 6513 trace_ext4_trim_extent(sb, group, start, count); 6514 6515 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 6516 6517 ex.fe_start = start; 6518 ex.fe_group = group; 6519 ex.fe_len = count; 6520 6521 /* 6522 * Mark blocks used, so no one can reuse them while 6523 * being trimmed. 6524 */ 6525 mb_mark_used(e4b, &ex); 6526 ext4_unlock_group(sb, group); 6527 ret = ext4_issue_discard(sb, group, start, count, NULL); 6528 ext4_lock_group(sb, group); 6529 mb_free_blocks(NULL, e4b, start, ex.fe_len); 6530 return ret; 6531 } 6532 6533 static int ext4_try_to_trim_range(struct super_block *sb, 6534 struct ext4_buddy *e4b, ext4_grpblk_t start, 6535 ext4_grpblk_t max, ext4_grpblk_t minblocks) 6536 __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) 6537 __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) 6538 { 6539 ext4_grpblk_t next, count, free_count; 6540 void *bitmap; 6541 6542 bitmap = e4b->bd_bitmap; 6543 start = (e4b->bd_info->bb_first_free > start) ? 6544 e4b->bd_info->bb_first_free : start; 6545 count = 0; 6546 free_count = 0; 6547 6548 while (start <= max) { 6549 start = mb_find_next_zero_bit(bitmap, max + 1, start); 6550 if (start > max) 6551 break; 6552 next = mb_find_next_bit(bitmap, max + 1, start); 6553 6554 if ((next - start) >= minblocks) { 6555 int ret = ext4_trim_extent(sb, start, next - start, e4b); 6556 6557 if (ret && ret != -EOPNOTSUPP) 6558 break; 6559 count += next - start; 6560 } 6561 free_count += next - start; 6562 start = next + 1; 6563 6564 if (fatal_signal_pending(current)) { 6565 count = -ERESTARTSYS; 6566 break; 6567 } 6568 6569 if (need_resched()) { 6570 ext4_unlock_group(sb, e4b->bd_group); 6571 cond_resched(); 6572 ext4_lock_group(sb, e4b->bd_group); 6573 } 6574 6575 if ((e4b->bd_info->bb_free - free_count) < minblocks) 6576 break; 6577 } 6578 6579 return count; 6580 } 6581 6582 /** 6583 * ext4_trim_all_free -- function to trim all free space in alloc. group 6584 * @sb: super block for file system 6585 * @group: group to be trimmed 6586 * @start: first group block to examine 6587 * @max: last group block to examine 6588 * @minblocks: minimum extent block count 6589 * @set_trimmed: set the trimmed flag if at least one block is trimmed 6590 * 6591 * ext4_trim_all_free walks through group's block bitmap searching for free 6592 * extents. When the free extent is found, mark it as used in group buddy 6593 * bitmap. Then issue a TRIM command on this extent and free the extent in 6594 * the group buddy bitmap. 6595 */ 6596 static ext4_grpblk_t 6597 ext4_trim_all_free(struct super_block *sb, ext4_group_t group, 6598 ext4_grpblk_t start, ext4_grpblk_t max, 6599 ext4_grpblk_t minblocks, bool set_trimmed) 6600 { 6601 struct ext4_buddy e4b; 6602 int ret; 6603 6604 trace_ext4_trim_all_free(sb, group, start, max); 6605 6606 ret = ext4_mb_load_buddy(sb, group, &e4b); 6607 if (ret) { 6608 ext4_warning(sb, "Error %d loading buddy information for %u", 6609 ret, group); 6610 return ret; 6611 } 6612 6613 ext4_lock_group(sb, group); 6614 6615 if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || 6616 minblocks < EXT4_SB(sb)->s_last_trim_minblks) { 6617 ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); 6618 if (ret >= 0 && set_trimmed) 6619 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); 6620 } else { 6621 ret = 0; 6622 } 6623 6624 ext4_unlock_group(sb, group); 6625 ext4_mb_unload_buddy(&e4b); 6626 6627 ext4_debug("trimmed %d blocks in the group %d\n", 6628 ret, group); 6629 6630 return ret; 6631 } 6632 6633 /** 6634 * ext4_trim_fs() -- trim ioctl handle function 6635 * @sb: superblock for filesystem 6636 * @range: fstrim_range structure 6637 * 6638 * start: First Byte to trim 6639 * len: number of Bytes to trim from start 6640 * minlen: minimum extent length in Bytes 6641 * ext4_trim_fs goes through all allocation groups containing Bytes from 6642 * start to start+len. For each such a group ext4_trim_all_free function 6643 * is invoked to trim all free space. 6644 */ 6645 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 6646 { 6647 unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); 6648 struct ext4_group_info *grp; 6649 ext4_group_t group, first_group, last_group; 6650 ext4_grpblk_t cnt = 0, first_cluster, last_cluster; 6651 uint64_t start, end, minlen, trimmed = 0; 6652 ext4_fsblk_t first_data_blk = 6653 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 6654 ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); 6655 bool whole_group, eof = false; 6656 int ret = 0; 6657 6658 start = range->start >> sb->s_blocksize_bits; 6659 end = start + (range->len >> sb->s_blocksize_bits) - 1; 6660 minlen = EXT4_NUM_B2C(EXT4_SB(sb), 6661 range->minlen >> sb->s_blocksize_bits); 6662 6663 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || 6664 start >= max_blks || 6665 range->len < sb->s_blocksize) 6666 return -EINVAL; 6667 /* No point to try to trim less than discard granularity */ 6668 if (range->minlen < discard_granularity) { 6669 minlen = EXT4_NUM_B2C(EXT4_SB(sb), 6670 discard_granularity >> sb->s_blocksize_bits); 6671 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) 6672 goto out; 6673 } 6674 if (end >= max_blks - 1) { 6675 end = max_blks - 1; 6676 eof = true; 6677 } 6678 if (end <= first_data_blk) 6679 goto out; 6680 if (start < first_data_blk) 6681 start = first_data_blk; 6682 6683 /* Determine first and last group to examine based on start and end */ 6684 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 6685 &first_group, &first_cluster); 6686 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, 6687 &last_group, &last_cluster); 6688 6689 /* end now represents the last cluster to discard in this group */ 6690 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; 6691 whole_group = true; 6692 6693 for (group = first_group; group <= last_group; group++) { 6694 grp = ext4_get_group_info(sb, group); 6695 /* We only do this if the grp has never been initialized */ 6696 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 6697 ret = ext4_mb_init_group(sb, group, GFP_NOFS); 6698 if (ret) 6699 break; 6700 } 6701 6702 /* 6703 * For all the groups except the last one, last cluster will 6704 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to 6705 * change it for the last group, note that last_cluster is 6706 * already computed earlier by ext4_get_group_no_and_offset() 6707 */ 6708 if (group == last_group) { 6709 end = last_cluster; 6710 whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; 6711 } 6712 if (grp->bb_free >= minlen) { 6713 cnt = ext4_trim_all_free(sb, group, first_cluster, 6714 end, minlen, whole_group); 6715 if (cnt < 0) { 6716 ret = cnt; 6717 break; 6718 } 6719 trimmed += cnt; 6720 } 6721 6722 /* 6723 * For every group except the first one, we are sure 6724 * that the first cluster to discard will be cluster #0. 6725 */ 6726 first_cluster = 0; 6727 } 6728 6729 if (!ret) 6730 EXT4_SB(sb)->s_last_trim_minblks = minlen; 6731 6732 out: 6733 range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; 6734 return ret; 6735 } 6736 6737 /* Iterate all the free extents in the group. */ 6738 int 6739 ext4_mballoc_query_range( 6740 struct super_block *sb, 6741 ext4_group_t group, 6742 ext4_grpblk_t start, 6743 ext4_grpblk_t end, 6744 ext4_mballoc_query_range_fn formatter, 6745 void *priv) 6746 { 6747 void *bitmap; 6748 ext4_grpblk_t next; 6749 struct ext4_buddy e4b; 6750 int error; 6751 6752 error = ext4_mb_load_buddy(sb, group, &e4b); 6753 if (error) 6754 return error; 6755 bitmap = e4b.bd_bitmap; 6756 6757 ext4_lock_group(sb, group); 6758 6759 start = (e4b.bd_info->bb_first_free > start) ? 6760 e4b.bd_info->bb_first_free : start; 6761 if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) 6762 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; 6763 6764 while (start <= end) { 6765 start = mb_find_next_zero_bit(bitmap, end + 1, start); 6766 if (start > end) 6767 break; 6768 next = mb_find_next_bit(bitmap, end + 1, start); 6769 6770 ext4_unlock_group(sb, group); 6771 error = formatter(sb, group, start, next - start, priv); 6772 if (error) 6773 goto out_unload; 6774 ext4_lock_group(sb, group); 6775 6776 start = next + 1; 6777 } 6778 6779 ext4_unlock_group(sb, group); 6780 out_unload: 6781 ext4_mb_unload_buddy(&e4b); 6782 6783 return error; 6784 } 6785