1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 4 * Written by Alex Tomas <alex@clusterfs.com> 5 */ 6 7 8 /* 9 * mballoc.c contains the multiblocks allocation routines 10 */ 11 12 #include "ext4_jbd2.h" 13 #include "mballoc.h" 14 #include <linux/log2.h> 15 #include <linux/module.h> 16 #include <linux/slab.h> 17 #include <linux/nospec.h> 18 #include <linux/backing-dev.h> 19 #include <trace/events/ext4.h> 20 21 /* 22 * MUSTDO: 23 * - test ext4_ext_search_left() and ext4_ext_search_right() 24 * - search for metadata in few groups 25 * 26 * TODO v4: 27 * - normalization should take into account whether file is still open 28 * - discard preallocations if no free space left (policy?) 29 * - don't normalize tails 30 * - quota 31 * - reservation for superuser 32 * 33 * TODO v3: 34 * - bitmap read-ahead (proposed by Oleg Drokin aka green) 35 * - track min/max extents in each group for better group selection 36 * - mb_mark_used() may allocate chunk right after splitting buddy 37 * - tree of groups sorted by number of free blocks 38 * - error handling 39 */ 40 41 /* 42 * The allocation request involve request for multiple number of blocks 43 * near to the goal(block) value specified. 44 * 45 * During initialization phase of the allocator we decide to use the 46 * group preallocation or inode preallocation depending on the size of 47 * the file. The size of the file could be the resulting file size we 48 * would have after allocation, or the current file size, which ever 49 * is larger. If the size is less than sbi->s_mb_stream_request we 50 * select to use the group preallocation. The default value of 51 * s_mb_stream_request is 16 blocks. This can also be tuned via 52 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in 53 * terms of number of blocks. 54 * 55 * The main motivation for having small file use group preallocation is to 56 * ensure that we have small files closer together on the disk. 57 * 58 * First stage the allocator looks at the inode prealloc list, 59 * ext4_inode_info->i_prealloc_list, which contains list of prealloc 60 * spaces for this particular inode. The inode prealloc space is 61 * represented as: 62 * 63 * pa_lstart -> the logical start block for this prealloc space 64 * pa_pstart -> the physical start block for this prealloc space 65 * pa_len -> length for this prealloc space (in clusters) 66 * pa_free -> free space available in this prealloc space (in clusters) 67 * 68 * The inode preallocation space is used looking at the _logical_ start 69 * block. If only the logical file block falls within the range of prealloc 70 * space we will consume the particular prealloc space. This makes sure that 71 * we have contiguous physical blocks representing the file blocks 72 * 73 * The important thing to be noted in case of inode prealloc space is that 74 * we don't modify the values associated to inode prealloc space except 75 * pa_free. 76 * 77 * If we are not able to find blocks in the inode prealloc space and if we 78 * have the group allocation flag set then we look at the locality group 79 * prealloc space. These are per CPU prealloc list represented as 80 * 81 * ext4_sb_info.s_locality_groups[smp_processor_id()] 82 * 83 * The reason for having a per cpu locality group is to reduce the contention 84 * between CPUs. It is possible to get scheduled at this point. 85 * 86 * The locality group prealloc space is used looking at whether we have 87 * enough free space (pa_free) within the prealloc space. 88 * 89 * If we can't allocate blocks via inode prealloc or/and locality group 90 * prealloc then we look at the buddy cache. The buddy cache is represented 91 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets 92 * mapped to the buddy and bitmap information regarding different 93 * groups. The buddy information is attached to buddy cache inode so that 94 * we can access them through the page cache. The information regarding 95 * each group is loaded via ext4_mb_load_buddy. The information involve 96 * block bitmap and buddy information. The information are stored in the 97 * inode as: 98 * 99 * { page } 100 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 101 * 102 * 103 * one block each for bitmap and buddy information. So for each group we 104 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / 105 * blocksize) blocks. So it can have information regarding groups_per_page 106 * which is blocks_per_page/2 107 * 108 * The buddy cache inode is not stored on disk. The inode is thrown 109 * away when the filesystem is unmounted. 110 * 111 * We look for count number of blocks in the buddy cache. If we were able 112 * to locate that many free blocks we return with additional information 113 * regarding rest of the contiguous physical block available 114 * 115 * Before allocating blocks via buddy cache we normalize the request 116 * blocks. This ensure we ask for more blocks that we needed. The extra 117 * blocks that we get after allocation is added to the respective prealloc 118 * list. In case of inode preallocation we follow a list of heuristics 119 * based on file size. This can be found in ext4_mb_normalize_request. If 120 * we are doing a group prealloc we try to normalize the request to 121 * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is 122 * dependent on the cluster size; for non-bigalloc file systems, it is 123 * 512 blocks. This can be tuned via 124 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in 125 * terms of number of blocks. If we have mounted the file system with -O 126 * stripe=<value> option the group prealloc request is normalized to the 127 * smallest multiple of the stripe value (sbi->s_stripe) which is 128 * greater than the default mb_group_prealloc. 129 * 130 * If "mb_optimize_scan" mount option is set, we maintain in memory group info 131 * structures in two data structures: 132 * 133 * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) 134 * 135 * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) 136 * 137 * This is an array of lists where the index in the array represents the 138 * largest free order in the buddy bitmap of the participating group infos of 139 * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total 140 * number of buddy bitmap orders possible) number of lists. Group-infos are 141 * placed in appropriate lists. 142 * 143 * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) 144 * 145 * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) 146 * 147 * This is an array of lists where in the i-th list there are groups with 148 * average fragment size >= 2^i and < 2^(i+1). The average fragment size 149 * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. 150 * Note that we don't bother with a special list for completely empty groups 151 * so we only have MB_NUM_ORDERS(sb) lists. 152 * 153 * When "mb_optimize_scan" mount option is set, mballoc consults the above data 154 * structures to decide the order in which groups are to be traversed for 155 * fulfilling an allocation request. 156 * 157 * At CR = 0, we look for groups which have the largest_free_order >= the order 158 * of the request. We directly look at the largest free order list in the data 159 * structure (1) above where largest_free_order = order of the request. If that 160 * list is empty, we look at remaining list in the increasing order of 161 * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. 162 * 163 * At CR = 1, we only consider groups where average fragment size > request 164 * size. So, we lookup a group which has average fragment size just above or 165 * equal to request size using our average fragment size group lists (data 166 * structure 2) in O(1) time. 167 * 168 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in 169 * linear order which requires O(N) search time for each CR 0 and CR 1 phase. 170 * 171 * The regular allocator (using the buddy cache) supports a few tunables. 172 * 173 * /sys/fs/ext4/<partition>/mb_min_to_scan 174 * /sys/fs/ext4/<partition>/mb_max_to_scan 175 * /sys/fs/ext4/<partition>/mb_order2_req 176 * /sys/fs/ext4/<partition>/mb_linear_limit 177 * 178 * The regular allocator uses buddy scan only if the request len is power of 179 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 180 * value of s_mb_order2_reqs can be tuned via 181 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to 182 * stripe size (sbi->s_stripe), we try to search for contiguous block in 183 * stripe size. This should result in better allocation on RAID setups. If 184 * not, we search in the specific group using bitmap for best extents. The 185 * tunable min_to_scan and max_to_scan control the behaviour here. 186 * min_to_scan indicate how long the mballoc __must__ look for a best 187 * extent and max_to_scan indicates how long the mballoc __can__ look for a 188 * best extent in the found extents. Searching for the blocks starts with 189 * the group specified as the goal value in allocation context via 190 * ac_g_ex. Each group is first checked based on the criteria whether it 191 * can be used for allocation. ext4_mb_good_group explains how the groups are 192 * checked. 193 * 194 * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not 195 * get traversed linearly. That may result in subsequent allocations being not 196 * close to each other. And so, the underlying device may get filled up in a 197 * non-linear fashion. While that may not matter on non-rotational devices, for 198 * rotational devices that may result in higher seek times. "mb_linear_limit" 199 * tells mballoc how many groups mballoc should search linearly before 200 * performing consulting above data structures for more efficient lookups. For 201 * non rotational devices, this value defaults to 0 and for rotational devices 202 * this is set to MB_DEFAULT_LINEAR_LIMIT. 203 * 204 * Both the prealloc space are getting populated as above. So for the first 205 * request we will hit the buddy cache which will result in this prealloc 206 * space getting filled. The prealloc space is then later used for the 207 * subsequent request. 208 */ 209 210 /* 211 * mballoc operates on the following data: 212 * - on-disk bitmap 213 * - in-core buddy (actually includes buddy and bitmap) 214 * - preallocation descriptors (PAs) 215 * 216 * there are two types of preallocations: 217 * - inode 218 * assiged to specific inode and can be used for this inode only. 219 * it describes part of inode's space preallocated to specific 220 * physical blocks. any block from that preallocated can be used 221 * independent. the descriptor just tracks number of blocks left 222 * unused. so, before taking some block from descriptor, one must 223 * make sure corresponded logical block isn't allocated yet. this 224 * also means that freeing any block within descriptor's range 225 * must discard all preallocated blocks. 226 * - locality group 227 * assigned to specific locality group which does not translate to 228 * permanent set of inodes: inode can join and leave group. space 229 * from this type of preallocation can be used for any inode. thus 230 * it's consumed from the beginning to the end. 231 * 232 * relation between them can be expressed as: 233 * in-core buddy = on-disk bitmap + preallocation descriptors 234 * 235 * this mean blocks mballoc considers used are: 236 * - allocated blocks (persistent) 237 * - preallocated blocks (non-persistent) 238 * 239 * consistency in mballoc world means that at any time a block is either 240 * free or used in ALL structures. notice: "any time" should not be read 241 * literally -- time is discrete and delimited by locks. 242 * 243 * to keep it simple, we don't use block numbers, instead we count number of 244 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. 245 * 246 * all operations can be expressed as: 247 * - init buddy: buddy = on-disk + PAs 248 * - new PA: buddy += N; PA = N 249 * - use inode PA: on-disk += N; PA -= N 250 * - discard inode PA buddy -= on-disk - PA; PA = 0 251 * - use locality group PA on-disk += N; PA -= N 252 * - discard locality group PA buddy -= PA; PA = 0 253 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap 254 * is used in real operation because we can't know actual used 255 * bits from PA, only from on-disk bitmap 256 * 257 * if we follow this strict logic, then all operations above should be atomic. 258 * given some of them can block, we'd have to use something like semaphores 259 * killing performance on high-end SMP hardware. let's try to relax it using 260 * the following knowledge: 261 * 1) if buddy is referenced, it's already initialized 262 * 2) while block is used in buddy and the buddy is referenced, 263 * nobody can re-allocate that block 264 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has 265 * bit set and PA claims same block, it's OK. IOW, one can set bit in 266 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded 267 * block 268 * 269 * so, now we're building a concurrency table: 270 * - init buddy vs. 271 * - new PA 272 * blocks for PA are allocated in the buddy, buddy must be referenced 273 * until PA is linked to allocation group to avoid concurrent buddy init 274 * - use inode PA 275 * we need to make sure that either on-disk bitmap or PA has uptodate data 276 * given (3) we care that PA-=N operation doesn't interfere with init 277 * - discard inode PA 278 * the simplest way would be to have buddy initialized by the discard 279 * - use locality group PA 280 * again PA-=N must be serialized with init 281 * - discard locality group PA 282 * the simplest way would be to have buddy initialized by the discard 283 * - new PA vs. 284 * - use inode PA 285 * i_data_sem serializes them 286 * - discard inode PA 287 * discard process must wait until PA isn't used by another process 288 * - use locality group PA 289 * some mutex should serialize them 290 * - discard locality group PA 291 * discard process must wait until PA isn't used by another process 292 * - use inode PA 293 * - use inode PA 294 * i_data_sem or another mutex should serializes them 295 * - discard inode PA 296 * discard process must wait until PA isn't used by another process 297 * - use locality group PA 298 * nothing wrong here -- they're different PAs covering different blocks 299 * - discard locality group PA 300 * discard process must wait until PA isn't used by another process 301 * 302 * now we're ready to make few consequences: 303 * - PA is referenced and while it is no discard is possible 304 * - PA is referenced until block isn't marked in on-disk bitmap 305 * - PA changes only after on-disk bitmap 306 * - discard must not compete with init. either init is done before 307 * any discard or they're serialized somehow 308 * - buddy init as sum of on-disk bitmap and PAs is done atomically 309 * 310 * a special case when we've used PA to emptiness. no need to modify buddy 311 * in this case, but we should care about concurrent init 312 * 313 */ 314 315 /* 316 * Logic in few words: 317 * 318 * - allocation: 319 * load group 320 * find blocks 321 * mark bits in on-disk bitmap 322 * release group 323 * 324 * - use preallocation: 325 * find proper PA (per-inode or group) 326 * load group 327 * mark bits in on-disk bitmap 328 * release group 329 * release PA 330 * 331 * - free: 332 * load group 333 * mark bits in on-disk bitmap 334 * release group 335 * 336 * - discard preallocations in group: 337 * mark PAs deleted 338 * move them onto local list 339 * load on-disk bitmap 340 * load group 341 * remove PA from object (inode or locality group) 342 * mark free blocks in-core 343 * 344 * - discard inode's preallocations: 345 */ 346 347 /* 348 * Locking rules 349 * 350 * Locks: 351 * - bitlock on a group (group) 352 * - object (inode/locality) (object) 353 * - per-pa lock (pa) 354 * - cr0 lists lock (cr0) 355 * - cr1 tree lock (cr1) 356 * 357 * Paths: 358 * - new pa 359 * object 360 * group 361 * 362 * - find and use pa: 363 * pa 364 * 365 * - release consumed pa: 366 * pa 367 * group 368 * object 369 * 370 * - generate in-core bitmap: 371 * group 372 * pa 373 * 374 * - discard all for given object (inode, locality group): 375 * object 376 * pa 377 * group 378 * 379 * - discard all for given group: 380 * group 381 * pa 382 * group 383 * object 384 * 385 * - allocation path (ext4_mb_regular_allocator) 386 * group 387 * cr0/cr1 388 */ 389 static struct kmem_cache *ext4_pspace_cachep; 390 static struct kmem_cache *ext4_ac_cachep; 391 static struct kmem_cache *ext4_free_data_cachep; 392 393 /* We create slab caches for groupinfo data structures based on the 394 * superblock block size. There will be one per mounted filesystem for 395 * each unique s_blocksize_bits */ 396 #define NR_GRPINFO_CACHES 8 397 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; 398 399 static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { 400 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", 401 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", 402 "ext4_groupinfo_64k", "ext4_groupinfo_128k" 403 }; 404 405 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 406 ext4_group_t group); 407 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 408 ext4_group_t group); 409 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); 410 411 static bool ext4_mb_good_group(struct ext4_allocation_context *ac, 412 ext4_group_t group, int cr); 413 414 static int ext4_try_to_trim_range(struct super_block *sb, 415 struct ext4_buddy *e4b, ext4_grpblk_t start, 416 ext4_grpblk_t max, ext4_grpblk_t minblocks); 417 418 /* 419 * The algorithm using this percpu seq counter goes below: 420 * 1. We sample the percpu discard_pa_seq counter before trying for block 421 * allocation in ext4_mb_new_blocks(). 422 * 2. We increment this percpu discard_pa_seq counter when we either allocate 423 * or free these blocks i.e. while marking those blocks as used/free in 424 * mb_mark_used()/mb_free_blocks(). 425 * 3. We also increment this percpu seq counter when we successfully identify 426 * that the bb_prealloc_list is not empty and hence proceed for discarding 427 * of those PAs inside ext4_mb_discard_group_preallocations(). 428 * 429 * Now to make sure that the regular fast path of block allocation is not 430 * affected, as a small optimization we only sample the percpu seq counter 431 * on that cpu. Only when the block allocation fails and when freed blocks 432 * found were 0, that is when we sample percpu seq counter for all cpus using 433 * below function ext4_get_discard_pa_seq_sum(). This happens after making 434 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. 435 */ 436 static DEFINE_PER_CPU(u64, discard_pa_seq); 437 static inline u64 ext4_get_discard_pa_seq_sum(void) 438 { 439 int __cpu; 440 u64 __seq = 0; 441 442 for_each_possible_cpu(__cpu) 443 __seq += per_cpu(discard_pa_seq, __cpu); 444 return __seq; 445 } 446 447 static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 448 { 449 #if BITS_PER_LONG == 64 450 *bit += ((unsigned long) addr & 7UL) << 3; 451 addr = (void *) ((unsigned long) addr & ~7UL); 452 #elif BITS_PER_LONG == 32 453 *bit += ((unsigned long) addr & 3UL) << 3; 454 addr = (void *) ((unsigned long) addr & ~3UL); 455 #else 456 #error "how many bits you are?!" 457 #endif 458 return addr; 459 } 460 461 static inline int mb_test_bit(int bit, void *addr) 462 { 463 /* 464 * ext4_test_bit on architecture like powerpc 465 * needs unsigned long aligned address 466 */ 467 addr = mb_correct_addr_and_bit(&bit, addr); 468 return ext4_test_bit(bit, addr); 469 } 470 471 static inline void mb_set_bit(int bit, void *addr) 472 { 473 addr = mb_correct_addr_and_bit(&bit, addr); 474 ext4_set_bit(bit, addr); 475 } 476 477 static inline void mb_clear_bit(int bit, void *addr) 478 { 479 addr = mb_correct_addr_and_bit(&bit, addr); 480 ext4_clear_bit(bit, addr); 481 } 482 483 static inline int mb_test_and_clear_bit(int bit, void *addr) 484 { 485 addr = mb_correct_addr_and_bit(&bit, addr); 486 return ext4_test_and_clear_bit(bit, addr); 487 } 488 489 static inline int mb_find_next_zero_bit(void *addr, int max, int start) 490 { 491 int fix = 0, ret, tmpmax; 492 addr = mb_correct_addr_and_bit(&fix, addr); 493 tmpmax = max + fix; 494 start += fix; 495 496 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; 497 if (ret > max) 498 return max; 499 return ret; 500 } 501 502 static inline int mb_find_next_bit(void *addr, int max, int start) 503 { 504 int fix = 0, ret, tmpmax; 505 addr = mb_correct_addr_and_bit(&fix, addr); 506 tmpmax = max + fix; 507 start += fix; 508 509 ret = ext4_find_next_bit(addr, tmpmax, start) - fix; 510 if (ret > max) 511 return max; 512 return ret; 513 } 514 515 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 516 { 517 char *bb; 518 519 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 520 BUG_ON(max == NULL); 521 522 if (order > e4b->bd_blkbits + 1) { 523 *max = 0; 524 return NULL; 525 } 526 527 /* at order 0 we see each particular block */ 528 if (order == 0) { 529 *max = 1 << (e4b->bd_blkbits + 3); 530 return e4b->bd_bitmap; 531 } 532 533 bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 534 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 535 536 return bb; 537 } 538 539 #ifdef DOUBLE_CHECK 540 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, 541 int first, int count) 542 { 543 int i; 544 struct super_block *sb = e4b->bd_sb; 545 546 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 547 return; 548 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 549 for (i = 0; i < count; i++) { 550 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 551 ext4_fsblk_t blocknr; 552 553 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 554 blocknr += EXT4_C2B(EXT4_SB(sb), first + i); 555 ext4_grp_locked_error(sb, e4b->bd_group, 556 inode ? inode->i_ino : 0, 557 blocknr, 558 "freeing block already freed " 559 "(bit %u)", 560 first + i); 561 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 562 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 563 } 564 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 565 } 566 } 567 568 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) 569 { 570 int i; 571 572 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 573 return; 574 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 575 for (i = 0; i < count; i++) { 576 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 577 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 578 } 579 } 580 581 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 582 { 583 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 584 return; 585 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { 586 unsigned char *b1, *b2; 587 int i; 588 b1 = (unsigned char *) e4b->bd_info->bb_bitmap; 589 b2 = (unsigned char *) bitmap; 590 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 591 if (b1[i] != b2[i]) { 592 ext4_msg(e4b->bd_sb, KERN_ERR, 593 "corruption in group %u " 594 "at byte %u(%u): %x in copy != %x " 595 "on disk/prealloc", 596 e4b->bd_group, i, i * 8, b1[i], b2[i]); 597 BUG(); 598 } 599 } 600 } 601 } 602 603 static void mb_group_bb_bitmap_alloc(struct super_block *sb, 604 struct ext4_group_info *grp, ext4_group_t group) 605 { 606 struct buffer_head *bh; 607 608 grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); 609 if (!grp->bb_bitmap) 610 return; 611 612 bh = ext4_read_block_bitmap(sb, group); 613 if (IS_ERR_OR_NULL(bh)) { 614 kfree(grp->bb_bitmap); 615 grp->bb_bitmap = NULL; 616 return; 617 } 618 619 memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); 620 put_bh(bh); 621 } 622 623 static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) 624 { 625 kfree(grp->bb_bitmap); 626 } 627 628 #else 629 static inline void mb_free_blocks_double(struct inode *inode, 630 struct ext4_buddy *e4b, int first, int count) 631 { 632 return; 633 } 634 static inline void mb_mark_used_double(struct ext4_buddy *e4b, 635 int first, int count) 636 { 637 return; 638 } 639 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 640 { 641 return; 642 } 643 644 static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, 645 struct ext4_group_info *grp, ext4_group_t group) 646 { 647 return; 648 } 649 650 static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) 651 { 652 return; 653 } 654 #endif 655 656 #ifdef AGGRESSIVE_CHECK 657 658 #define MB_CHECK_ASSERT(assert) \ 659 do { \ 660 if (!(assert)) { \ 661 printk(KERN_EMERG \ 662 "Assertion failure in %s() at %s:%d: \"%s\"\n", \ 663 function, file, line, # assert); \ 664 BUG(); \ 665 } \ 666 } while (0) 667 668 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, 669 const char *function, int line) 670 { 671 struct super_block *sb = e4b->bd_sb; 672 int order = e4b->bd_blkbits + 1; 673 int max; 674 int max2; 675 int i; 676 int j; 677 int k; 678 int count; 679 struct ext4_group_info *grp; 680 int fragments = 0; 681 int fstart; 682 struct list_head *cur; 683 void *buddy; 684 void *buddy2; 685 686 if (e4b->bd_info->bb_check_counter++ % 10) 687 return 0; 688 689 while (order > 1) { 690 buddy = mb_find_buddy(e4b, order, &max); 691 MB_CHECK_ASSERT(buddy); 692 buddy2 = mb_find_buddy(e4b, order - 1, &max2); 693 MB_CHECK_ASSERT(buddy2); 694 MB_CHECK_ASSERT(buddy != buddy2); 695 MB_CHECK_ASSERT(max * 2 == max2); 696 697 count = 0; 698 for (i = 0; i < max; i++) { 699 700 if (mb_test_bit(i, buddy)) { 701 /* only single bit in buddy2 may be 0 */ 702 if (!mb_test_bit(i << 1, buddy2)) { 703 MB_CHECK_ASSERT( 704 mb_test_bit((i<<1)+1, buddy2)); 705 } 706 continue; 707 } 708 709 /* both bits in buddy2 must be 1 */ 710 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 711 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 712 713 for (j = 0; j < (1 << order); j++) { 714 k = (i * (1 << order)) + j; 715 MB_CHECK_ASSERT( 716 !mb_test_bit(k, e4b->bd_bitmap)); 717 } 718 count++; 719 } 720 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); 721 order--; 722 } 723 724 fstart = -1; 725 buddy = mb_find_buddy(e4b, 0, &max); 726 for (i = 0; i < max; i++) { 727 if (!mb_test_bit(i, buddy)) { 728 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); 729 if (fstart == -1) { 730 fragments++; 731 fstart = i; 732 } 733 continue; 734 } 735 fstart = -1; 736 /* check used bits only */ 737 for (j = 0; j < e4b->bd_blkbits + 1; j++) { 738 buddy2 = mb_find_buddy(e4b, j, &max2); 739 k = i >> j; 740 MB_CHECK_ASSERT(k < max2); 741 MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); 742 } 743 } 744 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); 745 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 746 747 grp = ext4_get_group_info(sb, e4b->bd_group); 748 list_for_each(cur, &grp->bb_prealloc_list) { 749 ext4_group_t groupnr; 750 struct ext4_prealloc_space *pa; 751 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 752 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); 753 MB_CHECK_ASSERT(groupnr == e4b->bd_group); 754 for (i = 0; i < pa->pa_len; i++) 755 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); 756 } 757 return 0; 758 } 759 #undef MB_CHECK_ASSERT 760 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ 761 __FILE__, __func__, __LINE__) 762 #else 763 #define mb_check_buddy(e4b) 764 #endif 765 766 /* 767 * Divide blocks started from @first with length @len into 768 * smaller chunks with power of 2 blocks. 769 * Clear the bits in bitmap which the blocks of the chunk(s) covered, 770 * then increase bb_counters[] for corresponded chunk size. 771 */ 772 static void ext4_mb_mark_free_simple(struct super_block *sb, 773 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 774 struct ext4_group_info *grp) 775 { 776 struct ext4_sb_info *sbi = EXT4_SB(sb); 777 ext4_grpblk_t min; 778 ext4_grpblk_t max; 779 ext4_grpblk_t chunk; 780 unsigned int border; 781 782 BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); 783 784 border = 2 << sb->s_blocksize_bits; 785 786 while (len > 0) { 787 /* find how many blocks can be covered since this position */ 788 max = ffs(first | border) - 1; 789 790 /* find how many blocks of power 2 we need to mark */ 791 min = fls(len) - 1; 792 793 if (max < min) 794 min = max; 795 chunk = 1 << min; 796 797 /* mark multiblock chunks only */ 798 grp->bb_counters[min]++; 799 if (min > 0) 800 mb_clear_bit(first >> min, 801 buddy + sbi->s_mb_offsets[min]); 802 803 len -= chunk; 804 first += chunk; 805 } 806 } 807 808 static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) 809 { 810 int order; 811 812 /* 813 * We don't bother with a special lists groups with only 1 block free 814 * extents and for completely empty groups. 815 */ 816 order = fls(len) - 2; 817 if (order < 0) 818 return 0; 819 if (order == MB_NUM_ORDERS(sb)) 820 order--; 821 return order; 822 } 823 824 /* Move group to appropriate avg_fragment_size list */ 825 static void 826 mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) 827 { 828 struct ext4_sb_info *sbi = EXT4_SB(sb); 829 int new_order; 830 831 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) 832 return; 833 834 new_order = mb_avg_fragment_size_order(sb, 835 grp->bb_free / grp->bb_fragments); 836 if (new_order == grp->bb_avg_fragment_size_order) 837 return; 838 839 if (grp->bb_avg_fragment_size_order != -1) { 840 write_lock(&sbi->s_mb_avg_fragment_size_locks[ 841 grp->bb_avg_fragment_size_order]); 842 list_del(&grp->bb_avg_fragment_size_node); 843 write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 844 grp->bb_avg_fragment_size_order]); 845 } 846 grp->bb_avg_fragment_size_order = new_order; 847 write_lock(&sbi->s_mb_avg_fragment_size_locks[ 848 grp->bb_avg_fragment_size_order]); 849 list_add_tail(&grp->bb_avg_fragment_size_node, 850 &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); 851 write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 852 grp->bb_avg_fragment_size_order]); 853 } 854 855 /* 856 * Choose next group by traversing largest_free_order lists. Updates *new_cr if 857 * cr level needs an update. 858 */ 859 static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, 860 int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 861 { 862 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 863 struct ext4_group_info *iter, *grp; 864 int i; 865 866 if (ac->ac_status == AC_STATUS_FOUND) 867 return; 868 869 if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) 870 atomic_inc(&sbi->s_bal_cr0_bad_suggestions); 871 872 grp = NULL; 873 for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { 874 if (list_empty(&sbi->s_mb_largest_free_orders[i])) 875 continue; 876 read_lock(&sbi->s_mb_largest_free_orders_locks[i]); 877 if (list_empty(&sbi->s_mb_largest_free_orders[i])) { 878 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 879 continue; 880 } 881 grp = NULL; 882 list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], 883 bb_largest_free_order_node) { 884 if (sbi->s_mb_stats) 885 atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); 886 if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { 887 grp = iter; 888 break; 889 } 890 } 891 read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 892 if (grp) 893 break; 894 } 895 896 if (!grp) { 897 /* Increment cr and search again */ 898 *new_cr = 1; 899 } else { 900 *group = grp->bb_group; 901 ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; 902 } 903 } 904 905 /* 906 * Choose next group by traversing average fragment size list of suitable 907 * order. Updates *new_cr if cr level needs an update. 908 */ 909 static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, 910 int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 911 { 912 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 913 struct ext4_group_info *grp = NULL, *iter; 914 int i; 915 916 if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { 917 if (sbi->s_mb_stats) 918 atomic_inc(&sbi->s_bal_cr1_bad_suggestions); 919 } 920 921 for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); 922 i < MB_NUM_ORDERS(ac->ac_sb); i++) { 923 if (list_empty(&sbi->s_mb_avg_fragment_size[i])) 924 continue; 925 read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); 926 if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { 927 read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); 928 continue; 929 } 930 list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], 931 bb_avg_fragment_size_node) { 932 if (sbi->s_mb_stats) 933 atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); 934 if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { 935 grp = iter; 936 break; 937 } 938 } 939 read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); 940 if (grp) 941 break; 942 } 943 944 if (grp) { 945 *group = grp->bb_group; 946 ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; 947 } else { 948 *new_cr = 2; 949 } 950 } 951 952 static inline int should_optimize_scan(struct ext4_allocation_context *ac) 953 { 954 if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) 955 return 0; 956 if (ac->ac_criteria >= 2) 957 return 0; 958 if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) 959 return 0; 960 return 1; 961 } 962 963 /* 964 * Return next linear group for allocation. If linear traversal should not be 965 * performed, this function just returns the same group 966 */ 967 static int 968 next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) 969 { 970 if (!should_optimize_scan(ac)) 971 goto inc_and_return; 972 973 if (ac->ac_groups_linear_remaining) { 974 ac->ac_groups_linear_remaining--; 975 goto inc_and_return; 976 } 977 978 return group; 979 inc_and_return: 980 /* 981 * Artificially restricted ngroups for non-extent 982 * files makes group > ngroups possible on first loop. 983 */ 984 return group + 1 >= ngroups ? 0 : group + 1; 985 } 986 987 /* 988 * ext4_mb_choose_next_group: choose next group for allocation. 989 * 990 * @ac Allocation Context 991 * @new_cr This is an output parameter. If the there is no good group 992 * available at current CR level, this field is updated to indicate 993 * the new cr level that should be used. 994 * @group This is an input / output parameter. As an input it indicates the 995 * next group that the allocator intends to use for allocation. As 996 * output, this field indicates the next group that should be used as 997 * determined by the optimization functions. 998 * @ngroups Total number of groups 999 */ 1000 static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, 1001 int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 1002 { 1003 *new_cr = ac->ac_criteria; 1004 1005 if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { 1006 *group = next_linear_group(ac, *group, ngroups); 1007 return; 1008 } 1009 1010 if (*new_cr == 0) { 1011 ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); 1012 } else if (*new_cr == 1) { 1013 ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); 1014 } else { 1015 /* 1016 * TODO: For CR=2, we can arrange groups in an rb tree sorted by 1017 * bb_free. But until that happens, we should never come here. 1018 */ 1019 WARN_ON(1); 1020 } 1021 } 1022 1023 /* 1024 * Cache the order of the largest free extent we have available in this block 1025 * group. 1026 */ 1027 static void 1028 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) 1029 { 1030 struct ext4_sb_info *sbi = EXT4_SB(sb); 1031 int i; 1032 1033 for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) 1034 if (grp->bb_counters[i] > 0) 1035 break; 1036 /* No need to move between order lists? */ 1037 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || 1038 i == grp->bb_largest_free_order) { 1039 grp->bb_largest_free_order = i; 1040 return; 1041 } 1042 1043 if (grp->bb_largest_free_order >= 0) { 1044 write_lock(&sbi->s_mb_largest_free_orders_locks[ 1045 grp->bb_largest_free_order]); 1046 list_del_init(&grp->bb_largest_free_order_node); 1047 write_unlock(&sbi->s_mb_largest_free_orders_locks[ 1048 grp->bb_largest_free_order]); 1049 } 1050 grp->bb_largest_free_order = i; 1051 if (grp->bb_largest_free_order >= 0 && grp->bb_free) { 1052 write_lock(&sbi->s_mb_largest_free_orders_locks[ 1053 grp->bb_largest_free_order]); 1054 list_add_tail(&grp->bb_largest_free_order_node, 1055 &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); 1056 write_unlock(&sbi->s_mb_largest_free_orders_locks[ 1057 grp->bb_largest_free_order]); 1058 } 1059 } 1060 1061 static noinline_for_stack 1062 void ext4_mb_generate_buddy(struct super_block *sb, 1063 void *buddy, void *bitmap, ext4_group_t group) 1064 { 1065 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 1066 struct ext4_sb_info *sbi = EXT4_SB(sb); 1067 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 1068 ext4_grpblk_t i = 0; 1069 ext4_grpblk_t first; 1070 ext4_grpblk_t len; 1071 unsigned free = 0; 1072 unsigned fragments = 0; 1073 unsigned long long period = get_cycles(); 1074 1075 /* initialize buddy from bitmap which is aggregation 1076 * of on-disk bitmap and preallocations */ 1077 i = mb_find_next_zero_bit(bitmap, max, 0); 1078 grp->bb_first_free = i; 1079 while (i < max) { 1080 fragments++; 1081 first = i; 1082 i = mb_find_next_bit(bitmap, max, i); 1083 len = i - first; 1084 free += len; 1085 if (len > 1) 1086 ext4_mb_mark_free_simple(sb, buddy, first, len, grp); 1087 else 1088 grp->bb_counters[0]++; 1089 if (i < max) 1090 i = mb_find_next_zero_bit(bitmap, max, i); 1091 } 1092 grp->bb_fragments = fragments; 1093 1094 if (free != grp->bb_free) { 1095 ext4_grp_locked_error(sb, group, 0, 0, 1096 "block bitmap and bg descriptor " 1097 "inconsistent: %u vs %u free clusters", 1098 free, grp->bb_free); 1099 /* 1100 * If we intend to continue, we consider group descriptor 1101 * corrupt and update bb_free using bitmap value 1102 */ 1103 grp->bb_free = free; 1104 ext4_mark_group_bitmap_corrupted(sb, group, 1105 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1106 } 1107 mb_set_largest_free_order(sb, grp); 1108 mb_update_avg_fragment_size(sb, grp); 1109 1110 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 1111 1112 period = get_cycles() - period; 1113 atomic_inc(&sbi->s_mb_buddies_generated); 1114 atomic64_add(period, &sbi->s_mb_generation_time); 1115 } 1116 1117 /* The buddy information is attached the buddy cache inode 1118 * for convenience. The information regarding each group 1119 * is loaded via ext4_mb_load_buddy. The information involve 1120 * block bitmap and buddy information. The information are 1121 * stored in the inode as 1122 * 1123 * { page } 1124 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 1125 * 1126 * 1127 * one block each for bitmap and buddy information. 1128 * So for each group we take up 2 blocks. A page can 1129 * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. 1130 * So it can have information regarding groups_per_page which 1131 * is blocks_per_page/2 1132 * 1133 * Locking note: This routine takes the block group lock of all groups 1134 * for this page; do not hold this lock when calling this routine! 1135 */ 1136 1137 static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) 1138 { 1139 ext4_group_t ngroups; 1140 int blocksize; 1141 int blocks_per_page; 1142 int groups_per_page; 1143 int err = 0; 1144 int i; 1145 ext4_group_t first_group, group; 1146 int first_block; 1147 struct super_block *sb; 1148 struct buffer_head *bhs; 1149 struct buffer_head **bh = NULL; 1150 struct inode *inode; 1151 char *data; 1152 char *bitmap; 1153 struct ext4_group_info *grinfo; 1154 1155 inode = page->mapping->host; 1156 sb = inode->i_sb; 1157 ngroups = ext4_get_groups_count(sb); 1158 blocksize = i_blocksize(inode); 1159 blocks_per_page = PAGE_SIZE / blocksize; 1160 1161 mb_debug(sb, "init page %lu\n", page->index); 1162 1163 groups_per_page = blocks_per_page >> 1; 1164 if (groups_per_page == 0) 1165 groups_per_page = 1; 1166 1167 /* allocate buffer_heads to read bitmaps */ 1168 if (groups_per_page > 1) { 1169 i = sizeof(struct buffer_head *) * groups_per_page; 1170 bh = kzalloc(i, gfp); 1171 if (bh == NULL) { 1172 err = -ENOMEM; 1173 goto out; 1174 } 1175 } else 1176 bh = &bhs; 1177 1178 first_group = page->index * blocks_per_page / 2; 1179 1180 /* read all groups the page covers into the cache */ 1181 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1182 if (group >= ngroups) 1183 break; 1184 1185 grinfo = ext4_get_group_info(sb, group); 1186 /* 1187 * If page is uptodate then we came here after online resize 1188 * which added some new uninitialized group info structs, so 1189 * we must skip all initialized uptodate buddies on the page, 1190 * which may be currently in use by an allocating task. 1191 */ 1192 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { 1193 bh[i] = NULL; 1194 continue; 1195 } 1196 bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); 1197 if (IS_ERR(bh[i])) { 1198 err = PTR_ERR(bh[i]); 1199 bh[i] = NULL; 1200 goto out; 1201 } 1202 mb_debug(sb, "read bitmap for group %u\n", group); 1203 } 1204 1205 /* wait for I/O completion */ 1206 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1207 int err2; 1208 1209 if (!bh[i]) 1210 continue; 1211 err2 = ext4_wait_block_bitmap(sb, group, bh[i]); 1212 if (!err) 1213 err = err2; 1214 } 1215 1216 first_block = page->index * blocks_per_page; 1217 for (i = 0; i < blocks_per_page; i++) { 1218 group = (first_block + i) >> 1; 1219 if (group >= ngroups) 1220 break; 1221 1222 if (!bh[group - first_group]) 1223 /* skip initialized uptodate buddy */ 1224 continue; 1225 1226 if (!buffer_verified(bh[group - first_group])) 1227 /* Skip faulty bitmaps */ 1228 continue; 1229 err = 0; 1230 1231 /* 1232 * data carry information regarding this 1233 * particular group in the format specified 1234 * above 1235 * 1236 */ 1237 data = page_address(page) + (i * blocksize); 1238 bitmap = bh[group - first_group]->b_data; 1239 1240 /* 1241 * We place the buddy block and bitmap block 1242 * close together 1243 */ 1244 if ((first_block + i) & 1) { 1245 /* this is block of buddy */ 1246 BUG_ON(incore == NULL); 1247 mb_debug(sb, "put buddy for group %u in page %lu/%x\n", 1248 group, page->index, i * blocksize); 1249 trace_ext4_mb_buddy_bitmap_load(sb, group); 1250 grinfo = ext4_get_group_info(sb, group); 1251 grinfo->bb_fragments = 0; 1252 memset(grinfo->bb_counters, 0, 1253 sizeof(*grinfo->bb_counters) * 1254 (MB_NUM_ORDERS(sb))); 1255 /* 1256 * incore got set to the group block bitmap below 1257 */ 1258 ext4_lock_group(sb, group); 1259 /* init the buddy */ 1260 memset(data, 0xff, blocksize); 1261 ext4_mb_generate_buddy(sb, data, incore, group); 1262 ext4_unlock_group(sb, group); 1263 incore = NULL; 1264 } else { 1265 /* this is block of bitmap */ 1266 BUG_ON(incore != NULL); 1267 mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", 1268 group, page->index, i * blocksize); 1269 trace_ext4_mb_bitmap_load(sb, group); 1270 1271 /* see comments in ext4_mb_put_pa() */ 1272 ext4_lock_group(sb, group); 1273 memcpy(data, bitmap, blocksize); 1274 1275 /* mark all preallocated blks used in in-core bitmap */ 1276 ext4_mb_generate_from_pa(sb, data, group); 1277 ext4_mb_generate_from_freelist(sb, data, group); 1278 ext4_unlock_group(sb, group); 1279 1280 /* set incore so that the buddy information can be 1281 * generated using this 1282 */ 1283 incore = data; 1284 } 1285 } 1286 SetPageUptodate(page); 1287 1288 out: 1289 if (bh) { 1290 for (i = 0; i < groups_per_page; i++) 1291 brelse(bh[i]); 1292 if (bh != &bhs) 1293 kfree(bh); 1294 } 1295 return err; 1296 } 1297 1298 /* 1299 * Lock the buddy and bitmap pages. This make sure other parallel init_group 1300 * on the same buddy page doesn't happen whild holding the buddy page lock. 1301 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap 1302 * are on the same page e4b->bd_buddy_page is NULL and return value is 0. 1303 */ 1304 static int ext4_mb_get_buddy_page_lock(struct super_block *sb, 1305 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) 1306 { 1307 struct inode *inode = EXT4_SB(sb)->s_buddy_cache; 1308 int block, pnum, poff; 1309 int blocks_per_page; 1310 struct page *page; 1311 1312 e4b->bd_buddy_page = NULL; 1313 e4b->bd_bitmap_page = NULL; 1314 1315 blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1316 /* 1317 * the buddy cache inode stores the block bitmap 1318 * and buddy information in consecutive blocks. 1319 * So for each group we need two blocks. 1320 */ 1321 block = group * 2; 1322 pnum = block / blocks_per_page; 1323 poff = block % blocks_per_page; 1324 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1325 if (!page) 1326 return -ENOMEM; 1327 BUG_ON(page->mapping != inode->i_mapping); 1328 e4b->bd_bitmap_page = page; 1329 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1330 1331 if (blocks_per_page >= 2) { 1332 /* buddy and bitmap are on the same page */ 1333 return 0; 1334 } 1335 1336 block++; 1337 pnum = block / blocks_per_page; 1338 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1339 if (!page) 1340 return -ENOMEM; 1341 BUG_ON(page->mapping != inode->i_mapping); 1342 e4b->bd_buddy_page = page; 1343 return 0; 1344 } 1345 1346 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) 1347 { 1348 if (e4b->bd_bitmap_page) { 1349 unlock_page(e4b->bd_bitmap_page); 1350 put_page(e4b->bd_bitmap_page); 1351 } 1352 if (e4b->bd_buddy_page) { 1353 unlock_page(e4b->bd_buddy_page); 1354 put_page(e4b->bd_buddy_page); 1355 } 1356 } 1357 1358 /* 1359 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1360 * block group lock of all groups for this page; do not hold the BG lock when 1361 * calling this routine! 1362 */ 1363 static noinline_for_stack 1364 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) 1365 { 1366 1367 struct ext4_group_info *this_grp; 1368 struct ext4_buddy e4b; 1369 struct page *page; 1370 int ret = 0; 1371 1372 might_sleep(); 1373 mb_debug(sb, "init group %u\n", group); 1374 this_grp = ext4_get_group_info(sb, group); 1375 /* 1376 * This ensures that we don't reinit the buddy cache 1377 * page which map to the group from which we are already 1378 * allocating. If we are looking at the buddy cache we would 1379 * have taken a reference using ext4_mb_load_buddy and that 1380 * would have pinned buddy page to page cache. 1381 * The call to ext4_mb_get_buddy_page_lock will mark the 1382 * page accessed. 1383 */ 1384 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); 1385 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { 1386 /* 1387 * somebody initialized the group 1388 * return without doing anything 1389 */ 1390 goto err; 1391 } 1392 1393 page = e4b.bd_bitmap_page; 1394 ret = ext4_mb_init_cache(page, NULL, gfp); 1395 if (ret) 1396 goto err; 1397 if (!PageUptodate(page)) { 1398 ret = -EIO; 1399 goto err; 1400 } 1401 1402 if (e4b.bd_buddy_page == NULL) { 1403 /* 1404 * If both the bitmap and buddy are in 1405 * the same page we don't need to force 1406 * init the buddy 1407 */ 1408 ret = 0; 1409 goto err; 1410 } 1411 /* init buddy cache */ 1412 page = e4b.bd_buddy_page; 1413 ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); 1414 if (ret) 1415 goto err; 1416 if (!PageUptodate(page)) { 1417 ret = -EIO; 1418 goto err; 1419 } 1420 err: 1421 ext4_mb_put_buddy_page_lock(&e4b); 1422 return ret; 1423 } 1424 1425 /* 1426 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1427 * block group lock of all groups for this page; do not hold the BG lock when 1428 * calling this routine! 1429 */ 1430 static noinline_for_stack int 1431 ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, 1432 struct ext4_buddy *e4b, gfp_t gfp) 1433 { 1434 int blocks_per_page; 1435 int block; 1436 int pnum; 1437 int poff; 1438 struct page *page; 1439 int ret; 1440 struct ext4_group_info *grp; 1441 struct ext4_sb_info *sbi = EXT4_SB(sb); 1442 struct inode *inode = sbi->s_buddy_cache; 1443 1444 might_sleep(); 1445 mb_debug(sb, "load group %u\n", group); 1446 1447 blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1448 grp = ext4_get_group_info(sb, group); 1449 1450 e4b->bd_blkbits = sb->s_blocksize_bits; 1451 e4b->bd_info = grp; 1452 e4b->bd_sb = sb; 1453 e4b->bd_group = group; 1454 e4b->bd_buddy_page = NULL; 1455 e4b->bd_bitmap_page = NULL; 1456 1457 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1458 /* 1459 * we need full data about the group 1460 * to make a good selection 1461 */ 1462 ret = ext4_mb_init_group(sb, group, gfp); 1463 if (ret) 1464 return ret; 1465 } 1466 1467 /* 1468 * the buddy cache inode stores the block bitmap 1469 * and buddy information in consecutive blocks. 1470 * So for each group we need two blocks. 1471 */ 1472 block = group * 2; 1473 pnum = block / blocks_per_page; 1474 poff = block % blocks_per_page; 1475 1476 /* we could use find_or_create_page(), but it locks page 1477 * what we'd like to avoid in fast path ... */ 1478 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 1479 if (page == NULL || !PageUptodate(page)) { 1480 if (page) 1481 /* 1482 * drop the page reference and try 1483 * to get the page with lock. If we 1484 * are not uptodate that implies 1485 * somebody just created the page but 1486 * is yet to initialize the same. So 1487 * wait for it to initialize. 1488 */ 1489 put_page(page); 1490 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1491 if (page) { 1492 BUG_ON(page->mapping != inode->i_mapping); 1493 if (!PageUptodate(page)) { 1494 ret = ext4_mb_init_cache(page, NULL, gfp); 1495 if (ret) { 1496 unlock_page(page); 1497 goto err; 1498 } 1499 mb_cmp_bitmaps(e4b, page_address(page) + 1500 (poff * sb->s_blocksize)); 1501 } 1502 unlock_page(page); 1503 } 1504 } 1505 if (page == NULL) { 1506 ret = -ENOMEM; 1507 goto err; 1508 } 1509 if (!PageUptodate(page)) { 1510 ret = -EIO; 1511 goto err; 1512 } 1513 1514 /* Pages marked accessed already */ 1515 e4b->bd_bitmap_page = page; 1516 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1517 1518 block++; 1519 pnum = block / blocks_per_page; 1520 poff = block % blocks_per_page; 1521 1522 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 1523 if (page == NULL || !PageUptodate(page)) { 1524 if (page) 1525 put_page(page); 1526 page = find_or_create_page(inode->i_mapping, pnum, gfp); 1527 if (page) { 1528 BUG_ON(page->mapping != inode->i_mapping); 1529 if (!PageUptodate(page)) { 1530 ret = ext4_mb_init_cache(page, e4b->bd_bitmap, 1531 gfp); 1532 if (ret) { 1533 unlock_page(page); 1534 goto err; 1535 } 1536 } 1537 unlock_page(page); 1538 } 1539 } 1540 if (page == NULL) { 1541 ret = -ENOMEM; 1542 goto err; 1543 } 1544 if (!PageUptodate(page)) { 1545 ret = -EIO; 1546 goto err; 1547 } 1548 1549 /* Pages marked accessed already */ 1550 e4b->bd_buddy_page = page; 1551 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1552 1553 return 0; 1554 1555 err: 1556 if (page) 1557 put_page(page); 1558 if (e4b->bd_bitmap_page) 1559 put_page(e4b->bd_bitmap_page); 1560 if (e4b->bd_buddy_page) 1561 put_page(e4b->bd_buddy_page); 1562 e4b->bd_buddy = NULL; 1563 e4b->bd_bitmap = NULL; 1564 return ret; 1565 } 1566 1567 static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1568 struct ext4_buddy *e4b) 1569 { 1570 return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); 1571 } 1572 1573 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) 1574 { 1575 if (e4b->bd_bitmap_page) 1576 put_page(e4b->bd_bitmap_page); 1577 if (e4b->bd_buddy_page) 1578 put_page(e4b->bd_buddy_page); 1579 } 1580 1581 1582 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1583 { 1584 int order = 1, max; 1585 void *bb; 1586 1587 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 1588 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1589 1590 while (order <= e4b->bd_blkbits + 1) { 1591 bb = mb_find_buddy(e4b, order, &max); 1592 if (!mb_test_bit(block >> order, bb)) { 1593 /* this block is part of buddy of order 'order' */ 1594 return order; 1595 } 1596 order++; 1597 } 1598 return 0; 1599 } 1600 1601 static void mb_clear_bits(void *bm, int cur, int len) 1602 { 1603 __u32 *addr; 1604 1605 len = cur + len; 1606 while (cur < len) { 1607 if ((cur & 31) == 0 && (len - cur) >= 32) { 1608 /* fast path: clear whole word at once */ 1609 addr = bm + (cur >> 3); 1610 *addr = 0; 1611 cur += 32; 1612 continue; 1613 } 1614 mb_clear_bit(cur, bm); 1615 cur++; 1616 } 1617 } 1618 1619 /* clear bits in given range 1620 * will return first found zero bit if any, -1 otherwise 1621 */ 1622 static int mb_test_and_clear_bits(void *bm, int cur, int len) 1623 { 1624 __u32 *addr; 1625 int zero_bit = -1; 1626 1627 len = cur + len; 1628 while (cur < len) { 1629 if ((cur & 31) == 0 && (len - cur) >= 32) { 1630 /* fast path: clear whole word at once */ 1631 addr = bm + (cur >> 3); 1632 if (*addr != (__u32)(-1) && zero_bit == -1) 1633 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); 1634 *addr = 0; 1635 cur += 32; 1636 continue; 1637 } 1638 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) 1639 zero_bit = cur; 1640 cur++; 1641 } 1642 1643 return zero_bit; 1644 } 1645 1646 void mb_set_bits(void *bm, int cur, int len) 1647 { 1648 __u32 *addr; 1649 1650 len = cur + len; 1651 while (cur < len) { 1652 if ((cur & 31) == 0 && (len - cur) >= 32) { 1653 /* fast path: set whole word at once */ 1654 addr = bm + (cur >> 3); 1655 *addr = 0xffffffff; 1656 cur += 32; 1657 continue; 1658 } 1659 mb_set_bit(cur, bm); 1660 cur++; 1661 } 1662 } 1663 1664 static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) 1665 { 1666 if (mb_test_bit(*bit + side, bitmap)) { 1667 mb_clear_bit(*bit, bitmap); 1668 (*bit) -= side; 1669 return 1; 1670 } 1671 else { 1672 (*bit) += side; 1673 mb_set_bit(*bit, bitmap); 1674 return -1; 1675 } 1676 } 1677 1678 static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) 1679 { 1680 int max; 1681 int order = 1; 1682 void *buddy = mb_find_buddy(e4b, order, &max); 1683 1684 while (buddy) { 1685 void *buddy2; 1686 1687 /* Bits in range [first; last] are known to be set since 1688 * corresponding blocks were allocated. Bits in range 1689 * (first; last) will stay set because they form buddies on 1690 * upper layer. We just deal with borders if they don't 1691 * align with upper layer and then go up. 1692 * Releasing entire group is all about clearing 1693 * single bit of highest order buddy. 1694 */ 1695 1696 /* Example: 1697 * --------------------------------- 1698 * | 1 | 1 | 1 | 1 | 1699 * --------------------------------- 1700 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1701 * --------------------------------- 1702 * 0 1 2 3 4 5 6 7 1703 * \_____________________/ 1704 * 1705 * Neither [1] nor [6] is aligned to above layer. 1706 * Left neighbour [0] is free, so mark it busy, 1707 * decrease bb_counters and extend range to 1708 * [0; 6] 1709 * Right neighbour [7] is busy. It can't be coaleasced with [6], so 1710 * mark [6] free, increase bb_counters and shrink range to 1711 * [0; 5]. 1712 * Then shift range to [0; 2], go up and do the same. 1713 */ 1714 1715 1716 if (first & 1) 1717 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); 1718 if (!(last & 1)) 1719 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); 1720 if (first > last) 1721 break; 1722 order++; 1723 1724 if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { 1725 mb_clear_bits(buddy, first, last - first + 1); 1726 e4b->bd_info->bb_counters[order - 1] += last - first + 1; 1727 break; 1728 } 1729 first >>= 1; 1730 last >>= 1; 1731 buddy = buddy2; 1732 } 1733 } 1734 1735 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1736 int first, int count) 1737 { 1738 int left_is_free = 0; 1739 int right_is_free = 0; 1740 int block; 1741 int last = first + count - 1; 1742 struct super_block *sb = e4b->bd_sb; 1743 1744 if (WARN_ON(count == 0)) 1745 return; 1746 BUG_ON(last >= (sb->s_blocksize << 3)); 1747 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1748 /* Don't bother if the block group is corrupt. */ 1749 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) 1750 return; 1751 1752 mb_check_buddy(e4b); 1753 mb_free_blocks_double(inode, e4b, first, count); 1754 1755 this_cpu_inc(discard_pa_seq); 1756 e4b->bd_info->bb_free += count; 1757 if (first < e4b->bd_info->bb_first_free) 1758 e4b->bd_info->bb_first_free = first; 1759 1760 /* access memory sequentially: check left neighbour, 1761 * clear range and then check right neighbour 1762 */ 1763 if (first != 0) 1764 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); 1765 block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); 1766 if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) 1767 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); 1768 1769 if (unlikely(block != -1)) { 1770 struct ext4_sb_info *sbi = EXT4_SB(sb); 1771 ext4_fsblk_t blocknr; 1772 1773 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1774 blocknr += EXT4_C2B(sbi, block); 1775 if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { 1776 ext4_grp_locked_error(sb, e4b->bd_group, 1777 inode ? inode->i_ino : 0, 1778 blocknr, 1779 "freeing already freed block (bit %u); block bitmap corrupt.", 1780 block); 1781 ext4_mark_group_bitmap_corrupted( 1782 sb, e4b->bd_group, 1783 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1784 } 1785 goto done; 1786 } 1787 1788 /* let's maintain fragments counter */ 1789 if (left_is_free && right_is_free) 1790 e4b->bd_info->bb_fragments--; 1791 else if (!left_is_free && !right_is_free) 1792 e4b->bd_info->bb_fragments++; 1793 1794 /* buddy[0] == bd_bitmap is a special case, so handle 1795 * it right away and let mb_buddy_mark_free stay free of 1796 * zero order checks. 1797 * Check if neighbours are to be coaleasced, 1798 * adjust bitmap bb_counters and borders appropriately. 1799 */ 1800 if (first & 1) { 1801 first += !left_is_free; 1802 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; 1803 } 1804 if (!(last & 1)) { 1805 last -= !right_is_free; 1806 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; 1807 } 1808 1809 if (first <= last) 1810 mb_buddy_mark_free(e4b, first >> 1, last >> 1); 1811 1812 done: 1813 mb_set_largest_free_order(sb, e4b->bd_info); 1814 mb_update_avg_fragment_size(sb, e4b->bd_info); 1815 mb_check_buddy(e4b); 1816 } 1817 1818 static int mb_find_extent(struct ext4_buddy *e4b, int block, 1819 int needed, struct ext4_free_extent *ex) 1820 { 1821 int next = block; 1822 int max, order; 1823 void *buddy; 1824 1825 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1826 BUG_ON(ex == NULL); 1827 1828 buddy = mb_find_buddy(e4b, 0, &max); 1829 BUG_ON(buddy == NULL); 1830 BUG_ON(block >= max); 1831 if (mb_test_bit(block, buddy)) { 1832 ex->fe_len = 0; 1833 ex->fe_start = 0; 1834 ex->fe_group = 0; 1835 return 0; 1836 } 1837 1838 /* find actual order */ 1839 order = mb_find_order_for_block(e4b, block); 1840 block = block >> order; 1841 1842 ex->fe_len = 1 << order; 1843 ex->fe_start = block << order; 1844 ex->fe_group = e4b->bd_group; 1845 1846 /* calc difference from given start */ 1847 next = next - ex->fe_start; 1848 ex->fe_len -= next; 1849 ex->fe_start += next; 1850 1851 while (needed > ex->fe_len && 1852 mb_find_buddy(e4b, order, &max)) { 1853 1854 if (block + 1 >= max) 1855 break; 1856 1857 next = (block + 1) * (1 << order); 1858 if (mb_test_bit(next, e4b->bd_bitmap)) 1859 break; 1860 1861 order = mb_find_order_for_block(e4b, next); 1862 1863 block = next >> order; 1864 ex->fe_len += 1 << order; 1865 } 1866 1867 if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { 1868 /* Should never happen! (but apparently sometimes does?!?) */ 1869 WARN_ON(1); 1870 ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, 1871 "corruption or bug in mb_find_extent " 1872 "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", 1873 block, order, needed, ex->fe_group, ex->fe_start, 1874 ex->fe_len, ex->fe_logical); 1875 ex->fe_len = 0; 1876 ex->fe_start = 0; 1877 ex->fe_group = 0; 1878 } 1879 return ex->fe_len; 1880 } 1881 1882 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) 1883 { 1884 int ord; 1885 int mlen = 0; 1886 int max = 0; 1887 int cur; 1888 int start = ex->fe_start; 1889 int len = ex->fe_len; 1890 unsigned ret = 0; 1891 int len0 = len; 1892 void *buddy; 1893 bool split = false; 1894 1895 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 1896 BUG_ON(e4b->bd_group != ex->fe_group); 1897 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1898 mb_check_buddy(e4b); 1899 mb_mark_used_double(e4b, start, len); 1900 1901 this_cpu_inc(discard_pa_seq); 1902 e4b->bd_info->bb_free -= len; 1903 if (e4b->bd_info->bb_first_free == start) 1904 e4b->bd_info->bb_first_free += len; 1905 1906 /* let's maintain fragments counter */ 1907 if (start != 0) 1908 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); 1909 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1910 max = !mb_test_bit(start + len, e4b->bd_bitmap); 1911 if (mlen && max) 1912 e4b->bd_info->bb_fragments++; 1913 else if (!mlen && !max) 1914 e4b->bd_info->bb_fragments--; 1915 1916 /* let's maintain buddy itself */ 1917 while (len) { 1918 if (!split) 1919 ord = mb_find_order_for_block(e4b, start); 1920 1921 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 1922 /* the whole chunk may be allocated at once! */ 1923 mlen = 1 << ord; 1924 if (!split) 1925 buddy = mb_find_buddy(e4b, ord, &max); 1926 else 1927 split = false; 1928 BUG_ON((start >> ord) >= max); 1929 mb_set_bit(start >> ord, buddy); 1930 e4b->bd_info->bb_counters[ord]--; 1931 start += mlen; 1932 len -= mlen; 1933 BUG_ON(len < 0); 1934 continue; 1935 } 1936 1937 /* store for history */ 1938 if (ret == 0) 1939 ret = len | (ord << 16); 1940 1941 /* we have to split large buddy */ 1942 BUG_ON(ord <= 0); 1943 buddy = mb_find_buddy(e4b, ord, &max); 1944 mb_set_bit(start >> ord, buddy); 1945 e4b->bd_info->bb_counters[ord]--; 1946 1947 ord--; 1948 cur = (start >> ord) & ~1U; 1949 buddy = mb_find_buddy(e4b, ord, &max); 1950 mb_clear_bit(cur, buddy); 1951 mb_clear_bit(cur + 1, buddy); 1952 e4b->bd_info->bb_counters[ord]++; 1953 e4b->bd_info->bb_counters[ord]++; 1954 split = true; 1955 } 1956 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1957 1958 mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); 1959 mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0); 1960 mb_check_buddy(e4b); 1961 1962 return ret; 1963 } 1964 1965 /* 1966 * Must be called under group lock! 1967 */ 1968 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, 1969 struct ext4_buddy *e4b) 1970 { 1971 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1972 int ret; 1973 1974 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); 1975 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1976 1977 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); 1978 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; 1979 ret = mb_mark_used(e4b, &ac->ac_b_ex); 1980 1981 /* preallocation can change ac_b_ex, thus we store actually 1982 * allocated blocks for history */ 1983 ac->ac_f_ex = ac->ac_b_ex; 1984 1985 ac->ac_status = AC_STATUS_FOUND; 1986 ac->ac_tail = ret & 0xffff; 1987 ac->ac_buddy = ret >> 16; 1988 1989 /* 1990 * take the page reference. We want the page to be pinned 1991 * so that we don't get a ext4_mb_init_cache_call for this 1992 * group until we update the bitmap. That would mean we 1993 * double allocate blocks. The reference is dropped 1994 * in ext4_mb_release_context 1995 */ 1996 ac->ac_bitmap_page = e4b->bd_bitmap_page; 1997 get_page(ac->ac_bitmap_page); 1998 ac->ac_buddy_page = e4b->bd_buddy_page; 1999 get_page(ac->ac_buddy_page); 2000 /* store last allocated for subsequent stream allocation */ 2001 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2002 spin_lock(&sbi->s_md_lock); 2003 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 2004 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 2005 spin_unlock(&sbi->s_md_lock); 2006 } 2007 /* 2008 * As we've just preallocated more space than 2009 * user requested originally, we store allocated 2010 * space in a special descriptor. 2011 */ 2012 if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 2013 ext4_mb_new_preallocation(ac); 2014 2015 } 2016 2017 static void ext4_mb_check_limits(struct ext4_allocation_context *ac, 2018 struct ext4_buddy *e4b, 2019 int finish_group) 2020 { 2021 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2022 struct ext4_free_extent *bex = &ac->ac_b_ex; 2023 struct ext4_free_extent *gex = &ac->ac_g_ex; 2024 struct ext4_free_extent ex; 2025 int max; 2026 2027 if (ac->ac_status == AC_STATUS_FOUND) 2028 return; 2029 /* 2030 * We don't want to scan for a whole year 2031 */ 2032 if (ac->ac_found > sbi->s_mb_max_to_scan && 2033 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2034 ac->ac_status = AC_STATUS_BREAK; 2035 return; 2036 } 2037 2038 /* 2039 * Haven't found good chunk so far, let's continue 2040 */ 2041 if (bex->fe_len < gex->fe_len) 2042 return; 2043 2044 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) 2045 && bex->fe_group == e4b->bd_group) { 2046 /* recheck chunk's availability - we don't know 2047 * when it was found (within this lock-unlock 2048 * period or not) */ 2049 max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); 2050 if (max >= gex->fe_len) { 2051 ext4_mb_use_best_found(ac, e4b); 2052 return; 2053 } 2054 } 2055 } 2056 2057 /* 2058 * The routine checks whether found extent is good enough. If it is, 2059 * then the extent gets marked used and flag is set to the context 2060 * to stop scanning. Otherwise, the extent is compared with the 2061 * previous found extent and if new one is better, then it's stored 2062 * in the context. Later, the best found extent will be used, if 2063 * mballoc can't find good enough extent. 2064 * 2065 * FIXME: real allocation policy is to be designed yet! 2066 */ 2067 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, 2068 struct ext4_free_extent *ex, 2069 struct ext4_buddy *e4b) 2070 { 2071 struct ext4_free_extent *bex = &ac->ac_b_ex; 2072 struct ext4_free_extent *gex = &ac->ac_g_ex; 2073 2074 BUG_ON(ex->fe_len <= 0); 2075 BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 2076 BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 2077 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 2078 2079 ac->ac_found++; 2080 2081 /* 2082 * The special case - take what you catch first 2083 */ 2084 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2085 *bex = *ex; 2086 ext4_mb_use_best_found(ac, e4b); 2087 return; 2088 } 2089 2090 /* 2091 * Let's check whether the chuck is good enough 2092 */ 2093 if (ex->fe_len == gex->fe_len) { 2094 *bex = *ex; 2095 ext4_mb_use_best_found(ac, e4b); 2096 return; 2097 } 2098 2099 /* 2100 * If this is first found extent, just store it in the context 2101 */ 2102 if (bex->fe_len == 0) { 2103 *bex = *ex; 2104 return; 2105 } 2106 2107 /* 2108 * If new found extent is better, store it in the context 2109 */ 2110 if (bex->fe_len < gex->fe_len) { 2111 /* if the request isn't satisfied, any found extent 2112 * larger than previous best one is better */ 2113 if (ex->fe_len > bex->fe_len) 2114 *bex = *ex; 2115 } else if (ex->fe_len > gex->fe_len) { 2116 /* if the request is satisfied, then we try to find 2117 * an extent that still satisfy the request, but is 2118 * smaller than previous one */ 2119 if (ex->fe_len < bex->fe_len) 2120 *bex = *ex; 2121 } 2122 2123 ext4_mb_check_limits(ac, e4b, 0); 2124 } 2125 2126 static noinline_for_stack 2127 int ext4_mb_try_best_found(struct ext4_allocation_context *ac, 2128 struct ext4_buddy *e4b) 2129 { 2130 struct ext4_free_extent ex = ac->ac_b_ex; 2131 ext4_group_t group = ex.fe_group; 2132 int max; 2133 int err; 2134 2135 BUG_ON(ex.fe_len <= 0); 2136 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 2137 if (err) 2138 return err; 2139 2140 ext4_lock_group(ac->ac_sb, group); 2141 max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); 2142 2143 if (max > 0) { 2144 ac->ac_b_ex = ex; 2145 ext4_mb_use_best_found(ac, e4b); 2146 } 2147 2148 ext4_unlock_group(ac->ac_sb, group); 2149 ext4_mb_unload_buddy(e4b); 2150 2151 return 0; 2152 } 2153 2154 static noinline_for_stack 2155 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 2156 struct ext4_buddy *e4b) 2157 { 2158 ext4_group_t group = ac->ac_g_ex.fe_group; 2159 int max; 2160 int err; 2161 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2162 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2163 struct ext4_free_extent ex; 2164 2165 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 2166 return 0; 2167 if (grp->bb_free == 0) 2168 return 0; 2169 2170 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 2171 if (err) 2172 return err; 2173 2174 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { 2175 ext4_mb_unload_buddy(e4b); 2176 return 0; 2177 } 2178 2179 ext4_lock_group(ac->ac_sb, group); 2180 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, 2181 ac->ac_g_ex.fe_len, &ex); 2182 ex.fe_logical = 0xDEADFA11; /* debug value */ 2183 2184 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 2185 ext4_fsblk_t start; 2186 2187 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + 2188 ex.fe_start; 2189 /* use do_div to get remainder (would be 64-bit modulo) */ 2190 if (do_div(start, sbi->s_stripe) == 0) { 2191 ac->ac_found++; 2192 ac->ac_b_ex = ex; 2193 ext4_mb_use_best_found(ac, e4b); 2194 } 2195 } else if (max >= ac->ac_g_ex.fe_len) { 2196 BUG_ON(ex.fe_len <= 0); 2197 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 2198 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 2199 ac->ac_found++; 2200 ac->ac_b_ex = ex; 2201 ext4_mb_use_best_found(ac, e4b); 2202 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { 2203 /* Sometimes, caller may want to merge even small 2204 * number of blocks to an existing extent */ 2205 BUG_ON(ex.fe_len <= 0); 2206 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 2207 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 2208 ac->ac_found++; 2209 ac->ac_b_ex = ex; 2210 ext4_mb_use_best_found(ac, e4b); 2211 } 2212 ext4_unlock_group(ac->ac_sb, group); 2213 ext4_mb_unload_buddy(e4b); 2214 2215 return 0; 2216 } 2217 2218 /* 2219 * The routine scans buddy structures (not bitmap!) from given order 2220 * to max order and tries to find big enough chunk to satisfy the req 2221 */ 2222 static noinline_for_stack 2223 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 2224 struct ext4_buddy *e4b) 2225 { 2226 struct super_block *sb = ac->ac_sb; 2227 struct ext4_group_info *grp = e4b->bd_info; 2228 void *buddy; 2229 int i; 2230 int k; 2231 int max; 2232 2233 BUG_ON(ac->ac_2order <= 0); 2234 for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { 2235 if (grp->bb_counters[i] == 0) 2236 continue; 2237 2238 buddy = mb_find_buddy(e4b, i, &max); 2239 BUG_ON(buddy == NULL); 2240 2241 k = mb_find_next_zero_bit(buddy, max, 0); 2242 if (k >= max) { 2243 ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, 2244 "%d free clusters of order %d. But found 0", 2245 grp->bb_counters[i], i); 2246 ext4_mark_group_bitmap_corrupted(ac->ac_sb, 2247 e4b->bd_group, 2248 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2249 break; 2250 } 2251 ac->ac_found++; 2252 2253 ac->ac_b_ex.fe_len = 1 << i; 2254 ac->ac_b_ex.fe_start = k << i; 2255 ac->ac_b_ex.fe_group = e4b->bd_group; 2256 2257 ext4_mb_use_best_found(ac, e4b); 2258 2259 BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); 2260 2261 if (EXT4_SB(sb)->s_mb_stats) 2262 atomic_inc(&EXT4_SB(sb)->s_bal_2orders); 2263 2264 break; 2265 } 2266 } 2267 2268 /* 2269 * The routine scans the group and measures all found extents. 2270 * In order to optimize scanning, caller must pass number of 2271 * free blocks in the group, so the routine can know upper limit. 2272 */ 2273 static noinline_for_stack 2274 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 2275 struct ext4_buddy *e4b) 2276 { 2277 struct super_block *sb = ac->ac_sb; 2278 void *bitmap = e4b->bd_bitmap; 2279 struct ext4_free_extent ex; 2280 int i; 2281 int free; 2282 2283 free = e4b->bd_info->bb_free; 2284 if (WARN_ON(free <= 0)) 2285 return; 2286 2287 i = e4b->bd_info->bb_first_free; 2288 2289 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 2290 i = mb_find_next_zero_bit(bitmap, 2291 EXT4_CLUSTERS_PER_GROUP(sb), i); 2292 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { 2293 /* 2294 * IF we have corrupt bitmap, we won't find any 2295 * free blocks even though group info says we 2296 * have free blocks 2297 */ 2298 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 2299 "%d free clusters as per " 2300 "group info. But bitmap says 0", 2301 free); 2302 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 2303 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2304 break; 2305 } 2306 2307 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); 2308 if (WARN_ON(ex.fe_len <= 0)) 2309 break; 2310 if (free < ex.fe_len) { 2311 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 2312 "%d free clusters as per " 2313 "group info. But got %d blocks", 2314 free, ex.fe_len); 2315 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, 2316 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 2317 /* 2318 * The number of free blocks differs. This mostly 2319 * indicate that the bitmap is corrupt. So exit 2320 * without claiming the space. 2321 */ 2322 break; 2323 } 2324 ex.fe_logical = 0xDEADC0DE; /* debug value */ 2325 ext4_mb_measure_extent(ac, &ex, e4b); 2326 2327 i += ex.fe_len; 2328 free -= ex.fe_len; 2329 } 2330 2331 ext4_mb_check_limits(ac, e4b, 1); 2332 } 2333 2334 /* 2335 * This is a special case for storages like raid5 2336 * we try to find stripe-aligned chunks for stripe-size-multiple requests 2337 */ 2338 static noinline_for_stack 2339 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 2340 struct ext4_buddy *e4b) 2341 { 2342 struct super_block *sb = ac->ac_sb; 2343 struct ext4_sb_info *sbi = EXT4_SB(sb); 2344 void *bitmap = e4b->bd_bitmap; 2345 struct ext4_free_extent ex; 2346 ext4_fsblk_t first_group_block; 2347 ext4_fsblk_t a; 2348 ext4_grpblk_t i; 2349 int max; 2350 2351 BUG_ON(sbi->s_stripe == 0); 2352 2353 /* find first stripe-aligned block in group */ 2354 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); 2355 2356 a = first_group_block + sbi->s_stripe - 1; 2357 do_div(a, sbi->s_stripe); 2358 i = (a * sbi->s_stripe) - first_group_block; 2359 2360 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { 2361 if (!mb_test_bit(i, bitmap)) { 2362 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); 2363 if (max >= sbi->s_stripe) { 2364 ac->ac_found++; 2365 ex.fe_logical = 0xDEADF00D; /* debug value */ 2366 ac->ac_b_ex = ex; 2367 ext4_mb_use_best_found(ac, e4b); 2368 break; 2369 } 2370 } 2371 i += sbi->s_stripe; 2372 } 2373 } 2374 2375 /* 2376 * This is also called BEFORE we load the buddy bitmap. 2377 * Returns either 1 or 0 indicating that the group is either suitable 2378 * for the allocation or not. 2379 */ 2380 static bool ext4_mb_good_group(struct ext4_allocation_context *ac, 2381 ext4_group_t group, int cr) 2382 { 2383 ext4_grpblk_t free, fragments; 2384 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 2385 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2386 2387 BUG_ON(cr < 0 || cr >= 4); 2388 2389 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) 2390 return false; 2391 2392 free = grp->bb_free; 2393 if (free == 0) 2394 return false; 2395 2396 fragments = grp->bb_fragments; 2397 if (fragments == 0) 2398 return false; 2399 2400 switch (cr) { 2401 case 0: 2402 BUG_ON(ac->ac_2order == 0); 2403 2404 /* Avoid using the first bg of a flexgroup for data files */ 2405 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 2406 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 2407 ((group % flex_size) == 0)) 2408 return false; 2409 2410 if (free < ac->ac_g_ex.fe_len) 2411 return false; 2412 2413 if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) 2414 return true; 2415 2416 if (grp->bb_largest_free_order < ac->ac_2order) 2417 return false; 2418 2419 return true; 2420 case 1: 2421 if ((free / fragments) >= ac->ac_g_ex.fe_len) 2422 return true; 2423 break; 2424 case 2: 2425 if (free >= ac->ac_g_ex.fe_len) 2426 return true; 2427 break; 2428 case 3: 2429 return true; 2430 default: 2431 BUG(); 2432 } 2433 2434 return false; 2435 } 2436 2437 /* 2438 * This could return negative error code if something goes wrong 2439 * during ext4_mb_init_group(). This should not be called with 2440 * ext4_lock_group() held. 2441 * 2442 * Note: because we are conditionally operating with the group lock in 2443 * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this 2444 * function using __acquire and __release. This means we need to be 2445 * super careful before messing with the error path handling via "goto 2446 * out"! 2447 */ 2448 static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, 2449 ext4_group_t group, int cr) 2450 { 2451 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 2452 struct super_block *sb = ac->ac_sb; 2453 struct ext4_sb_info *sbi = EXT4_SB(sb); 2454 bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; 2455 ext4_grpblk_t free; 2456 int ret = 0; 2457 2458 if (sbi->s_mb_stats) 2459 atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); 2460 if (should_lock) { 2461 ext4_lock_group(sb, group); 2462 __release(ext4_group_lock_ptr(sb, group)); 2463 } 2464 free = grp->bb_free; 2465 if (free == 0) 2466 goto out; 2467 if (cr <= 2 && free < ac->ac_g_ex.fe_len) 2468 goto out; 2469 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) 2470 goto out; 2471 if (should_lock) { 2472 __acquire(ext4_group_lock_ptr(sb, group)); 2473 ext4_unlock_group(sb, group); 2474 } 2475 2476 /* We only do this if the grp has never been initialized */ 2477 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 2478 struct ext4_group_desc *gdp = 2479 ext4_get_group_desc(sb, group, NULL); 2480 int ret; 2481 2482 /* cr=0/1 is a very optimistic search to find large 2483 * good chunks almost for free. If buddy data is not 2484 * ready, then this optimization makes no sense. But 2485 * we never skip the first block group in a flex_bg, 2486 * since this gets used for metadata block allocation, 2487 * and we want to make sure we locate metadata blocks 2488 * in the first block group in the flex_bg if possible. 2489 */ 2490 if (cr < 2 && 2491 (!sbi->s_log_groups_per_flex || 2492 ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && 2493 !(ext4_has_group_desc_csum(sb) && 2494 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) 2495 return 0; 2496 ret = ext4_mb_init_group(sb, group, GFP_NOFS); 2497 if (ret) 2498 return ret; 2499 } 2500 2501 if (should_lock) { 2502 ext4_lock_group(sb, group); 2503 __release(ext4_group_lock_ptr(sb, group)); 2504 } 2505 ret = ext4_mb_good_group(ac, group, cr); 2506 out: 2507 if (should_lock) { 2508 __acquire(ext4_group_lock_ptr(sb, group)); 2509 ext4_unlock_group(sb, group); 2510 } 2511 return ret; 2512 } 2513 2514 /* 2515 * Start prefetching @nr block bitmaps starting at @group. 2516 * Return the next group which needs to be prefetched. 2517 */ 2518 ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, 2519 unsigned int nr, int *cnt) 2520 { 2521 ext4_group_t ngroups = ext4_get_groups_count(sb); 2522 struct buffer_head *bh; 2523 struct blk_plug plug; 2524 2525 blk_start_plug(&plug); 2526 while (nr-- > 0) { 2527 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, 2528 NULL); 2529 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 2530 2531 /* 2532 * Prefetch block groups with free blocks; but don't 2533 * bother if it is marked uninitialized on disk, since 2534 * it won't require I/O to read. Also only try to 2535 * prefetch once, so we avoid getblk() call, which can 2536 * be expensive. 2537 */ 2538 if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) && 2539 EXT4_MB_GRP_NEED_INIT(grp) && 2540 ext4_free_group_clusters(sb, gdp) > 0 && 2541 !(ext4_has_group_desc_csum(sb) && 2542 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { 2543 bh = ext4_read_block_bitmap_nowait(sb, group, true); 2544 if (bh && !IS_ERR(bh)) { 2545 if (!buffer_uptodate(bh) && cnt) 2546 (*cnt)++; 2547 brelse(bh); 2548 } 2549 } 2550 if (++group >= ngroups) 2551 group = 0; 2552 } 2553 blk_finish_plug(&plug); 2554 return group; 2555 } 2556 2557 /* 2558 * Prefetching reads the block bitmap into the buffer cache; but we 2559 * need to make sure that the buddy bitmap in the page cache has been 2560 * initialized. Note that ext4_mb_init_group() will block if the I/O 2561 * is not yet completed, or indeed if it was not initiated by 2562 * ext4_mb_prefetch did not start the I/O. 2563 * 2564 * TODO: We should actually kick off the buddy bitmap setup in a work 2565 * queue when the buffer I/O is completed, so that we don't block 2566 * waiting for the block allocation bitmap read to finish when 2567 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). 2568 */ 2569 void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, 2570 unsigned int nr) 2571 { 2572 while (nr-- > 0) { 2573 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, 2574 NULL); 2575 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 2576 2577 if (!group) 2578 group = ext4_get_groups_count(sb); 2579 group--; 2580 grp = ext4_get_group_info(sb, group); 2581 2582 if (EXT4_MB_GRP_NEED_INIT(grp) && 2583 ext4_free_group_clusters(sb, gdp) > 0 && 2584 !(ext4_has_group_desc_csum(sb) && 2585 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { 2586 if (ext4_mb_init_group(sb, group, GFP_NOFS)) 2587 break; 2588 } 2589 } 2590 } 2591 2592 static noinline_for_stack int 2593 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 2594 { 2595 ext4_group_t prefetch_grp = 0, ngroups, group, i; 2596 int cr = -1, new_cr; 2597 int err = 0, first_err = 0; 2598 unsigned int nr = 0, prefetch_ios = 0; 2599 struct ext4_sb_info *sbi; 2600 struct super_block *sb; 2601 struct ext4_buddy e4b; 2602 int lost; 2603 2604 sb = ac->ac_sb; 2605 sbi = EXT4_SB(sb); 2606 ngroups = ext4_get_groups_count(sb); 2607 /* non-extent files are limited to low blocks/groups */ 2608 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) 2609 ngroups = sbi->s_blockfile_groups; 2610 2611 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2612 2613 /* first, try the goal */ 2614 err = ext4_mb_find_by_goal(ac, &e4b); 2615 if (err || ac->ac_status == AC_STATUS_FOUND) 2616 goto out; 2617 2618 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 2619 goto out; 2620 2621 /* 2622 * ac->ac_2order is set only if the fe_len is a power of 2 2623 * if ac->ac_2order is set we also set criteria to 0 so that we 2624 * try exact allocation using buddy. 2625 */ 2626 i = fls(ac->ac_g_ex.fe_len); 2627 ac->ac_2order = 0; 2628 /* 2629 * We search using buddy data only if the order of the request 2630 * is greater than equal to the sbi_s_mb_order2_reqs 2631 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req 2632 * We also support searching for power-of-two requests only for 2633 * requests upto maximum buddy size we have constructed. 2634 */ 2635 if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { 2636 /* 2637 * This should tell if fe_len is exactly power of 2 2638 */ 2639 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 2640 ac->ac_2order = array_index_nospec(i - 1, 2641 MB_NUM_ORDERS(sb)); 2642 } 2643 2644 /* if stream allocation is enabled, use global goal */ 2645 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2646 /* TBD: may be hot point */ 2647 spin_lock(&sbi->s_md_lock); 2648 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2649 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2650 spin_unlock(&sbi->s_md_lock); 2651 } 2652 2653 /* Let's just scan groups to find more-less suitable blocks */ 2654 cr = ac->ac_2order ? 0 : 1; 2655 /* 2656 * cr == 0 try to get exact allocation, 2657 * cr == 3 try to get anything 2658 */ 2659 repeat: 2660 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 2661 ac->ac_criteria = cr; 2662 /* 2663 * searching for the right group start 2664 * from the goal value specified 2665 */ 2666 group = ac->ac_g_ex.fe_group; 2667 ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; 2668 prefetch_grp = group; 2669 2670 for (i = 0, new_cr = cr; i < ngroups; i++, 2671 ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { 2672 int ret = 0; 2673 2674 cond_resched(); 2675 if (new_cr != cr) { 2676 cr = new_cr; 2677 goto repeat; 2678 } 2679 2680 /* 2681 * Batch reads of the block allocation bitmaps 2682 * to get multiple READs in flight; limit 2683 * prefetching at cr=0/1, otherwise mballoc can 2684 * spend a lot of time loading imperfect groups 2685 */ 2686 if ((prefetch_grp == group) && 2687 (cr > 1 || 2688 prefetch_ios < sbi->s_mb_prefetch_limit)) { 2689 unsigned int curr_ios = prefetch_ios; 2690 2691 nr = sbi->s_mb_prefetch; 2692 if (ext4_has_feature_flex_bg(sb)) { 2693 nr = 1 << sbi->s_log_groups_per_flex; 2694 nr -= group & (nr - 1); 2695 nr = min(nr, sbi->s_mb_prefetch); 2696 } 2697 prefetch_grp = ext4_mb_prefetch(sb, group, 2698 nr, &prefetch_ios); 2699 if (prefetch_ios == curr_ios) 2700 nr = 0; 2701 } 2702 2703 /* This now checks without needing the buddy page */ 2704 ret = ext4_mb_good_group_nolock(ac, group, cr); 2705 if (ret <= 0) { 2706 if (!first_err) 2707 first_err = ret; 2708 continue; 2709 } 2710 2711 err = ext4_mb_load_buddy(sb, group, &e4b); 2712 if (err) 2713 goto out; 2714 2715 ext4_lock_group(sb, group); 2716 2717 /* 2718 * We need to check again after locking the 2719 * block group 2720 */ 2721 ret = ext4_mb_good_group(ac, group, cr); 2722 if (ret == 0) { 2723 ext4_unlock_group(sb, group); 2724 ext4_mb_unload_buddy(&e4b); 2725 continue; 2726 } 2727 2728 ac->ac_groups_scanned++; 2729 if (cr == 0) 2730 ext4_mb_simple_scan_group(ac, &e4b); 2731 else if (cr == 1 && sbi->s_stripe && 2732 !(ac->ac_g_ex.fe_len % sbi->s_stripe)) 2733 ext4_mb_scan_aligned(ac, &e4b); 2734 else 2735 ext4_mb_complex_scan_group(ac, &e4b); 2736 2737 ext4_unlock_group(sb, group); 2738 ext4_mb_unload_buddy(&e4b); 2739 2740 if (ac->ac_status != AC_STATUS_CONTINUE) 2741 break; 2742 } 2743 /* Processed all groups and haven't found blocks */ 2744 if (sbi->s_mb_stats && i == ngroups) 2745 atomic64_inc(&sbi->s_bal_cX_failed[cr]); 2746 } 2747 2748 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && 2749 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2750 /* 2751 * We've been searching too long. Let's try to allocate 2752 * the best chunk we've found so far 2753 */ 2754 ext4_mb_try_best_found(ac, &e4b); 2755 if (ac->ac_status != AC_STATUS_FOUND) { 2756 /* 2757 * Someone more lucky has already allocated it. 2758 * The only thing we can do is just take first 2759 * found block(s) 2760 */ 2761 lost = atomic_inc_return(&sbi->s_mb_lost_chunks); 2762 mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", 2763 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, 2764 ac->ac_b_ex.fe_len, lost); 2765 2766 ac->ac_b_ex.fe_group = 0; 2767 ac->ac_b_ex.fe_start = 0; 2768 ac->ac_b_ex.fe_len = 0; 2769 ac->ac_status = AC_STATUS_CONTINUE; 2770 ac->ac_flags |= EXT4_MB_HINT_FIRST; 2771 cr = 3; 2772 goto repeat; 2773 } 2774 } 2775 2776 if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) 2777 atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); 2778 out: 2779 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) 2780 err = first_err; 2781 2782 mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", 2783 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, 2784 ac->ac_flags, cr, err); 2785 2786 if (nr) 2787 ext4_mb_prefetch_fini(sb, prefetch_grp, nr); 2788 2789 return err; 2790 } 2791 2792 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2793 { 2794 struct super_block *sb = pde_data(file_inode(seq->file)); 2795 ext4_group_t group; 2796 2797 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2798 return NULL; 2799 group = *pos + 1; 2800 return (void *) ((unsigned long) group); 2801 } 2802 2803 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2804 { 2805 struct super_block *sb = pde_data(file_inode(seq->file)); 2806 ext4_group_t group; 2807 2808 ++*pos; 2809 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2810 return NULL; 2811 group = *pos + 1; 2812 return (void *) ((unsigned long) group); 2813 } 2814 2815 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2816 { 2817 struct super_block *sb = pde_data(file_inode(seq->file)); 2818 ext4_group_t group = (ext4_group_t) ((unsigned long) v); 2819 int i; 2820 int err, buddy_loaded = 0; 2821 struct ext4_buddy e4b; 2822 struct ext4_group_info *grinfo; 2823 unsigned char blocksize_bits = min_t(unsigned char, 2824 sb->s_blocksize_bits, 2825 EXT4_MAX_BLOCK_LOG_SIZE); 2826 struct sg { 2827 struct ext4_group_info info; 2828 ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; 2829 } sg; 2830 2831 group--; 2832 if (group == 0) 2833 seq_puts(seq, "#group: free frags first [" 2834 " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " 2835 " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); 2836 2837 i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + 2838 sizeof(struct ext4_group_info); 2839 2840 grinfo = ext4_get_group_info(sb, group); 2841 /* Load the group info in memory only if not already loaded. */ 2842 if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { 2843 err = ext4_mb_load_buddy(sb, group, &e4b); 2844 if (err) { 2845 seq_printf(seq, "#%-5u: I/O error\n", group); 2846 return 0; 2847 } 2848 buddy_loaded = 1; 2849 } 2850 2851 memcpy(&sg, ext4_get_group_info(sb, group), i); 2852 2853 if (buddy_loaded) 2854 ext4_mb_unload_buddy(&e4b); 2855 2856 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2857 sg.info.bb_fragments, sg.info.bb_first_free); 2858 for (i = 0; i <= 13; i++) 2859 seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? 2860 sg.info.bb_counters[i] : 0); 2861 seq_puts(seq, " ]\n"); 2862 2863 return 0; 2864 } 2865 2866 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) 2867 { 2868 } 2869 2870 const struct seq_operations ext4_mb_seq_groups_ops = { 2871 .start = ext4_mb_seq_groups_start, 2872 .next = ext4_mb_seq_groups_next, 2873 .stop = ext4_mb_seq_groups_stop, 2874 .show = ext4_mb_seq_groups_show, 2875 }; 2876 2877 int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) 2878 { 2879 struct super_block *sb = seq->private; 2880 struct ext4_sb_info *sbi = EXT4_SB(sb); 2881 2882 seq_puts(seq, "mballoc:\n"); 2883 if (!sbi->s_mb_stats) { 2884 seq_puts(seq, "\tmb stats collection turned off.\n"); 2885 seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); 2886 return 0; 2887 } 2888 seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); 2889 seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); 2890 2891 seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); 2892 2893 seq_puts(seq, "\tcr0_stats:\n"); 2894 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); 2895 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2896 atomic64_read(&sbi->s_bal_cX_groups_considered[0])); 2897 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2898 atomic64_read(&sbi->s_bal_cX_failed[0])); 2899 seq_printf(seq, "\t\tbad_suggestions: %u\n", 2900 atomic_read(&sbi->s_bal_cr0_bad_suggestions)); 2901 2902 seq_puts(seq, "\tcr1_stats:\n"); 2903 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); 2904 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2905 atomic64_read(&sbi->s_bal_cX_groups_considered[1])); 2906 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2907 atomic64_read(&sbi->s_bal_cX_failed[1])); 2908 seq_printf(seq, "\t\tbad_suggestions: %u\n", 2909 atomic_read(&sbi->s_bal_cr1_bad_suggestions)); 2910 2911 seq_puts(seq, "\tcr2_stats:\n"); 2912 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); 2913 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2914 atomic64_read(&sbi->s_bal_cX_groups_considered[2])); 2915 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2916 atomic64_read(&sbi->s_bal_cX_failed[2])); 2917 2918 seq_puts(seq, "\tcr3_stats:\n"); 2919 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); 2920 seq_printf(seq, "\t\tgroups_considered: %llu\n", 2921 atomic64_read(&sbi->s_bal_cX_groups_considered[3])); 2922 seq_printf(seq, "\t\tuseless_loops: %llu\n", 2923 atomic64_read(&sbi->s_bal_cX_failed[3])); 2924 seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); 2925 seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); 2926 seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); 2927 seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); 2928 seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); 2929 2930 seq_printf(seq, "\tbuddies_generated: %u/%u\n", 2931 atomic_read(&sbi->s_mb_buddies_generated), 2932 ext4_get_groups_count(sb)); 2933 seq_printf(seq, "\tbuddies_time_used: %llu\n", 2934 atomic64_read(&sbi->s_mb_generation_time)); 2935 seq_printf(seq, "\tpreallocated: %u\n", 2936 atomic_read(&sbi->s_mb_preallocated)); 2937 seq_printf(seq, "\tdiscarded: %u\n", 2938 atomic_read(&sbi->s_mb_discarded)); 2939 return 0; 2940 } 2941 2942 static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) 2943 __acquires(&EXT4_SB(sb)->s_mb_rb_lock) 2944 { 2945 struct super_block *sb = pde_data(file_inode(seq->file)); 2946 unsigned long position; 2947 2948 if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) 2949 return NULL; 2950 position = *pos + 1; 2951 return (void *) ((unsigned long) position); 2952 } 2953 2954 static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) 2955 { 2956 struct super_block *sb = pde_data(file_inode(seq->file)); 2957 unsigned long position; 2958 2959 ++*pos; 2960 if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) 2961 return NULL; 2962 position = *pos + 1; 2963 return (void *) ((unsigned long) position); 2964 } 2965 2966 static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) 2967 { 2968 struct super_block *sb = pde_data(file_inode(seq->file)); 2969 struct ext4_sb_info *sbi = EXT4_SB(sb); 2970 unsigned long position = ((unsigned long) v); 2971 struct ext4_group_info *grp; 2972 unsigned int count; 2973 2974 position--; 2975 if (position >= MB_NUM_ORDERS(sb)) { 2976 position -= MB_NUM_ORDERS(sb); 2977 if (position == 0) 2978 seq_puts(seq, "avg_fragment_size_lists:\n"); 2979 2980 count = 0; 2981 read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); 2982 list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], 2983 bb_avg_fragment_size_node) 2984 count++; 2985 read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); 2986 seq_printf(seq, "\tlist_order_%u_groups: %u\n", 2987 (unsigned int)position, count); 2988 return 0; 2989 } 2990 2991 if (position == 0) { 2992 seq_printf(seq, "optimize_scan: %d\n", 2993 test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); 2994 seq_puts(seq, "max_free_order_lists:\n"); 2995 } 2996 count = 0; 2997 read_lock(&sbi->s_mb_largest_free_orders_locks[position]); 2998 list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], 2999 bb_largest_free_order_node) 3000 count++; 3001 read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); 3002 seq_printf(seq, "\tlist_order_%u_groups: %u\n", 3003 (unsigned int)position, count); 3004 3005 return 0; 3006 } 3007 3008 static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) 3009 { 3010 } 3011 3012 const struct seq_operations ext4_mb_seq_structs_summary_ops = { 3013 .start = ext4_mb_seq_structs_summary_start, 3014 .next = ext4_mb_seq_structs_summary_next, 3015 .stop = ext4_mb_seq_structs_summary_stop, 3016 .show = ext4_mb_seq_structs_summary_show, 3017 }; 3018 3019 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) 3020 { 3021 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 3022 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; 3023 3024 BUG_ON(!cachep); 3025 return cachep; 3026 } 3027 3028 /* 3029 * Allocate the top-level s_group_info array for the specified number 3030 * of groups 3031 */ 3032 int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) 3033 { 3034 struct ext4_sb_info *sbi = EXT4_SB(sb); 3035 unsigned size; 3036 struct ext4_group_info ***old_groupinfo, ***new_groupinfo; 3037 3038 size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> 3039 EXT4_DESC_PER_BLOCK_BITS(sb); 3040 if (size <= sbi->s_group_info_size) 3041 return 0; 3042 3043 size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); 3044 new_groupinfo = kvzalloc(size, GFP_KERNEL); 3045 if (!new_groupinfo) { 3046 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); 3047 return -ENOMEM; 3048 } 3049 rcu_read_lock(); 3050 old_groupinfo = rcu_dereference(sbi->s_group_info); 3051 if (old_groupinfo) 3052 memcpy(new_groupinfo, old_groupinfo, 3053 sbi->s_group_info_size * sizeof(*sbi->s_group_info)); 3054 rcu_read_unlock(); 3055 rcu_assign_pointer(sbi->s_group_info, new_groupinfo); 3056 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); 3057 if (old_groupinfo) 3058 ext4_kvfree_array_rcu(old_groupinfo); 3059 ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 3060 sbi->s_group_info_size); 3061 return 0; 3062 } 3063 3064 /* Create and initialize ext4_group_info data for the given group. */ 3065 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 3066 struct ext4_group_desc *desc) 3067 { 3068 int i; 3069 int metalen = 0; 3070 int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); 3071 struct ext4_sb_info *sbi = EXT4_SB(sb); 3072 struct ext4_group_info **meta_group_info; 3073 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3074 3075 /* 3076 * First check if this group is the first of a reserved block. 3077 * If it's true, we have to allocate a new table of pointers 3078 * to ext4_group_info structures 3079 */ 3080 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 3081 metalen = sizeof(*meta_group_info) << 3082 EXT4_DESC_PER_BLOCK_BITS(sb); 3083 meta_group_info = kmalloc(metalen, GFP_NOFS); 3084 if (meta_group_info == NULL) { 3085 ext4_msg(sb, KERN_ERR, "can't allocate mem " 3086 "for a buddy group"); 3087 goto exit_meta_group_info; 3088 } 3089 rcu_read_lock(); 3090 rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; 3091 rcu_read_unlock(); 3092 } 3093 3094 meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); 3095 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 3096 3097 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); 3098 if (meta_group_info[i] == NULL) { 3099 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); 3100 goto exit_group_info; 3101 } 3102 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 3103 &(meta_group_info[i]->bb_state)); 3104 3105 /* 3106 * initialize bb_free to be able to skip 3107 * empty groups without initialization 3108 */ 3109 if (ext4_has_group_desc_csum(sb) && 3110 (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 3111 meta_group_info[i]->bb_free = 3112 ext4_free_clusters_after_init(sb, group, desc); 3113 } else { 3114 meta_group_info[i]->bb_free = 3115 ext4_free_group_clusters(sb, desc); 3116 } 3117 3118 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 3119 init_rwsem(&meta_group_info[i]->alloc_sem); 3120 meta_group_info[i]->bb_free_root = RB_ROOT; 3121 INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); 3122 INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); 3123 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ 3124 meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ 3125 meta_group_info[i]->bb_group = group; 3126 3127 mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); 3128 return 0; 3129 3130 exit_group_info: 3131 /* If a meta_group_info table has been allocated, release it now */ 3132 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 3133 struct ext4_group_info ***group_info; 3134 3135 rcu_read_lock(); 3136 group_info = rcu_dereference(sbi->s_group_info); 3137 kfree(group_info[idx]); 3138 group_info[idx] = NULL; 3139 rcu_read_unlock(); 3140 } 3141 exit_meta_group_info: 3142 return -ENOMEM; 3143 } /* ext4_mb_add_groupinfo */ 3144 3145 static int ext4_mb_init_backend(struct super_block *sb) 3146 { 3147 ext4_group_t ngroups = ext4_get_groups_count(sb); 3148 ext4_group_t i; 3149 struct ext4_sb_info *sbi = EXT4_SB(sb); 3150 int err; 3151 struct ext4_group_desc *desc; 3152 struct ext4_group_info ***group_info; 3153 struct kmem_cache *cachep; 3154 3155 err = ext4_mb_alloc_groupinfo(sb, ngroups); 3156 if (err) 3157 return err; 3158 3159 sbi->s_buddy_cache = new_inode(sb); 3160 if (sbi->s_buddy_cache == NULL) { 3161 ext4_msg(sb, KERN_ERR, "can't get new inode"); 3162 goto err_freesgi; 3163 } 3164 /* To avoid potentially colliding with an valid on-disk inode number, 3165 * use EXT4_BAD_INO for the buddy cache inode number. This inode is 3166 * not in the inode hash, so it should never be found by iget(), but 3167 * this will avoid confusion if it ever shows up during debugging. */ 3168 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; 3169 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 3170 for (i = 0; i < ngroups; i++) { 3171 cond_resched(); 3172 desc = ext4_get_group_desc(sb, i, NULL); 3173 if (desc == NULL) { 3174 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); 3175 goto err_freebuddy; 3176 } 3177 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 3178 goto err_freebuddy; 3179 } 3180 3181 if (ext4_has_feature_flex_bg(sb)) { 3182 /* a single flex group is supposed to be read by a single IO. 3183 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is 3184 * unsigned integer, so the maximum shift is 32. 3185 */ 3186 if (sbi->s_es->s_log_groups_per_flex >= 32) { 3187 ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); 3188 goto err_freebuddy; 3189 } 3190 sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, 3191 BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); 3192 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ 3193 } else { 3194 sbi->s_mb_prefetch = 32; 3195 } 3196 if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) 3197 sbi->s_mb_prefetch = ext4_get_groups_count(sb); 3198 /* now many real IOs to prefetch within a single allocation at cr=0 3199 * given cr=0 is an CPU-related optimization we shouldn't try to 3200 * load too many groups, at some point we should start to use what 3201 * we've got in memory. 3202 * with an average random access time 5ms, it'd take a second to get 3203 * 200 groups (* N with flex_bg), so let's make this limit 4 3204 */ 3205 sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; 3206 if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) 3207 sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); 3208 3209 return 0; 3210 3211 err_freebuddy: 3212 cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3213 while (i-- > 0) 3214 kmem_cache_free(cachep, ext4_get_group_info(sb, i)); 3215 i = sbi->s_group_info_size; 3216 rcu_read_lock(); 3217 group_info = rcu_dereference(sbi->s_group_info); 3218 while (i-- > 0) 3219 kfree(group_info[i]); 3220 rcu_read_unlock(); 3221 iput(sbi->s_buddy_cache); 3222 err_freesgi: 3223 rcu_read_lock(); 3224 kvfree(rcu_dereference(sbi->s_group_info)); 3225 rcu_read_unlock(); 3226 return -ENOMEM; 3227 } 3228 3229 static void ext4_groupinfo_destroy_slabs(void) 3230 { 3231 int i; 3232 3233 for (i = 0; i < NR_GRPINFO_CACHES; i++) { 3234 kmem_cache_destroy(ext4_groupinfo_caches[i]); 3235 ext4_groupinfo_caches[i] = NULL; 3236 } 3237 } 3238 3239 static int ext4_groupinfo_create_slab(size_t size) 3240 { 3241 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); 3242 int slab_size; 3243 int blocksize_bits = order_base_2(size); 3244 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 3245 struct kmem_cache *cachep; 3246 3247 if (cache_index >= NR_GRPINFO_CACHES) 3248 return -EINVAL; 3249 3250 if (unlikely(cache_index < 0)) 3251 cache_index = 0; 3252 3253 mutex_lock(&ext4_grpinfo_slab_create_mutex); 3254 if (ext4_groupinfo_caches[cache_index]) { 3255 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 3256 return 0; /* Already created */ 3257 } 3258 3259 slab_size = offsetof(struct ext4_group_info, 3260 bb_counters[blocksize_bits + 2]); 3261 3262 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], 3263 slab_size, 0, SLAB_RECLAIM_ACCOUNT, 3264 NULL); 3265 3266 ext4_groupinfo_caches[cache_index] = cachep; 3267 3268 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 3269 if (!cachep) { 3270 printk(KERN_EMERG 3271 "EXT4-fs: no memory for groupinfo slab cache\n"); 3272 return -ENOMEM; 3273 } 3274 3275 return 0; 3276 } 3277 3278 static void ext4_discard_work(struct work_struct *work) 3279 { 3280 struct ext4_sb_info *sbi = container_of(work, 3281 struct ext4_sb_info, s_discard_work); 3282 struct super_block *sb = sbi->s_sb; 3283 struct ext4_free_data *fd, *nfd; 3284 struct ext4_buddy e4b; 3285 struct list_head discard_list; 3286 ext4_group_t grp, load_grp; 3287 int err = 0; 3288 3289 INIT_LIST_HEAD(&discard_list); 3290 spin_lock(&sbi->s_md_lock); 3291 list_splice_init(&sbi->s_discard_list, &discard_list); 3292 spin_unlock(&sbi->s_md_lock); 3293 3294 load_grp = UINT_MAX; 3295 list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { 3296 /* 3297 * If filesystem is umounting or no memory or suffering 3298 * from no space, give up the discard 3299 */ 3300 if ((sb->s_flags & SB_ACTIVE) && !err && 3301 !atomic_read(&sbi->s_retry_alloc_pending)) { 3302 grp = fd->efd_group; 3303 if (grp != load_grp) { 3304 if (load_grp != UINT_MAX) 3305 ext4_mb_unload_buddy(&e4b); 3306 3307 err = ext4_mb_load_buddy(sb, grp, &e4b); 3308 if (err) { 3309 kmem_cache_free(ext4_free_data_cachep, fd); 3310 load_grp = UINT_MAX; 3311 continue; 3312 } else { 3313 load_grp = grp; 3314 } 3315 } 3316 3317 ext4_lock_group(sb, grp); 3318 ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, 3319 fd->efd_start_cluster + fd->efd_count - 1, 1); 3320 ext4_unlock_group(sb, grp); 3321 } 3322 kmem_cache_free(ext4_free_data_cachep, fd); 3323 } 3324 3325 if (load_grp != UINT_MAX) 3326 ext4_mb_unload_buddy(&e4b); 3327 } 3328 3329 int ext4_mb_init(struct super_block *sb) 3330 { 3331 struct ext4_sb_info *sbi = EXT4_SB(sb); 3332 unsigned i, j; 3333 unsigned offset, offset_incr; 3334 unsigned max; 3335 int ret; 3336 3337 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); 3338 3339 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 3340 if (sbi->s_mb_offsets == NULL) { 3341 ret = -ENOMEM; 3342 goto out; 3343 } 3344 3345 i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); 3346 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 3347 if (sbi->s_mb_maxs == NULL) { 3348 ret = -ENOMEM; 3349 goto out; 3350 } 3351 3352 ret = ext4_groupinfo_create_slab(sb->s_blocksize); 3353 if (ret < 0) 3354 goto out; 3355 3356 /* order 0 is regular bitmap */ 3357 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 3358 sbi->s_mb_offsets[0] = 0; 3359 3360 i = 1; 3361 offset = 0; 3362 offset_incr = 1 << (sb->s_blocksize_bits - 1); 3363 max = sb->s_blocksize << 2; 3364 do { 3365 sbi->s_mb_offsets[i] = offset; 3366 sbi->s_mb_maxs[i] = max; 3367 offset += offset_incr; 3368 offset_incr = offset_incr >> 1; 3369 max = max >> 1; 3370 i++; 3371 } while (i < MB_NUM_ORDERS(sb)); 3372 3373 sbi->s_mb_avg_fragment_size = 3374 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3375 GFP_KERNEL); 3376 if (!sbi->s_mb_avg_fragment_size) { 3377 ret = -ENOMEM; 3378 goto out; 3379 } 3380 sbi->s_mb_avg_fragment_size_locks = 3381 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 3382 GFP_KERNEL); 3383 if (!sbi->s_mb_avg_fragment_size_locks) { 3384 ret = -ENOMEM; 3385 goto out; 3386 } 3387 for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 3388 INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); 3389 rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); 3390 } 3391 sbi->s_mb_largest_free_orders = 3392 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3393 GFP_KERNEL); 3394 if (!sbi->s_mb_largest_free_orders) { 3395 ret = -ENOMEM; 3396 goto out; 3397 } 3398 sbi->s_mb_largest_free_orders_locks = 3399 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 3400 GFP_KERNEL); 3401 if (!sbi->s_mb_largest_free_orders_locks) { 3402 ret = -ENOMEM; 3403 goto out; 3404 } 3405 for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 3406 INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); 3407 rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); 3408 } 3409 3410 spin_lock_init(&sbi->s_md_lock); 3411 sbi->s_mb_free_pending = 0; 3412 INIT_LIST_HEAD(&sbi->s_freed_data_list); 3413 INIT_LIST_HEAD(&sbi->s_discard_list); 3414 INIT_WORK(&sbi->s_discard_work, ext4_discard_work); 3415 atomic_set(&sbi->s_retry_alloc_pending, 0); 3416 3417 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 3418 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; 3419 sbi->s_mb_stats = MB_DEFAULT_STATS; 3420 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 3421 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 3422 sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; 3423 /* 3424 * The default group preallocation is 512, which for 4k block 3425 * sizes translates to 2 megabytes. However for bigalloc file 3426 * systems, this is probably too big (i.e, if the cluster size 3427 * is 1 megabyte, then group preallocation size becomes half a 3428 * gigabyte!). As a default, we will keep a two megabyte 3429 * group pralloc size for cluster sizes up to 64k, and after 3430 * that, we will force a minimum group preallocation size of 3431 * 32 clusters. This translates to 8 megs when the cluster 3432 * size is 256k, and 32 megs when the cluster size is 1 meg, 3433 * which seems reasonable as a default. 3434 */ 3435 sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> 3436 sbi->s_cluster_bits, 32); 3437 /* 3438 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc 3439 * to the lowest multiple of s_stripe which is bigger than 3440 * the s_mb_group_prealloc as determined above. We want 3441 * the preallocation size to be an exact multiple of the 3442 * RAID stripe size so that preallocations don't fragment 3443 * the stripes. 3444 */ 3445 if (sbi->s_stripe > 1) { 3446 sbi->s_mb_group_prealloc = roundup( 3447 sbi->s_mb_group_prealloc, sbi->s_stripe); 3448 } 3449 3450 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 3451 if (sbi->s_locality_groups == NULL) { 3452 ret = -ENOMEM; 3453 goto out; 3454 } 3455 for_each_possible_cpu(i) { 3456 struct ext4_locality_group *lg; 3457 lg = per_cpu_ptr(sbi->s_locality_groups, i); 3458 mutex_init(&lg->lg_mutex); 3459 for (j = 0; j < PREALLOC_TB_SIZE; j++) 3460 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 3461 spin_lock_init(&lg->lg_prealloc_lock); 3462 } 3463 3464 if (bdev_nonrot(sb->s_bdev)) 3465 sbi->s_mb_max_linear_groups = 0; 3466 else 3467 sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; 3468 /* init file for buddy data */ 3469 ret = ext4_mb_init_backend(sb); 3470 if (ret != 0) 3471 goto out_free_locality_groups; 3472 3473 return 0; 3474 3475 out_free_locality_groups: 3476 free_percpu(sbi->s_locality_groups); 3477 sbi->s_locality_groups = NULL; 3478 out: 3479 kfree(sbi->s_mb_avg_fragment_size); 3480 kfree(sbi->s_mb_avg_fragment_size_locks); 3481 kfree(sbi->s_mb_largest_free_orders); 3482 kfree(sbi->s_mb_largest_free_orders_locks); 3483 kfree(sbi->s_mb_offsets); 3484 sbi->s_mb_offsets = NULL; 3485 kfree(sbi->s_mb_maxs); 3486 sbi->s_mb_maxs = NULL; 3487 return ret; 3488 } 3489 3490 /* need to called with the ext4 group lock held */ 3491 static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) 3492 { 3493 struct ext4_prealloc_space *pa; 3494 struct list_head *cur, *tmp; 3495 int count = 0; 3496 3497 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { 3498 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 3499 list_del(&pa->pa_group_list); 3500 count++; 3501 kmem_cache_free(ext4_pspace_cachep, pa); 3502 } 3503 return count; 3504 } 3505 3506 int ext4_mb_release(struct super_block *sb) 3507 { 3508 ext4_group_t ngroups = ext4_get_groups_count(sb); 3509 ext4_group_t i; 3510 int num_meta_group_infos; 3511 struct ext4_group_info *grinfo, ***group_info; 3512 struct ext4_sb_info *sbi = EXT4_SB(sb); 3513 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 3514 int count; 3515 3516 if (test_opt(sb, DISCARD)) { 3517 /* 3518 * wait the discard work to drain all of ext4_free_data 3519 */ 3520 flush_work(&sbi->s_discard_work); 3521 WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); 3522 } 3523 3524 if (sbi->s_group_info) { 3525 for (i = 0; i < ngroups; i++) { 3526 cond_resched(); 3527 grinfo = ext4_get_group_info(sb, i); 3528 mb_group_bb_bitmap_free(grinfo); 3529 ext4_lock_group(sb, i); 3530 count = ext4_mb_cleanup_pa(grinfo); 3531 if (count) 3532 mb_debug(sb, "mballoc: %d PAs left\n", 3533 count); 3534 ext4_unlock_group(sb, i); 3535 kmem_cache_free(cachep, grinfo); 3536 } 3537 num_meta_group_infos = (ngroups + 3538 EXT4_DESC_PER_BLOCK(sb) - 1) >> 3539 EXT4_DESC_PER_BLOCK_BITS(sb); 3540 rcu_read_lock(); 3541 group_info = rcu_dereference(sbi->s_group_info); 3542 for (i = 0; i < num_meta_group_infos; i++) 3543 kfree(group_info[i]); 3544 kvfree(group_info); 3545 rcu_read_unlock(); 3546 } 3547 kfree(sbi->s_mb_avg_fragment_size); 3548 kfree(sbi->s_mb_avg_fragment_size_locks); 3549 kfree(sbi->s_mb_largest_free_orders); 3550 kfree(sbi->s_mb_largest_free_orders_locks); 3551 kfree(sbi->s_mb_offsets); 3552 kfree(sbi->s_mb_maxs); 3553 iput(sbi->s_buddy_cache); 3554 if (sbi->s_mb_stats) { 3555 ext4_msg(sb, KERN_INFO, 3556 "mballoc: %u blocks %u reqs (%u success)", 3557 atomic_read(&sbi->s_bal_allocated), 3558 atomic_read(&sbi->s_bal_reqs), 3559 atomic_read(&sbi->s_bal_success)); 3560 ext4_msg(sb, KERN_INFO, 3561 "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " 3562 "%u 2^N hits, %u breaks, %u lost", 3563 atomic_read(&sbi->s_bal_ex_scanned), 3564 atomic_read(&sbi->s_bal_groups_scanned), 3565 atomic_read(&sbi->s_bal_goals), 3566 atomic_read(&sbi->s_bal_2orders), 3567 atomic_read(&sbi->s_bal_breaks), 3568 atomic_read(&sbi->s_mb_lost_chunks)); 3569 ext4_msg(sb, KERN_INFO, 3570 "mballoc: %u generated and it took %llu", 3571 atomic_read(&sbi->s_mb_buddies_generated), 3572 atomic64_read(&sbi->s_mb_generation_time)); 3573 ext4_msg(sb, KERN_INFO, 3574 "mballoc: %u preallocated, %u discarded", 3575 atomic_read(&sbi->s_mb_preallocated), 3576 atomic_read(&sbi->s_mb_discarded)); 3577 } 3578 3579 free_percpu(sbi->s_locality_groups); 3580 3581 return 0; 3582 } 3583 3584 static inline int ext4_issue_discard(struct super_block *sb, 3585 ext4_group_t block_group, ext4_grpblk_t cluster, int count, 3586 struct bio **biop) 3587 { 3588 ext4_fsblk_t discard_block; 3589 3590 discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + 3591 ext4_group_first_block_no(sb, block_group)); 3592 count = EXT4_C2B(EXT4_SB(sb), count); 3593 trace_ext4_discard_blocks(sb, 3594 (unsigned long long) discard_block, count); 3595 if (biop) { 3596 return __blkdev_issue_discard(sb->s_bdev, 3597 (sector_t)discard_block << (sb->s_blocksize_bits - 9), 3598 (sector_t)count << (sb->s_blocksize_bits - 9), 3599 GFP_NOFS, biop); 3600 } else 3601 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); 3602 } 3603 3604 static void ext4_free_data_in_buddy(struct super_block *sb, 3605 struct ext4_free_data *entry) 3606 { 3607 struct ext4_buddy e4b; 3608 struct ext4_group_info *db; 3609 int err, count = 0, count2 = 0; 3610 3611 mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", 3612 entry->efd_count, entry->efd_group, entry); 3613 3614 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); 3615 /* we expect to find existing buddy because it's pinned */ 3616 BUG_ON(err != 0); 3617 3618 spin_lock(&EXT4_SB(sb)->s_md_lock); 3619 EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; 3620 spin_unlock(&EXT4_SB(sb)->s_md_lock); 3621 3622 db = e4b.bd_info; 3623 /* there are blocks to put in buddy to make them really free */ 3624 count += entry->efd_count; 3625 count2++; 3626 ext4_lock_group(sb, entry->efd_group); 3627 /* Take it out of per group rb tree */ 3628 rb_erase(&entry->efd_node, &(db->bb_free_root)); 3629 mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); 3630 3631 /* 3632 * Clear the trimmed flag for the group so that the next 3633 * ext4_trim_fs can trim it. 3634 * If the volume is mounted with -o discard, online discard 3635 * is supported and the free blocks will be trimmed online. 3636 */ 3637 if (!test_opt(sb, DISCARD)) 3638 EXT4_MB_GRP_CLEAR_TRIMMED(db); 3639 3640 if (!db->bb_free_root.rb_node) { 3641 /* No more items in the per group rb tree 3642 * balance refcounts from ext4_mb_free_metadata() 3643 */ 3644 put_page(e4b.bd_buddy_page); 3645 put_page(e4b.bd_bitmap_page); 3646 } 3647 ext4_unlock_group(sb, entry->efd_group); 3648 ext4_mb_unload_buddy(&e4b); 3649 3650 mb_debug(sb, "freed %d blocks in %d structures\n", count, 3651 count2); 3652 } 3653 3654 /* 3655 * This function is called by the jbd2 layer once the commit has finished, 3656 * so we know we can free the blocks that were released with that commit. 3657 */ 3658 void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) 3659 { 3660 struct ext4_sb_info *sbi = EXT4_SB(sb); 3661 struct ext4_free_data *entry, *tmp; 3662 struct list_head freed_data_list; 3663 struct list_head *cut_pos = NULL; 3664 bool wake; 3665 3666 INIT_LIST_HEAD(&freed_data_list); 3667 3668 spin_lock(&sbi->s_md_lock); 3669 list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { 3670 if (entry->efd_tid != commit_tid) 3671 break; 3672 cut_pos = &entry->efd_list; 3673 } 3674 if (cut_pos) 3675 list_cut_position(&freed_data_list, &sbi->s_freed_data_list, 3676 cut_pos); 3677 spin_unlock(&sbi->s_md_lock); 3678 3679 list_for_each_entry(entry, &freed_data_list, efd_list) 3680 ext4_free_data_in_buddy(sb, entry); 3681 3682 if (test_opt(sb, DISCARD)) { 3683 spin_lock(&sbi->s_md_lock); 3684 wake = list_empty(&sbi->s_discard_list); 3685 list_splice_tail(&freed_data_list, &sbi->s_discard_list); 3686 spin_unlock(&sbi->s_md_lock); 3687 if (wake) 3688 queue_work(system_unbound_wq, &sbi->s_discard_work); 3689 } else { 3690 list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) 3691 kmem_cache_free(ext4_free_data_cachep, entry); 3692 } 3693 } 3694 3695 int __init ext4_init_mballoc(void) 3696 { 3697 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, 3698 SLAB_RECLAIM_ACCOUNT); 3699 if (ext4_pspace_cachep == NULL) 3700 goto out; 3701 3702 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, 3703 SLAB_RECLAIM_ACCOUNT); 3704 if (ext4_ac_cachep == NULL) 3705 goto out_pa_free; 3706 3707 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, 3708 SLAB_RECLAIM_ACCOUNT); 3709 if (ext4_free_data_cachep == NULL) 3710 goto out_ac_free; 3711 3712 return 0; 3713 3714 out_ac_free: 3715 kmem_cache_destroy(ext4_ac_cachep); 3716 out_pa_free: 3717 kmem_cache_destroy(ext4_pspace_cachep); 3718 out: 3719 return -ENOMEM; 3720 } 3721 3722 void ext4_exit_mballoc(void) 3723 { 3724 /* 3725 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 3726 * before destroying the slab cache. 3727 */ 3728 rcu_barrier(); 3729 kmem_cache_destroy(ext4_pspace_cachep); 3730 kmem_cache_destroy(ext4_ac_cachep); 3731 kmem_cache_destroy(ext4_free_data_cachep); 3732 ext4_groupinfo_destroy_slabs(); 3733 } 3734 3735 3736 /* 3737 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps 3738 * Returns 0 if success or error code 3739 */ 3740 static noinline_for_stack int 3741 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 3742 handle_t *handle, unsigned int reserv_clstrs) 3743 { 3744 struct buffer_head *bitmap_bh = NULL; 3745 struct ext4_group_desc *gdp; 3746 struct buffer_head *gdp_bh; 3747 struct ext4_sb_info *sbi; 3748 struct super_block *sb; 3749 ext4_fsblk_t block; 3750 int err, len; 3751 3752 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3753 BUG_ON(ac->ac_b_ex.fe_len <= 0); 3754 3755 sb = ac->ac_sb; 3756 sbi = EXT4_SB(sb); 3757 3758 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 3759 if (IS_ERR(bitmap_bh)) { 3760 err = PTR_ERR(bitmap_bh); 3761 bitmap_bh = NULL; 3762 goto out_err; 3763 } 3764 3765 BUFFER_TRACE(bitmap_bh, "getting write access"); 3766 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 3767 EXT4_JTR_NONE); 3768 if (err) 3769 goto out_err; 3770 3771 err = -EIO; 3772 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); 3773 if (!gdp) 3774 goto out_err; 3775 3776 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 3777 ext4_free_group_clusters(sb, gdp)); 3778 3779 BUFFER_TRACE(gdp_bh, "get_write_access"); 3780 err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); 3781 if (err) 3782 goto out_err; 3783 3784 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3785 3786 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 3787 if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { 3788 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 3789 "fs metadata", block, block+len); 3790 /* File system mounted not to panic on error 3791 * Fix the bitmap and return EFSCORRUPTED 3792 * We leak some of the blocks here. 3793 */ 3794 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3795 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 3796 ac->ac_b_ex.fe_len); 3797 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3798 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3799 if (!err) 3800 err = -EFSCORRUPTED; 3801 goto out_err; 3802 } 3803 3804 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3805 #ifdef AGGRESSIVE_CHECK 3806 { 3807 int i; 3808 for (i = 0; i < ac->ac_b_ex.fe_len; i++) { 3809 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, 3810 bitmap_bh->b_data)); 3811 } 3812 } 3813 #endif 3814 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 3815 ac->ac_b_ex.fe_len); 3816 if (ext4_has_group_desc_csum(sb) && 3817 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 3818 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3819 ext4_free_group_clusters_set(sb, gdp, 3820 ext4_free_clusters_after_init(sb, 3821 ac->ac_b_ex.fe_group, gdp)); 3822 } 3823 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; 3824 ext4_free_group_clusters_set(sb, gdp, len); 3825 ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); 3826 ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); 3827 3828 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3829 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); 3830 /* 3831 * Now reduce the dirty block count also. Should not go negative 3832 */ 3833 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 3834 /* release all the reserved blocks if non delalloc */ 3835 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 3836 reserv_clstrs); 3837 3838 if (sbi->s_log_groups_per_flex) { 3839 ext4_group_t flex_group = ext4_flex_group(sbi, 3840 ac->ac_b_ex.fe_group); 3841 atomic64_sub(ac->ac_b_ex.fe_len, 3842 &sbi_array_rcu_deref(sbi, s_flex_groups, 3843 flex_group)->free_clusters); 3844 } 3845 3846 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3847 if (err) 3848 goto out_err; 3849 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 3850 3851 out_err: 3852 brelse(bitmap_bh); 3853 return err; 3854 } 3855 3856 /* 3857 * Idempotent helper for Ext4 fast commit replay path to set the state of 3858 * blocks in bitmaps and update counters. 3859 */ 3860 void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, 3861 int len, int state) 3862 { 3863 struct buffer_head *bitmap_bh = NULL; 3864 struct ext4_group_desc *gdp; 3865 struct buffer_head *gdp_bh; 3866 struct ext4_sb_info *sbi = EXT4_SB(sb); 3867 ext4_group_t group; 3868 ext4_grpblk_t blkoff; 3869 int i, err; 3870 int already; 3871 unsigned int clen, clen_changed, thisgrp_len; 3872 3873 while (len > 0) { 3874 ext4_get_group_no_and_offset(sb, block, &group, &blkoff); 3875 3876 /* 3877 * Check to see if we are freeing blocks across a group 3878 * boundary. 3879 * In case of flex_bg, this can happen that (block, len) may 3880 * span across more than one group. In that case we need to 3881 * get the corresponding group metadata to work with. 3882 * For this we have goto again loop. 3883 */ 3884 thisgrp_len = min_t(unsigned int, (unsigned int)len, 3885 EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); 3886 clen = EXT4_NUM_B2C(sbi, thisgrp_len); 3887 3888 if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) { 3889 ext4_error(sb, "Marking blocks in system zone - " 3890 "Block = %llu, len = %u", 3891 block, thisgrp_len); 3892 bitmap_bh = NULL; 3893 break; 3894 } 3895 3896 bitmap_bh = ext4_read_block_bitmap(sb, group); 3897 if (IS_ERR(bitmap_bh)) { 3898 err = PTR_ERR(bitmap_bh); 3899 bitmap_bh = NULL; 3900 break; 3901 } 3902 3903 err = -EIO; 3904 gdp = ext4_get_group_desc(sb, group, &gdp_bh); 3905 if (!gdp) 3906 break; 3907 3908 ext4_lock_group(sb, group); 3909 already = 0; 3910 for (i = 0; i < clen; i++) 3911 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == 3912 !state) 3913 already++; 3914 3915 clen_changed = clen - already; 3916 if (state) 3917 mb_set_bits(bitmap_bh->b_data, blkoff, clen); 3918 else 3919 mb_clear_bits(bitmap_bh->b_data, blkoff, clen); 3920 if (ext4_has_group_desc_csum(sb) && 3921 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 3922 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3923 ext4_free_group_clusters_set(sb, gdp, 3924 ext4_free_clusters_after_init(sb, group, gdp)); 3925 } 3926 if (state) 3927 clen = ext4_free_group_clusters(sb, gdp) - clen_changed; 3928 else 3929 clen = ext4_free_group_clusters(sb, gdp) + clen_changed; 3930 3931 ext4_free_group_clusters_set(sb, gdp, clen); 3932 ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); 3933 ext4_group_desc_csum_set(sb, group, gdp); 3934 3935 ext4_unlock_group(sb, group); 3936 3937 if (sbi->s_log_groups_per_flex) { 3938 ext4_group_t flex_group = ext4_flex_group(sbi, group); 3939 struct flex_groups *fg = sbi_array_rcu_deref(sbi, 3940 s_flex_groups, flex_group); 3941 3942 if (state) 3943 atomic64_sub(clen_changed, &fg->free_clusters); 3944 else 3945 atomic64_add(clen_changed, &fg->free_clusters); 3946 3947 } 3948 3949 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); 3950 if (err) 3951 break; 3952 sync_dirty_buffer(bitmap_bh); 3953 err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); 3954 sync_dirty_buffer(gdp_bh); 3955 if (err) 3956 break; 3957 3958 block += thisgrp_len; 3959 len -= thisgrp_len; 3960 brelse(bitmap_bh); 3961 BUG_ON(len < 0); 3962 } 3963 3964 if (err) 3965 brelse(bitmap_bh); 3966 } 3967 3968 /* 3969 * here we normalize request for locality group 3970 * Group request are normalized to s_mb_group_prealloc, which goes to 3971 * s_strip if we set the same via mount option. 3972 * s_mb_group_prealloc can be configured via 3973 * /sys/fs/ext4/<partition>/mb_group_prealloc 3974 * 3975 * XXX: should we try to preallocate more than the group has now? 3976 */ 3977 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) 3978 { 3979 struct super_block *sb = ac->ac_sb; 3980 struct ext4_locality_group *lg = ac->ac_lg; 3981 3982 BUG_ON(lg == NULL); 3983 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 3984 mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); 3985 } 3986 3987 /* 3988 * Normalization means making request better in terms of 3989 * size and alignment 3990 */ 3991 static noinline_for_stack void 3992 ext4_mb_normalize_request(struct ext4_allocation_context *ac, 3993 struct ext4_allocation_request *ar) 3994 { 3995 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3996 int bsbits, max; 3997 ext4_lblk_t end; 3998 loff_t size, start_off; 3999 loff_t orig_size __maybe_unused; 4000 ext4_lblk_t start; 4001 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4002 struct ext4_prealloc_space *pa; 4003 4004 /* do normalize only data requests, metadata requests 4005 do not need preallocation */ 4006 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4007 return; 4008 4009 /* sometime caller may want exact blocks */ 4010 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4011 return; 4012 4013 /* caller may indicate that preallocation isn't 4014 * required (it's a tail, for example) */ 4015 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) 4016 return; 4017 4018 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { 4019 ext4_mb_normalize_group_request(ac); 4020 return ; 4021 } 4022 4023 bsbits = ac->ac_sb->s_blocksize_bits; 4024 4025 /* first, let's learn actual file size 4026 * given current request is allocated */ 4027 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); 4028 size = size << bsbits; 4029 if (size < i_size_read(ac->ac_inode)) 4030 size = i_size_read(ac->ac_inode); 4031 orig_size = size; 4032 4033 /* max size of free chunks */ 4034 max = 2 << bsbits; 4035 4036 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ 4037 (req <= (size) || max <= (chunk_size)) 4038 4039 /* first, try to predict filesize */ 4040 /* XXX: should this table be tunable? */ 4041 start_off = 0; 4042 if (size <= 16 * 1024) { 4043 size = 16 * 1024; 4044 } else if (size <= 32 * 1024) { 4045 size = 32 * 1024; 4046 } else if (size <= 64 * 1024) { 4047 size = 64 * 1024; 4048 } else if (size <= 128 * 1024) { 4049 size = 128 * 1024; 4050 } else if (size <= 256 * 1024) { 4051 size = 256 * 1024; 4052 } else if (size <= 512 * 1024) { 4053 size = 512 * 1024; 4054 } else if (size <= 1024 * 1024) { 4055 size = 1024 * 1024; 4056 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { 4057 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4058 (21 - bsbits)) << 21; 4059 size = 2 * 1024 * 1024; 4060 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { 4061 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4062 (22 - bsbits)) << 22; 4063 size = 4 * 1024 * 1024; 4064 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, 4065 (8<<20)>>bsbits, max, 8 * 1024)) { 4066 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4067 (23 - bsbits)) << 23; 4068 size = 8 * 1024 * 1024; 4069 } else { 4070 start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; 4071 size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), 4072 ac->ac_o_ex.fe_len) << bsbits; 4073 } 4074 size = size >> bsbits; 4075 start = start_off >> bsbits; 4076 4077 /* 4078 * For tiny groups (smaller than 8MB) the chosen allocation 4079 * alignment may be larger than group size. Make sure the 4080 * alignment does not move allocation to a different group which 4081 * makes mballoc fail assertions later. 4082 */ 4083 start = max(start, rounddown(ac->ac_o_ex.fe_logical, 4084 (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); 4085 4086 /* don't cover already allocated blocks in selected range */ 4087 if (ar->pleft && start <= ar->lleft) { 4088 size -= ar->lleft + 1 - start; 4089 start = ar->lleft + 1; 4090 } 4091 if (ar->pright && start + size - 1 >= ar->lright) 4092 size -= start + size - ar->lright; 4093 4094 /* 4095 * Trim allocation request for filesystems with artificially small 4096 * groups. 4097 */ 4098 if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) 4099 size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); 4100 4101 end = start + size; 4102 4103 /* check we don't cross already preallocated blocks */ 4104 rcu_read_lock(); 4105 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 4106 ext4_lblk_t pa_end; 4107 4108 if (pa->pa_deleted) 4109 continue; 4110 spin_lock(&pa->pa_lock); 4111 if (pa->pa_deleted) { 4112 spin_unlock(&pa->pa_lock); 4113 continue; 4114 } 4115 4116 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), 4117 pa->pa_len); 4118 4119 /* PA must not overlap original request */ 4120 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 4121 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 4122 4123 /* skip PAs this normalized request doesn't overlap with */ 4124 if (pa->pa_lstart >= end || pa_end <= start) { 4125 spin_unlock(&pa->pa_lock); 4126 continue; 4127 } 4128 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 4129 4130 /* adjust start or end to be adjacent to this pa */ 4131 if (pa_end <= ac->ac_o_ex.fe_logical) { 4132 BUG_ON(pa_end < start); 4133 start = pa_end; 4134 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { 4135 BUG_ON(pa->pa_lstart > end); 4136 end = pa->pa_lstart; 4137 } 4138 spin_unlock(&pa->pa_lock); 4139 } 4140 rcu_read_unlock(); 4141 size = end - start; 4142 4143 /* XXX: extra loop to check we really don't overlap preallocations */ 4144 rcu_read_lock(); 4145 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 4146 ext4_lblk_t pa_end; 4147 4148 spin_lock(&pa->pa_lock); 4149 if (pa->pa_deleted == 0) { 4150 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), 4151 pa->pa_len); 4152 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); 4153 } 4154 spin_unlock(&pa->pa_lock); 4155 } 4156 rcu_read_unlock(); 4157 4158 /* 4159 * In this function "start" and "size" are normalized for better 4160 * alignment and length such that we could preallocate more blocks. 4161 * This normalization is done such that original request of 4162 * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and 4163 * "size" boundaries. 4164 * (Note fe_len can be relaxed since FS block allocation API does not 4165 * provide gurantee on number of contiguous blocks allocation since that 4166 * depends upon free space left, etc). 4167 * In case of inode pa, later we use the allocated blocks 4168 * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated 4169 * range of goal/best blocks [start, size] to put it at the 4170 * ac_o_ex.fe_logical extent of this inode. 4171 * (See ext4_mb_use_inode_pa() for more details) 4172 */ 4173 if (start + size <= ac->ac_o_ex.fe_logical || 4174 start > ac->ac_o_ex.fe_logical) { 4175 ext4_msg(ac->ac_sb, KERN_ERR, 4176 "start %lu, size %lu, fe_logical %lu", 4177 (unsigned long) start, (unsigned long) size, 4178 (unsigned long) ac->ac_o_ex.fe_logical); 4179 BUG(); 4180 } 4181 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 4182 4183 /* now prepare goal request */ 4184 4185 /* XXX: is it better to align blocks WRT to logical 4186 * placement or satisfy big request as is */ 4187 ac->ac_g_ex.fe_logical = start; 4188 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); 4189 4190 /* define goal start in order to merge */ 4191 if (ar->pright && (ar->lright == (start + size))) { 4192 /* merge to the right */ 4193 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, 4194 &ac->ac_f_ex.fe_group, 4195 &ac->ac_f_ex.fe_start); 4196 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 4197 } 4198 if (ar->pleft && (ar->lleft + 1 == start)) { 4199 /* merge to the left */ 4200 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, 4201 &ac->ac_f_ex.fe_group, 4202 &ac->ac_f_ex.fe_start); 4203 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 4204 } 4205 4206 mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, 4207 orig_size, start); 4208 } 4209 4210 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) 4211 { 4212 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4213 4214 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { 4215 atomic_inc(&sbi->s_bal_reqs); 4216 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 4217 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) 4218 atomic_inc(&sbi->s_bal_success); 4219 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 4220 atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); 4221 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 4222 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 4223 atomic_inc(&sbi->s_bal_goals); 4224 if (ac->ac_found > sbi->s_mb_max_to_scan) 4225 atomic_inc(&sbi->s_bal_breaks); 4226 } 4227 4228 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) 4229 trace_ext4_mballoc_alloc(ac); 4230 else 4231 trace_ext4_mballoc_prealloc(ac); 4232 } 4233 4234 /* 4235 * Called on failure; free up any blocks from the inode PA for this 4236 * context. We don't need this for MB_GROUP_PA because we only change 4237 * pa_free in ext4_mb_release_context(), but on failure, we've already 4238 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. 4239 */ 4240 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) 4241 { 4242 struct ext4_prealloc_space *pa = ac->ac_pa; 4243 struct ext4_buddy e4b; 4244 int err; 4245 4246 if (pa == NULL) { 4247 if (ac->ac_f_ex.fe_len == 0) 4248 return; 4249 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); 4250 if (err) { 4251 /* 4252 * This should never happen since we pin the 4253 * pages in the ext4_allocation_context so 4254 * ext4_mb_load_buddy() should never fail. 4255 */ 4256 WARN(1, "mb_load_buddy failed (%d)", err); 4257 return; 4258 } 4259 ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); 4260 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, 4261 ac->ac_f_ex.fe_len); 4262 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); 4263 ext4_mb_unload_buddy(&e4b); 4264 return; 4265 } 4266 if (pa->pa_type == MB_INODE_PA) 4267 pa->pa_free += ac->ac_b_ex.fe_len; 4268 } 4269 4270 /* 4271 * use blocks preallocated to inode 4272 */ 4273 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 4274 struct ext4_prealloc_space *pa) 4275 { 4276 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4277 ext4_fsblk_t start; 4278 ext4_fsblk_t end; 4279 int len; 4280 4281 /* found preallocated blocks, use them */ 4282 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 4283 end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), 4284 start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); 4285 len = EXT4_NUM_B2C(sbi, end - start); 4286 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 4287 &ac->ac_b_ex.fe_start); 4288 ac->ac_b_ex.fe_len = len; 4289 ac->ac_status = AC_STATUS_FOUND; 4290 ac->ac_pa = pa; 4291 4292 BUG_ON(start < pa->pa_pstart); 4293 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); 4294 BUG_ON(pa->pa_free < len); 4295 pa->pa_free -= len; 4296 4297 mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); 4298 } 4299 4300 /* 4301 * use blocks preallocated to locality group 4302 */ 4303 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, 4304 struct ext4_prealloc_space *pa) 4305 { 4306 unsigned int len = ac->ac_o_ex.fe_len; 4307 4308 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 4309 &ac->ac_b_ex.fe_group, 4310 &ac->ac_b_ex.fe_start); 4311 ac->ac_b_ex.fe_len = len; 4312 ac->ac_status = AC_STATUS_FOUND; 4313 ac->ac_pa = pa; 4314 4315 /* we don't correct pa_pstart or pa_plen here to avoid 4316 * possible race when the group is being loaded concurrently 4317 * instead we correct pa later, after blocks are marked 4318 * in on-disk bitmap -- see ext4_mb_release_context() 4319 * Other CPUs are prevented from allocating from this pa by lg_mutex 4320 */ 4321 mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", 4322 pa->pa_lstart-len, len, pa); 4323 } 4324 4325 /* 4326 * Return the prealloc space that have minimal distance 4327 * from the goal block. @cpa is the prealloc 4328 * space that is having currently known minimal distance 4329 * from the goal block. 4330 */ 4331 static struct ext4_prealloc_space * 4332 ext4_mb_check_group_pa(ext4_fsblk_t goal_block, 4333 struct ext4_prealloc_space *pa, 4334 struct ext4_prealloc_space *cpa) 4335 { 4336 ext4_fsblk_t cur_distance, new_distance; 4337 4338 if (cpa == NULL) { 4339 atomic_inc(&pa->pa_count); 4340 return pa; 4341 } 4342 cur_distance = abs(goal_block - cpa->pa_pstart); 4343 new_distance = abs(goal_block - pa->pa_pstart); 4344 4345 if (cur_distance <= new_distance) 4346 return cpa; 4347 4348 /* drop the previous reference */ 4349 atomic_dec(&cpa->pa_count); 4350 atomic_inc(&pa->pa_count); 4351 return pa; 4352 } 4353 4354 /* 4355 * search goal blocks in preallocated space 4356 */ 4357 static noinline_for_stack bool 4358 ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 4359 { 4360 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4361 int order, i; 4362 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 4363 struct ext4_locality_group *lg; 4364 struct ext4_prealloc_space *pa, *cpa = NULL; 4365 ext4_fsblk_t goal_block; 4366 4367 /* only data can be preallocated */ 4368 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4369 return false; 4370 4371 /* first, try per-file preallocation */ 4372 rcu_read_lock(); 4373 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 4374 4375 /* all fields in this condition don't change, 4376 * so we can skip locking for them */ 4377 if (ac->ac_o_ex.fe_logical < pa->pa_lstart || 4378 ac->ac_o_ex.fe_logical >= (pa->pa_lstart + 4379 EXT4_C2B(sbi, pa->pa_len))) 4380 continue; 4381 4382 /* non-extent files can't have physical blocks past 2^32 */ 4383 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 4384 (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > 4385 EXT4_MAX_BLOCK_FILE_PHYS)) 4386 continue; 4387 4388 /* found preallocated blocks, use them */ 4389 spin_lock(&pa->pa_lock); 4390 if (pa->pa_deleted == 0 && pa->pa_free) { 4391 atomic_inc(&pa->pa_count); 4392 ext4_mb_use_inode_pa(ac, pa); 4393 spin_unlock(&pa->pa_lock); 4394 ac->ac_criteria = 10; 4395 rcu_read_unlock(); 4396 return true; 4397 } 4398 spin_unlock(&pa->pa_lock); 4399 } 4400 rcu_read_unlock(); 4401 4402 /* can we use group allocation? */ 4403 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) 4404 return false; 4405 4406 /* inode may have no locality group for some reason */ 4407 lg = ac->ac_lg; 4408 if (lg == NULL) 4409 return false; 4410 order = fls(ac->ac_o_ex.fe_len) - 1; 4411 if (order > PREALLOC_TB_SIZE - 1) 4412 /* The max size of hash table is PREALLOC_TB_SIZE */ 4413 order = PREALLOC_TB_SIZE - 1; 4414 4415 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); 4416 /* 4417 * search for the prealloc space that is having 4418 * minimal distance from the goal block. 4419 */ 4420 for (i = order; i < PREALLOC_TB_SIZE; i++) { 4421 rcu_read_lock(); 4422 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], 4423 pa_inode_list) { 4424 spin_lock(&pa->pa_lock); 4425 if (pa->pa_deleted == 0 && 4426 pa->pa_free >= ac->ac_o_ex.fe_len) { 4427 4428 cpa = ext4_mb_check_group_pa(goal_block, 4429 pa, cpa); 4430 } 4431 spin_unlock(&pa->pa_lock); 4432 } 4433 rcu_read_unlock(); 4434 } 4435 if (cpa) { 4436 ext4_mb_use_group_pa(ac, cpa); 4437 ac->ac_criteria = 20; 4438 return true; 4439 } 4440 return false; 4441 } 4442 4443 /* 4444 * the function goes through all block freed in the group 4445 * but not yet committed and marks them used in in-core bitmap. 4446 * buddy must be generated from this bitmap 4447 * Need to be called with the ext4 group lock held 4448 */ 4449 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 4450 ext4_group_t group) 4451 { 4452 struct rb_node *n; 4453 struct ext4_group_info *grp; 4454 struct ext4_free_data *entry; 4455 4456 grp = ext4_get_group_info(sb, group); 4457 n = rb_first(&(grp->bb_free_root)); 4458 4459 while (n) { 4460 entry = rb_entry(n, struct ext4_free_data, efd_node); 4461 mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); 4462 n = rb_next(n); 4463 } 4464 return; 4465 } 4466 4467 /* 4468 * the function goes through all preallocation in this group and marks them 4469 * used in in-core bitmap. buddy must be generated from this bitmap 4470 * Need to be called with ext4 group lock held 4471 */ 4472 static noinline_for_stack 4473 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 4474 ext4_group_t group) 4475 { 4476 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 4477 struct ext4_prealloc_space *pa; 4478 struct list_head *cur; 4479 ext4_group_t groupnr; 4480 ext4_grpblk_t start; 4481 int preallocated = 0; 4482 int len; 4483 4484 /* all form of preallocation discards first load group, 4485 * so the only competing code is preallocation use. 4486 * we don't need any locking here 4487 * notice we do NOT ignore preallocations with pa_deleted 4488 * otherwise we could leave used blocks available for 4489 * allocation in buddy when concurrent ext4_mb_put_pa() 4490 * is dropping preallocation 4491 */ 4492 list_for_each(cur, &grp->bb_prealloc_list) { 4493 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 4494 spin_lock(&pa->pa_lock); 4495 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 4496 &groupnr, &start); 4497 len = pa->pa_len; 4498 spin_unlock(&pa->pa_lock); 4499 if (unlikely(len == 0)) 4500 continue; 4501 BUG_ON(groupnr != group); 4502 mb_set_bits(bitmap, start, len); 4503 preallocated += len; 4504 } 4505 mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); 4506 } 4507 4508 static void ext4_mb_mark_pa_deleted(struct super_block *sb, 4509 struct ext4_prealloc_space *pa) 4510 { 4511 struct ext4_inode_info *ei; 4512 4513 if (pa->pa_deleted) { 4514 ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", 4515 pa->pa_type, pa->pa_pstart, pa->pa_lstart, 4516 pa->pa_len); 4517 return; 4518 } 4519 4520 pa->pa_deleted = 1; 4521 4522 if (pa->pa_type == MB_INODE_PA) { 4523 ei = EXT4_I(pa->pa_inode); 4524 atomic_dec(&ei->i_prealloc_active); 4525 } 4526 } 4527 4528 static void ext4_mb_pa_callback(struct rcu_head *head) 4529 { 4530 struct ext4_prealloc_space *pa; 4531 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 4532 4533 BUG_ON(atomic_read(&pa->pa_count)); 4534 BUG_ON(pa->pa_deleted == 0); 4535 kmem_cache_free(ext4_pspace_cachep, pa); 4536 } 4537 4538 /* 4539 * drops a reference to preallocated space descriptor 4540 * if this was the last reference and the space is consumed 4541 */ 4542 static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 4543 struct super_block *sb, struct ext4_prealloc_space *pa) 4544 { 4545 ext4_group_t grp; 4546 ext4_fsblk_t grp_blk; 4547 4548 /* in this short window concurrent discard can set pa_deleted */ 4549 spin_lock(&pa->pa_lock); 4550 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { 4551 spin_unlock(&pa->pa_lock); 4552 return; 4553 } 4554 4555 if (pa->pa_deleted == 1) { 4556 spin_unlock(&pa->pa_lock); 4557 return; 4558 } 4559 4560 ext4_mb_mark_pa_deleted(sb, pa); 4561 spin_unlock(&pa->pa_lock); 4562 4563 grp_blk = pa->pa_pstart; 4564 /* 4565 * If doing group-based preallocation, pa_pstart may be in the 4566 * next group when pa is used up 4567 */ 4568 if (pa->pa_type == MB_GROUP_PA) 4569 grp_blk--; 4570 4571 grp = ext4_get_group_number(sb, grp_blk); 4572 4573 /* 4574 * possible race: 4575 * 4576 * P1 (buddy init) P2 (regular allocation) 4577 * find block B in PA 4578 * copy on-disk bitmap to buddy 4579 * mark B in on-disk bitmap 4580 * drop PA from group 4581 * mark all PAs in buddy 4582 * 4583 * thus, P1 initializes buddy with B available. to prevent this 4584 * we make "copy" and "mark all PAs" atomic and serialize "drop PA" 4585 * against that pair 4586 */ 4587 ext4_lock_group(sb, grp); 4588 list_del(&pa->pa_group_list); 4589 ext4_unlock_group(sb, grp); 4590 4591 spin_lock(pa->pa_obj_lock); 4592 list_del_rcu(&pa->pa_inode_list); 4593 spin_unlock(pa->pa_obj_lock); 4594 4595 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4596 } 4597 4598 /* 4599 * creates new preallocated space for given inode 4600 */ 4601 static noinline_for_stack void 4602 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 4603 { 4604 struct super_block *sb = ac->ac_sb; 4605 struct ext4_sb_info *sbi = EXT4_SB(sb); 4606 struct ext4_prealloc_space *pa; 4607 struct ext4_group_info *grp; 4608 struct ext4_inode_info *ei; 4609 4610 /* preallocate only when found space is larger then requested */ 4611 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 4612 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 4613 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 4614 BUG_ON(ac->ac_pa == NULL); 4615 4616 pa = ac->ac_pa; 4617 4618 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { 4619 int winl; 4620 int wins; 4621 int win; 4622 int offs; 4623 4624 /* we can't allocate as much as normalizer wants. 4625 * so, found space must get proper lstart 4626 * to cover original request */ 4627 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 4628 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); 4629 4630 /* we're limited by original request in that 4631 * logical block must be covered any way 4632 * winl is window we can move our chunk within */ 4633 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; 4634 4635 /* also, we should cover whole original request */ 4636 wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); 4637 4638 /* the smallest one defines real window */ 4639 win = min(winl, wins); 4640 4641 offs = ac->ac_o_ex.fe_logical % 4642 EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4643 if (offs && offs < win) 4644 win = offs; 4645 4646 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - 4647 EXT4_NUM_B2C(sbi, win); 4648 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 4649 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 4650 } 4651 4652 /* preallocation can change ac_b_ex, thus we store actually 4653 * allocated blocks for history */ 4654 ac->ac_f_ex = ac->ac_b_ex; 4655 4656 pa->pa_lstart = ac->ac_b_ex.fe_logical; 4657 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4658 pa->pa_len = ac->ac_b_ex.fe_len; 4659 pa->pa_free = pa->pa_len; 4660 spin_lock_init(&pa->pa_lock); 4661 INIT_LIST_HEAD(&pa->pa_inode_list); 4662 INIT_LIST_HEAD(&pa->pa_group_list); 4663 pa->pa_deleted = 0; 4664 pa->pa_type = MB_INODE_PA; 4665 4666 mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, 4667 pa->pa_len, pa->pa_lstart); 4668 trace_ext4_mb_new_inode_pa(ac, pa); 4669 4670 ext4_mb_use_inode_pa(ac, pa); 4671 atomic_add(pa->pa_free, &sbi->s_mb_preallocated); 4672 4673 ei = EXT4_I(ac->ac_inode); 4674 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 4675 4676 pa->pa_obj_lock = &ei->i_prealloc_lock; 4677 pa->pa_inode = ac->ac_inode; 4678 4679 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 4680 4681 spin_lock(pa->pa_obj_lock); 4682 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); 4683 spin_unlock(pa->pa_obj_lock); 4684 atomic_inc(&ei->i_prealloc_active); 4685 } 4686 4687 /* 4688 * creates new preallocated space for locality group inodes belongs to 4689 */ 4690 static noinline_for_stack void 4691 ext4_mb_new_group_pa(struct ext4_allocation_context *ac) 4692 { 4693 struct super_block *sb = ac->ac_sb; 4694 struct ext4_locality_group *lg; 4695 struct ext4_prealloc_space *pa; 4696 struct ext4_group_info *grp; 4697 4698 /* preallocate only when found space is larger then requested */ 4699 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 4700 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 4701 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 4702 BUG_ON(ac->ac_pa == NULL); 4703 4704 pa = ac->ac_pa; 4705 4706 /* preallocation can change ac_b_ex, thus we store actually 4707 * allocated blocks for history */ 4708 ac->ac_f_ex = ac->ac_b_ex; 4709 4710 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4711 pa->pa_lstart = pa->pa_pstart; 4712 pa->pa_len = ac->ac_b_ex.fe_len; 4713 pa->pa_free = pa->pa_len; 4714 spin_lock_init(&pa->pa_lock); 4715 INIT_LIST_HEAD(&pa->pa_inode_list); 4716 INIT_LIST_HEAD(&pa->pa_group_list); 4717 pa->pa_deleted = 0; 4718 pa->pa_type = MB_GROUP_PA; 4719 4720 mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, 4721 pa->pa_len, pa->pa_lstart); 4722 trace_ext4_mb_new_group_pa(ac, pa); 4723 4724 ext4_mb_use_group_pa(ac, pa); 4725 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 4726 4727 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 4728 lg = ac->ac_lg; 4729 BUG_ON(lg == NULL); 4730 4731 pa->pa_obj_lock = &lg->lg_prealloc_lock; 4732 pa->pa_inode = NULL; 4733 4734 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 4735 4736 /* 4737 * We will later add the new pa to the right bucket 4738 * after updating the pa_free in ext4_mb_release_context 4739 */ 4740 } 4741 4742 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) 4743 { 4744 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 4745 ext4_mb_new_group_pa(ac); 4746 else 4747 ext4_mb_new_inode_pa(ac); 4748 } 4749 4750 /* 4751 * finds all unused blocks in on-disk bitmap, frees them in 4752 * in-core bitmap and buddy. 4753 * @pa must be unlinked from inode and group lists, so that 4754 * nobody else can find/use it. 4755 * the caller MUST hold group/inode locks. 4756 * TODO: optimize the case when there are no in-core structures yet 4757 */ 4758 static noinline_for_stack int 4759 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 4760 struct ext4_prealloc_space *pa) 4761 { 4762 struct super_block *sb = e4b->bd_sb; 4763 struct ext4_sb_info *sbi = EXT4_SB(sb); 4764 unsigned int end; 4765 unsigned int next; 4766 ext4_group_t group; 4767 ext4_grpblk_t bit; 4768 unsigned long long grp_blk_start; 4769 int free = 0; 4770 4771 BUG_ON(pa->pa_deleted == 0); 4772 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 4773 grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); 4774 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 4775 end = bit + pa->pa_len; 4776 4777 while (bit < end) { 4778 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 4779 if (bit >= end) 4780 break; 4781 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 4782 mb_debug(sb, "free preallocated %u/%u in group %u\n", 4783 (unsigned) ext4_group_first_block_no(sb, group) + bit, 4784 (unsigned) next - bit, (unsigned) group); 4785 free += next - bit; 4786 4787 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); 4788 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + 4789 EXT4_C2B(sbi, bit)), 4790 next - bit); 4791 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 4792 bit = next + 1; 4793 } 4794 if (free != pa->pa_free) { 4795 ext4_msg(e4b->bd_sb, KERN_CRIT, 4796 "pa %p: logic %lu, phys. %lu, len %d", 4797 pa, (unsigned long) pa->pa_lstart, 4798 (unsigned long) pa->pa_pstart, 4799 pa->pa_len); 4800 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", 4801 free, pa->pa_free); 4802 /* 4803 * pa is already deleted so we use the value obtained 4804 * from the bitmap and continue. 4805 */ 4806 } 4807 atomic_add(free, &sbi->s_mb_discarded); 4808 4809 return 0; 4810 } 4811 4812 static noinline_for_stack int 4813 ext4_mb_release_group_pa(struct ext4_buddy *e4b, 4814 struct ext4_prealloc_space *pa) 4815 { 4816 struct super_block *sb = e4b->bd_sb; 4817 ext4_group_t group; 4818 ext4_grpblk_t bit; 4819 4820 trace_ext4_mb_release_group_pa(sb, pa); 4821 BUG_ON(pa->pa_deleted == 0); 4822 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 4823 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 4824 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 4825 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 4826 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); 4827 4828 return 0; 4829 } 4830 4831 /* 4832 * releases all preallocations in given group 4833 * 4834 * first, we need to decide discard policy: 4835 * - when do we discard 4836 * 1) ENOSPC 4837 * - how many do we discard 4838 * 1) how many requested 4839 */ 4840 static noinline_for_stack int 4841 ext4_mb_discard_group_preallocations(struct super_block *sb, 4842 ext4_group_t group, int *busy) 4843 { 4844 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 4845 struct buffer_head *bitmap_bh = NULL; 4846 struct ext4_prealloc_space *pa, *tmp; 4847 struct list_head list; 4848 struct ext4_buddy e4b; 4849 int err; 4850 int free = 0; 4851 4852 mb_debug(sb, "discard preallocation for group %u\n", group); 4853 if (list_empty(&grp->bb_prealloc_list)) 4854 goto out_dbg; 4855 4856 bitmap_bh = ext4_read_block_bitmap(sb, group); 4857 if (IS_ERR(bitmap_bh)) { 4858 err = PTR_ERR(bitmap_bh); 4859 ext4_error_err(sb, -err, 4860 "Error %d reading block bitmap for %u", 4861 err, group); 4862 goto out_dbg; 4863 } 4864 4865 err = ext4_mb_load_buddy(sb, group, &e4b); 4866 if (err) { 4867 ext4_warning(sb, "Error %d loading buddy information for %u", 4868 err, group); 4869 put_bh(bitmap_bh); 4870 goto out_dbg; 4871 } 4872 4873 INIT_LIST_HEAD(&list); 4874 ext4_lock_group(sb, group); 4875 list_for_each_entry_safe(pa, tmp, 4876 &grp->bb_prealloc_list, pa_group_list) { 4877 spin_lock(&pa->pa_lock); 4878 if (atomic_read(&pa->pa_count)) { 4879 spin_unlock(&pa->pa_lock); 4880 *busy = 1; 4881 continue; 4882 } 4883 if (pa->pa_deleted) { 4884 spin_unlock(&pa->pa_lock); 4885 continue; 4886 } 4887 4888 /* seems this one can be freed ... */ 4889 ext4_mb_mark_pa_deleted(sb, pa); 4890 4891 if (!free) 4892 this_cpu_inc(discard_pa_seq); 4893 4894 /* we can trust pa_free ... */ 4895 free += pa->pa_free; 4896 4897 spin_unlock(&pa->pa_lock); 4898 4899 list_del(&pa->pa_group_list); 4900 list_add(&pa->u.pa_tmp_list, &list); 4901 } 4902 4903 /* now free all selected PAs */ 4904 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 4905 4906 /* remove from object (inode or locality group) */ 4907 spin_lock(pa->pa_obj_lock); 4908 list_del_rcu(&pa->pa_inode_list); 4909 spin_unlock(pa->pa_obj_lock); 4910 4911 if (pa->pa_type == MB_GROUP_PA) 4912 ext4_mb_release_group_pa(&e4b, pa); 4913 else 4914 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 4915 4916 list_del(&pa->u.pa_tmp_list); 4917 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4918 } 4919 4920 ext4_unlock_group(sb, group); 4921 ext4_mb_unload_buddy(&e4b); 4922 put_bh(bitmap_bh); 4923 out_dbg: 4924 mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", 4925 free, group, grp->bb_free); 4926 return free; 4927 } 4928 4929 /* 4930 * releases all non-used preallocated blocks for given inode 4931 * 4932 * It's important to discard preallocations under i_data_sem 4933 * We don't want another block to be served from the prealloc 4934 * space when we are discarding the inode prealloc space. 4935 * 4936 * FIXME!! Make sure it is valid at all the call sites 4937 */ 4938 void ext4_discard_preallocations(struct inode *inode, unsigned int needed) 4939 { 4940 struct ext4_inode_info *ei = EXT4_I(inode); 4941 struct super_block *sb = inode->i_sb; 4942 struct buffer_head *bitmap_bh = NULL; 4943 struct ext4_prealloc_space *pa, *tmp; 4944 ext4_group_t group = 0; 4945 struct list_head list; 4946 struct ext4_buddy e4b; 4947 int err; 4948 4949 if (!S_ISREG(inode->i_mode)) { 4950 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 4951 return; 4952 } 4953 4954 if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) 4955 return; 4956 4957 mb_debug(sb, "discard preallocation for inode %lu\n", 4958 inode->i_ino); 4959 trace_ext4_discard_preallocations(inode, 4960 atomic_read(&ei->i_prealloc_active), needed); 4961 4962 INIT_LIST_HEAD(&list); 4963 4964 if (needed == 0) 4965 needed = UINT_MAX; 4966 4967 repeat: 4968 /* first, collect all pa's in the inode */ 4969 spin_lock(&ei->i_prealloc_lock); 4970 while (!list_empty(&ei->i_prealloc_list) && needed) { 4971 pa = list_entry(ei->i_prealloc_list.prev, 4972 struct ext4_prealloc_space, pa_inode_list); 4973 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); 4974 spin_lock(&pa->pa_lock); 4975 if (atomic_read(&pa->pa_count)) { 4976 /* this shouldn't happen often - nobody should 4977 * use preallocation while we're discarding it */ 4978 spin_unlock(&pa->pa_lock); 4979 spin_unlock(&ei->i_prealloc_lock); 4980 ext4_msg(sb, KERN_ERR, 4981 "uh-oh! used pa while discarding"); 4982 WARN_ON(1); 4983 schedule_timeout_uninterruptible(HZ); 4984 goto repeat; 4985 4986 } 4987 if (pa->pa_deleted == 0) { 4988 ext4_mb_mark_pa_deleted(sb, pa); 4989 spin_unlock(&pa->pa_lock); 4990 list_del_rcu(&pa->pa_inode_list); 4991 list_add(&pa->u.pa_tmp_list, &list); 4992 needed--; 4993 continue; 4994 } 4995 4996 /* someone is deleting pa right now */ 4997 spin_unlock(&pa->pa_lock); 4998 spin_unlock(&ei->i_prealloc_lock); 4999 5000 /* we have to wait here because pa_deleted 5001 * doesn't mean pa is already unlinked from 5002 * the list. as we might be called from 5003 * ->clear_inode() the inode will get freed 5004 * and concurrent thread which is unlinking 5005 * pa from inode's list may access already 5006 * freed memory, bad-bad-bad */ 5007 5008 /* XXX: if this happens too often, we can 5009 * add a flag to force wait only in case 5010 * of ->clear_inode(), but not in case of 5011 * regular truncate */ 5012 schedule_timeout_uninterruptible(HZ); 5013 goto repeat; 5014 } 5015 spin_unlock(&ei->i_prealloc_lock); 5016 5017 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 5018 BUG_ON(pa->pa_type != MB_INODE_PA); 5019 group = ext4_get_group_number(sb, pa->pa_pstart); 5020 5021 err = ext4_mb_load_buddy_gfp(sb, group, &e4b, 5022 GFP_NOFS|__GFP_NOFAIL); 5023 if (err) { 5024 ext4_error_err(sb, -err, "Error %d loading buddy information for %u", 5025 err, group); 5026 continue; 5027 } 5028 5029 bitmap_bh = ext4_read_block_bitmap(sb, group); 5030 if (IS_ERR(bitmap_bh)) { 5031 err = PTR_ERR(bitmap_bh); 5032 ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", 5033 err, group); 5034 ext4_mb_unload_buddy(&e4b); 5035 continue; 5036 } 5037 5038 ext4_lock_group(sb, group); 5039 list_del(&pa->pa_group_list); 5040 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 5041 ext4_unlock_group(sb, group); 5042 5043 ext4_mb_unload_buddy(&e4b); 5044 put_bh(bitmap_bh); 5045 5046 list_del(&pa->u.pa_tmp_list); 5047 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 5048 } 5049 } 5050 5051 static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) 5052 { 5053 struct ext4_prealloc_space *pa; 5054 5055 BUG_ON(ext4_pspace_cachep == NULL); 5056 pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); 5057 if (!pa) 5058 return -ENOMEM; 5059 atomic_set(&pa->pa_count, 1); 5060 ac->ac_pa = pa; 5061 return 0; 5062 } 5063 5064 static void ext4_mb_pa_free(struct ext4_allocation_context *ac) 5065 { 5066 struct ext4_prealloc_space *pa = ac->ac_pa; 5067 5068 BUG_ON(!pa); 5069 ac->ac_pa = NULL; 5070 WARN_ON(!atomic_dec_and_test(&pa->pa_count)); 5071 kmem_cache_free(ext4_pspace_cachep, pa); 5072 } 5073 5074 #ifdef CONFIG_EXT4_DEBUG 5075 static inline void ext4_mb_show_pa(struct super_block *sb) 5076 { 5077 ext4_group_t i, ngroups; 5078 5079 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) 5080 return; 5081 5082 ngroups = ext4_get_groups_count(sb); 5083 mb_debug(sb, "groups: "); 5084 for (i = 0; i < ngroups; i++) { 5085 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 5086 struct ext4_prealloc_space *pa; 5087 ext4_grpblk_t start; 5088 struct list_head *cur; 5089 ext4_lock_group(sb, i); 5090 list_for_each(cur, &grp->bb_prealloc_list) { 5091 pa = list_entry(cur, struct ext4_prealloc_space, 5092 pa_group_list); 5093 spin_lock(&pa->pa_lock); 5094 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 5095 NULL, &start); 5096 spin_unlock(&pa->pa_lock); 5097 mb_debug(sb, "PA:%u:%d:%d\n", i, start, 5098 pa->pa_len); 5099 } 5100 ext4_unlock_group(sb, i); 5101 mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, 5102 grp->bb_fragments); 5103 } 5104 } 5105 5106 static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 5107 { 5108 struct super_block *sb = ac->ac_sb; 5109 5110 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) 5111 return; 5112 5113 mb_debug(sb, "Can't allocate:" 5114 " Allocation context details:"); 5115 mb_debug(sb, "status %u flags 0x%x", 5116 ac->ac_status, ac->ac_flags); 5117 mb_debug(sb, "orig %lu/%lu/%lu@%lu, " 5118 "goal %lu/%lu/%lu@%lu, " 5119 "best %lu/%lu/%lu@%lu cr %d", 5120 (unsigned long)ac->ac_o_ex.fe_group, 5121 (unsigned long)ac->ac_o_ex.fe_start, 5122 (unsigned long)ac->ac_o_ex.fe_len, 5123 (unsigned long)ac->ac_o_ex.fe_logical, 5124 (unsigned long)ac->ac_g_ex.fe_group, 5125 (unsigned long)ac->ac_g_ex.fe_start, 5126 (unsigned long)ac->ac_g_ex.fe_len, 5127 (unsigned long)ac->ac_g_ex.fe_logical, 5128 (unsigned long)ac->ac_b_ex.fe_group, 5129 (unsigned long)ac->ac_b_ex.fe_start, 5130 (unsigned long)ac->ac_b_ex.fe_len, 5131 (unsigned long)ac->ac_b_ex.fe_logical, 5132 (int)ac->ac_criteria); 5133 mb_debug(sb, "%u found", ac->ac_found); 5134 ext4_mb_show_pa(sb); 5135 } 5136 #else 5137 static inline void ext4_mb_show_pa(struct super_block *sb) 5138 { 5139 return; 5140 } 5141 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) 5142 { 5143 ext4_mb_show_pa(ac->ac_sb); 5144 return; 5145 } 5146 #endif 5147 5148 /* 5149 * We use locality group preallocation for small size file. The size of the 5150 * file is determined by the current size or the resulting size after 5151 * allocation which ever is larger 5152 * 5153 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req 5154 */ 5155 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 5156 { 5157 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 5158 int bsbits = ac->ac_sb->s_blocksize_bits; 5159 loff_t size, isize; 5160 bool inode_pa_eligible, group_pa_eligible; 5161 5162 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 5163 return; 5164 5165 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 5166 return; 5167 5168 group_pa_eligible = sbi->s_mb_group_prealloc > 0; 5169 inode_pa_eligible = true; 5170 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); 5171 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 5172 >> bsbits; 5173 5174 /* No point in using inode preallocation for closed files */ 5175 if ((size == isize) && !ext4_fs_is_busy(sbi) && 5176 !inode_is_open_for_write(ac->ac_inode)) 5177 inode_pa_eligible = false; 5178 5179 size = max(size, isize); 5180 /* Don't use group allocation for large files */ 5181 if (size > sbi->s_mb_stream_request) 5182 group_pa_eligible = false; 5183 5184 if (!group_pa_eligible) { 5185 if (inode_pa_eligible) 5186 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 5187 else 5188 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; 5189 return; 5190 } 5191 5192 BUG_ON(ac->ac_lg != NULL); 5193 /* 5194 * locality group prealloc space are per cpu. The reason for having 5195 * per cpu locality group is to reduce the contention between block 5196 * request from multiple CPUs. 5197 */ 5198 ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); 5199 5200 /* we're going to use group allocation */ 5201 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 5202 5203 /* serialize all allocations in the group */ 5204 mutex_lock(&ac->ac_lg->lg_mutex); 5205 } 5206 5207 static noinline_for_stack int 5208 ext4_mb_initialize_context(struct ext4_allocation_context *ac, 5209 struct ext4_allocation_request *ar) 5210 { 5211 struct super_block *sb = ar->inode->i_sb; 5212 struct ext4_sb_info *sbi = EXT4_SB(sb); 5213 struct ext4_super_block *es = sbi->s_es; 5214 ext4_group_t group; 5215 unsigned int len; 5216 ext4_fsblk_t goal; 5217 ext4_grpblk_t block; 5218 5219 /* we can't allocate > group size */ 5220 len = ar->len; 5221 5222 /* just a dirty hack to filter too big requests */ 5223 if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) 5224 len = EXT4_CLUSTERS_PER_GROUP(sb); 5225 5226 /* start searching from the goal */ 5227 goal = ar->goal; 5228 if (goal < le32_to_cpu(es->s_first_data_block) || 5229 goal >= ext4_blocks_count(es)) 5230 goal = le32_to_cpu(es->s_first_data_block); 5231 ext4_get_group_no_and_offset(sb, goal, &group, &block); 5232 5233 /* set up allocation goals */ 5234 ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); 5235 ac->ac_status = AC_STATUS_CONTINUE; 5236 ac->ac_sb = sb; 5237 ac->ac_inode = ar->inode; 5238 ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; 5239 ac->ac_o_ex.fe_group = group; 5240 ac->ac_o_ex.fe_start = block; 5241 ac->ac_o_ex.fe_len = len; 5242 ac->ac_g_ex = ac->ac_o_ex; 5243 ac->ac_flags = ar->flags; 5244 5245 /* we have to define context: we'll work with a file or 5246 * locality group. this is a policy, actually */ 5247 ext4_mb_group_or_file(ac); 5248 5249 mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " 5250 "left: %u/%u, right %u/%u to %swritable\n", 5251 (unsigned) ar->len, (unsigned) ar->logical, 5252 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 5253 (unsigned) ar->lleft, (unsigned) ar->pleft, 5254 (unsigned) ar->lright, (unsigned) ar->pright, 5255 inode_is_open_for_write(ar->inode) ? "" : "non-"); 5256 return 0; 5257 5258 } 5259 5260 static noinline_for_stack void 5261 ext4_mb_discard_lg_preallocations(struct super_block *sb, 5262 struct ext4_locality_group *lg, 5263 int order, int total_entries) 5264 { 5265 ext4_group_t group = 0; 5266 struct ext4_buddy e4b; 5267 struct list_head discard_list; 5268 struct ext4_prealloc_space *pa, *tmp; 5269 5270 mb_debug(sb, "discard locality group preallocation\n"); 5271 5272 INIT_LIST_HEAD(&discard_list); 5273 5274 spin_lock(&lg->lg_prealloc_lock); 5275 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 5276 pa_inode_list, 5277 lockdep_is_held(&lg->lg_prealloc_lock)) { 5278 spin_lock(&pa->pa_lock); 5279 if (atomic_read(&pa->pa_count)) { 5280 /* 5281 * This is the pa that we just used 5282 * for block allocation. So don't 5283 * free that 5284 */ 5285 spin_unlock(&pa->pa_lock); 5286 continue; 5287 } 5288 if (pa->pa_deleted) { 5289 spin_unlock(&pa->pa_lock); 5290 continue; 5291 } 5292 /* only lg prealloc space */ 5293 BUG_ON(pa->pa_type != MB_GROUP_PA); 5294 5295 /* seems this one can be freed ... */ 5296 ext4_mb_mark_pa_deleted(sb, pa); 5297 spin_unlock(&pa->pa_lock); 5298 5299 list_del_rcu(&pa->pa_inode_list); 5300 list_add(&pa->u.pa_tmp_list, &discard_list); 5301 5302 total_entries--; 5303 if (total_entries <= 5) { 5304 /* 5305 * we want to keep only 5 entries 5306 * allowing it to grow to 8. This 5307 * mak sure we don't call discard 5308 * soon for this list. 5309 */ 5310 break; 5311 } 5312 } 5313 spin_unlock(&lg->lg_prealloc_lock); 5314 5315 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 5316 int err; 5317 5318 group = ext4_get_group_number(sb, pa->pa_pstart); 5319 err = ext4_mb_load_buddy_gfp(sb, group, &e4b, 5320 GFP_NOFS|__GFP_NOFAIL); 5321 if (err) { 5322 ext4_error_err(sb, -err, "Error %d loading buddy information for %u", 5323 err, group); 5324 continue; 5325 } 5326 ext4_lock_group(sb, group); 5327 list_del(&pa->pa_group_list); 5328 ext4_mb_release_group_pa(&e4b, pa); 5329 ext4_unlock_group(sb, group); 5330 5331 ext4_mb_unload_buddy(&e4b); 5332 list_del(&pa->u.pa_tmp_list); 5333 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 5334 } 5335 } 5336 5337 /* 5338 * We have incremented pa_count. So it cannot be freed at this 5339 * point. Also we hold lg_mutex. So no parallel allocation is 5340 * possible from this lg. That means pa_free cannot be updated. 5341 * 5342 * A parallel ext4_mb_discard_group_preallocations is possible. 5343 * which can cause the lg_prealloc_list to be updated. 5344 */ 5345 5346 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) 5347 { 5348 int order, added = 0, lg_prealloc_count = 1; 5349 struct super_block *sb = ac->ac_sb; 5350 struct ext4_locality_group *lg = ac->ac_lg; 5351 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; 5352 5353 order = fls(pa->pa_free) - 1; 5354 if (order > PREALLOC_TB_SIZE - 1) 5355 /* The max size of hash table is PREALLOC_TB_SIZE */ 5356 order = PREALLOC_TB_SIZE - 1; 5357 /* Add the prealloc space to lg */ 5358 spin_lock(&lg->lg_prealloc_lock); 5359 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], 5360 pa_inode_list, 5361 lockdep_is_held(&lg->lg_prealloc_lock)) { 5362 spin_lock(&tmp_pa->pa_lock); 5363 if (tmp_pa->pa_deleted) { 5364 spin_unlock(&tmp_pa->pa_lock); 5365 continue; 5366 } 5367 if (!added && pa->pa_free < tmp_pa->pa_free) { 5368 /* Add to the tail of the previous entry */ 5369 list_add_tail_rcu(&pa->pa_inode_list, 5370 &tmp_pa->pa_inode_list); 5371 added = 1; 5372 /* 5373 * we want to count the total 5374 * number of entries in the list 5375 */ 5376 } 5377 spin_unlock(&tmp_pa->pa_lock); 5378 lg_prealloc_count++; 5379 } 5380 if (!added) 5381 list_add_tail_rcu(&pa->pa_inode_list, 5382 &lg->lg_prealloc_list[order]); 5383 spin_unlock(&lg->lg_prealloc_lock); 5384 5385 /* Now trim the list to be not more than 8 elements */ 5386 if (lg_prealloc_count > 8) { 5387 ext4_mb_discard_lg_preallocations(sb, lg, 5388 order, lg_prealloc_count); 5389 return; 5390 } 5391 return ; 5392 } 5393 5394 /* 5395 * if per-inode prealloc list is too long, trim some PA 5396 */ 5397 static void ext4_mb_trim_inode_pa(struct inode *inode) 5398 { 5399 struct ext4_inode_info *ei = EXT4_I(inode); 5400 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5401 int count, delta; 5402 5403 count = atomic_read(&ei->i_prealloc_active); 5404 delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1; 5405 if (count > sbi->s_mb_max_inode_prealloc + delta) { 5406 count -= sbi->s_mb_max_inode_prealloc; 5407 ext4_discard_preallocations(inode, count); 5408 } 5409 } 5410 5411 /* 5412 * release all resource we used in allocation 5413 */ 5414 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 5415 { 5416 struct inode *inode = ac->ac_inode; 5417 struct ext4_inode_info *ei = EXT4_I(inode); 5418 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 5419 struct ext4_prealloc_space *pa = ac->ac_pa; 5420 if (pa) { 5421 if (pa->pa_type == MB_GROUP_PA) { 5422 /* see comment in ext4_mb_use_group_pa() */ 5423 spin_lock(&pa->pa_lock); 5424 pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 5425 pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 5426 pa->pa_free -= ac->ac_b_ex.fe_len; 5427 pa->pa_len -= ac->ac_b_ex.fe_len; 5428 spin_unlock(&pa->pa_lock); 5429 5430 /* 5431 * We want to add the pa to the right bucket. 5432 * Remove it from the list and while adding 5433 * make sure the list to which we are adding 5434 * doesn't grow big. 5435 */ 5436 if (likely(pa->pa_free)) { 5437 spin_lock(pa->pa_obj_lock); 5438 list_del_rcu(&pa->pa_inode_list); 5439 spin_unlock(pa->pa_obj_lock); 5440 ext4_mb_add_n_trim(ac); 5441 } 5442 } 5443 5444 if (pa->pa_type == MB_INODE_PA) { 5445 /* 5446 * treat per-inode prealloc list as a lru list, then try 5447 * to trim the least recently used PA. 5448 */ 5449 spin_lock(pa->pa_obj_lock); 5450 list_move(&pa->pa_inode_list, &ei->i_prealloc_list); 5451 spin_unlock(pa->pa_obj_lock); 5452 } 5453 5454 ext4_mb_put_pa(ac, ac->ac_sb, pa); 5455 } 5456 if (ac->ac_bitmap_page) 5457 put_page(ac->ac_bitmap_page); 5458 if (ac->ac_buddy_page) 5459 put_page(ac->ac_buddy_page); 5460 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 5461 mutex_unlock(&ac->ac_lg->lg_mutex); 5462 ext4_mb_collect_stats(ac); 5463 ext4_mb_trim_inode_pa(inode); 5464 return 0; 5465 } 5466 5467 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 5468 { 5469 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 5470 int ret; 5471 int freed = 0, busy = 0; 5472 int retry = 0; 5473 5474 trace_ext4_mb_discard_preallocations(sb, needed); 5475 5476 if (needed == 0) 5477 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; 5478 repeat: 5479 for (i = 0; i < ngroups && needed > 0; i++) { 5480 ret = ext4_mb_discard_group_preallocations(sb, i, &busy); 5481 freed += ret; 5482 needed -= ret; 5483 cond_resched(); 5484 } 5485 5486 if (needed > 0 && busy && ++retry < 3) { 5487 busy = 0; 5488 goto repeat; 5489 } 5490 5491 return freed; 5492 } 5493 5494 static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, 5495 struct ext4_allocation_context *ac, u64 *seq) 5496 { 5497 int freed; 5498 u64 seq_retry = 0; 5499 bool ret = false; 5500 5501 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); 5502 if (freed) { 5503 ret = true; 5504 goto out_dbg; 5505 } 5506 seq_retry = ext4_get_discard_pa_seq_sum(); 5507 if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { 5508 ac->ac_flags |= EXT4_MB_STRICT_CHECK; 5509 *seq = seq_retry; 5510 ret = true; 5511 } 5512 5513 out_dbg: 5514 mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); 5515 return ret; 5516 } 5517 5518 static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, 5519 struct ext4_allocation_request *ar, int *errp); 5520 5521 /* 5522 * Main entry point into mballoc to allocate blocks 5523 * it tries to use preallocation first, then falls back 5524 * to usual allocation 5525 */ 5526 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 5527 struct ext4_allocation_request *ar, int *errp) 5528 { 5529 struct ext4_allocation_context *ac = NULL; 5530 struct ext4_sb_info *sbi; 5531 struct super_block *sb; 5532 ext4_fsblk_t block = 0; 5533 unsigned int inquota = 0; 5534 unsigned int reserv_clstrs = 0; 5535 int retries = 0; 5536 u64 seq; 5537 5538 might_sleep(); 5539 sb = ar->inode->i_sb; 5540 sbi = EXT4_SB(sb); 5541 5542 trace_ext4_request_blocks(ar); 5543 if (sbi->s_mount_state & EXT4_FC_REPLAY) 5544 return ext4_mb_new_blocks_simple(handle, ar, errp); 5545 5546 /* Allow to use superuser reservation for quota file */ 5547 if (ext4_is_quota_file(ar->inode)) 5548 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; 5549 5550 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { 5551 /* Without delayed allocation we need to verify 5552 * there is enough free blocks to do block allocation 5553 * and verify allocation doesn't exceed the quota limits. 5554 */ 5555 while (ar->len && 5556 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { 5557 5558 /* let others to free the space */ 5559 cond_resched(); 5560 ar->len = ar->len >> 1; 5561 } 5562 if (!ar->len) { 5563 ext4_mb_show_pa(sb); 5564 *errp = -ENOSPC; 5565 return 0; 5566 } 5567 reserv_clstrs = ar->len; 5568 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { 5569 dquot_alloc_block_nofail(ar->inode, 5570 EXT4_C2B(sbi, ar->len)); 5571 } else { 5572 while (ar->len && 5573 dquot_alloc_block(ar->inode, 5574 EXT4_C2B(sbi, ar->len))) { 5575 5576 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 5577 ar->len--; 5578 } 5579 } 5580 inquota = ar->len; 5581 if (ar->len == 0) { 5582 *errp = -EDQUOT; 5583 goto out; 5584 } 5585 } 5586 5587 ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); 5588 if (!ac) { 5589 ar->len = 0; 5590 *errp = -ENOMEM; 5591 goto out; 5592 } 5593 5594 *errp = ext4_mb_initialize_context(ac, ar); 5595 if (*errp) { 5596 ar->len = 0; 5597 goto out; 5598 } 5599 5600 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 5601 seq = this_cpu_read(discard_pa_seq); 5602 if (!ext4_mb_use_preallocated(ac)) { 5603 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 5604 ext4_mb_normalize_request(ac, ar); 5605 5606 *errp = ext4_mb_pa_alloc(ac); 5607 if (*errp) 5608 goto errout; 5609 repeat: 5610 /* allocate space in core */ 5611 *errp = ext4_mb_regular_allocator(ac); 5612 /* 5613 * pa allocated above is added to grp->bb_prealloc_list only 5614 * when we were able to allocate some block i.e. when 5615 * ac->ac_status == AC_STATUS_FOUND. 5616 * And error from above mean ac->ac_status != AC_STATUS_FOUND 5617 * So we have to free this pa here itself. 5618 */ 5619 if (*errp) { 5620 ext4_mb_pa_free(ac); 5621 ext4_discard_allocated_blocks(ac); 5622 goto errout; 5623 } 5624 if (ac->ac_status == AC_STATUS_FOUND && 5625 ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) 5626 ext4_mb_pa_free(ac); 5627 } 5628 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 5629 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); 5630 if (*errp) { 5631 ext4_discard_allocated_blocks(ac); 5632 goto errout; 5633 } else { 5634 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 5635 ar->len = ac->ac_b_ex.fe_len; 5636 } 5637 } else { 5638 if (++retries < 3 && 5639 ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) 5640 goto repeat; 5641 /* 5642 * If block allocation fails then the pa allocated above 5643 * needs to be freed here itself. 5644 */ 5645 ext4_mb_pa_free(ac); 5646 *errp = -ENOSPC; 5647 } 5648 5649 errout: 5650 if (*errp) { 5651 ac->ac_b_ex.fe_len = 0; 5652 ar->len = 0; 5653 ext4_mb_show_ac(ac); 5654 } 5655 ext4_mb_release_context(ac); 5656 out: 5657 if (ac) 5658 kmem_cache_free(ext4_ac_cachep, ac); 5659 if (inquota && ar->len < inquota) 5660 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); 5661 if (!ar->len) { 5662 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) 5663 /* release all the reserved blocks if non delalloc */ 5664 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 5665 reserv_clstrs); 5666 } 5667 5668 trace_ext4_allocate_blocks(ar, (unsigned long long)block); 5669 5670 return block; 5671 } 5672 5673 /* 5674 * We can merge two free data extents only if the physical blocks 5675 * are contiguous, AND the extents were freed by the same transaction, 5676 * AND the blocks are associated with the same group. 5677 */ 5678 static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, 5679 struct ext4_free_data *entry, 5680 struct ext4_free_data *new_entry, 5681 struct rb_root *entry_rb_root) 5682 { 5683 if ((entry->efd_tid != new_entry->efd_tid) || 5684 (entry->efd_group != new_entry->efd_group)) 5685 return; 5686 if (entry->efd_start_cluster + entry->efd_count == 5687 new_entry->efd_start_cluster) { 5688 new_entry->efd_start_cluster = entry->efd_start_cluster; 5689 new_entry->efd_count += entry->efd_count; 5690 } else if (new_entry->efd_start_cluster + new_entry->efd_count == 5691 entry->efd_start_cluster) { 5692 new_entry->efd_count += entry->efd_count; 5693 } else 5694 return; 5695 spin_lock(&sbi->s_md_lock); 5696 list_del(&entry->efd_list); 5697 spin_unlock(&sbi->s_md_lock); 5698 rb_erase(&entry->efd_node, entry_rb_root); 5699 kmem_cache_free(ext4_free_data_cachep, entry); 5700 } 5701 5702 static noinline_for_stack int 5703 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 5704 struct ext4_free_data *new_entry) 5705 { 5706 ext4_group_t group = e4b->bd_group; 5707 ext4_grpblk_t cluster; 5708 ext4_grpblk_t clusters = new_entry->efd_count; 5709 struct ext4_free_data *entry; 5710 struct ext4_group_info *db = e4b->bd_info; 5711 struct super_block *sb = e4b->bd_sb; 5712 struct ext4_sb_info *sbi = EXT4_SB(sb); 5713 struct rb_node **n = &db->bb_free_root.rb_node, *node; 5714 struct rb_node *parent = NULL, *new_node; 5715 5716 BUG_ON(!ext4_handle_valid(handle)); 5717 BUG_ON(e4b->bd_bitmap_page == NULL); 5718 BUG_ON(e4b->bd_buddy_page == NULL); 5719 5720 new_node = &new_entry->efd_node; 5721 cluster = new_entry->efd_start_cluster; 5722 5723 if (!*n) { 5724 /* first free block exent. We need to 5725 protect buddy cache from being freed, 5726 * otherwise we'll refresh it from 5727 * on-disk bitmap and lose not-yet-available 5728 * blocks */ 5729 get_page(e4b->bd_buddy_page); 5730 get_page(e4b->bd_bitmap_page); 5731 } 5732 while (*n) { 5733 parent = *n; 5734 entry = rb_entry(parent, struct ext4_free_data, efd_node); 5735 if (cluster < entry->efd_start_cluster) 5736 n = &(*n)->rb_left; 5737 else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) 5738 n = &(*n)->rb_right; 5739 else { 5740 ext4_grp_locked_error(sb, group, 0, 5741 ext4_group_first_block_no(sb, group) + 5742 EXT4_C2B(sbi, cluster), 5743 "Block already on to-be-freed list"); 5744 kmem_cache_free(ext4_free_data_cachep, new_entry); 5745 return 0; 5746 } 5747 } 5748 5749 rb_link_node(new_node, parent, n); 5750 rb_insert_color(new_node, &db->bb_free_root); 5751 5752 /* Now try to see the extent can be merged to left and right */ 5753 node = rb_prev(new_node); 5754 if (node) { 5755 entry = rb_entry(node, struct ext4_free_data, efd_node); 5756 ext4_try_merge_freed_extent(sbi, entry, new_entry, 5757 &(db->bb_free_root)); 5758 } 5759 5760 node = rb_next(new_node); 5761 if (node) { 5762 entry = rb_entry(node, struct ext4_free_data, efd_node); 5763 ext4_try_merge_freed_extent(sbi, entry, new_entry, 5764 &(db->bb_free_root)); 5765 } 5766 5767 spin_lock(&sbi->s_md_lock); 5768 list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); 5769 sbi->s_mb_free_pending += clusters; 5770 spin_unlock(&sbi->s_md_lock); 5771 return 0; 5772 } 5773 5774 /* 5775 * Simple allocator for Ext4 fast commit replay path. It searches for blocks 5776 * linearly starting at the goal block and also excludes the blocks which 5777 * are going to be in use after fast commit replay. 5778 */ 5779 static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, 5780 struct ext4_allocation_request *ar, int *errp) 5781 { 5782 struct buffer_head *bitmap_bh; 5783 struct super_block *sb = ar->inode->i_sb; 5784 ext4_group_t group; 5785 ext4_grpblk_t blkoff; 5786 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 5787 ext4_grpblk_t i = 0; 5788 ext4_fsblk_t goal, block; 5789 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 5790 5791 goal = ar->goal; 5792 if (goal < le32_to_cpu(es->s_first_data_block) || 5793 goal >= ext4_blocks_count(es)) 5794 goal = le32_to_cpu(es->s_first_data_block); 5795 5796 ar->len = 0; 5797 ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); 5798 for (; group < ext4_get_groups_count(sb); group++) { 5799 bitmap_bh = ext4_read_block_bitmap(sb, group); 5800 if (IS_ERR(bitmap_bh)) { 5801 *errp = PTR_ERR(bitmap_bh); 5802 pr_warn("Failed to read block bitmap\n"); 5803 return 0; 5804 } 5805 5806 ext4_get_group_no_and_offset(sb, 5807 max(ext4_group_first_block_no(sb, group), goal), 5808 NULL, &blkoff); 5809 while (1) { 5810 i = mb_find_next_zero_bit(bitmap_bh->b_data, max, 5811 blkoff); 5812 if (i >= max) 5813 break; 5814 if (ext4_fc_replay_check_excluded(sb, 5815 ext4_group_first_block_no(sb, group) + i)) { 5816 blkoff = i + 1; 5817 } else 5818 break; 5819 } 5820 brelse(bitmap_bh); 5821 if (i < max) 5822 break; 5823 } 5824 5825 if (group >= ext4_get_groups_count(sb) || i >= max) { 5826 *errp = -ENOSPC; 5827 return 0; 5828 } 5829 5830 block = ext4_group_first_block_no(sb, group) + i; 5831 ext4_mb_mark_bb(sb, block, 1, 1); 5832 ar->len = 1; 5833 5834 return block; 5835 } 5836 5837 static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, 5838 unsigned long count) 5839 { 5840 struct buffer_head *bitmap_bh; 5841 struct super_block *sb = inode->i_sb; 5842 struct ext4_group_desc *gdp; 5843 struct buffer_head *gdp_bh; 5844 ext4_group_t group; 5845 ext4_grpblk_t blkoff; 5846 int already_freed = 0, err, i; 5847 5848 ext4_get_group_no_and_offset(sb, block, &group, &blkoff); 5849 bitmap_bh = ext4_read_block_bitmap(sb, group); 5850 if (IS_ERR(bitmap_bh)) { 5851 err = PTR_ERR(bitmap_bh); 5852 pr_warn("Failed to read block bitmap\n"); 5853 return; 5854 } 5855 gdp = ext4_get_group_desc(sb, group, &gdp_bh); 5856 if (!gdp) 5857 return; 5858 5859 for (i = 0; i < count; i++) { 5860 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) 5861 already_freed++; 5862 } 5863 mb_clear_bits(bitmap_bh->b_data, blkoff, count); 5864 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); 5865 if (err) 5866 return; 5867 ext4_free_group_clusters_set( 5868 sb, gdp, ext4_free_group_clusters(sb, gdp) + 5869 count - already_freed); 5870 ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); 5871 ext4_group_desc_csum_set(sb, group, gdp); 5872 ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); 5873 sync_dirty_buffer(bitmap_bh); 5874 sync_dirty_buffer(gdp_bh); 5875 brelse(bitmap_bh); 5876 } 5877 5878 /** 5879 * ext4_mb_clear_bb() -- helper function for freeing blocks. 5880 * Used by ext4_free_blocks() 5881 * @handle: handle for this transaction 5882 * @inode: inode 5883 * @block: starting physical block to be freed 5884 * @count: number of blocks to be freed 5885 * @flags: flags used by ext4_free_blocks 5886 */ 5887 static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, 5888 ext4_fsblk_t block, unsigned long count, 5889 int flags) 5890 { 5891 struct buffer_head *bitmap_bh = NULL; 5892 struct super_block *sb = inode->i_sb; 5893 struct ext4_group_desc *gdp; 5894 unsigned int overflow; 5895 ext4_grpblk_t bit; 5896 struct buffer_head *gd_bh; 5897 ext4_group_t block_group; 5898 struct ext4_sb_info *sbi; 5899 struct ext4_buddy e4b; 5900 unsigned int count_clusters; 5901 int err = 0; 5902 int ret; 5903 5904 sbi = EXT4_SB(sb); 5905 5906 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 5907 !ext4_inode_block_valid(inode, block, count)) { 5908 ext4_error(sb, "Freeing blocks in system zone - " 5909 "Block = %llu, count = %lu", block, count); 5910 /* err = 0. ext4_std_error should be a no op */ 5911 goto error_return; 5912 } 5913 flags |= EXT4_FREE_BLOCKS_VALIDATED; 5914 5915 do_more: 5916 overflow = 0; 5917 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 5918 5919 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( 5920 ext4_get_group_info(sb, block_group)))) 5921 return; 5922 5923 /* 5924 * Check to see if we are freeing blocks across a group 5925 * boundary. 5926 */ 5927 if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { 5928 overflow = EXT4_C2B(sbi, bit) + count - 5929 EXT4_BLOCKS_PER_GROUP(sb); 5930 count -= overflow; 5931 /* The range changed so it's no longer validated */ 5932 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 5933 } 5934 count_clusters = EXT4_NUM_B2C(sbi, count); 5935 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 5936 if (IS_ERR(bitmap_bh)) { 5937 err = PTR_ERR(bitmap_bh); 5938 bitmap_bh = NULL; 5939 goto error_return; 5940 } 5941 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 5942 if (!gdp) { 5943 err = -EIO; 5944 goto error_return; 5945 } 5946 5947 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 5948 !ext4_inode_block_valid(inode, block, count)) { 5949 ext4_error(sb, "Freeing blocks in system zone - " 5950 "Block = %llu, count = %lu", block, count); 5951 /* err = 0. ext4_std_error should be a no op */ 5952 goto error_return; 5953 } 5954 5955 BUFFER_TRACE(bitmap_bh, "getting write access"); 5956 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 5957 EXT4_JTR_NONE); 5958 if (err) 5959 goto error_return; 5960 5961 /* 5962 * We are about to modify some metadata. Call the journal APIs 5963 * to unshare ->b_data if a currently-committing transaction is 5964 * using it 5965 */ 5966 BUFFER_TRACE(gd_bh, "get_write_access"); 5967 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); 5968 if (err) 5969 goto error_return; 5970 #ifdef AGGRESSIVE_CHECK 5971 { 5972 int i; 5973 for (i = 0; i < count_clusters; i++) 5974 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 5975 } 5976 #endif 5977 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); 5978 5979 /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ 5980 err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, 5981 GFP_NOFS|__GFP_NOFAIL); 5982 if (err) 5983 goto error_return; 5984 5985 /* 5986 * We need to make sure we don't reuse the freed block until after the 5987 * transaction is committed. We make an exception if the inode is to be 5988 * written in writeback mode since writeback mode has weak data 5989 * consistency guarantees. 5990 */ 5991 if (ext4_handle_valid(handle) && 5992 ((flags & EXT4_FREE_BLOCKS_METADATA) || 5993 !ext4_should_writeback_data(inode))) { 5994 struct ext4_free_data *new_entry; 5995 /* 5996 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed 5997 * to fail. 5998 */ 5999 new_entry = kmem_cache_alloc(ext4_free_data_cachep, 6000 GFP_NOFS|__GFP_NOFAIL); 6001 new_entry->efd_start_cluster = bit; 6002 new_entry->efd_group = block_group; 6003 new_entry->efd_count = count_clusters; 6004 new_entry->efd_tid = handle->h_transaction->t_tid; 6005 6006 ext4_lock_group(sb, block_group); 6007 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 6008 ext4_mb_free_metadata(handle, &e4b, new_entry); 6009 } else { 6010 /* need to update group_info->bb_free and bitmap 6011 * with group lock held. generate_buddy look at 6012 * them with group lock_held 6013 */ 6014 if (test_opt(sb, DISCARD)) { 6015 err = ext4_issue_discard(sb, block_group, bit, count, 6016 NULL); 6017 if (err && err != -EOPNOTSUPP) 6018 ext4_msg(sb, KERN_WARNING, "discard request in" 6019 " group:%u block:%d count:%lu failed" 6020 " with %d", block_group, bit, count, 6021 err); 6022 } else 6023 EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); 6024 6025 ext4_lock_group(sb, block_group); 6026 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 6027 mb_free_blocks(inode, &e4b, bit, count_clusters); 6028 } 6029 6030 ret = ext4_free_group_clusters(sb, gdp) + count_clusters; 6031 ext4_free_group_clusters_set(sb, gdp, ret); 6032 ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); 6033 ext4_group_desc_csum_set(sb, block_group, gdp); 6034 ext4_unlock_group(sb, block_group); 6035 6036 if (sbi->s_log_groups_per_flex) { 6037 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 6038 atomic64_add(count_clusters, 6039 &sbi_array_rcu_deref(sbi, s_flex_groups, 6040 flex_group)->free_clusters); 6041 } 6042 6043 /* 6044 * on a bigalloc file system, defer the s_freeclusters_counter 6045 * update to the caller (ext4_remove_space and friends) so they 6046 * can determine if a cluster freed here should be rereserved 6047 */ 6048 if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { 6049 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) 6050 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); 6051 percpu_counter_add(&sbi->s_freeclusters_counter, 6052 count_clusters); 6053 } 6054 6055 ext4_mb_unload_buddy(&e4b); 6056 6057 /* We dirtied the bitmap block */ 6058 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 6059 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 6060 6061 /* And the group descriptor block */ 6062 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 6063 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 6064 if (!err) 6065 err = ret; 6066 6067 if (overflow && !err) { 6068 block += count; 6069 count = overflow; 6070 put_bh(bitmap_bh); 6071 /* The range changed so it's no longer validated */ 6072 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6073 goto do_more; 6074 } 6075 error_return: 6076 brelse(bitmap_bh); 6077 ext4_std_error(sb, err); 6078 return; 6079 } 6080 6081 /** 6082 * ext4_free_blocks() -- Free given blocks and update quota 6083 * @handle: handle for this transaction 6084 * @inode: inode 6085 * @bh: optional buffer of the block to be freed 6086 * @block: starting physical block to be freed 6087 * @count: number of blocks to be freed 6088 * @flags: flags used by ext4_free_blocks 6089 */ 6090 void ext4_free_blocks(handle_t *handle, struct inode *inode, 6091 struct buffer_head *bh, ext4_fsblk_t block, 6092 unsigned long count, int flags) 6093 { 6094 struct super_block *sb = inode->i_sb; 6095 unsigned int overflow; 6096 struct ext4_sb_info *sbi; 6097 6098 sbi = EXT4_SB(sb); 6099 6100 if (sbi->s_mount_state & EXT4_FC_REPLAY) { 6101 ext4_free_blocks_simple(inode, block, count); 6102 return; 6103 } 6104 6105 might_sleep(); 6106 if (bh) { 6107 if (block) 6108 BUG_ON(block != bh->b_blocknr); 6109 else 6110 block = bh->b_blocknr; 6111 } 6112 6113 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6114 !ext4_inode_block_valid(inode, block, count)) { 6115 ext4_error(sb, "Freeing blocks not in datazone - " 6116 "block = %llu, count = %lu", block, count); 6117 return; 6118 } 6119 flags |= EXT4_FREE_BLOCKS_VALIDATED; 6120 6121 ext4_debug("freeing block %llu\n", block); 6122 trace_ext4_free_blocks(inode, block, count, flags); 6123 6124 if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { 6125 BUG_ON(count > 1); 6126 6127 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 6128 inode, bh, block); 6129 } 6130 6131 /* 6132 * If the extent to be freed does not begin on a cluster 6133 * boundary, we need to deal with partial clusters at the 6134 * beginning and end of the extent. Normally we will free 6135 * blocks at the beginning or the end unless we are explicitly 6136 * requested to avoid doing so. 6137 */ 6138 overflow = EXT4_PBLK_COFF(sbi, block); 6139 if (overflow) { 6140 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { 6141 overflow = sbi->s_cluster_ratio - overflow; 6142 block += overflow; 6143 if (count > overflow) 6144 count -= overflow; 6145 else 6146 return; 6147 } else { 6148 block -= overflow; 6149 count += overflow; 6150 } 6151 /* The range changed so it's no longer validated */ 6152 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6153 } 6154 overflow = EXT4_LBLK_COFF(sbi, count); 6155 if (overflow) { 6156 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { 6157 if (count > overflow) 6158 count -= overflow; 6159 else 6160 return; 6161 } else 6162 count += sbi->s_cluster_ratio - overflow; 6163 /* The range changed so it's no longer validated */ 6164 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6165 } 6166 6167 if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { 6168 int i; 6169 int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; 6170 6171 for (i = 0; i < count; i++) { 6172 cond_resched(); 6173 if (is_metadata) 6174 bh = sb_find_get_block(inode->i_sb, block + i); 6175 ext4_forget(handle, is_metadata, inode, bh, block + i); 6176 } 6177 } 6178 6179 ext4_mb_clear_bb(handle, inode, block, count, flags); 6180 return; 6181 } 6182 6183 /** 6184 * ext4_group_add_blocks() -- Add given blocks to an existing group 6185 * @handle: handle to this transaction 6186 * @sb: super block 6187 * @block: start physical block to add to the block group 6188 * @count: number of blocks to free 6189 * 6190 * This marks the blocks as free in the bitmap and buddy. 6191 */ 6192 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, 6193 ext4_fsblk_t block, unsigned long count) 6194 { 6195 struct buffer_head *bitmap_bh = NULL; 6196 struct buffer_head *gd_bh; 6197 ext4_group_t block_group; 6198 ext4_grpblk_t bit; 6199 unsigned int i; 6200 struct ext4_group_desc *desc; 6201 struct ext4_sb_info *sbi = EXT4_SB(sb); 6202 struct ext4_buddy e4b; 6203 int err = 0, ret, free_clusters_count; 6204 ext4_grpblk_t clusters_freed; 6205 ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); 6206 ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); 6207 unsigned long cluster_count = last_cluster - first_cluster + 1; 6208 6209 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 6210 6211 if (count == 0) 6212 return 0; 6213 6214 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 6215 /* 6216 * Check to see if we are freeing blocks across a group 6217 * boundary. 6218 */ 6219 if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { 6220 ext4_warning(sb, "too many blocks added to group %u", 6221 block_group); 6222 err = -EINVAL; 6223 goto error_return; 6224 } 6225 6226 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 6227 if (IS_ERR(bitmap_bh)) { 6228 err = PTR_ERR(bitmap_bh); 6229 bitmap_bh = NULL; 6230 goto error_return; 6231 } 6232 6233 desc = ext4_get_group_desc(sb, block_group, &gd_bh); 6234 if (!desc) { 6235 err = -EIO; 6236 goto error_return; 6237 } 6238 6239 if (!ext4_sb_block_valid(sb, NULL, block, count)) { 6240 ext4_error(sb, "Adding blocks in system zones - " 6241 "Block = %llu, count = %lu", 6242 block, count); 6243 err = -EINVAL; 6244 goto error_return; 6245 } 6246 6247 BUFFER_TRACE(bitmap_bh, "getting write access"); 6248 err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 6249 EXT4_JTR_NONE); 6250 if (err) 6251 goto error_return; 6252 6253 /* 6254 * We are about to modify some metadata. Call the journal APIs 6255 * to unshare ->b_data if a currently-committing transaction is 6256 * using it 6257 */ 6258 BUFFER_TRACE(gd_bh, "get_write_access"); 6259 err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); 6260 if (err) 6261 goto error_return; 6262 6263 for (i = 0, clusters_freed = 0; i < cluster_count; i++) { 6264 BUFFER_TRACE(bitmap_bh, "clear bit"); 6265 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { 6266 ext4_error(sb, "bit already cleared for block %llu", 6267 (ext4_fsblk_t)(block + i)); 6268 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 6269 } else { 6270 clusters_freed++; 6271 } 6272 } 6273 6274 err = ext4_mb_load_buddy(sb, block_group, &e4b); 6275 if (err) 6276 goto error_return; 6277 6278 /* 6279 * need to update group_info->bb_free and bitmap 6280 * with group lock held. generate_buddy look at 6281 * them with group lock_held 6282 */ 6283 ext4_lock_group(sb, block_group); 6284 mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); 6285 mb_free_blocks(NULL, &e4b, bit, cluster_count); 6286 free_clusters_count = clusters_freed + 6287 ext4_free_group_clusters(sb, desc); 6288 ext4_free_group_clusters_set(sb, desc, free_clusters_count); 6289 ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); 6290 ext4_group_desc_csum_set(sb, block_group, desc); 6291 ext4_unlock_group(sb, block_group); 6292 percpu_counter_add(&sbi->s_freeclusters_counter, 6293 clusters_freed); 6294 6295 if (sbi->s_log_groups_per_flex) { 6296 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 6297 atomic64_add(clusters_freed, 6298 &sbi_array_rcu_deref(sbi, s_flex_groups, 6299 flex_group)->free_clusters); 6300 } 6301 6302 ext4_mb_unload_buddy(&e4b); 6303 6304 /* We dirtied the bitmap block */ 6305 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 6306 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 6307 6308 /* And the group descriptor block */ 6309 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 6310 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 6311 if (!err) 6312 err = ret; 6313 6314 error_return: 6315 brelse(bitmap_bh); 6316 ext4_std_error(sb, err); 6317 return err; 6318 } 6319 6320 /** 6321 * ext4_trim_extent -- function to TRIM one single free extent in the group 6322 * @sb: super block for the file system 6323 * @start: starting block of the free extent in the alloc. group 6324 * @count: number of blocks to TRIM 6325 * @e4b: ext4 buddy for the group 6326 * 6327 * Trim "count" blocks starting at "start" in the "group". To assure that no 6328 * one will allocate those blocks, mark it as used in buddy bitmap. This must 6329 * be called with under the group lock. 6330 */ 6331 static int ext4_trim_extent(struct super_block *sb, 6332 int start, int count, struct ext4_buddy *e4b) 6333 __releases(bitlock) 6334 __acquires(bitlock) 6335 { 6336 struct ext4_free_extent ex; 6337 ext4_group_t group = e4b->bd_group; 6338 int ret = 0; 6339 6340 trace_ext4_trim_extent(sb, group, start, count); 6341 6342 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 6343 6344 ex.fe_start = start; 6345 ex.fe_group = group; 6346 ex.fe_len = count; 6347 6348 /* 6349 * Mark blocks used, so no one can reuse them while 6350 * being trimmed. 6351 */ 6352 mb_mark_used(e4b, &ex); 6353 ext4_unlock_group(sb, group); 6354 ret = ext4_issue_discard(sb, group, start, count, NULL); 6355 ext4_lock_group(sb, group); 6356 mb_free_blocks(NULL, e4b, start, ex.fe_len); 6357 return ret; 6358 } 6359 6360 static int ext4_try_to_trim_range(struct super_block *sb, 6361 struct ext4_buddy *e4b, ext4_grpblk_t start, 6362 ext4_grpblk_t max, ext4_grpblk_t minblocks) 6363 __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) 6364 __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) 6365 { 6366 ext4_grpblk_t next, count, free_count; 6367 void *bitmap; 6368 6369 bitmap = e4b->bd_bitmap; 6370 start = (e4b->bd_info->bb_first_free > start) ? 6371 e4b->bd_info->bb_first_free : start; 6372 count = 0; 6373 free_count = 0; 6374 6375 while (start <= max) { 6376 start = mb_find_next_zero_bit(bitmap, max + 1, start); 6377 if (start > max) 6378 break; 6379 next = mb_find_next_bit(bitmap, max + 1, start); 6380 6381 if ((next - start) >= minblocks) { 6382 int ret = ext4_trim_extent(sb, start, next - start, e4b); 6383 6384 if (ret && ret != -EOPNOTSUPP) 6385 break; 6386 count += next - start; 6387 } 6388 free_count += next - start; 6389 start = next + 1; 6390 6391 if (fatal_signal_pending(current)) { 6392 count = -ERESTARTSYS; 6393 break; 6394 } 6395 6396 if (need_resched()) { 6397 ext4_unlock_group(sb, e4b->bd_group); 6398 cond_resched(); 6399 ext4_lock_group(sb, e4b->bd_group); 6400 } 6401 6402 if ((e4b->bd_info->bb_free - free_count) < minblocks) 6403 break; 6404 } 6405 6406 return count; 6407 } 6408 6409 /** 6410 * ext4_trim_all_free -- function to trim all free space in alloc. group 6411 * @sb: super block for file system 6412 * @group: group to be trimmed 6413 * @start: first group block to examine 6414 * @max: last group block to examine 6415 * @minblocks: minimum extent block count 6416 * @set_trimmed: set the trimmed flag if at least one block is trimmed 6417 * 6418 * ext4_trim_all_free walks through group's block bitmap searching for free 6419 * extents. When the free extent is found, mark it as used in group buddy 6420 * bitmap. Then issue a TRIM command on this extent and free the extent in 6421 * the group buddy bitmap. 6422 */ 6423 static ext4_grpblk_t 6424 ext4_trim_all_free(struct super_block *sb, ext4_group_t group, 6425 ext4_grpblk_t start, ext4_grpblk_t max, 6426 ext4_grpblk_t minblocks, bool set_trimmed) 6427 { 6428 struct ext4_buddy e4b; 6429 int ret; 6430 6431 trace_ext4_trim_all_free(sb, group, start, max); 6432 6433 ret = ext4_mb_load_buddy(sb, group, &e4b); 6434 if (ret) { 6435 ext4_warning(sb, "Error %d loading buddy information for %u", 6436 ret, group); 6437 return ret; 6438 } 6439 6440 ext4_lock_group(sb, group); 6441 6442 if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || 6443 minblocks < EXT4_SB(sb)->s_last_trim_minblks) { 6444 ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); 6445 if (ret >= 0 && set_trimmed) 6446 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); 6447 } else { 6448 ret = 0; 6449 } 6450 6451 ext4_unlock_group(sb, group); 6452 ext4_mb_unload_buddy(&e4b); 6453 6454 ext4_debug("trimmed %d blocks in the group %d\n", 6455 ret, group); 6456 6457 return ret; 6458 } 6459 6460 /** 6461 * ext4_trim_fs() -- trim ioctl handle function 6462 * @sb: superblock for filesystem 6463 * @range: fstrim_range structure 6464 * 6465 * start: First Byte to trim 6466 * len: number of Bytes to trim from start 6467 * minlen: minimum extent length in Bytes 6468 * ext4_trim_fs goes through all allocation groups containing Bytes from 6469 * start to start+len. For each such a group ext4_trim_all_free function 6470 * is invoked to trim all free space. 6471 */ 6472 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 6473 { 6474 unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); 6475 struct ext4_group_info *grp; 6476 ext4_group_t group, first_group, last_group; 6477 ext4_grpblk_t cnt = 0, first_cluster, last_cluster; 6478 uint64_t start, end, minlen, trimmed = 0; 6479 ext4_fsblk_t first_data_blk = 6480 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 6481 ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); 6482 bool whole_group, eof = false; 6483 int ret = 0; 6484 6485 start = range->start >> sb->s_blocksize_bits; 6486 end = start + (range->len >> sb->s_blocksize_bits) - 1; 6487 minlen = EXT4_NUM_B2C(EXT4_SB(sb), 6488 range->minlen >> sb->s_blocksize_bits); 6489 6490 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || 6491 start >= max_blks || 6492 range->len < sb->s_blocksize) 6493 return -EINVAL; 6494 /* No point to try to trim less than discard granularity */ 6495 if (range->minlen < discard_granularity) { 6496 minlen = EXT4_NUM_B2C(EXT4_SB(sb), 6497 discard_granularity >> sb->s_blocksize_bits); 6498 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) 6499 goto out; 6500 } 6501 if (end >= max_blks - 1) { 6502 end = max_blks - 1; 6503 eof = true; 6504 } 6505 if (end <= first_data_blk) 6506 goto out; 6507 if (start < first_data_blk) 6508 start = first_data_blk; 6509 6510 /* Determine first and last group to examine based on start and end */ 6511 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 6512 &first_group, &first_cluster); 6513 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, 6514 &last_group, &last_cluster); 6515 6516 /* end now represents the last cluster to discard in this group */ 6517 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; 6518 whole_group = true; 6519 6520 for (group = first_group; group <= last_group; group++) { 6521 grp = ext4_get_group_info(sb, group); 6522 /* We only do this if the grp has never been initialized */ 6523 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 6524 ret = ext4_mb_init_group(sb, group, GFP_NOFS); 6525 if (ret) 6526 break; 6527 } 6528 6529 /* 6530 * For all the groups except the last one, last cluster will 6531 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to 6532 * change it for the last group, note that last_cluster is 6533 * already computed earlier by ext4_get_group_no_and_offset() 6534 */ 6535 if (group == last_group) { 6536 end = last_cluster; 6537 whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; 6538 } 6539 if (grp->bb_free >= minlen) { 6540 cnt = ext4_trim_all_free(sb, group, first_cluster, 6541 end, minlen, whole_group); 6542 if (cnt < 0) { 6543 ret = cnt; 6544 break; 6545 } 6546 trimmed += cnt; 6547 } 6548 6549 /* 6550 * For every group except the first one, we are sure 6551 * that the first cluster to discard will be cluster #0. 6552 */ 6553 first_cluster = 0; 6554 } 6555 6556 if (!ret) 6557 EXT4_SB(sb)->s_last_trim_minblks = minlen; 6558 6559 out: 6560 range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; 6561 return ret; 6562 } 6563 6564 /* Iterate all the free extents in the group. */ 6565 int 6566 ext4_mballoc_query_range( 6567 struct super_block *sb, 6568 ext4_group_t group, 6569 ext4_grpblk_t start, 6570 ext4_grpblk_t end, 6571 ext4_mballoc_query_range_fn formatter, 6572 void *priv) 6573 { 6574 void *bitmap; 6575 ext4_grpblk_t next; 6576 struct ext4_buddy e4b; 6577 int error; 6578 6579 error = ext4_mb_load_buddy(sb, group, &e4b); 6580 if (error) 6581 return error; 6582 bitmap = e4b.bd_bitmap; 6583 6584 ext4_lock_group(sb, group); 6585 6586 start = (e4b.bd_info->bb_first_free > start) ? 6587 e4b.bd_info->bb_first_free : start; 6588 if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) 6589 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; 6590 6591 while (start <= end) { 6592 start = mb_find_next_zero_bit(bitmap, end + 1, start); 6593 if (start > end) 6594 break; 6595 next = mb_find_next_bit(bitmap, end + 1, start); 6596 6597 ext4_unlock_group(sb, group); 6598 error = formatter(sb, group, start, next - start, priv); 6599 if (error) 6600 goto out_unload; 6601 ext4_lock_group(sb, group); 6602 6603 start = next + 1; 6604 } 6605 6606 ext4_unlock_group(sb, group); 6607 out_unload: 6608 ext4_mb_unload_buddy(&e4b); 6609 6610 return error; 6611 } 6612