1 /* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License version 2 as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public Licens 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 17 */ 18 19 20 /* 21 * mballoc.c contains the multiblocks allocation routines 22 */ 23 24 #include "mballoc.h" 25 #include <linux/debugfs.h> 26 #include <linux/slab.h> 27 #include <trace/events/ext4.h> 28 29 /* 30 * MUSTDO: 31 * - test ext4_ext_search_left() and ext4_ext_search_right() 32 * - search for metadata in few groups 33 * 34 * TODO v4: 35 * - normalization should take into account whether file is still open 36 * - discard preallocations if no free space left (policy?) 37 * - don't normalize tails 38 * - quota 39 * - reservation for superuser 40 * 41 * TODO v3: 42 * - bitmap read-ahead (proposed by Oleg Drokin aka green) 43 * - track min/max extents in each group for better group selection 44 * - mb_mark_used() may allocate chunk right after splitting buddy 45 * - tree of groups sorted by number of free blocks 46 * - error handling 47 */ 48 49 /* 50 * The allocation request involve request for multiple number of blocks 51 * near to the goal(block) value specified. 52 * 53 * During initialization phase of the allocator we decide to use the 54 * group preallocation or inode preallocation depending on the size of 55 * the file. The size of the file could be the resulting file size we 56 * would have after allocation, or the current file size, which ever 57 * is larger. If the size is less than sbi->s_mb_stream_request we 58 * select to use the group preallocation. The default value of 59 * s_mb_stream_request is 16 blocks. This can also be tuned via 60 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in 61 * terms of number of blocks. 62 * 63 * The main motivation for having small file use group preallocation is to 64 * ensure that we have small files closer together on the disk. 65 * 66 * First stage the allocator looks at the inode prealloc list, 67 * ext4_inode_info->i_prealloc_list, which contains list of prealloc 68 * spaces for this particular inode. The inode prealloc space is 69 * represented as: 70 * 71 * pa_lstart -> the logical start block for this prealloc space 72 * pa_pstart -> the physical start block for this prealloc space 73 * pa_len -> length for this prealloc space (in clusters) 74 * pa_free -> free space available in this prealloc space (in clusters) 75 * 76 * The inode preallocation space is used looking at the _logical_ start 77 * block. If only the logical file block falls within the range of prealloc 78 * space we will consume the particular prealloc space. This makes sure that 79 * we have contiguous physical blocks representing the file blocks 80 * 81 * The important thing to be noted in case of inode prealloc space is that 82 * we don't modify the values associated to inode prealloc space except 83 * pa_free. 84 * 85 * If we are not able to find blocks in the inode prealloc space and if we 86 * have the group allocation flag set then we look at the locality group 87 * prealloc space. These are per CPU prealloc list represented as 88 * 89 * ext4_sb_info.s_locality_groups[smp_processor_id()] 90 * 91 * The reason for having a per cpu locality group is to reduce the contention 92 * between CPUs. It is possible to get scheduled at this point. 93 * 94 * The locality group prealloc space is used looking at whether we have 95 * enough free space (pa_free) within the prealloc space. 96 * 97 * If we can't allocate blocks via inode prealloc or/and locality group 98 * prealloc then we look at the buddy cache. The buddy cache is represented 99 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets 100 * mapped to the buddy and bitmap information regarding different 101 * groups. The buddy information is attached to buddy cache inode so that 102 * we can access them through the page cache. The information regarding 103 * each group is loaded via ext4_mb_load_buddy. The information involve 104 * block bitmap and buddy information. The information are stored in the 105 * inode as: 106 * 107 * { page } 108 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 109 * 110 * 111 * one block each for bitmap and buddy information. So for each group we 112 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / 113 * blocksize) blocks. So it can have information regarding groups_per_page 114 * which is blocks_per_page/2 115 * 116 * The buddy cache inode is not stored on disk. The inode is thrown 117 * away when the filesystem is unmounted. 118 * 119 * We look for count number of blocks in the buddy cache. If we were able 120 * to locate that many free blocks we return with additional information 121 * regarding rest of the contiguous physical block available 122 * 123 * Before allocating blocks via buddy cache we normalize the request 124 * blocks. This ensure we ask for more blocks that we needed. The extra 125 * blocks that we get after allocation is added to the respective prealloc 126 * list. In case of inode preallocation we follow a list of heuristics 127 * based on file size. This can be found in ext4_mb_normalize_request. If 128 * we are doing a group prealloc we try to normalize the request to 129 * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is 130 * dependent on the cluster size; for non-bigalloc file systems, it is 131 * 512 blocks. This can be tuned via 132 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in 133 * terms of number of blocks. If we have mounted the file system with -O 134 * stripe=<value> option the group prealloc request is normalized to the 135 * the smallest multiple of the stripe value (sbi->s_stripe) which is 136 * greater than the default mb_group_prealloc. 137 * 138 * The regular allocator (using the buddy cache) supports a few tunables. 139 * 140 * /sys/fs/ext4/<partition>/mb_min_to_scan 141 * /sys/fs/ext4/<partition>/mb_max_to_scan 142 * /sys/fs/ext4/<partition>/mb_order2_req 143 * 144 * The regular allocator uses buddy scan only if the request len is power of 145 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 146 * value of s_mb_order2_reqs can be tuned via 147 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to 148 * stripe size (sbi->s_stripe), we try to search for contiguous block in 149 * stripe size. This should result in better allocation on RAID setups. If 150 * not, we search in the specific group using bitmap for best extents. The 151 * tunable min_to_scan and max_to_scan control the behaviour here. 152 * min_to_scan indicate how long the mballoc __must__ look for a best 153 * extent and max_to_scan indicates how long the mballoc __can__ look for a 154 * best extent in the found extents. Searching for the blocks starts with 155 * the group specified as the goal value in allocation context via 156 * ac_g_ex. Each group is first checked based on the criteria whether it 157 * can be used for allocation. ext4_mb_good_group explains how the groups are 158 * checked. 159 * 160 * Both the prealloc space are getting populated as above. So for the first 161 * request we will hit the buddy cache which will result in this prealloc 162 * space getting filled. The prealloc space is then later used for the 163 * subsequent request. 164 */ 165 166 /* 167 * mballoc operates on the following data: 168 * - on-disk bitmap 169 * - in-core buddy (actually includes buddy and bitmap) 170 * - preallocation descriptors (PAs) 171 * 172 * there are two types of preallocations: 173 * - inode 174 * assiged to specific inode and can be used for this inode only. 175 * it describes part of inode's space preallocated to specific 176 * physical blocks. any block from that preallocated can be used 177 * independent. the descriptor just tracks number of blocks left 178 * unused. so, before taking some block from descriptor, one must 179 * make sure corresponded logical block isn't allocated yet. this 180 * also means that freeing any block within descriptor's range 181 * must discard all preallocated blocks. 182 * - locality group 183 * assigned to specific locality group which does not translate to 184 * permanent set of inodes: inode can join and leave group. space 185 * from this type of preallocation can be used for any inode. thus 186 * it's consumed from the beginning to the end. 187 * 188 * relation between them can be expressed as: 189 * in-core buddy = on-disk bitmap + preallocation descriptors 190 * 191 * this mean blocks mballoc considers used are: 192 * - allocated blocks (persistent) 193 * - preallocated blocks (non-persistent) 194 * 195 * consistency in mballoc world means that at any time a block is either 196 * free or used in ALL structures. notice: "any time" should not be read 197 * literally -- time is discrete and delimited by locks. 198 * 199 * to keep it simple, we don't use block numbers, instead we count number of 200 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. 201 * 202 * all operations can be expressed as: 203 * - init buddy: buddy = on-disk + PAs 204 * - new PA: buddy += N; PA = N 205 * - use inode PA: on-disk += N; PA -= N 206 * - discard inode PA buddy -= on-disk - PA; PA = 0 207 * - use locality group PA on-disk += N; PA -= N 208 * - discard locality group PA buddy -= PA; PA = 0 209 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap 210 * is used in real operation because we can't know actual used 211 * bits from PA, only from on-disk bitmap 212 * 213 * if we follow this strict logic, then all operations above should be atomic. 214 * given some of them can block, we'd have to use something like semaphores 215 * killing performance on high-end SMP hardware. let's try to relax it using 216 * the following knowledge: 217 * 1) if buddy is referenced, it's already initialized 218 * 2) while block is used in buddy and the buddy is referenced, 219 * nobody can re-allocate that block 220 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has 221 * bit set and PA claims same block, it's OK. IOW, one can set bit in 222 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded 223 * block 224 * 225 * so, now we're building a concurrency table: 226 * - init buddy vs. 227 * - new PA 228 * blocks for PA are allocated in the buddy, buddy must be referenced 229 * until PA is linked to allocation group to avoid concurrent buddy init 230 * - use inode PA 231 * we need to make sure that either on-disk bitmap or PA has uptodate data 232 * given (3) we care that PA-=N operation doesn't interfere with init 233 * - discard inode PA 234 * the simplest way would be to have buddy initialized by the discard 235 * - use locality group PA 236 * again PA-=N must be serialized with init 237 * - discard locality group PA 238 * the simplest way would be to have buddy initialized by the discard 239 * - new PA vs. 240 * - use inode PA 241 * i_data_sem serializes them 242 * - discard inode PA 243 * discard process must wait until PA isn't used by another process 244 * - use locality group PA 245 * some mutex should serialize them 246 * - discard locality group PA 247 * discard process must wait until PA isn't used by another process 248 * - use inode PA 249 * - use inode PA 250 * i_data_sem or another mutex should serializes them 251 * - discard inode PA 252 * discard process must wait until PA isn't used by another process 253 * - use locality group PA 254 * nothing wrong here -- they're different PAs covering different blocks 255 * - discard locality group PA 256 * discard process must wait until PA isn't used by another process 257 * 258 * now we're ready to make few consequences: 259 * - PA is referenced and while it is no discard is possible 260 * - PA is referenced until block isn't marked in on-disk bitmap 261 * - PA changes only after on-disk bitmap 262 * - discard must not compete with init. either init is done before 263 * any discard or they're serialized somehow 264 * - buddy init as sum of on-disk bitmap and PAs is done atomically 265 * 266 * a special case when we've used PA to emptiness. no need to modify buddy 267 * in this case, but we should care about concurrent init 268 * 269 */ 270 271 /* 272 * Logic in few words: 273 * 274 * - allocation: 275 * load group 276 * find blocks 277 * mark bits in on-disk bitmap 278 * release group 279 * 280 * - use preallocation: 281 * find proper PA (per-inode or group) 282 * load group 283 * mark bits in on-disk bitmap 284 * release group 285 * release PA 286 * 287 * - free: 288 * load group 289 * mark bits in on-disk bitmap 290 * release group 291 * 292 * - discard preallocations in group: 293 * mark PAs deleted 294 * move them onto local list 295 * load on-disk bitmap 296 * load group 297 * remove PA from object (inode or locality group) 298 * mark free blocks in-core 299 * 300 * - discard inode's preallocations: 301 */ 302 303 /* 304 * Locking rules 305 * 306 * Locks: 307 * - bitlock on a group (group) 308 * - object (inode/locality) (object) 309 * - per-pa lock (pa) 310 * 311 * Paths: 312 * - new pa 313 * object 314 * group 315 * 316 * - find and use pa: 317 * pa 318 * 319 * - release consumed pa: 320 * pa 321 * group 322 * object 323 * 324 * - generate in-core bitmap: 325 * group 326 * pa 327 * 328 * - discard all for given object (inode, locality group): 329 * object 330 * pa 331 * group 332 * 333 * - discard all for given group: 334 * group 335 * pa 336 * group 337 * object 338 * 339 */ 340 static struct kmem_cache *ext4_pspace_cachep; 341 static struct kmem_cache *ext4_ac_cachep; 342 static struct kmem_cache *ext4_free_ext_cachep; 343 344 /* We create slab caches for groupinfo data structures based on the 345 * superblock block size. There will be one per mounted filesystem for 346 * each unique s_blocksize_bits */ 347 #define NR_GRPINFO_CACHES 8 348 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; 349 350 static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { 351 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", 352 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", 353 "ext4_groupinfo_64k", "ext4_groupinfo_128k" 354 }; 355 356 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 357 ext4_group_t group); 358 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 359 ext4_group_t group); 360 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 361 362 static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 363 { 364 #if BITS_PER_LONG == 64 365 *bit += ((unsigned long) addr & 7UL) << 3; 366 addr = (void *) ((unsigned long) addr & ~7UL); 367 #elif BITS_PER_LONG == 32 368 *bit += ((unsigned long) addr & 3UL) << 3; 369 addr = (void *) ((unsigned long) addr & ~3UL); 370 #else 371 #error "how many bits you are?!" 372 #endif 373 return addr; 374 } 375 376 static inline int mb_test_bit(int bit, void *addr) 377 { 378 /* 379 * ext4_test_bit on architecture like powerpc 380 * needs unsigned long aligned address 381 */ 382 addr = mb_correct_addr_and_bit(&bit, addr); 383 return ext4_test_bit(bit, addr); 384 } 385 386 static inline void mb_set_bit(int bit, void *addr) 387 { 388 addr = mb_correct_addr_and_bit(&bit, addr); 389 ext4_set_bit(bit, addr); 390 } 391 392 static inline void mb_clear_bit(int bit, void *addr) 393 { 394 addr = mb_correct_addr_and_bit(&bit, addr); 395 ext4_clear_bit(bit, addr); 396 } 397 398 static inline int mb_find_next_zero_bit(void *addr, int max, int start) 399 { 400 int fix = 0, ret, tmpmax; 401 addr = mb_correct_addr_and_bit(&fix, addr); 402 tmpmax = max + fix; 403 start += fix; 404 405 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; 406 if (ret > max) 407 return max; 408 return ret; 409 } 410 411 static inline int mb_find_next_bit(void *addr, int max, int start) 412 { 413 int fix = 0, ret, tmpmax; 414 addr = mb_correct_addr_and_bit(&fix, addr); 415 tmpmax = max + fix; 416 start += fix; 417 418 ret = ext4_find_next_bit(addr, tmpmax, start) - fix; 419 if (ret > max) 420 return max; 421 return ret; 422 } 423 424 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 425 { 426 char *bb; 427 428 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 429 BUG_ON(max == NULL); 430 431 if (order > e4b->bd_blkbits + 1) { 432 *max = 0; 433 return NULL; 434 } 435 436 /* at order 0 we see each particular block */ 437 if (order == 0) { 438 *max = 1 << (e4b->bd_blkbits + 3); 439 return EXT4_MB_BITMAP(e4b); 440 } 441 442 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 443 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 444 445 return bb; 446 } 447 448 #ifdef DOUBLE_CHECK 449 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, 450 int first, int count) 451 { 452 int i; 453 struct super_block *sb = e4b->bd_sb; 454 455 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 456 return; 457 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 458 for (i = 0; i < count; i++) { 459 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 460 ext4_fsblk_t blocknr; 461 462 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 463 blocknr += EXT4_C2B(EXT4_SB(sb), first + i); 464 ext4_grp_locked_error(sb, e4b->bd_group, 465 inode ? inode->i_ino : 0, 466 blocknr, 467 "freeing block already freed " 468 "(bit %u)", 469 first + i); 470 } 471 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 472 } 473 } 474 475 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) 476 { 477 int i; 478 479 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 480 return; 481 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 482 for (i = 0; i < count; i++) { 483 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 484 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 485 } 486 } 487 488 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 489 { 490 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { 491 unsigned char *b1, *b2; 492 int i; 493 b1 = (unsigned char *) e4b->bd_info->bb_bitmap; 494 b2 = (unsigned char *) bitmap; 495 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 496 if (b1[i] != b2[i]) { 497 ext4_msg(e4b->bd_sb, KERN_ERR, 498 "corruption in group %u " 499 "at byte %u(%u): %x in copy != %x " 500 "on disk/prealloc", 501 e4b->bd_group, i, i * 8, b1[i], b2[i]); 502 BUG(); 503 } 504 } 505 } 506 } 507 508 #else 509 static inline void mb_free_blocks_double(struct inode *inode, 510 struct ext4_buddy *e4b, int first, int count) 511 { 512 return; 513 } 514 static inline void mb_mark_used_double(struct ext4_buddy *e4b, 515 int first, int count) 516 { 517 return; 518 } 519 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 520 { 521 return; 522 } 523 #endif 524 525 #ifdef AGGRESSIVE_CHECK 526 527 #define MB_CHECK_ASSERT(assert) \ 528 do { \ 529 if (!(assert)) { \ 530 printk(KERN_EMERG \ 531 "Assertion failure in %s() at %s:%d: \"%s\"\n", \ 532 function, file, line, # assert); \ 533 BUG(); \ 534 } \ 535 } while (0) 536 537 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, 538 const char *function, int line) 539 { 540 struct super_block *sb = e4b->bd_sb; 541 int order = e4b->bd_blkbits + 1; 542 int max; 543 int max2; 544 int i; 545 int j; 546 int k; 547 int count; 548 struct ext4_group_info *grp; 549 int fragments = 0; 550 int fstart; 551 struct list_head *cur; 552 void *buddy; 553 void *buddy2; 554 555 { 556 static int mb_check_counter; 557 if (mb_check_counter++ % 100 != 0) 558 return 0; 559 } 560 561 while (order > 1) { 562 buddy = mb_find_buddy(e4b, order, &max); 563 MB_CHECK_ASSERT(buddy); 564 buddy2 = mb_find_buddy(e4b, order - 1, &max2); 565 MB_CHECK_ASSERT(buddy2); 566 MB_CHECK_ASSERT(buddy != buddy2); 567 MB_CHECK_ASSERT(max * 2 == max2); 568 569 count = 0; 570 for (i = 0; i < max; i++) { 571 572 if (mb_test_bit(i, buddy)) { 573 /* only single bit in buddy2 may be 1 */ 574 if (!mb_test_bit(i << 1, buddy2)) { 575 MB_CHECK_ASSERT( 576 mb_test_bit((i<<1)+1, buddy2)); 577 } else if (!mb_test_bit((i << 1) + 1, buddy2)) { 578 MB_CHECK_ASSERT( 579 mb_test_bit(i << 1, buddy2)); 580 } 581 continue; 582 } 583 584 /* both bits in buddy2 must be 1 */ 585 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 586 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 587 588 for (j = 0; j < (1 << order); j++) { 589 k = (i * (1 << order)) + j; 590 MB_CHECK_ASSERT( 591 !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); 592 } 593 count++; 594 } 595 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); 596 order--; 597 } 598 599 fstart = -1; 600 buddy = mb_find_buddy(e4b, 0, &max); 601 for (i = 0; i < max; i++) { 602 if (!mb_test_bit(i, buddy)) { 603 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); 604 if (fstart == -1) { 605 fragments++; 606 fstart = i; 607 } 608 continue; 609 } 610 fstart = -1; 611 /* check used bits only */ 612 for (j = 0; j < e4b->bd_blkbits + 1; j++) { 613 buddy2 = mb_find_buddy(e4b, j, &max2); 614 k = i >> j; 615 MB_CHECK_ASSERT(k < max2); 616 MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); 617 } 618 } 619 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); 620 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 621 622 grp = ext4_get_group_info(sb, e4b->bd_group); 623 list_for_each(cur, &grp->bb_prealloc_list) { 624 ext4_group_t groupnr; 625 struct ext4_prealloc_space *pa; 626 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 627 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); 628 MB_CHECK_ASSERT(groupnr == e4b->bd_group); 629 for (i = 0; i < pa->pa_len; i++) 630 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); 631 } 632 return 0; 633 } 634 #undef MB_CHECK_ASSERT 635 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ 636 __FILE__, __func__, __LINE__) 637 #else 638 #define mb_check_buddy(e4b) 639 #endif 640 641 /* 642 * Divide blocks started from @first with length @len into 643 * smaller chunks with power of 2 blocks. 644 * Clear the bits in bitmap which the blocks of the chunk(s) covered, 645 * then increase bb_counters[] for corresponded chunk size. 646 */ 647 static void ext4_mb_mark_free_simple(struct super_block *sb, 648 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 649 struct ext4_group_info *grp) 650 { 651 struct ext4_sb_info *sbi = EXT4_SB(sb); 652 ext4_grpblk_t min; 653 ext4_grpblk_t max; 654 ext4_grpblk_t chunk; 655 unsigned short border; 656 657 BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); 658 659 border = 2 << sb->s_blocksize_bits; 660 661 while (len > 0) { 662 /* find how many blocks can be covered since this position */ 663 max = ffs(first | border) - 1; 664 665 /* find how many blocks of power 2 we need to mark */ 666 min = fls(len) - 1; 667 668 if (max < min) 669 min = max; 670 chunk = 1 << min; 671 672 /* mark multiblock chunks only */ 673 grp->bb_counters[min]++; 674 if (min > 0) 675 mb_clear_bit(first >> min, 676 buddy + sbi->s_mb_offsets[min]); 677 678 len -= chunk; 679 first += chunk; 680 } 681 } 682 683 /* 684 * Cache the order of the largest free extent we have available in this block 685 * group. 686 */ 687 static void 688 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) 689 { 690 int i; 691 int bits; 692 693 grp->bb_largest_free_order = -1; /* uninit */ 694 695 bits = sb->s_blocksize_bits + 1; 696 for (i = bits; i >= 0; i--) { 697 if (grp->bb_counters[i] > 0) { 698 grp->bb_largest_free_order = i; 699 break; 700 } 701 } 702 } 703 704 static noinline_for_stack 705 void ext4_mb_generate_buddy(struct super_block *sb, 706 void *buddy, void *bitmap, ext4_group_t group) 707 { 708 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 709 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 710 ext4_grpblk_t i = 0; 711 ext4_grpblk_t first; 712 ext4_grpblk_t len; 713 unsigned free = 0; 714 unsigned fragments = 0; 715 unsigned long long period = get_cycles(); 716 717 /* initialize buddy from bitmap which is aggregation 718 * of on-disk bitmap and preallocations */ 719 i = mb_find_next_zero_bit(bitmap, max, 0); 720 grp->bb_first_free = i; 721 while (i < max) { 722 fragments++; 723 first = i; 724 i = mb_find_next_bit(bitmap, max, i); 725 len = i - first; 726 free += len; 727 if (len > 1) 728 ext4_mb_mark_free_simple(sb, buddy, first, len, grp); 729 else 730 grp->bb_counters[0]++; 731 if (i < max) 732 i = mb_find_next_zero_bit(bitmap, max, i); 733 } 734 grp->bb_fragments = fragments; 735 736 if (free != grp->bb_free) { 737 ext4_grp_locked_error(sb, group, 0, 0, 738 "%u clusters in bitmap, %u in gd", 739 free, grp->bb_free); 740 /* 741 * If we intent to continue, we consider group descritor 742 * corrupt and update bb_free using bitmap value 743 */ 744 grp->bb_free = free; 745 } 746 mb_set_largest_free_order(sb, grp); 747 748 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 749 750 period = get_cycles() - period; 751 spin_lock(&EXT4_SB(sb)->s_bal_lock); 752 EXT4_SB(sb)->s_mb_buddies_generated++; 753 EXT4_SB(sb)->s_mb_generation_time += period; 754 spin_unlock(&EXT4_SB(sb)->s_bal_lock); 755 } 756 757 /* The buddy information is attached the buddy cache inode 758 * for convenience. The information regarding each group 759 * is loaded via ext4_mb_load_buddy. The information involve 760 * block bitmap and buddy information. The information are 761 * stored in the inode as 762 * 763 * { page } 764 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 765 * 766 * 767 * one block each for bitmap and buddy information. 768 * So for each group we take up 2 blocks. A page can 769 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 770 * So it can have information regarding groups_per_page which 771 * is blocks_per_page/2 772 * 773 * Locking note: This routine takes the block group lock of all groups 774 * for this page; do not hold this lock when calling this routine! 775 */ 776 777 static int ext4_mb_init_cache(struct page *page, char *incore) 778 { 779 ext4_group_t ngroups; 780 int blocksize; 781 int blocks_per_page; 782 int groups_per_page; 783 int err = 0; 784 int i; 785 ext4_group_t first_group; 786 int first_block; 787 struct super_block *sb; 788 struct buffer_head *bhs; 789 struct buffer_head **bh; 790 struct inode *inode; 791 char *data; 792 char *bitmap; 793 struct ext4_group_info *grinfo; 794 795 mb_debug(1, "init page %lu\n", page->index); 796 797 inode = page->mapping->host; 798 sb = inode->i_sb; 799 ngroups = ext4_get_groups_count(sb); 800 blocksize = 1 << inode->i_blkbits; 801 blocks_per_page = PAGE_CACHE_SIZE / blocksize; 802 803 groups_per_page = blocks_per_page >> 1; 804 if (groups_per_page == 0) 805 groups_per_page = 1; 806 807 /* allocate buffer_heads to read bitmaps */ 808 if (groups_per_page > 1) { 809 err = -ENOMEM; 810 i = sizeof(struct buffer_head *) * groups_per_page; 811 bh = kzalloc(i, GFP_NOFS); 812 if (bh == NULL) 813 goto out; 814 } else 815 bh = &bhs; 816 817 first_group = page->index * blocks_per_page / 2; 818 819 /* read all groups the page covers into the cache */ 820 for (i = 0; i < groups_per_page; i++) { 821 struct ext4_group_desc *desc; 822 823 if (first_group + i >= ngroups) 824 break; 825 826 grinfo = ext4_get_group_info(sb, first_group + i); 827 /* 828 * If page is uptodate then we came here after online resize 829 * which added some new uninitialized group info structs, so 830 * we must skip all initialized uptodate buddies on the page, 831 * which may be currently in use by an allocating task. 832 */ 833 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { 834 bh[i] = NULL; 835 continue; 836 } 837 838 err = -EIO; 839 desc = ext4_get_group_desc(sb, first_group + i, NULL); 840 if (desc == NULL) 841 goto out; 842 843 err = -ENOMEM; 844 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); 845 if (bh[i] == NULL) 846 goto out; 847 848 if (bitmap_uptodate(bh[i])) 849 continue; 850 851 lock_buffer(bh[i]); 852 if (bitmap_uptodate(bh[i])) { 853 unlock_buffer(bh[i]); 854 continue; 855 } 856 ext4_lock_group(sb, first_group + i); 857 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 858 ext4_init_block_bitmap(sb, bh[i], 859 first_group + i, desc); 860 set_bitmap_uptodate(bh[i]); 861 set_buffer_uptodate(bh[i]); 862 ext4_unlock_group(sb, first_group + i); 863 unlock_buffer(bh[i]); 864 continue; 865 } 866 ext4_unlock_group(sb, first_group + i); 867 if (buffer_uptodate(bh[i])) { 868 /* 869 * if not uninit if bh is uptodate, 870 * bitmap is also uptodate 871 */ 872 set_bitmap_uptodate(bh[i]); 873 unlock_buffer(bh[i]); 874 continue; 875 } 876 get_bh(bh[i]); 877 /* 878 * submit the buffer_head for read. We can 879 * safely mark the bitmap as uptodate now. 880 * We do it here so the bitmap uptodate bit 881 * get set with buffer lock held. 882 */ 883 set_bitmap_uptodate(bh[i]); 884 bh[i]->b_end_io = end_buffer_read_sync; 885 submit_bh(READ, bh[i]); 886 mb_debug(1, "read bitmap for group %u\n", first_group + i); 887 } 888 889 /* wait for I/O completion */ 890 for (i = 0; i < groups_per_page; i++) 891 if (bh[i]) 892 wait_on_buffer(bh[i]); 893 894 err = -EIO; 895 for (i = 0; i < groups_per_page; i++) 896 if (bh[i] && !buffer_uptodate(bh[i])) 897 goto out; 898 899 err = 0; 900 first_block = page->index * blocks_per_page; 901 for (i = 0; i < blocks_per_page; i++) { 902 int group; 903 904 group = (first_block + i) >> 1; 905 if (group >= ngroups) 906 break; 907 908 if (!bh[group - first_group]) 909 /* skip initialized uptodate buddy */ 910 continue; 911 912 /* 913 * data carry information regarding this 914 * particular group in the format specified 915 * above 916 * 917 */ 918 data = page_address(page) + (i * blocksize); 919 bitmap = bh[group - first_group]->b_data; 920 921 /* 922 * We place the buddy block and bitmap block 923 * close together 924 */ 925 if ((first_block + i) & 1) { 926 /* this is block of buddy */ 927 BUG_ON(incore == NULL); 928 mb_debug(1, "put buddy for group %u in page %lu/%x\n", 929 group, page->index, i * blocksize); 930 trace_ext4_mb_buddy_bitmap_load(sb, group); 931 grinfo = ext4_get_group_info(sb, group); 932 grinfo->bb_fragments = 0; 933 memset(grinfo->bb_counters, 0, 934 sizeof(*grinfo->bb_counters) * 935 (sb->s_blocksize_bits+2)); 936 /* 937 * incore got set to the group block bitmap below 938 */ 939 ext4_lock_group(sb, group); 940 /* init the buddy */ 941 memset(data, 0xff, blocksize); 942 ext4_mb_generate_buddy(sb, data, incore, group); 943 ext4_unlock_group(sb, group); 944 incore = NULL; 945 } else { 946 /* this is block of bitmap */ 947 BUG_ON(incore != NULL); 948 mb_debug(1, "put bitmap for group %u in page %lu/%x\n", 949 group, page->index, i * blocksize); 950 trace_ext4_mb_bitmap_load(sb, group); 951 952 /* see comments in ext4_mb_put_pa() */ 953 ext4_lock_group(sb, group); 954 memcpy(data, bitmap, blocksize); 955 956 /* mark all preallocated blks used in in-core bitmap */ 957 ext4_mb_generate_from_pa(sb, data, group); 958 ext4_mb_generate_from_freelist(sb, data, group); 959 ext4_unlock_group(sb, group); 960 961 /* set incore so that the buddy information can be 962 * generated using this 963 */ 964 incore = data; 965 } 966 } 967 SetPageUptodate(page); 968 969 out: 970 if (bh) { 971 for (i = 0; i < groups_per_page; i++) 972 brelse(bh[i]); 973 if (bh != &bhs) 974 kfree(bh); 975 } 976 return err; 977 } 978 979 /* 980 * Lock the buddy and bitmap pages. This make sure other parallel init_group 981 * on the same buddy page doesn't happen whild holding the buddy page lock. 982 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap 983 * are on the same page e4b->bd_buddy_page is NULL and return value is 0. 984 */ 985 static int ext4_mb_get_buddy_page_lock(struct super_block *sb, 986 ext4_group_t group, struct ext4_buddy *e4b) 987 { 988 struct inode *inode = EXT4_SB(sb)->s_buddy_cache; 989 int block, pnum, poff; 990 int blocks_per_page; 991 struct page *page; 992 993 e4b->bd_buddy_page = NULL; 994 e4b->bd_bitmap_page = NULL; 995 996 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 997 /* 998 * the buddy cache inode stores the block bitmap 999 * and buddy information in consecutive blocks. 1000 * So for each group we need two blocks. 1001 */ 1002 block = group * 2; 1003 pnum = block / blocks_per_page; 1004 poff = block % blocks_per_page; 1005 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1006 if (!page) 1007 return -EIO; 1008 BUG_ON(page->mapping != inode->i_mapping); 1009 e4b->bd_bitmap_page = page; 1010 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1011 1012 if (blocks_per_page >= 2) { 1013 /* buddy and bitmap are on the same page */ 1014 return 0; 1015 } 1016 1017 block++; 1018 pnum = block / blocks_per_page; 1019 poff = block % blocks_per_page; 1020 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1021 if (!page) 1022 return -EIO; 1023 BUG_ON(page->mapping != inode->i_mapping); 1024 e4b->bd_buddy_page = page; 1025 return 0; 1026 } 1027 1028 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) 1029 { 1030 if (e4b->bd_bitmap_page) { 1031 unlock_page(e4b->bd_bitmap_page); 1032 page_cache_release(e4b->bd_bitmap_page); 1033 } 1034 if (e4b->bd_buddy_page) { 1035 unlock_page(e4b->bd_buddy_page); 1036 page_cache_release(e4b->bd_buddy_page); 1037 } 1038 } 1039 1040 /* 1041 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1042 * block group lock of all groups for this page; do not hold the BG lock when 1043 * calling this routine! 1044 */ 1045 static noinline_for_stack 1046 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1047 { 1048 1049 struct ext4_group_info *this_grp; 1050 struct ext4_buddy e4b; 1051 struct page *page; 1052 int ret = 0; 1053 1054 mb_debug(1, "init group %u\n", group); 1055 this_grp = ext4_get_group_info(sb, group); 1056 /* 1057 * This ensures that we don't reinit the buddy cache 1058 * page which map to the group from which we are already 1059 * allocating. If we are looking at the buddy cache we would 1060 * have taken a reference using ext4_mb_load_buddy and that 1061 * would have pinned buddy page to page cache. 1062 */ 1063 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); 1064 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { 1065 /* 1066 * somebody initialized the group 1067 * return without doing anything 1068 */ 1069 goto err; 1070 } 1071 1072 page = e4b.bd_bitmap_page; 1073 ret = ext4_mb_init_cache(page, NULL); 1074 if (ret) 1075 goto err; 1076 if (!PageUptodate(page)) { 1077 ret = -EIO; 1078 goto err; 1079 } 1080 mark_page_accessed(page); 1081 1082 if (e4b.bd_buddy_page == NULL) { 1083 /* 1084 * If both the bitmap and buddy are in 1085 * the same page we don't need to force 1086 * init the buddy 1087 */ 1088 ret = 0; 1089 goto err; 1090 } 1091 /* init buddy cache */ 1092 page = e4b.bd_buddy_page; 1093 ret = ext4_mb_init_cache(page, e4b.bd_bitmap); 1094 if (ret) 1095 goto err; 1096 if (!PageUptodate(page)) { 1097 ret = -EIO; 1098 goto err; 1099 } 1100 mark_page_accessed(page); 1101 err: 1102 ext4_mb_put_buddy_page_lock(&e4b); 1103 return ret; 1104 } 1105 1106 /* 1107 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1108 * block group lock of all groups for this page; do not hold the BG lock when 1109 * calling this routine! 1110 */ 1111 static noinline_for_stack int 1112 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1113 struct ext4_buddy *e4b) 1114 { 1115 int blocks_per_page; 1116 int block; 1117 int pnum; 1118 int poff; 1119 struct page *page; 1120 int ret; 1121 struct ext4_group_info *grp; 1122 struct ext4_sb_info *sbi = EXT4_SB(sb); 1123 struct inode *inode = sbi->s_buddy_cache; 1124 1125 mb_debug(1, "load group %u\n", group); 1126 1127 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1128 grp = ext4_get_group_info(sb, group); 1129 1130 e4b->bd_blkbits = sb->s_blocksize_bits; 1131 e4b->bd_info = grp; 1132 e4b->bd_sb = sb; 1133 e4b->bd_group = group; 1134 e4b->bd_buddy_page = NULL; 1135 e4b->bd_bitmap_page = NULL; 1136 1137 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1138 /* 1139 * we need full data about the group 1140 * to make a good selection 1141 */ 1142 ret = ext4_mb_init_group(sb, group); 1143 if (ret) 1144 return ret; 1145 } 1146 1147 /* 1148 * the buddy cache inode stores the block bitmap 1149 * and buddy information in consecutive blocks. 1150 * So for each group we need two blocks. 1151 */ 1152 block = group * 2; 1153 pnum = block / blocks_per_page; 1154 poff = block % blocks_per_page; 1155 1156 /* we could use find_or_create_page(), but it locks page 1157 * what we'd like to avoid in fast path ... */ 1158 page = find_get_page(inode->i_mapping, pnum); 1159 if (page == NULL || !PageUptodate(page)) { 1160 if (page) 1161 /* 1162 * drop the page reference and try 1163 * to get the page with lock. If we 1164 * are not uptodate that implies 1165 * somebody just created the page but 1166 * is yet to initialize the same. So 1167 * wait for it to initialize. 1168 */ 1169 page_cache_release(page); 1170 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1171 if (page) { 1172 BUG_ON(page->mapping != inode->i_mapping); 1173 if (!PageUptodate(page)) { 1174 ret = ext4_mb_init_cache(page, NULL); 1175 if (ret) { 1176 unlock_page(page); 1177 goto err; 1178 } 1179 mb_cmp_bitmaps(e4b, page_address(page) + 1180 (poff * sb->s_blocksize)); 1181 } 1182 unlock_page(page); 1183 } 1184 } 1185 if (page == NULL || !PageUptodate(page)) { 1186 ret = -EIO; 1187 goto err; 1188 } 1189 e4b->bd_bitmap_page = page; 1190 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1191 mark_page_accessed(page); 1192 1193 block++; 1194 pnum = block / blocks_per_page; 1195 poff = block % blocks_per_page; 1196 1197 page = find_get_page(inode->i_mapping, pnum); 1198 if (page == NULL || !PageUptodate(page)) { 1199 if (page) 1200 page_cache_release(page); 1201 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1202 if (page) { 1203 BUG_ON(page->mapping != inode->i_mapping); 1204 if (!PageUptodate(page)) { 1205 ret = ext4_mb_init_cache(page, e4b->bd_bitmap); 1206 if (ret) { 1207 unlock_page(page); 1208 goto err; 1209 } 1210 } 1211 unlock_page(page); 1212 } 1213 } 1214 if (page == NULL || !PageUptodate(page)) { 1215 ret = -EIO; 1216 goto err; 1217 } 1218 e4b->bd_buddy_page = page; 1219 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1220 mark_page_accessed(page); 1221 1222 BUG_ON(e4b->bd_bitmap_page == NULL); 1223 BUG_ON(e4b->bd_buddy_page == NULL); 1224 1225 return 0; 1226 1227 err: 1228 if (page) 1229 page_cache_release(page); 1230 if (e4b->bd_bitmap_page) 1231 page_cache_release(e4b->bd_bitmap_page); 1232 if (e4b->bd_buddy_page) 1233 page_cache_release(e4b->bd_buddy_page); 1234 e4b->bd_buddy = NULL; 1235 e4b->bd_bitmap = NULL; 1236 return ret; 1237 } 1238 1239 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) 1240 { 1241 if (e4b->bd_bitmap_page) 1242 page_cache_release(e4b->bd_bitmap_page); 1243 if (e4b->bd_buddy_page) 1244 page_cache_release(e4b->bd_buddy_page); 1245 } 1246 1247 1248 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1249 { 1250 int order = 1; 1251 void *bb; 1252 1253 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 1254 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1255 1256 bb = EXT4_MB_BUDDY(e4b); 1257 while (order <= e4b->bd_blkbits + 1) { 1258 block = block >> 1; 1259 if (!mb_test_bit(block, bb)) { 1260 /* this block is part of buddy of order 'order' */ 1261 return order; 1262 } 1263 bb += 1 << (e4b->bd_blkbits - order); 1264 order++; 1265 } 1266 return 0; 1267 } 1268 1269 static void mb_clear_bits(void *bm, int cur, int len) 1270 { 1271 __u32 *addr; 1272 1273 len = cur + len; 1274 while (cur < len) { 1275 if ((cur & 31) == 0 && (len - cur) >= 32) { 1276 /* fast path: clear whole word at once */ 1277 addr = bm + (cur >> 3); 1278 *addr = 0; 1279 cur += 32; 1280 continue; 1281 } 1282 mb_clear_bit(cur, bm); 1283 cur++; 1284 } 1285 } 1286 1287 void ext4_set_bits(void *bm, int cur, int len) 1288 { 1289 __u32 *addr; 1290 1291 len = cur + len; 1292 while (cur < len) { 1293 if ((cur & 31) == 0 && (len - cur) >= 32) { 1294 /* fast path: set whole word at once */ 1295 addr = bm + (cur >> 3); 1296 *addr = 0xffffffff; 1297 cur += 32; 1298 continue; 1299 } 1300 mb_set_bit(cur, bm); 1301 cur++; 1302 } 1303 } 1304 1305 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1306 int first, int count) 1307 { 1308 int block = 0; 1309 int max = 0; 1310 int order; 1311 void *buddy; 1312 void *buddy2; 1313 struct super_block *sb = e4b->bd_sb; 1314 1315 BUG_ON(first + count > (sb->s_blocksize << 3)); 1316 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1317 mb_check_buddy(e4b); 1318 mb_free_blocks_double(inode, e4b, first, count); 1319 1320 e4b->bd_info->bb_free += count; 1321 if (first < e4b->bd_info->bb_first_free) 1322 e4b->bd_info->bb_first_free = first; 1323 1324 /* let's maintain fragments counter */ 1325 if (first != 0) 1326 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); 1327 if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) 1328 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); 1329 if (block && max) 1330 e4b->bd_info->bb_fragments--; 1331 else if (!block && !max) 1332 e4b->bd_info->bb_fragments++; 1333 1334 /* let's maintain buddy itself */ 1335 while (count-- > 0) { 1336 block = first++; 1337 order = 0; 1338 1339 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1340 ext4_fsblk_t blocknr; 1341 1342 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1343 blocknr += EXT4_C2B(EXT4_SB(sb), block); 1344 ext4_grp_locked_error(sb, e4b->bd_group, 1345 inode ? inode->i_ino : 0, 1346 blocknr, 1347 "freeing already freed block " 1348 "(bit %u)", block); 1349 } 1350 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1351 e4b->bd_info->bb_counters[order]++; 1352 1353 /* start of the buddy */ 1354 buddy = mb_find_buddy(e4b, order, &max); 1355 1356 do { 1357 block &= ~1UL; 1358 if (mb_test_bit(block, buddy) || 1359 mb_test_bit(block + 1, buddy)) 1360 break; 1361 1362 /* both the buddies are free, try to coalesce them */ 1363 buddy2 = mb_find_buddy(e4b, order + 1, &max); 1364 1365 if (!buddy2) 1366 break; 1367 1368 if (order > 0) { 1369 /* for special purposes, we don't set 1370 * free bits in bitmap */ 1371 mb_set_bit(block, buddy); 1372 mb_set_bit(block + 1, buddy); 1373 } 1374 e4b->bd_info->bb_counters[order]--; 1375 e4b->bd_info->bb_counters[order]--; 1376 1377 block = block >> 1; 1378 order++; 1379 e4b->bd_info->bb_counters[order]++; 1380 1381 mb_clear_bit(block, buddy2); 1382 buddy = buddy2; 1383 } while (1); 1384 } 1385 mb_set_largest_free_order(sb, e4b->bd_info); 1386 mb_check_buddy(e4b); 1387 } 1388 1389 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, 1390 int needed, struct ext4_free_extent *ex) 1391 { 1392 int next = block; 1393 int max; 1394 void *buddy; 1395 1396 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1397 BUG_ON(ex == NULL); 1398 1399 buddy = mb_find_buddy(e4b, order, &max); 1400 BUG_ON(buddy == NULL); 1401 BUG_ON(block >= max); 1402 if (mb_test_bit(block, buddy)) { 1403 ex->fe_len = 0; 1404 ex->fe_start = 0; 1405 ex->fe_group = 0; 1406 return 0; 1407 } 1408 1409 /* FIXME dorp order completely ? */ 1410 if (likely(order == 0)) { 1411 /* find actual order */ 1412 order = mb_find_order_for_block(e4b, block); 1413 block = block >> order; 1414 } 1415 1416 ex->fe_len = 1 << order; 1417 ex->fe_start = block << order; 1418 ex->fe_group = e4b->bd_group; 1419 1420 /* calc difference from given start */ 1421 next = next - ex->fe_start; 1422 ex->fe_len -= next; 1423 ex->fe_start += next; 1424 1425 while (needed > ex->fe_len && 1426 (buddy = mb_find_buddy(e4b, order, &max))) { 1427 1428 if (block + 1 >= max) 1429 break; 1430 1431 next = (block + 1) * (1 << order); 1432 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) 1433 break; 1434 1435 order = mb_find_order_for_block(e4b, next); 1436 1437 block = next >> order; 1438 ex->fe_len += 1 << order; 1439 } 1440 1441 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); 1442 return ex->fe_len; 1443 } 1444 1445 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) 1446 { 1447 int ord; 1448 int mlen = 0; 1449 int max = 0; 1450 int cur; 1451 int start = ex->fe_start; 1452 int len = ex->fe_len; 1453 unsigned ret = 0; 1454 int len0 = len; 1455 void *buddy; 1456 1457 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 1458 BUG_ON(e4b->bd_group != ex->fe_group); 1459 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1460 mb_check_buddy(e4b); 1461 mb_mark_used_double(e4b, start, len); 1462 1463 e4b->bd_info->bb_free -= len; 1464 if (e4b->bd_info->bb_first_free == start) 1465 e4b->bd_info->bb_first_free += len; 1466 1467 /* let's maintain fragments counter */ 1468 if (start != 0) 1469 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); 1470 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1471 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); 1472 if (mlen && max) 1473 e4b->bd_info->bb_fragments++; 1474 else if (!mlen && !max) 1475 e4b->bd_info->bb_fragments--; 1476 1477 /* let's maintain buddy itself */ 1478 while (len) { 1479 ord = mb_find_order_for_block(e4b, start); 1480 1481 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 1482 /* the whole chunk may be allocated at once! */ 1483 mlen = 1 << ord; 1484 buddy = mb_find_buddy(e4b, ord, &max); 1485 BUG_ON((start >> ord) >= max); 1486 mb_set_bit(start >> ord, buddy); 1487 e4b->bd_info->bb_counters[ord]--; 1488 start += mlen; 1489 len -= mlen; 1490 BUG_ON(len < 0); 1491 continue; 1492 } 1493 1494 /* store for history */ 1495 if (ret == 0) 1496 ret = len | (ord << 16); 1497 1498 /* we have to split large buddy */ 1499 BUG_ON(ord <= 0); 1500 buddy = mb_find_buddy(e4b, ord, &max); 1501 mb_set_bit(start >> ord, buddy); 1502 e4b->bd_info->bb_counters[ord]--; 1503 1504 ord--; 1505 cur = (start >> ord) & ~1U; 1506 buddy = mb_find_buddy(e4b, ord, &max); 1507 mb_clear_bit(cur, buddy); 1508 mb_clear_bit(cur + 1, buddy); 1509 e4b->bd_info->bb_counters[ord]++; 1510 e4b->bd_info->bb_counters[ord]++; 1511 } 1512 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1513 1514 ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1515 mb_check_buddy(e4b); 1516 1517 return ret; 1518 } 1519 1520 /* 1521 * Must be called under group lock! 1522 */ 1523 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, 1524 struct ext4_buddy *e4b) 1525 { 1526 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1527 int ret; 1528 1529 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); 1530 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1531 1532 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); 1533 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; 1534 ret = mb_mark_used(e4b, &ac->ac_b_ex); 1535 1536 /* preallocation can change ac_b_ex, thus we store actually 1537 * allocated blocks for history */ 1538 ac->ac_f_ex = ac->ac_b_ex; 1539 1540 ac->ac_status = AC_STATUS_FOUND; 1541 ac->ac_tail = ret & 0xffff; 1542 ac->ac_buddy = ret >> 16; 1543 1544 /* 1545 * take the page reference. We want the page to be pinned 1546 * so that we don't get a ext4_mb_init_cache_call for this 1547 * group until we update the bitmap. That would mean we 1548 * double allocate blocks. The reference is dropped 1549 * in ext4_mb_release_context 1550 */ 1551 ac->ac_bitmap_page = e4b->bd_bitmap_page; 1552 get_page(ac->ac_bitmap_page); 1553 ac->ac_buddy_page = e4b->bd_buddy_page; 1554 get_page(ac->ac_buddy_page); 1555 /* store last allocated for subsequent stream allocation */ 1556 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 1557 spin_lock(&sbi->s_md_lock); 1558 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1559 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1560 spin_unlock(&sbi->s_md_lock); 1561 } 1562 } 1563 1564 /* 1565 * regular allocator, for general purposes allocation 1566 */ 1567 1568 static void ext4_mb_check_limits(struct ext4_allocation_context *ac, 1569 struct ext4_buddy *e4b, 1570 int finish_group) 1571 { 1572 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1573 struct ext4_free_extent *bex = &ac->ac_b_ex; 1574 struct ext4_free_extent *gex = &ac->ac_g_ex; 1575 struct ext4_free_extent ex; 1576 int max; 1577 1578 if (ac->ac_status == AC_STATUS_FOUND) 1579 return; 1580 /* 1581 * We don't want to scan for a whole year 1582 */ 1583 if (ac->ac_found > sbi->s_mb_max_to_scan && 1584 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1585 ac->ac_status = AC_STATUS_BREAK; 1586 return; 1587 } 1588 1589 /* 1590 * Haven't found good chunk so far, let's continue 1591 */ 1592 if (bex->fe_len < gex->fe_len) 1593 return; 1594 1595 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) 1596 && bex->fe_group == e4b->bd_group) { 1597 /* recheck chunk's availability - we don't know 1598 * when it was found (within this lock-unlock 1599 * period or not) */ 1600 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); 1601 if (max >= gex->fe_len) { 1602 ext4_mb_use_best_found(ac, e4b); 1603 return; 1604 } 1605 } 1606 } 1607 1608 /* 1609 * The routine checks whether found extent is good enough. If it is, 1610 * then the extent gets marked used and flag is set to the context 1611 * to stop scanning. Otherwise, the extent is compared with the 1612 * previous found extent and if new one is better, then it's stored 1613 * in the context. Later, the best found extent will be used, if 1614 * mballoc can't find good enough extent. 1615 * 1616 * FIXME: real allocation policy is to be designed yet! 1617 */ 1618 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, 1619 struct ext4_free_extent *ex, 1620 struct ext4_buddy *e4b) 1621 { 1622 struct ext4_free_extent *bex = &ac->ac_b_ex; 1623 struct ext4_free_extent *gex = &ac->ac_g_ex; 1624 1625 BUG_ON(ex->fe_len <= 0); 1626 BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 1627 BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 1628 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 1629 1630 ac->ac_found++; 1631 1632 /* 1633 * The special case - take what you catch first 1634 */ 1635 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1636 *bex = *ex; 1637 ext4_mb_use_best_found(ac, e4b); 1638 return; 1639 } 1640 1641 /* 1642 * Let's check whether the chuck is good enough 1643 */ 1644 if (ex->fe_len == gex->fe_len) { 1645 *bex = *ex; 1646 ext4_mb_use_best_found(ac, e4b); 1647 return; 1648 } 1649 1650 /* 1651 * If this is first found extent, just store it in the context 1652 */ 1653 if (bex->fe_len == 0) { 1654 *bex = *ex; 1655 return; 1656 } 1657 1658 /* 1659 * If new found extent is better, store it in the context 1660 */ 1661 if (bex->fe_len < gex->fe_len) { 1662 /* if the request isn't satisfied, any found extent 1663 * larger than previous best one is better */ 1664 if (ex->fe_len > bex->fe_len) 1665 *bex = *ex; 1666 } else if (ex->fe_len > gex->fe_len) { 1667 /* if the request is satisfied, then we try to find 1668 * an extent that still satisfy the request, but is 1669 * smaller than previous one */ 1670 if (ex->fe_len < bex->fe_len) 1671 *bex = *ex; 1672 } 1673 1674 ext4_mb_check_limits(ac, e4b, 0); 1675 } 1676 1677 static noinline_for_stack 1678 int ext4_mb_try_best_found(struct ext4_allocation_context *ac, 1679 struct ext4_buddy *e4b) 1680 { 1681 struct ext4_free_extent ex = ac->ac_b_ex; 1682 ext4_group_t group = ex.fe_group; 1683 int max; 1684 int err; 1685 1686 BUG_ON(ex.fe_len <= 0); 1687 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1688 if (err) 1689 return err; 1690 1691 ext4_lock_group(ac->ac_sb, group); 1692 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); 1693 1694 if (max > 0) { 1695 ac->ac_b_ex = ex; 1696 ext4_mb_use_best_found(ac, e4b); 1697 } 1698 1699 ext4_unlock_group(ac->ac_sb, group); 1700 ext4_mb_unload_buddy(e4b); 1701 1702 return 0; 1703 } 1704 1705 static noinline_for_stack 1706 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 1707 struct ext4_buddy *e4b) 1708 { 1709 ext4_group_t group = ac->ac_g_ex.fe_group; 1710 int max; 1711 int err; 1712 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1713 struct ext4_free_extent ex; 1714 1715 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1716 return 0; 1717 1718 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1719 if (err) 1720 return err; 1721 1722 ext4_lock_group(ac->ac_sb, group); 1723 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, 1724 ac->ac_g_ex.fe_len, &ex); 1725 1726 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1727 ext4_fsblk_t start; 1728 1729 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + 1730 ex.fe_start; 1731 /* use do_div to get remainder (would be 64-bit modulo) */ 1732 if (do_div(start, sbi->s_stripe) == 0) { 1733 ac->ac_found++; 1734 ac->ac_b_ex = ex; 1735 ext4_mb_use_best_found(ac, e4b); 1736 } 1737 } else if (max >= ac->ac_g_ex.fe_len) { 1738 BUG_ON(ex.fe_len <= 0); 1739 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1740 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1741 ac->ac_found++; 1742 ac->ac_b_ex = ex; 1743 ext4_mb_use_best_found(ac, e4b); 1744 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { 1745 /* Sometimes, caller may want to merge even small 1746 * number of blocks to an existing extent */ 1747 BUG_ON(ex.fe_len <= 0); 1748 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1749 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1750 ac->ac_found++; 1751 ac->ac_b_ex = ex; 1752 ext4_mb_use_best_found(ac, e4b); 1753 } 1754 ext4_unlock_group(ac->ac_sb, group); 1755 ext4_mb_unload_buddy(e4b); 1756 1757 return 0; 1758 } 1759 1760 /* 1761 * The routine scans buddy structures (not bitmap!) from given order 1762 * to max order and tries to find big enough chunk to satisfy the req 1763 */ 1764 static noinline_for_stack 1765 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 1766 struct ext4_buddy *e4b) 1767 { 1768 struct super_block *sb = ac->ac_sb; 1769 struct ext4_group_info *grp = e4b->bd_info; 1770 void *buddy; 1771 int i; 1772 int k; 1773 int max; 1774 1775 BUG_ON(ac->ac_2order <= 0); 1776 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { 1777 if (grp->bb_counters[i] == 0) 1778 continue; 1779 1780 buddy = mb_find_buddy(e4b, i, &max); 1781 BUG_ON(buddy == NULL); 1782 1783 k = mb_find_next_zero_bit(buddy, max, 0); 1784 BUG_ON(k >= max); 1785 1786 ac->ac_found++; 1787 1788 ac->ac_b_ex.fe_len = 1 << i; 1789 ac->ac_b_ex.fe_start = k << i; 1790 ac->ac_b_ex.fe_group = e4b->bd_group; 1791 1792 ext4_mb_use_best_found(ac, e4b); 1793 1794 BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); 1795 1796 if (EXT4_SB(sb)->s_mb_stats) 1797 atomic_inc(&EXT4_SB(sb)->s_bal_2orders); 1798 1799 break; 1800 } 1801 } 1802 1803 /* 1804 * The routine scans the group and measures all found extents. 1805 * In order to optimize scanning, caller must pass number of 1806 * free blocks in the group, so the routine can know upper limit. 1807 */ 1808 static noinline_for_stack 1809 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 1810 struct ext4_buddy *e4b) 1811 { 1812 struct super_block *sb = ac->ac_sb; 1813 void *bitmap = EXT4_MB_BITMAP(e4b); 1814 struct ext4_free_extent ex; 1815 int i; 1816 int free; 1817 1818 free = e4b->bd_info->bb_free; 1819 BUG_ON(free <= 0); 1820 1821 i = e4b->bd_info->bb_first_free; 1822 1823 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 1824 i = mb_find_next_zero_bit(bitmap, 1825 EXT4_CLUSTERS_PER_GROUP(sb), i); 1826 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { 1827 /* 1828 * IF we have corrupt bitmap, we won't find any 1829 * free blocks even though group info says we 1830 * we have free blocks 1831 */ 1832 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1833 "%d free clusters as per " 1834 "group info. But bitmap says 0", 1835 free); 1836 break; 1837 } 1838 1839 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1840 BUG_ON(ex.fe_len <= 0); 1841 if (free < ex.fe_len) { 1842 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1843 "%d free clusters as per " 1844 "group info. But got %d blocks", 1845 free, ex.fe_len); 1846 /* 1847 * The number of free blocks differs. This mostly 1848 * indicate that the bitmap is corrupt. So exit 1849 * without claiming the space. 1850 */ 1851 break; 1852 } 1853 1854 ext4_mb_measure_extent(ac, &ex, e4b); 1855 1856 i += ex.fe_len; 1857 free -= ex.fe_len; 1858 } 1859 1860 ext4_mb_check_limits(ac, e4b, 1); 1861 } 1862 1863 /* 1864 * This is a special case for storages like raid5 1865 * we try to find stripe-aligned chunks for stripe-size-multiple requests 1866 */ 1867 static noinline_for_stack 1868 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1869 struct ext4_buddy *e4b) 1870 { 1871 struct super_block *sb = ac->ac_sb; 1872 struct ext4_sb_info *sbi = EXT4_SB(sb); 1873 void *bitmap = EXT4_MB_BITMAP(e4b); 1874 struct ext4_free_extent ex; 1875 ext4_fsblk_t first_group_block; 1876 ext4_fsblk_t a; 1877 ext4_grpblk_t i; 1878 int max; 1879 1880 BUG_ON(sbi->s_stripe == 0); 1881 1882 /* find first stripe-aligned block in group */ 1883 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); 1884 1885 a = first_group_block + sbi->s_stripe - 1; 1886 do_div(a, sbi->s_stripe); 1887 i = (a * sbi->s_stripe) - first_group_block; 1888 1889 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { 1890 if (!mb_test_bit(i, bitmap)) { 1891 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); 1892 if (max >= sbi->s_stripe) { 1893 ac->ac_found++; 1894 ac->ac_b_ex = ex; 1895 ext4_mb_use_best_found(ac, e4b); 1896 break; 1897 } 1898 } 1899 i += sbi->s_stripe; 1900 } 1901 } 1902 1903 /* This is now called BEFORE we load the buddy bitmap. */ 1904 static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1905 ext4_group_t group, int cr) 1906 { 1907 unsigned free, fragments; 1908 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1909 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1910 1911 BUG_ON(cr < 0 || cr >= 4); 1912 1913 /* We only do this if the grp has never been initialized */ 1914 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1915 int ret = ext4_mb_init_group(ac->ac_sb, group); 1916 if (ret) 1917 return 0; 1918 } 1919 1920 free = grp->bb_free; 1921 fragments = grp->bb_fragments; 1922 if (free == 0) 1923 return 0; 1924 if (fragments == 0) 1925 return 0; 1926 1927 switch (cr) { 1928 case 0: 1929 BUG_ON(ac->ac_2order == 0); 1930 1931 if (grp->bb_largest_free_order < ac->ac_2order) 1932 return 0; 1933 1934 /* Avoid using the first bg of a flexgroup for data files */ 1935 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1936 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 1937 ((group % flex_size) == 0)) 1938 return 0; 1939 1940 return 1; 1941 case 1: 1942 if ((free / fragments) >= ac->ac_g_ex.fe_len) 1943 return 1; 1944 break; 1945 case 2: 1946 if (free >= ac->ac_g_ex.fe_len) 1947 return 1; 1948 break; 1949 case 3: 1950 return 1; 1951 default: 1952 BUG(); 1953 } 1954 1955 return 0; 1956 } 1957 1958 static noinline_for_stack int 1959 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1960 { 1961 ext4_group_t ngroups, group, i; 1962 int cr; 1963 int err = 0; 1964 struct ext4_sb_info *sbi; 1965 struct super_block *sb; 1966 struct ext4_buddy e4b; 1967 1968 sb = ac->ac_sb; 1969 sbi = EXT4_SB(sb); 1970 ngroups = ext4_get_groups_count(sb); 1971 /* non-extent files are limited to low blocks/groups */ 1972 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) 1973 ngroups = sbi->s_blockfile_groups; 1974 1975 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1976 1977 /* first, try the goal */ 1978 err = ext4_mb_find_by_goal(ac, &e4b); 1979 if (err || ac->ac_status == AC_STATUS_FOUND) 1980 goto out; 1981 1982 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 1983 goto out; 1984 1985 /* 1986 * ac->ac2_order is set only if the fe_len is a power of 2 1987 * if ac2_order is set we also set criteria to 0 so that we 1988 * try exact allocation using buddy. 1989 */ 1990 i = fls(ac->ac_g_ex.fe_len); 1991 ac->ac_2order = 0; 1992 /* 1993 * We search using buddy data only if the order of the request 1994 * is greater than equal to the sbi_s_mb_order2_reqs 1995 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req 1996 */ 1997 if (i >= sbi->s_mb_order2_reqs) { 1998 /* 1999 * This should tell if fe_len is exactly power of 2 2000 */ 2001 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 2002 ac->ac_2order = i - 1; 2003 } 2004 2005 /* if stream allocation is enabled, use global goal */ 2006 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2007 /* TBD: may be hot point */ 2008 spin_lock(&sbi->s_md_lock); 2009 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2010 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2011 spin_unlock(&sbi->s_md_lock); 2012 } 2013 2014 /* Let's just scan groups to find more-less suitable blocks */ 2015 cr = ac->ac_2order ? 0 : 1; 2016 /* 2017 * cr == 0 try to get exact allocation, 2018 * cr == 3 try to get anything 2019 */ 2020 repeat: 2021 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 2022 ac->ac_criteria = cr; 2023 /* 2024 * searching for the right group start 2025 * from the goal value specified 2026 */ 2027 group = ac->ac_g_ex.fe_group; 2028 2029 for (i = 0; i < ngroups; group++, i++) { 2030 if (group == ngroups) 2031 group = 0; 2032 2033 /* This now checks without needing the buddy page */ 2034 if (!ext4_mb_good_group(ac, group, cr)) 2035 continue; 2036 2037 err = ext4_mb_load_buddy(sb, group, &e4b); 2038 if (err) 2039 goto out; 2040 2041 ext4_lock_group(sb, group); 2042 2043 /* 2044 * We need to check again after locking the 2045 * block group 2046 */ 2047 if (!ext4_mb_good_group(ac, group, cr)) { 2048 ext4_unlock_group(sb, group); 2049 ext4_mb_unload_buddy(&e4b); 2050 continue; 2051 } 2052 2053 ac->ac_groups_scanned++; 2054 if (cr == 0) 2055 ext4_mb_simple_scan_group(ac, &e4b); 2056 else if (cr == 1 && sbi->s_stripe && 2057 !(ac->ac_g_ex.fe_len % sbi->s_stripe)) 2058 ext4_mb_scan_aligned(ac, &e4b); 2059 else 2060 ext4_mb_complex_scan_group(ac, &e4b); 2061 2062 ext4_unlock_group(sb, group); 2063 ext4_mb_unload_buddy(&e4b); 2064 2065 if (ac->ac_status != AC_STATUS_CONTINUE) 2066 break; 2067 } 2068 } 2069 2070 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && 2071 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2072 /* 2073 * We've been searching too long. Let's try to allocate 2074 * the best chunk we've found so far 2075 */ 2076 2077 ext4_mb_try_best_found(ac, &e4b); 2078 if (ac->ac_status != AC_STATUS_FOUND) { 2079 /* 2080 * Someone more lucky has already allocated it. 2081 * The only thing we can do is just take first 2082 * found block(s) 2083 printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); 2084 */ 2085 ac->ac_b_ex.fe_group = 0; 2086 ac->ac_b_ex.fe_start = 0; 2087 ac->ac_b_ex.fe_len = 0; 2088 ac->ac_status = AC_STATUS_CONTINUE; 2089 ac->ac_flags |= EXT4_MB_HINT_FIRST; 2090 cr = 3; 2091 atomic_inc(&sbi->s_mb_lost_chunks); 2092 goto repeat; 2093 } 2094 } 2095 out: 2096 return err; 2097 } 2098 2099 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2100 { 2101 struct super_block *sb = seq->private; 2102 ext4_group_t group; 2103 2104 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2105 return NULL; 2106 group = *pos + 1; 2107 return (void *) ((unsigned long) group); 2108 } 2109 2110 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2111 { 2112 struct super_block *sb = seq->private; 2113 ext4_group_t group; 2114 2115 ++*pos; 2116 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2117 return NULL; 2118 group = *pos + 1; 2119 return (void *) ((unsigned long) group); 2120 } 2121 2122 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2123 { 2124 struct super_block *sb = seq->private; 2125 ext4_group_t group = (ext4_group_t) ((unsigned long) v); 2126 int i; 2127 int err; 2128 struct ext4_buddy e4b; 2129 struct sg { 2130 struct ext4_group_info info; 2131 ext4_grpblk_t counters[16]; 2132 } sg; 2133 2134 group--; 2135 if (group == 0) 2136 seq_printf(seq, "#%-5s: %-5s %-5s %-5s " 2137 "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " 2138 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", 2139 "group", "free", "frags", "first", 2140 "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", 2141 "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); 2142 2143 i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + 2144 sizeof(struct ext4_group_info); 2145 err = ext4_mb_load_buddy(sb, group, &e4b); 2146 if (err) { 2147 seq_printf(seq, "#%-5u: I/O error\n", group); 2148 return 0; 2149 } 2150 ext4_lock_group(sb, group); 2151 memcpy(&sg, ext4_get_group_info(sb, group), i); 2152 ext4_unlock_group(sb, group); 2153 ext4_mb_unload_buddy(&e4b); 2154 2155 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2156 sg.info.bb_fragments, sg.info.bb_first_free); 2157 for (i = 0; i <= 13; i++) 2158 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? 2159 sg.info.bb_counters[i] : 0); 2160 seq_printf(seq, " ]\n"); 2161 2162 return 0; 2163 } 2164 2165 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) 2166 { 2167 } 2168 2169 static const struct seq_operations ext4_mb_seq_groups_ops = { 2170 .start = ext4_mb_seq_groups_start, 2171 .next = ext4_mb_seq_groups_next, 2172 .stop = ext4_mb_seq_groups_stop, 2173 .show = ext4_mb_seq_groups_show, 2174 }; 2175 2176 static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) 2177 { 2178 struct super_block *sb = PDE(inode)->data; 2179 int rc; 2180 2181 rc = seq_open(file, &ext4_mb_seq_groups_ops); 2182 if (rc == 0) { 2183 struct seq_file *m = file->private_data; 2184 m->private = sb; 2185 } 2186 return rc; 2187 2188 } 2189 2190 static const struct file_operations ext4_mb_seq_groups_fops = { 2191 .owner = THIS_MODULE, 2192 .open = ext4_mb_seq_groups_open, 2193 .read = seq_read, 2194 .llseek = seq_lseek, 2195 .release = seq_release, 2196 }; 2197 2198 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) 2199 { 2200 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 2201 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; 2202 2203 BUG_ON(!cachep); 2204 return cachep; 2205 } 2206 2207 /* Create and initialize ext4_group_info data for the given group. */ 2208 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2209 struct ext4_group_desc *desc) 2210 { 2211 int i; 2212 int metalen = 0; 2213 struct ext4_sb_info *sbi = EXT4_SB(sb); 2214 struct ext4_group_info **meta_group_info; 2215 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2216 2217 /* 2218 * First check if this group is the first of a reserved block. 2219 * If it's true, we have to allocate a new table of pointers 2220 * to ext4_group_info structures 2221 */ 2222 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 2223 metalen = sizeof(*meta_group_info) << 2224 EXT4_DESC_PER_BLOCK_BITS(sb); 2225 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2226 if (meta_group_info == NULL) { 2227 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " 2228 "for a buddy group"); 2229 goto exit_meta_group_info; 2230 } 2231 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = 2232 meta_group_info; 2233 } 2234 2235 meta_group_info = 2236 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2237 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2238 2239 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); 2240 if (meta_group_info[i] == NULL) { 2241 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); 2242 goto exit_group_info; 2243 } 2244 memset(meta_group_info[i], 0, kmem_cache_size(cachep)); 2245 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2246 &(meta_group_info[i]->bb_state)); 2247 2248 /* 2249 * initialize bb_free to be able to skip 2250 * empty groups without initialization 2251 */ 2252 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2253 meta_group_info[i]->bb_free = 2254 ext4_free_clusters_after_init(sb, group, desc); 2255 } else { 2256 meta_group_info[i]->bb_free = 2257 ext4_free_group_clusters(sb, desc); 2258 } 2259 2260 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2261 init_rwsem(&meta_group_info[i]->alloc_sem); 2262 meta_group_info[i]->bb_free_root = RB_ROOT; 2263 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ 2264 2265 #ifdef DOUBLE_CHECK 2266 { 2267 struct buffer_head *bh; 2268 meta_group_info[i]->bb_bitmap = 2269 kmalloc(sb->s_blocksize, GFP_KERNEL); 2270 BUG_ON(meta_group_info[i]->bb_bitmap == NULL); 2271 bh = ext4_read_block_bitmap(sb, group); 2272 BUG_ON(bh == NULL); 2273 memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, 2274 sb->s_blocksize); 2275 put_bh(bh); 2276 } 2277 #endif 2278 2279 return 0; 2280 2281 exit_group_info: 2282 /* If a meta_group_info table has been allocated, release it now */ 2283 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 2284 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); 2285 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; 2286 } 2287 exit_meta_group_info: 2288 return -ENOMEM; 2289 } /* ext4_mb_add_groupinfo */ 2290 2291 static int ext4_mb_init_backend(struct super_block *sb) 2292 { 2293 ext4_group_t ngroups = ext4_get_groups_count(sb); 2294 ext4_group_t i; 2295 struct ext4_sb_info *sbi = EXT4_SB(sb); 2296 struct ext4_super_block *es = sbi->s_es; 2297 int num_meta_group_infos; 2298 int num_meta_group_infos_max; 2299 int array_size; 2300 struct ext4_group_desc *desc; 2301 struct kmem_cache *cachep; 2302 2303 /* This is the number of blocks used by GDT */ 2304 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 2305 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); 2306 2307 /* 2308 * This is the total number of blocks used by GDT including 2309 * the number of reserved blocks for GDT. 2310 * The s_group_info array is allocated with this value 2311 * to allow a clean online resize without a complex 2312 * manipulation of pointer. 2313 * The drawback is the unused memory when no resize 2314 * occurs but it's very low in terms of pages 2315 * (see comments below) 2316 * Need to handle this properly when META_BG resizing is allowed 2317 */ 2318 num_meta_group_infos_max = num_meta_group_infos + 2319 le16_to_cpu(es->s_reserved_gdt_blocks); 2320 2321 /* 2322 * array_size is the size of s_group_info array. We round it 2323 * to the next power of two because this approximation is done 2324 * internally by kmalloc so we can have some more memory 2325 * for free here (e.g. may be used for META_BG resize). 2326 */ 2327 array_size = 1; 2328 while (array_size < sizeof(*sbi->s_group_info) * 2329 num_meta_group_infos_max) 2330 array_size = array_size << 1; 2331 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2332 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2333 * So a two level scheme suffices for now. */ 2334 sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); 2335 if (sbi->s_group_info == NULL) { 2336 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); 2337 return -ENOMEM; 2338 } 2339 sbi->s_buddy_cache = new_inode(sb); 2340 if (sbi->s_buddy_cache == NULL) { 2341 ext4_msg(sb, KERN_ERR, "can't get new inode"); 2342 goto err_freesgi; 2343 } 2344 /* To avoid potentially colliding with an valid on-disk inode number, 2345 * use EXT4_BAD_INO for the buddy cache inode number. This inode is 2346 * not in the inode hash, so it should never be found by iget(), but 2347 * this will avoid confusion if it ever shows up during debugging. */ 2348 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; 2349 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2350 for (i = 0; i < ngroups; i++) { 2351 desc = ext4_get_group_desc(sb, i, NULL); 2352 if (desc == NULL) { 2353 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); 2354 goto err_freebuddy; 2355 } 2356 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2357 goto err_freebuddy; 2358 } 2359 2360 return 0; 2361 2362 err_freebuddy: 2363 cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2364 while (i-- > 0) 2365 kmem_cache_free(cachep, ext4_get_group_info(sb, i)); 2366 i = num_meta_group_infos; 2367 while (i-- > 0) 2368 kfree(sbi->s_group_info[i]); 2369 iput(sbi->s_buddy_cache); 2370 err_freesgi: 2371 ext4_kvfree(sbi->s_group_info); 2372 return -ENOMEM; 2373 } 2374 2375 static void ext4_groupinfo_destroy_slabs(void) 2376 { 2377 int i; 2378 2379 for (i = 0; i < NR_GRPINFO_CACHES; i++) { 2380 if (ext4_groupinfo_caches[i]) 2381 kmem_cache_destroy(ext4_groupinfo_caches[i]); 2382 ext4_groupinfo_caches[i] = NULL; 2383 } 2384 } 2385 2386 static int ext4_groupinfo_create_slab(size_t size) 2387 { 2388 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); 2389 int slab_size; 2390 int blocksize_bits = order_base_2(size); 2391 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 2392 struct kmem_cache *cachep; 2393 2394 if (cache_index >= NR_GRPINFO_CACHES) 2395 return -EINVAL; 2396 2397 if (unlikely(cache_index < 0)) 2398 cache_index = 0; 2399 2400 mutex_lock(&ext4_grpinfo_slab_create_mutex); 2401 if (ext4_groupinfo_caches[cache_index]) { 2402 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 2403 return 0; /* Already created */ 2404 } 2405 2406 slab_size = offsetof(struct ext4_group_info, 2407 bb_counters[blocksize_bits + 2]); 2408 2409 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], 2410 slab_size, 0, SLAB_RECLAIM_ACCOUNT, 2411 NULL); 2412 2413 ext4_groupinfo_caches[cache_index] = cachep; 2414 2415 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 2416 if (!cachep) { 2417 printk(KERN_EMERG 2418 "EXT4-fs: no memory for groupinfo slab cache\n"); 2419 return -ENOMEM; 2420 } 2421 2422 return 0; 2423 } 2424 2425 int ext4_mb_init(struct super_block *sb, int needs_recovery) 2426 { 2427 struct ext4_sb_info *sbi = EXT4_SB(sb); 2428 unsigned i, j; 2429 unsigned offset; 2430 unsigned max; 2431 int ret; 2432 2433 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); 2434 2435 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2436 if (sbi->s_mb_offsets == NULL) { 2437 ret = -ENOMEM; 2438 goto out; 2439 } 2440 2441 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); 2442 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2443 if (sbi->s_mb_maxs == NULL) { 2444 ret = -ENOMEM; 2445 goto out; 2446 } 2447 2448 ret = ext4_groupinfo_create_slab(sb->s_blocksize); 2449 if (ret < 0) 2450 goto out; 2451 2452 /* order 0 is regular bitmap */ 2453 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 2454 sbi->s_mb_offsets[0] = 0; 2455 2456 i = 1; 2457 offset = 0; 2458 max = sb->s_blocksize << 2; 2459 do { 2460 sbi->s_mb_offsets[i] = offset; 2461 sbi->s_mb_maxs[i] = max; 2462 offset += 1 << (sb->s_blocksize_bits - i); 2463 max = max >> 1; 2464 i++; 2465 } while (i <= sb->s_blocksize_bits + 1); 2466 2467 spin_lock_init(&sbi->s_md_lock); 2468 spin_lock_init(&sbi->s_bal_lock); 2469 2470 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 2471 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; 2472 sbi->s_mb_stats = MB_DEFAULT_STATS; 2473 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2474 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2475 /* 2476 * The default group preallocation is 512, which for 4k block 2477 * sizes translates to 2 megabytes. However for bigalloc file 2478 * systems, this is probably too big (i.e, if the cluster size 2479 * is 1 megabyte, then group preallocation size becomes half a 2480 * gigabyte!). As a default, we will keep a two megabyte 2481 * group pralloc size for cluster sizes up to 64k, and after 2482 * that, we will force a minimum group preallocation size of 2483 * 32 clusters. This translates to 8 megs when the cluster 2484 * size is 256k, and 32 megs when the cluster size is 1 meg, 2485 * which seems reasonable as a default. 2486 */ 2487 sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> 2488 sbi->s_cluster_bits, 32); 2489 /* 2490 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc 2491 * to the lowest multiple of s_stripe which is bigger than 2492 * the s_mb_group_prealloc as determined above. We want 2493 * the preallocation size to be an exact multiple of the 2494 * RAID stripe size so that preallocations don't fragment 2495 * the stripes. 2496 */ 2497 if (sbi->s_stripe > 1) { 2498 sbi->s_mb_group_prealloc = roundup( 2499 sbi->s_mb_group_prealloc, sbi->s_stripe); 2500 } 2501 2502 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2503 if (sbi->s_locality_groups == NULL) { 2504 ret = -ENOMEM; 2505 goto out_free_groupinfo_slab; 2506 } 2507 for_each_possible_cpu(i) { 2508 struct ext4_locality_group *lg; 2509 lg = per_cpu_ptr(sbi->s_locality_groups, i); 2510 mutex_init(&lg->lg_mutex); 2511 for (j = 0; j < PREALLOC_TB_SIZE; j++) 2512 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 2513 spin_lock_init(&lg->lg_prealloc_lock); 2514 } 2515 2516 /* init file for buddy data */ 2517 ret = ext4_mb_init_backend(sb); 2518 if (ret != 0) 2519 goto out_free_locality_groups; 2520 2521 if (sbi->s_proc) 2522 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2523 &ext4_mb_seq_groups_fops, sb); 2524 2525 if (sbi->s_journal) 2526 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2527 2528 return 0; 2529 2530 out_free_locality_groups: 2531 free_percpu(sbi->s_locality_groups); 2532 sbi->s_locality_groups = NULL; 2533 out_free_groupinfo_slab: 2534 ext4_groupinfo_destroy_slabs(); 2535 out: 2536 kfree(sbi->s_mb_offsets); 2537 sbi->s_mb_offsets = NULL; 2538 kfree(sbi->s_mb_maxs); 2539 sbi->s_mb_maxs = NULL; 2540 return ret; 2541 } 2542 2543 /* need to called with the ext4 group lock held */ 2544 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) 2545 { 2546 struct ext4_prealloc_space *pa; 2547 struct list_head *cur, *tmp; 2548 int count = 0; 2549 2550 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { 2551 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 2552 list_del(&pa->pa_group_list); 2553 count++; 2554 kmem_cache_free(ext4_pspace_cachep, pa); 2555 } 2556 if (count) 2557 mb_debug(1, "mballoc: %u PAs left\n", count); 2558 2559 } 2560 2561 int ext4_mb_release(struct super_block *sb) 2562 { 2563 ext4_group_t ngroups = ext4_get_groups_count(sb); 2564 ext4_group_t i; 2565 int num_meta_group_infos; 2566 struct ext4_group_info *grinfo; 2567 struct ext4_sb_info *sbi = EXT4_SB(sb); 2568 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2569 2570 if (sbi->s_group_info) { 2571 for (i = 0; i < ngroups; i++) { 2572 grinfo = ext4_get_group_info(sb, i); 2573 #ifdef DOUBLE_CHECK 2574 kfree(grinfo->bb_bitmap); 2575 #endif 2576 ext4_lock_group(sb, i); 2577 ext4_mb_cleanup_pa(grinfo); 2578 ext4_unlock_group(sb, i); 2579 kmem_cache_free(cachep, grinfo); 2580 } 2581 num_meta_group_infos = (ngroups + 2582 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2583 EXT4_DESC_PER_BLOCK_BITS(sb); 2584 for (i = 0; i < num_meta_group_infos; i++) 2585 kfree(sbi->s_group_info[i]); 2586 ext4_kvfree(sbi->s_group_info); 2587 } 2588 kfree(sbi->s_mb_offsets); 2589 kfree(sbi->s_mb_maxs); 2590 if (sbi->s_buddy_cache) 2591 iput(sbi->s_buddy_cache); 2592 if (sbi->s_mb_stats) { 2593 ext4_msg(sb, KERN_INFO, 2594 "mballoc: %u blocks %u reqs (%u success)", 2595 atomic_read(&sbi->s_bal_allocated), 2596 atomic_read(&sbi->s_bal_reqs), 2597 atomic_read(&sbi->s_bal_success)); 2598 ext4_msg(sb, KERN_INFO, 2599 "mballoc: %u extents scanned, %u goal hits, " 2600 "%u 2^N hits, %u breaks, %u lost", 2601 atomic_read(&sbi->s_bal_ex_scanned), 2602 atomic_read(&sbi->s_bal_goals), 2603 atomic_read(&sbi->s_bal_2orders), 2604 atomic_read(&sbi->s_bal_breaks), 2605 atomic_read(&sbi->s_mb_lost_chunks)); 2606 ext4_msg(sb, KERN_INFO, 2607 "mballoc: %lu generated and it took %Lu", 2608 sbi->s_mb_buddies_generated, 2609 sbi->s_mb_generation_time); 2610 ext4_msg(sb, KERN_INFO, 2611 "mballoc: %u preallocated, %u discarded", 2612 atomic_read(&sbi->s_mb_preallocated), 2613 atomic_read(&sbi->s_mb_discarded)); 2614 } 2615 2616 free_percpu(sbi->s_locality_groups); 2617 if (sbi->s_proc) 2618 remove_proc_entry("mb_groups", sbi->s_proc); 2619 2620 return 0; 2621 } 2622 2623 static inline int ext4_issue_discard(struct super_block *sb, 2624 ext4_group_t block_group, ext4_grpblk_t cluster, int count) 2625 { 2626 ext4_fsblk_t discard_block; 2627 2628 discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + 2629 ext4_group_first_block_no(sb, block_group)); 2630 count = EXT4_C2B(EXT4_SB(sb), count); 2631 trace_ext4_discard_blocks(sb, 2632 (unsigned long long) discard_block, count); 2633 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); 2634 } 2635 2636 /* 2637 * This function is called by the jbd2 layer once the commit has finished, 2638 * so we know we can free the blocks that were released with that commit. 2639 */ 2640 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) 2641 { 2642 struct super_block *sb = journal->j_private; 2643 struct ext4_buddy e4b; 2644 struct ext4_group_info *db; 2645 int err, count = 0, count2 = 0; 2646 struct ext4_free_data *entry; 2647 struct list_head *l, *ltmp; 2648 2649 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2650 entry = list_entry(l, struct ext4_free_data, list); 2651 2652 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2653 entry->count, entry->group, entry); 2654 2655 if (test_opt(sb, DISCARD)) 2656 ext4_issue_discard(sb, entry->group, 2657 entry->start_cluster, entry->count); 2658 2659 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2660 /* we expect to find existing buddy because it's pinned */ 2661 BUG_ON(err != 0); 2662 2663 db = e4b.bd_info; 2664 /* there are blocks to put in buddy to make them really free */ 2665 count += entry->count; 2666 count2++; 2667 ext4_lock_group(sb, entry->group); 2668 /* Take it out of per group rb tree */ 2669 rb_erase(&entry->node, &(db->bb_free_root)); 2670 mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); 2671 2672 /* 2673 * Clear the trimmed flag for the group so that the next 2674 * ext4_trim_fs can trim it. 2675 * If the volume is mounted with -o discard, online discard 2676 * is supported and the free blocks will be trimmed online. 2677 */ 2678 if (!test_opt(sb, DISCARD)) 2679 EXT4_MB_GRP_CLEAR_TRIMMED(db); 2680 2681 if (!db->bb_free_root.rb_node) { 2682 /* No more items in the per group rb tree 2683 * balance refcounts from ext4_mb_free_metadata() 2684 */ 2685 page_cache_release(e4b.bd_buddy_page); 2686 page_cache_release(e4b.bd_bitmap_page); 2687 } 2688 ext4_unlock_group(sb, entry->group); 2689 kmem_cache_free(ext4_free_ext_cachep, entry); 2690 ext4_mb_unload_buddy(&e4b); 2691 } 2692 2693 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2694 } 2695 2696 #ifdef CONFIG_EXT4_DEBUG 2697 u8 mb_enable_debug __read_mostly; 2698 2699 static struct dentry *debugfs_dir; 2700 static struct dentry *debugfs_debug; 2701 2702 static void __init ext4_create_debugfs_entry(void) 2703 { 2704 debugfs_dir = debugfs_create_dir("ext4", NULL); 2705 if (debugfs_dir) 2706 debugfs_debug = debugfs_create_u8("mballoc-debug", 2707 S_IRUGO | S_IWUSR, 2708 debugfs_dir, 2709 &mb_enable_debug); 2710 } 2711 2712 static void ext4_remove_debugfs_entry(void) 2713 { 2714 debugfs_remove(debugfs_debug); 2715 debugfs_remove(debugfs_dir); 2716 } 2717 2718 #else 2719 2720 static void __init ext4_create_debugfs_entry(void) 2721 { 2722 } 2723 2724 static void ext4_remove_debugfs_entry(void) 2725 { 2726 } 2727 2728 #endif 2729 2730 int __init ext4_init_mballoc(void) 2731 { 2732 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, 2733 SLAB_RECLAIM_ACCOUNT); 2734 if (ext4_pspace_cachep == NULL) 2735 return -ENOMEM; 2736 2737 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, 2738 SLAB_RECLAIM_ACCOUNT); 2739 if (ext4_ac_cachep == NULL) { 2740 kmem_cache_destroy(ext4_pspace_cachep); 2741 return -ENOMEM; 2742 } 2743 2744 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, 2745 SLAB_RECLAIM_ACCOUNT); 2746 if (ext4_free_ext_cachep == NULL) { 2747 kmem_cache_destroy(ext4_pspace_cachep); 2748 kmem_cache_destroy(ext4_ac_cachep); 2749 return -ENOMEM; 2750 } 2751 ext4_create_debugfs_entry(); 2752 return 0; 2753 } 2754 2755 void ext4_exit_mballoc(void) 2756 { 2757 /* 2758 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2759 * before destroying the slab cache. 2760 */ 2761 rcu_barrier(); 2762 kmem_cache_destroy(ext4_pspace_cachep); 2763 kmem_cache_destroy(ext4_ac_cachep); 2764 kmem_cache_destroy(ext4_free_ext_cachep); 2765 ext4_groupinfo_destroy_slabs(); 2766 ext4_remove_debugfs_entry(); 2767 } 2768 2769 2770 /* 2771 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps 2772 * Returns 0 if success or error code 2773 */ 2774 static noinline_for_stack int 2775 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2776 handle_t *handle, unsigned int reserv_clstrs) 2777 { 2778 struct buffer_head *bitmap_bh = NULL; 2779 struct ext4_group_desc *gdp; 2780 struct buffer_head *gdp_bh; 2781 struct ext4_sb_info *sbi; 2782 struct super_block *sb; 2783 ext4_fsblk_t block; 2784 int err, len; 2785 2786 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 2787 BUG_ON(ac->ac_b_ex.fe_len <= 0); 2788 2789 sb = ac->ac_sb; 2790 sbi = EXT4_SB(sb); 2791 2792 err = -EIO; 2793 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 2794 if (!bitmap_bh) 2795 goto out_err; 2796 2797 err = ext4_journal_get_write_access(handle, bitmap_bh); 2798 if (err) 2799 goto out_err; 2800 2801 err = -EIO; 2802 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); 2803 if (!gdp) 2804 goto out_err; 2805 2806 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 2807 ext4_free_group_clusters(sb, gdp)); 2808 2809 err = ext4_journal_get_write_access(handle, gdp_bh); 2810 if (err) 2811 goto out_err; 2812 2813 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 2814 2815 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 2816 if (!ext4_data_block_valid(sbi, block, len)) { 2817 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 2818 "fs metadata\n", block, block+len); 2819 /* File system mounted not to panic on error 2820 * Fix the bitmap and repeat the block allocation 2821 * We leak some of the blocks here. 2822 */ 2823 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 2824 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 2825 ac->ac_b_ex.fe_len); 2826 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2827 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2828 if (!err) 2829 err = -EAGAIN; 2830 goto out_err; 2831 } 2832 2833 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 2834 #ifdef AGGRESSIVE_CHECK 2835 { 2836 int i; 2837 for (i = 0; i < ac->ac_b_ex.fe_len; i++) { 2838 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, 2839 bitmap_bh->b_data)); 2840 } 2841 } 2842 #endif 2843 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 2844 ac->ac_b_ex.fe_len); 2845 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2846 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 2847 ext4_free_group_clusters_set(sb, gdp, 2848 ext4_free_clusters_after_init(sb, 2849 ac->ac_b_ex.fe_group, gdp)); 2850 } 2851 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; 2852 ext4_free_group_clusters_set(sb, gdp, len); 2853 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2854 2855 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2856 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); 2857 /* 2858 * Now reduce the dirty block count also. Should not go negative 2859 */ 2860 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 2861 /* release all the reserved blocks if non delalloc */ 2862 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 2863 reserv_clstrs); 2864 2865 if (sbi->s_log_groups_per_flex) { 2866 ext4_group_t flex_group = ext4_flex_group(sbi, 2867 ac->ac_b_ex.fe_group); 2868 atomic_sub(ac->ac_b_ex.fe_len, 2869 &sbi->s_flex_groups[flex_group].free_clusters); 2870 } 2871 2872 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2873 if (err) 2874 goto out_err; 2875 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 2876 2877 out_err: 2878 ext4_mark_super_dirty(sb); 2879 brelse(bitmap_bh); 2880 return err; 2881 } 2882 2883 /* 2884 * here we normalize request for locality group 2885 * Group request are normalized to s_mb_group_prealloc, which goes to 2886 * s_strip if we set the same via mount option. 2887 * s_mb_group_prealloc can be configured via 2888 * /sys/fs/ext4/<partition>/mb_group_prealloc 2889 * 2890 * XXX: should we try to preallocate more than the group has now? 2891 */ 2892 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) 2893 { 2894 struct super_block *sb = ac->ac_sb; 2895 struct ext4_locality_group *lg = ac->ac_lg; 2896 2897 BUG_ON(lg == NULL); 2898 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 2899 mb_debug(1, "#%u: goal %u blocks for locality group\n", 2900 current->pid, ac->ac_g_ex.fe_len); 2901 } 2902 2903 /* 2904 * Normalization means making request better in terms of 2905 * size and alignment 2906 */ 2907 static noinline_for_stack void 2908 ext4_mb_normalize_request(struct ext4_allocation_context *ac, 2909 struct ext4_allocation_request *ar) 2910 { 2911 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2912 int bsbits, max; 2913 ext4_lblk_t end; 2914 loff_t size, orig_size, start_off; 2915 ext4_lblk_t start; 2916 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 2917 struct ext4_prealloc_space *pa; 2918 2919 /* do normalize only data requests, metadata requests 2920 do not need preallocation */ 2921 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 2922 return; 2923 2924 /* sometime caller may want exact blocks */ 2925 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 2926 return; 2927 2928 /* caller may indicate that preallocation isn't 2929 * required (it's a tail, for example) */ 2930 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) 2931 return; 2932 2933 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { 2934 ext4_mb_normalize_group_request(ac); 2935 return ; 2936 } 2937 2938 bsbits = ac->ac_sb->s_blocksize_bits; 2939 2940 /* first, let's learn actual file size 2941 * given current request is allocated */ 2942 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); 2943 size = size << bsbits; 2944 if (size < i_size_read(ac->ac_inode)) 2945 size = i_size_read(ac->ac_inode); 2946 orig_size = size; 2947 2948 /* max size of free chunks */ 2949 max = 2 << bsbits; 2950 2951 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ 2952 (req <= (size) || max <= (chunk_size)) 2953 2954 /* first, try to predict filesize */ 2955 /* XXX: should this table be tunable? */ 2956 start_off = 0; 2957 if (size <= 16 * 1024) { 2958 size = 16 * 1024; 2959 } else if (size <= 32 * 1024) { 2960 size = 32 * 1024; 2961 } else if (size <= 64 * 1024) { 2962 size = 64 * 1024; 2963 } else if (size <= 128 * 1024) { 2964 size = 128 * 1024; 2965 } else if (size <= 256 * 1024) { 2966 size = 256 * 1024; 2967 } else if (size <= 512 * 1024) { 2968 size = 512 * 1024; 2969 } else if (size <= 1024 * 1024) { 2970 size = 1024 * 1024; 2971 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { 2972 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 2973 (21 - bsbits)) << 21; 2974 size = 2 * 1024 * 1024; 2975 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { 2976 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 2977 (22 - bsbits)) << 22; 2978 size = 4 * 1024 * 1024; 2979 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, 2980 (8<<20)>>bsbits, max, 8 * 1024)) { 2981 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 2982 (23 - bsbits)) << 23; 2983 size = 8 * 1024 * 1024; 2984 } else { 2985 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; 2986 size = ac->ac_o_ex.fe_len << bsbits; 2987 } 2988 size = size >> bsbits; 2989 start = start_off >> bsbits; 2990 2991 /* don't cover already allocated blocks in selected range */ 2992 if (ar->pleft && start <= ar->lleft) { 2993 size -= ar->lleft + 1 - start; 2994 start = ar->lleft + 1; 2995 } 2996 if (ar->pright && start + size - 1 >= ar->lright) 2997 size -= start + size - ar->lright; 2998 2999 end = start + size; 3000 3001 /* check we don't cross already preallocated blocks */ 3002 rcu_read_lock(); 3003 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3004 ext4_lblk_t pa_end; 3005 3006 if (pa->pa_deleted) 3007 continue; 3008 spin_lock(&pa->pa_lock); 3009 if (pa->pa_deleted) { 3010 spin_unlock(&pa->pa_lock); 3011 continue; 3012 } 3013 3014 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), 3015 pa->pa_len); 3016 3017 /* PA must not overlap original request */ 3018 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3019 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 3020 3021 /* skip PAs this normalized request doesn't overlap with */ 3022 if (pa->pa_lstart >= end || pa_end <= start) { 3023 spin_unlock(&pa->pa_lock); 3024 continue; 3025 } 3026 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 3027 3028 /* adjust start or end to be adjacent to this pa */ 3029 if (pa_end <= ac->ac_o_ex.fe_logical) { 3030 BUG_ON(pa_end < start); 3031 start = pa_end; 3032 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { 3033 BUG_ON(pa->pa_lstart > end); 3034 end = pa->pa_lstart; 3035 } 3036 spin_unlock(&pa->pa_lock); 3037 } 3038 rcu_read_unlock(); 3039 size = end - start; 3040 3041 /* XXX: extra loop to check we really don't overlap preallocations */ 3042 rcu_read_lock(); 3043 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3044 ext4_lblk_t pa_end; 3045 3046 spin_lock(&pa->pa_lock); 3047 if (pa->pa_deleted == 0) { 3048 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), 3049 pa->pa_len); 3050 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); 3051 } 3052 spin_unlock(&pa->pa_lock); 3053 } 3054 rcu_read_unlock(); 3055 3056 if (start + size <= ac->ac_o_ex.fe_logical && 3057 start > ac->ac_o_ex.fe_logical) { 3058 ext4_msg(ac->ac_sb, KERN_ERR, 3059 "start %lu, size %lu, fe_logical %lu", 3060 (unsigned long) start, (unsigned long) size, 3061 (unsigned long) ac->ac_o_ex.fe_logical); 3062 } 3063 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3064 start > ac->ac_o_ex.fe_logical); 3065 BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 3066 3067 /* now prepare goal request */ 3068 3069 /* XXX: is it better to align blocks WRT to logical 3070 * placement or satisfy big request as is */ 3071 ac->ac_g_ex.fe_logical = start; 3072 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); 3073 3074 /* define goal start in order to merge */ 3075 if (ar->pright && (ar->lright == (start + size))) { 3076 /* merge to the right */ 3077 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, 3078 &ac->ac_f_ex.fe_group, 3079 &ac->ac_f_ex.fe_start); 3080 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3081 } 3082 if (ar->pleft && (ar->lleft + 1 == start)) { 3083 /* merge to the left */ 3084 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, 3085 &ac->ac_f_ex.fe_group, 3086 &ac->ac_f_ex.fe_start); 3087 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3088 } 3089 3090 mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, 3091 (unsigned) orig_size, (unsigned) start); 3092 } 3093 3094 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) 3095 { 3096 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3097 3098 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3099 atomic_inc(&sbi->s_bal_reqs); 3100 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3101 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) 3102 atomic_inc(&sbi->s_bal_success); 3103 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3104 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3105 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 3106 atomic_inc(&sbi->s_bal_goals); 3107 if (ac->ac_found > sbi->s_mb_max_to_scan) 3108 atomic_inc(&sbi->s_bal_breaks); 3109 } 3110 3111 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) 3112 trace_ext4_mballoc_alloc(ac); 3113 else 3114 trace_ext4_mballoc_prealloc(ac); 3115 } 3116 3117 /* 3118 * Called on failure; free up any blocks from the inode PA for this 3119 * context. We don't need this for MB_GROUP_PA because we only change 3120 * pa_free in ext4_mb_release_context(), but on failure, we've already 3121 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. 3122 */ 3123 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) 3124 { 3125 struct ext4_prealloc_space *pa = ac->ac_pa; 3126 int len; 3127 3128 if (pa && pa->pa_type == MB_INODE_PA) { 3129 len = ac->ac_b_ex.fe_len; 3130 pa->pa_free += len; 3131 } 3132 3133 } 3134 3135 /* 3136 * use blocks preallocated to inode 3137 */ 3138 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 3139 struct ext4_prealloc_space *pa) 3140 { 3141 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3142 ext4_fsblk_t start; 3143 ext4_fsblk_t end; 3144 int len; 3145 3146 /* found preallocated blocks, use them */ 3147 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 3148 end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), 3149 start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); 3150 len = EXT4_NUM_B2C(sbi, end - start); 3151 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 3152 &ac->ac_b_ex.fe_start); 3153 ac->ac_b_ex.fe_len = len; 3154 ac->ac_status = AC_STATUS_FOUND; 3155 ac->ac_pa = pa; 3156 3157 BUG_ON(start < pa->pa_pstart); 3158 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); 3159 BUG_ON(pa->pa_free < len); 3160 pa->pa_free -= len; 3161 3162 mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); 3163 } 3164 3165 /* 3166 * use blocks preallocated to locality group 3167 */ 3168 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, 3169 struct ext4_prealloc_space *pa) 3170 { 3171 unsigned int len = ac->ac_o_ex.fe_len; 3172 3173 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 3174 &ac->ac_b_ex.fe_group, 3175 &ac->ac_b_ex.fe_start); 3176 ac->ac_b_ex.fe_len = len; 3177 ac->ac_status = AC_STATUS_FOUND; 3178 ac->ac_pa = pa; 3179 3180 /* we don't correct pa_pstart or pa_plen here to avoid 3181 * possible race when the group is being loaded concurrently 3182 * instead we correct pa later, after blocks are marked 3183 * in on-disk bitmap -- see ext4_mb_release_context() 3184 * Other CPUs are prevented from allocating from this pa by lg_mutex 3185 */ 3186 mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3187 } 3188 3189 /* 3190 * Return the prealloc space that have minimal distance 3191 * from the goal block. @cpa is the prealloc 3192 * space that is having currently known minimal distance 3193 * from the goal block. 3194 */ 3195 static struct ext4_prealloc_space * 3196 ext4_mb_check_group_pa(ext4_fsblk_t goal_block, 3197 struct ext4_prealloc_space *pa, 3198 struct ext4_prealloc_space *cpa) 3199 { 3200 ext4_fsblk_t cur_distance, new_distance; 3201 3202 if (cpa == NULL) { 3203 atomic_inc(&pa->pa_count); 3204 return pa; 3205 } 3206 cur_distance = abs(goal_block - cpa->pa_pstart); 3207 new_distance = abs(goal_block - pa->pa_pstart); 3208 3209 if (cur_distance <= new_distance) 3210 return cpa; 3211 3212 /* drop the previous reference */ 3213 atomic_dec(&cpa->pa_count); 3214 atomic_inc(&pa->pa_count); 3215 return pa; 3216 } 3217 3218 /* 3219 * search goal blocks in preallocated space 3220 */ 3221 static noinline_for_stack int 3222 ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 3223 { 3224 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3225 int order, i; 3226 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3227 struct ext4_locality_group *lg; 3228 struct ext4_prealloc_space *pa, *cpa = NULL; 3229 ext4_fsblk_t goal_block; 3230 3231 /* only data can be preallocated */ 3232 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3233 return 0; 3234 3235 /* first, try per-file preallocation */ 3236 rcu_read_lock(); 3237 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3238 3239 /* all fields in this condition don't change, 3240 * so we can skip locking for them */ 3241 if (ac->ac_o_ex.fe_logical < pa->pa_lstart || 3242 ac->ac_o_ex.fe_logical >= (pa->pa_lstart + 3243 EXT4_C2B(sbi, pa->pa_len))) 3244 continue; 3245 3246 /* non-extent files can't have physical blocks past 2^32 */ 3247 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 3248 (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > 3249 EXT4_MAX_BLOCK_FILE_PHYS)) 3250 continue; 3251 3252 /* found preallocated blocks, use them */ 3253 spin_lock(&pa->pa_lock); 3254 if (pa->pa_deleted == 0 && pa->pa_free) { 3255 atomic_inc(&pa->pa_count); 3256 ext4_mb_use_inode_pa(ac, pa); 3257 spin_unlock(&pa->pa_lock); 3258 ac->ac_criteria = 10; 3259 rcu_read_unlock(); 3260 return 1; 3261 } 3262 spin_unlock(&pa->pa_lock); 3263 } 3264 rcu_read_unlock(); 3265 3266 /* can we use group allocation? */ 3267 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) 3268 return 0; 3269 3270 /* inode may have no locality group for some reason */ 3271 lg = ac->ac_lg; 3272 if (lg == NULL) 3273 return 0; 3274 order = fls(ac->ac_o_ex.fe_len) - 1; 3275 if (order > PREALLOC_TB_SIZE - 1) 3276 /* The max size of hash table is PREALLOC_TB_SIZE */ 3277 order = PREALLOC_TB_SIZE - 1; 3278 3279 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); 3280 /* 3281 * search for the prealloc space that is having 3282 * minimal distance from the goal block. 3283 */ 3284 for (i = order; i < PREALLOC_TB_SIZE; i++) { 3285 rcu_read_lock(); 3286 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], 3287 pa_inode_list) { 3288 spin_lock(&pa->pa_lock); 3289 if (pa->pa_deleted == 0 && 3290 pa->pa_free >= ac->ac_o_ex.fe_len) { 3291 3292 cpa = ext4_mb_check_group_pa(goal_block, 3293 pa, cpa); 3294 } 3295 spin_unlock(&pa->pa_lock); 3296 } 3297 rcu_read_unlock(); 3298 } 3299 if (cpa) { 3300 ext4_mb_use_group_pa(ac, cpa); 3301 ac->ac_criteria = 20; 3302 return 1; 3303 } 3304 return 0; 3305 } 3306 3307 /* 3308 * the function goes through all block freed in the group 3309 * but not yet committed and marks them used in in-core bitmap. 3310 * buddy must be generated from this bitmap 3311 * Need to be called with the ext4 group lock held 3312 */ 3313 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 3314 ext4_group_t group) 3315 { 3316 struct rb_node *n; 3317 struct ext4_group_info *grp; 3318 struct ext4_free_data *entry; 3319 3320 grp = ext4_get_group_info(sb, group); 3321 n = rb_first(&(grp->bb_free_root)); 3322 3323 while (n) { 3324 entry = rb_entry(n, struct ext4_free_data, node); 3325 ext4_set_bits(bitmap, entry->start_cluster, entry->count); 3326 n = rb_next(n); 3327 } 3328 return; 3329 } 3330 3331 /* 3332 * the function goes through all preallocation in this group and marks them 3333 * used in in-core bitmap. buddy must be generated from this bitmap 3334 * Need to be called with ext4 group lock held 3335 */ 3336 static noinline_for_stack 3337 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 3338 ext4_group_t group) 3339 { 3340 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3341 struct ext4_prealloc_space *pa; 3342 struct list_head *cur; 3343 ext4_group_t groupnr; 3344 ext4_grpblk_t start; 3345 int preallocated = 0; 3346 int len; 3347 3348 /* all form of preallocation discards first load group, 3349 * so the only competing code is preallocation use. 3350 * we don't need any locking here 3351 * notice we do NOT ignore preallocations with pa_deleted 3352 * otherwise we could leave used blocks available for 3353 * allocation in buddy when concurrent ext4_mb_put_pa() 3354 * is dropping preallocation 3355 */ 3356 list_for_each(cur, &grp->bb_prealloc_list) { 3357 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 3358 spin_lock(&pa->pa_lock); 3359 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 3360 &groupnr, &start); 3361 len = pa->pa_len; 3362 spin_unlock(&pa->pa_lock); 3363 if (unlikely(len == 0)) 3364 continue; 3365 BUG_ON(groupnr != group); 3366 ext4_set_bits(bitmap, start, len); 3367 preallocated += len; 3368 } 3369 mb_debug(1, "prellocated %u for group %u\n", preallocated, group); 3370 } 3371 3372 static void ext4_mb_pa_callback(struct rcu_head *head) 3373 { 3374 struct ext4_prealloc_space *pa; 3375 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 3376 kmem_cache_free(ext4_pspace_cachep, pa); 3377 } 3378 3379 /* 3380 * drops a reference to preallocated space descriptor 3381 * if this was the last reference and the space is consumed 3382 */ 3383 static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 3384 struct super_block *sb, struct ext4_prealloc_space *pa) 3385 { 3386 ext4_group_t grp; 3387 ext4_fsblk_t grp_blk; 3388 3389 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) 3390 return; 3391 3392 /* in this short window concurrent discard can set pa_deleted */ 3393 spin_lock(&pa->pa_lock); 3394 if (pa->pa_deleted == 1) { 3395 spin_unlock(&pa->pa_lock); 3396 return; 3397 } 3398 3399 pa->pa_deleted = 1; 3400 spin_unlock(&pa->pa_lock); 3401 3402 grp_blk = pa->pa_pstart; 3403 /* 3404 * If doing group-based preallocation, pa_pstart may be in the 3405 * next group when pa is used up 3406 */ 3407 if (pa->pa_type == MB_GROUP_PA) 3408 grp_blk--; 3409 3410 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); 3411 3412 /* 3413 * possible race: 3414 * 3415 * P1 (buddy init) P2 (regular allocation) 3416 * find block B in PA 3417 * copy on-disk bitmap to buddy 3418 * mark B in on-disk bitmap 3419 * drop PA from group 3420 * mark all PAs in buddy 3421 * 3422 * thus, P1 initializes buddy with B available. to prevent this 3423 * we make "copy" and "mark all PAs" atomic and serialize "drop PA" 3424 * against that pair 3425 */ 3426 ext4_lock_group(sb, grp); 3427 list_del(&pa->pa_group_list); 3428 ext4_unlock_group(sb, grp); 3429 3430 spin_lock(pa->pa_obj_lock); 3431 list_del_rcu(&pa->pa_inode_list); 3432 spin_unlock(pa->pa_obj_lock); 3433 3434 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3435 } 3436 3437 /* 3438 * creates new preallocated space for given inode 3439 */ 3440 static noinline_for_stack int 3441 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 3442 { 3443 struct super_block *sb = ac->ac_sb; 3444 struct ext4_sb_info *sbi = EXT4_SB(sb); 3445 struct ext4_prealloc_space *pa; 3446 struct ext4_group_info *grp; 3447 struct ext4_inode_info *ei; 3448 3449 /* preallocate only when found space is larger then requested */ 3450 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3451 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3452 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3453 3454 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3455 if (pa == NULL) 3456 return -ENOMEM; 3457 3458 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { 3459 int winl; 3460 int wins; 3461 int win; 3462 int offs; 3463 3464 /* we can't allocate as much as normalizer wants. 3465 * so, found space must get proper lstart 3466 * to cover original request */ 3467 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 3468 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); 3469 3470 /* we're limited by original request in that 3471 * logical block must be covered any way 3472 * winl is window we can move our chunk within */ 3473 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; 3474 3475 /* also, we should cover whole original request */ 3476 wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); 3477 3478 /* the smallest one defines real window */ 3479 win = min(winl, wins); 3480 3481 offs = ac->ac_o_ex.fe_logical % 3482 EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 3483 if (offs && offs < win) 3484 win = offs; 3485 3486 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - 3487 EXT4_B2C(sbi, win); 3488 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 3489 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 3490 } 3491 3492 /* preallocation can change ac_b_ex, thus we store actually 3493 * allocated blocks for history */ 3494 ac->ac_f_ex = ac->ac_b_ex; 3495 3496 pa->pa_lstart = ac->ac_b_ex.fe_logical; 3497 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3498 pa->pa_len = ac->ac_b_ex.fe_len; 3499 pa->pa_free = pa->pa_len; 3500 atomic_set(&pa->pa_count, 1); 3501 spin_lock_init(&pa->pa_lock); 3502 INIT_LIST_HEAD(&pa->pa_inode_list); 3503 INIT_LIST_HEAD(&pa->pa_group_list); 3504 pa->pa_deleted = 0; 3505 pa->pa_type = MB_INODE_PA; 3506 3507 mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, 3508 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3509 trace_ext4_mb_new_inode_pa(ac, pa); 3510 3511 ext4_mb_use_inode_pa(ac, pa); 3512 atomic_add(pa->pa_free, &sbi->s_mb_preallocated); 3513 3514 ei = EXT4_I(ac->ac_inode); 3515 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3516 3517 pa->pa_obj_lock = &ei->i_prealloc_lock; 3518 pa->pa_inode = ac->ac_inode; 3519 3520 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3521 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3522 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3523 3524 spin_lock(pa->pa_obj_lock); 3525 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); 3526 spin_unlock(pa->pa_obj_lock); 3527 3528 return 0; 3529 } 3530 3531 /* 3532 * creates new preallocated space for locality group inodes belongs to 3533 */ 3534 static noinline_for_stack int 3535 ext4_mb_new_group_pa(struct ext4_allocation_context *ac) 3536 { 3537 struct super_block *sb = ac->ac_sb; 3538 struct ext4_locality_group *lg; 3539 struct ext4_prealloc_space *pa; 3540 struct ext4_group_info *grp; 3541 3542 /* preallocate only when found space is larger then requested */ 3543 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3544 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3545 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3546 3547 BUG_ON(ext4_pspace_cachep == NULL); 3548 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3549 if (pa == NULL) 3550 return -ENOMEM; 3551 3552 /* preallocation can change ac_b_ex, thus we store actually 3553 * allocated blocks for history */ 3554 ac->ac_f_ex = ac->ac_b_ex; 3555 3556 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3557 pa->pa_lstart = pa->pa_pstart; 3558 pa->pa_len = ac->ac_b_ex.fe_len; 3559 pa->pa_free = pa->pa_len; 3560 atomic_set(&pa->pa_count, 1); 3561 spin_lock_init(&pa->pa_lock); 3562 INIT_LIST_HEAD(&pa->pa_inode_list); 3563 INIT_LIST_HEAD(&pa->pa_group_list); 3564 pa->pa_deleted = 0; 3565 pa->pa_type = MB_GROUP_PA; 3566 3567 mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, 3568 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3569 trace_ext4_mb_new_group_pa(ac, pa); 3570 3571 ext4_mb_use_group_pa(ac, pa); 3572 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3573 3574 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3575 lg = ac->ac_lg; 3576 BUG_ON(lg == NULL); 3577 3578 pa->pa_obj_lock = &lg->lg_prealloc_lock; 3579 pa->pa_inode = NULL; 3580 3581 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3582 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3583 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3584 3585 /* 3586 * We will later add the new pa to the right bucket 3587 * after updating the pa_free in ext4_mb_release_context 3588 */ 3589 return 0; 3590 } 3591 3592 static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) 3593 { 3594 int err; 3595 3596 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 3597 err = ext4_mb_new_group_pa(ac); 3598 else 3599 err = ext4_mb_new_inode_pa(ac); 3600 return err; 3601 } 3602 3603 /* 3604 * finds all unused blocks in on-disk bitmap, frees them in 3605 * in-core bitmap and buddy. 3606 * @pa must be unlinked from inode and group lists, so that 3607 * nobody else can find/use it. 3608 * the caller MUST hold group/inode locks. 3609 * TODO: optimize the case when there are no in-core structures yet 3610 */ 3611 static noinline_for_stack int 3612 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 3613 struct ext4_prealloc_space *pa) 3614 { 3615 struct super_block *sb = e4b->bd_sb; 3616 struct ext4_sb_info *sbi = EXT4_SB(sb); 3617 unsigned int end; 3618 unsigned int next; 3619 ext4_group_t group; 3620 ext4_grpblk_t bit; 3621 unsigned long long grp_blk_start; 3622 int err = 0; 3623 int free = 0; 3624 3625 BUG_ON(pa->pa_deleted == 0); 3626 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3627 grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); 3628 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3629 end = bit + pa->pa_len; 3630 3631 while (bit < end) { 3632 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 3633 if (bit >= end) 3634 break; 3635 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3636 mb_debug(1, " free preallocated %u/%u in group %u\n", 3637 (unsigned) ext4_group_first_block_no(sb, group) + bit, 3638 (unsigned) next - bit, (unsigned) group); 3639 free += next - bit; 3640 3641 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); 3642 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + 3643 EXT4_C2B(sbi, bit)), 3644 next - bit); 3645 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3646 bit = next + 1; 3647 } 3648 if (free != pa->pa_free) { 3649 ext4_msg(e4b->bd_sb, KERN_CRIT, 3650 "pa %p: logic %lu, phys. %lu, len %lu", 3651 pa, (unsigned long) pa->pa_lstart, 3652 (unsigned long) pa->pa_pstart, 3653 (unsigned long) pa->pa_len); 3654 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", 3655 free, pa->pa_free); 3656 /* 3657 * pa is already deleted so we use the value obtained 3658 * from the bitmap and continue. 3659 */ 3660 } 3661 atomic_add(free, &sbi->s_mb_discarded); 3662 3663 return err; 3664 } 3665 3666 static noinline_for_stack int 3667 ext4_mb_release_group_pa(struct ext4_buddy *e4b, 3668 struct ext4_prealloc_space *pa) 3669 { 3670 struct super_block *sb = e4b->bd_sb; 3671 ext4_group_t group; 3672 ext4_grpblk_t bit; 3673 3674 trace_ext4_mb_release_group_pa(sb, pa); 3675 BUG_ON(pa->pa_deleted == 0); 3676 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3677 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3678 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 3679 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 3680 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); 3681 3682 return 0; 3683 } 3684 3685 /* 3686 * releases all preallocations in given group 3687 * 3688 * first, we need to decide discard policy: 3689 * - when do we discard 3690 * 1) ENOSPC 3691 * - how many do we discard 3692 * 1) how many requested 3693 */ 3694 static noinline_for_stack int 3695 ext4_mb_discard_group_preallocations(struct super_block *sb, 3696 ext4_group_t group, int needed) 3697 { 3698 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3699 struct buffer_head *bitmap_bh = NULL; 3700 struct ext4_prealloc_space *pa, *tmp; 3701 struct list_head list; 3702 struct ext4_buddy e4b; 3703 int err; 3704 int busy = 0; 3705 int free = 0; 3706 3707 mb_debug(1, "discard preallocation for group %u\n", group); 3708 3709 if (list_empty(&grp->bb_prealloc_list)) 3710 return 0; 3711 3712 bitmap_bh = ext4_read_block_bitmap(sb, group); 3713 if (bitmap_bh == NULL) { 3714 ext4_error(sb, "Error reading block bitmap for %u", group); 3715 return 0; 3716 } 3717 3718 err = ext4_mb_load_buddy(sb, group, &e4b); 3719 if (err) { 3720 ext4_error(sb, "Error loading buddy information for %u", group); 3721 put_bh(bitmap_bh); 3722 return 0; 3723 } 3724 3725 if (needed == 0) 3726 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; 3727 3728 INIT_LIST_HEAD(&list); 3729 repeat: 3730 ext4_lock_group(sb, group); 3731 list_for_each_entry_safe(pa, tmp, 3732 &grp->bb_prealloc_list, pa_group_list) { 3733 spin_lock(&pa->pa_lock); 3734 if (atomic_read(&pa->pa_count)) { 3735 spin_unlock(&pa->pa_lock); 3736 busy = 1; 3737 continue; 3738 } 3739 if (pa->pa_deleted) { 3740 spin_unlock(&pa->pa_lock); 3741 continue; 3742 } 3743 3744 /* seems this one can be freed ... */ 3745 pa->pa_deleted = 1; 3746 3747 /* we can trust pa_free ... */ 3748 free += pa->pa_free; 3749 3750 spin_unlock(&pa->pa_lock); 3751 3752 list_del(&pa->pa_group_list); 3753 list_add(&pa->u.pa_tmp_list, &list); 3754 } 3755 3756 /* if we still need more blocks and some PAs were used, try again */ 3757 if (free < needed && busy) { 3758 busy = 0; 3759 ext4_unlock_group(sb, group); 3760 /* 3761 * Yield the CPU here so that we don't get soft lockup 3762 * in non preempt case. 3763 */ 3764 yield(); 3765 goto repeat; 3766 } 3767 3768 /* found anything to free? */ 3769 if (list_empty(&list)) { 3770 BUG_ON(free != 0); 3771 goto out; 3772 } 3773 3774 /* now free all selected PAs */ 3775 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3776 3777 /* remove from object (inode or locality group) */ 3778 spin_lock(pa->pa_obj_lock); 3779 list_del_rcu(&pa->pa_inode_list); 3780 spin_unlock(pa->pa_obj_lock); 3781 3782 if (pa->pa_type == MB_GROUP_PA) 3783 ext4_mb_release_group_pa(&e4b, pa); 3784 else 3785 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 3786 3787 list_del(&pa->u.pa_tmp_list); 3788 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3789 } 3790 3791 out: 3792 ext4_unlock_group(sb, group); 3793 ext4_mb_unload_buddy(&e4b); 3794 put_bh(bitmap_bh); 3795 return free; 3796 } 3797 3798 /* 3799 * releases all non-used preallocated blocks for given inode 3800 * 3801 * It's important to discard preallocations under i_data_sem 3802 * We don't want another block to be served from the prealloc 3803 * space when we are discarding the inode prealloc space. 3804 * 3805 * FIXME!! Make sure it is valid at all the call sites 3806 */ 3807 void ext4_discard_preallocations(struct inode *inode) 3808 { 3809 struct ext4_inode_info *ei = EXT4_I(inode); 3810 struct super_block *sb = inode->i_sb; 3811 struct buffer_head *bitmap_bh = NULL; 3812 struct ext4_prealloc_space *pa, *tmp; 3813 ext4_group_t group = 0; 3814 struct list_head list; 3815 struct ext4_buddy e4b; 3816 int err; 3817 3818 if (!S_ISREG(inode->i_mode)) { 3819 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 3820 return; 3821 } 3822 3823 mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); 3824 trace_ext4_discard_preallocations(inode); 3825 3826 INIT_LIST_HEAD(&list); 3827 3828 repeat: 3829 /* first, collect all pa's in the inode */ 3830 spin_lock(&ei->i_prealloc_lock); 3831 while (!list_empty(&ei->i_prealloc_list)) { 3832 pa = list_entry(ei->i_prealloc_list.next, 3833 struct ext4_prealloc_space, pa_inode_list); 3834 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); 3835 spin_lock(&pa->pa_lock); 3836 if (atomic_read(&pa->pa_count)) { 3837 /* this shouldn't happen often - nobody should 3838 * use preallocation while we're discarding it */ 3839 spin_unlock(&pa->pa_lock); 3840 spin_unlock(&ei->i_prealloc_lock); 3841 ext4_msg(sb, KERN_ERR, 3842 "uh-oh! used pa while discarding"); 3843 WARN_ON(1); 3844 schedule_timeout_uninterruptible(HZ); 3845 goto repeat; 3846 3847 } 3848 if (pa->pa_deleted == 0) { 3849 pa->pa_deleted = 1; 3850 spin_unlock(&pa->pa_lock); 3851 list_del_rcu(&pa->pa_inode_list); 3852 list_add(&pa->u.pa_tmp_list, &list); 3853 continue; 3854 } 3855 3856 /* someone is deleting pa right now */ 3857 spin_unlock(&pa->pa_lock); 3858 spin_unlock(&ei->i_prealloc_lock); 3859 3860 /* we have to wait here because pa_deleted 3861 * doesn't mean pa is already unlinked from 3862 * the list. as we might be called from 3863 * ->clear_inode() the inode will get freed 3864 * and concurrent thread which is unlinking 3865 * pa from inode's list may access already 3866 * freed memory, bad-bad-bad */ 3867 3868 /* XXX: if this happens too often, we can 3869 * add a flag to force wait only in case 3870 * of ->clear_inode(), but not in case of 3871 * regular truncate */ 3872 schedule_timeout_uninterruptible(HZ); 3873 goto repeat; 3874 } 3875 spin_unlock(&ei->i_prealloc_lock); 3876 3877 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3878 BUG_ON(pa->pa_type != MB_INODE_PA); 3879 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 3880 3881 err = ext4_mb_load_buddy(sb, group, &e4b); 3882 if (err) { 3883 ext4_error(sb, "Error loading buddy information for %u", 3884 group); 3885 continue; 3886 } 3887 3888 bitmap_bh = ext4_read_block_bitmap(sb, group); 3889 if (bitmap_bh == NULL) { 3890 ext4_error(sb, "Error reading block bitmap for %u", 3891 group); 3892 ext4_mb_unload_buddy(&e4b); 3893 continue; 3894 } 3895 3896 ext4_lock_group(sb, group); 3897 list_del(&pa->pa_group_list); 3898 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 3899 ext4_unlock_group(sb, group); 3900 3901 ext4_mb_unload_buddy(&e4b); 3902 put_bh(bitmap_bh); 3903 3904 list_del(&pa->u.pa_tmp_list); 3905 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3906 } 3907 } 3908 3909 #ifdef CONFIG_EXT4_DEBUG 3910 static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3911 { 3912 struct super_block *sb = ac->ac_sb; 3913 ext4_group_t ngroups, i; 3914 3915 if (!mb_enable_debug || 3916 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) 3917 return; 3918 3919 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" 3920 " Allocation context details:"); 3921 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", 3922 ac->ac_status, ac->ac_flags); 3923 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " 3924 "goal %lu/%lu/%lu@%lu, " 3925 "best %lu/%lu/%lu@%lu cr %d", 3926 (unsigned long)ac->ac_o_ex.fe_group, 3927 (unsigned long)ac->ac_o_ex.fe_start, 3928 (unsigned long)ac->ac_o_ex.fe_len, 3929 (unsigned long)ac->ac_o_ex.fe_logical, 3930 (unsigned long)ac->ac_g_ex.fe_group, 3931 (unsigned long)ac->ac_g_ex.fe_start, 3932 (unsigned long)ac->ac_g_ex.fe_len, 3933 (unsigned long)ac->ac_g_ex.fe_logical, 3934 (unsigned long)ac->ac_b_ex.fe_group, 3935 (unsigned long)ac->ac_b_ex.fe_start, 3936 (unsigned long)ac->ac_b_ex.fe_len, 3937 (unsigned long)ac->ac_b_ex.fe_logical, 3938 (int)ac->ac_criteria); 3939 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", 3940 ac->ac_ex_scanned, ac->ac_found); 3941 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); 3942 ngroups = ext4_get_groups_count(sb); 3943 for (i = 0; i < ngroups; i++) { 3944 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 3945 struct ext4_prealloc_space *pa; 3946 ext4_grpblk_t start; 3947 struct list_head *cur; 3948 ext4_lock_group(sb, i); 3949 list_for_each(cur, &grp->bb_prealloc_list) { 3950 pa = list_entry(cur, struct ext4_prealloc_space, 3951 pa_group_list); 3952 spin_lock(&pa->pa_lock); 3953 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 3954 NULL, &start); 3955 spin_unlock(&pa->pa_lock); 3956 printk(KERN_ERR "PA:%u:%d:%u \n", i, 3957 start, pa->pa_len); 3958 } 3959 ext4_unlock_group(sb, i); 3960 3961 if (grp->bb_free == 0) 3962 continue; 3963 printk(KERN_ERR "%u: %d/%d \n", 3964 i, grp->bb_free, grp->bb_fragments); 3965 } 3966 printk(KERN_ERR "\n"); 3967 } 3968 #else 3969 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3970 { 3971 return; 3972 } 3973 #endif 3974 3975 /* 3976 * We use locality group preallocation for small size file. The size of the 3977 * file is determined by the current size or the resulting size after 3978 * allocation which ever is larger 3979 * 3980 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req 3981 */ 3982 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 3983 { 3984 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3985 int bsbits = ac->ac_sb->s_blocksize_bits; 3986 loff_t size, isize; 3987 3988 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3989 return; 3990 3991 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 3992 return; 3993 3994 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); 3995 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 3996 >> bsbits; 3997 3998 if ((size == isize) && 3999 !ext4_fs_is_busy(sbi) && 4000 (atomic_read(&ac->ac_inode->i_writecount) == 0)) { 4001 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; 4002 return; 4003 } 4004 4005 if (sbi->s_mb_group_prealloc <= 0) { 4006 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 4007 return; 4008 } 4009 4010 /* don't use group allocation for large files */ 4011 size = max(size, isize); 4012 if (size > sbi->s_mb_stream_request) { 4013 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 4014 return; 4015 } 4016 4017 BUG_ON(ac->ac_lg != NULL); 4018 /* 4019 * locality group prealloc space are per cpu. The reason for having 4020 * per cpu locality group is to reduce the contention between block 4021 * request from multiple CPUs. 4022 */ 4023 ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups); 4024 4025 /* we're going to use group allocation */ 4026 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 4027 4028 /* serialize all allocations in the group */ 4029 mutex_lock(&ac->ac_lg->lg_mutex); 4030 } 4031 4032 static noinline_for_stack int 4033 ext4_mb_initialize_context(struct ext4_allocation_context *ac, 4034 struct ext4_allocation_request *ar) 4035 { 4036 struct super_block *sb = ar->inode->i_sb; 4037 struct ext4_sb_info *sbi = EXT4_SB(sb); 4038 struct ext4_super_block *es = sbi->s_es; 4039 ext4_group_t group; 4040 unsigned int len; 4041 ext4_fsblk_t goal; 4042 ext4_grpblk_t block; 4043 4044 /* we can't allocate > group size */ 4045 len = ar->len; 4046 4047 /* just a dirty hack to filter too big requests */ 4048 if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10) 4049 len = EXT4_CLUSTERS_PER_GROUP(sb) - 10; 4050 4051 /* start searching from the goal */ 4052 goal = ar->goal; 4053 if (goal < le32_to_cpu(es->s_first_data_block) || 4054 goal >= ext4_blocks_count(es)) 4055 goal = le32_to_cpu(es->s_first_data_block); 4056 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4057 4058 /* set up allocation goals */ 4059 memset(ac, 0, sizeof(struct ext4_allocation_context)); 4060 ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); 4061 ac->ac_status = AC_STATUS_CONTINUE; 4062 ac->ac_sb = sb; 4063 ac->ac_inode = ar->inode; 4064 ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; 4065 ac->ac_o_ex.fe_group = group; 4066 ac->ac_o_ex.fe_start = block; 4067 ac->ac_o_ex.fe_len = len; 4068 ac->ac_g_ex = ac->ac_o_ex; 4069 ac->ac_flags = ar->flags; 4070 4071 /* we have to define context: we'll we work with a file or 4072 * locality group. this is a policy, actually */ 4073 ext4_mb_group_or_file(ac); 4074 4075 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 4076 "left: %u/%u, right %u/%u to %swritable\n", 4077 (unsigned) ar->len, (unsigned) ar->logical, 4078 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 4079 (unsigned) ar->lleft, (unsigned) ar->pleft, 4080 (unsigned) ar->lright, (unsigned) ar->pright, 4081 atomic_read(&ar->inode->i_writecount) ? "" : "non-"); 4082 return 0; 4083 4084 } 4085 4086 static noinline_for_stack void 4087 ext4_mb_discard_lg_preallocations(struct super_block *sb, 4088 struct ext4_locality_group *lg, 4089 int order, int total_entries) 4090 { 4091 ext4_group_t group = 0; 4092 struct ext4_buddy e4b; 4093 struct list_head discard_list; 4094 struct ext4_prealloc_space *pa, *tmp; 4095 4096 mb_debug(1, "discard locality group preallocation\n"); 4097 4098 INIT_LIST_HEAD(&discard_list); 4099 4100 spin_lock(&lg->lg_prealloc_lock); 4101 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4102 pa_inode_list) { 4103 spin_lock(&pa->pa_lock); 4104 if (atomic_read(&pa->pa_count)) { 4105 /* 4106 * This is the pa that we just used 4107 * for block allocation. So don't 4108 * free that 4109 */ 4110 spin_unlock(&pa->pa_lock); 4111 continue; 4112 } 4113 if (pa->pa_deleted) { 4114 spin_unlock(&pa->pa_lock); 4115 continue; 4116 } 4117 /* only lg prealloc space */ 4118 BUG_ON(pa->pa_type != MB_GROUP_PA); 4119 4120 /* seems this one can be freed ... */ 4121 pa->pa_deleted = 1; 4122 spin_unlock(&pa->pa_lock); 4123 4124 list_del_rcu(&pa->pa_inode_list); 4125 list_add(&pa->u.pa_tmp_list, &discard_list); 4126 4127 total_entries--; 4128 if (total_entries <= 5) { 4129 /* 4130 * we want to keep only 5 entries 4131 * allowing it to grow to 8. This 4132 * mak sure we don't call discard 4133 * soon for this list. 4134 */ 4135 break; 4136 } 4137 } 4138 spin_unlock(&lg->lg_prealloc_lock); 4139 4140 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 4141 4142 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4143 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4144 ext4_error(sb, "Error loading buddy information for %u", 4145 group); 4146 continue; 4147 } 4148 ext4_lock_group(sb, group); 4149 list_del(&pa->pa_group_list); 4150 ext4_mb_release_group_pa(&e4b, pa); 4151 ext4_unlock_group(sb, group); 4152 4153 ext4_mb_unload_buddy(&e4b); 4154 list_del(&pa->u.pa_tmp_list); 4155 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4156 } 4157 } 4158 4159 /* 4160 * We have incremented pa_count. So it cannot be freed at this 4161 * point. Also we hold lg_mutex. So no parallel allocation is 4162 * possible from this lg. That means pa_free cannot be updated. 4163 * 4164 * A parallel ext4_mb_discard_group_preallocations is possible. 4165 * which can cause the lg_prealloc_list to be updated. 4166 */ 4167 4168 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) 4169 { 4170 int order, added = 0, lg_prealloc_count = 1; 4171 struct super_block *sb = ac->ac_sb; 4172 struct ext4_locality_group *lg = ac->ac_lg; 4173 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; 4174 4175 order = fls(pa->pa_free) - 1; 4176 if (order > PREALLOC_TB_SIZE - 1) 4177 /* The max size of hash table is PREALLOC_TB_SIZE */ 4178 order = PREALLOC_TB_SIZE - 1; 4179 /* Add the prealloc space to lg */ 4180 rcu_read_lock(); 4181 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], 4182 pa_inode_list) { 4183 spin_lock(&tmp_pa->pa_lock); 4184 if (tmp_pa->pa_deleted) { 4185 spin_unlock(&tmp_pa->pa_lock); 4186 continue; 4187 } 4188 if (!added && pa->pa_free < tmp_pa->pa_free) { 4189 /* Add to the tail of the previous entry */ 4190 list_add_tail_rcu(&pa->pa_inode_list, 4191 &tmp_pa->pa_inode_list); 4192 added = 1; 4193 /* 4194 * we want to count the total 4195 * number of entries in the list 4196 */ 4197 } 4198 spin_unlock(&tmp_pa->pa_lock); 4199 lg_prealloc_count++; 4200 } 4201 if (!added) 4202 list_add_tail_rcu(&pa->pa_inode_list, 4203 &lg->lg_prealloc_list[order]); 4204 rcu_read_unlock(); 4205 4206 /* Now trim the list to be not more than 8 elements */ 4207 if (lg_prealloc_count > 8) { 4208 ext4_mb_discard_lg_preallocations(sb, lg, 4209 order, lg_prealloc_count); 4210 return; 4211 } 4212 return ; 4213 } 4214 4215 /* 4216 * release all resource we used in allocation 4217 */ 4218 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 4219 { 4220 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4221 struct ext4_prealloc_space *pa = ac->ac_pa; 4222 if (pa) { 4223 if (pa->pa_type == MB_GROUP_PA) { 4224 /* see comment in ext4_mb_use_group_pa() */ 4225 spin_lock(&pa->pa_lock); 4226 pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4227 pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4228 pa->pa_free -= ac->ac_b_ex.fe_len; 4229 pa->pa_len -= ac->ac_b_ex.fe_len; 4230 spin_unlock(&pa->pa_lock); 4231 } 4232 } 4233 if (pa) { 4234 /* 4235 * We want to add the pa to the right bucket. 4236 * Remove it from the list and while adding 4237 * make sure the list to which we are adding 4238 * doesn't grow big. 4239 */ 4240 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { 4241 spin_lock(pa->pa_obj_lock); 4242 list_del_rcu(&pa->pa_inode_list); 4243 spin_unlock(pa->pa_obj_lock); 4244 ext4_mb_add_n_trim(ac); 4245 } 4246 ext4_mb_put_pa(ac, ac->ac_sb, pa); 4247 } 4248 if (ac->ac_bitmap_page) 4249 page_cache_release(ac->ac_bitmap_page); 4250 if (ac->ac_buddy_page) 4251 page_cache_release(ac->ac_buddy_page); 4252 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 4253 mutex_unlock(&ac->ac_lg->lg_mutex); 4254 ext4_mb_collect_stats(ac); 4255 return 0; 4256 } 4257 4258 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 4259 { 4260 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 4261 int ret; 4262 int freed = 0; 4263 4264 trace_ext4_mb_discard_preallocations(sb, needed); 4265 for (i = 0; i < ngroups && needed > 0; i++) { 4266 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4267 freed += ret; 4268 needed -= ret; 4269 } 4270 4271 return freed; 4272 } 4273 4274 /* 4275 * Main entry point into mballoc to allocate blocks 4276 * it tries to use preallocation first, then falls back 4277 * to usual allocation 4278 */ 4279 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4280 struct ext4_allocation_request *ar, int *errp) 4281 { 4282 int freed; 4283 struct ext4_allocation_context *ac = NULL; 4284 struct ext4_sb_info *sbi; 4285 struct super_block *sb; 4286 ext4_fsblk_t block = 0; 4287 unsigned int inquota = 0; 4288 unsigned int reserv_clstrs = 0; 4289 4290 sb = ar->inode->i_sb; 4291 sbi = EXT4_SB(sb); 4292 4293 trace_ext4_request_blocks(ar); 4294 4295 /* Allow to use superuser reservation for quota file */ 4296 if (IS_NOQUOTA(ar->inode)) 4297 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; 4298 4299 /* 4300 * For delayed allocation, we could skip the ENOSPC and 4301 * EDQUOT check, as blocks and quotas have been already 4302 * reserved when data being copied into pagecache. 4303 */ 4304 if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) 4305 ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4306 else { 4307 /* Without delayed allocation we need to verify 4308 * there is enough free blocks to do block allocation 4309 * and verify allocation doesn't exceed the quota limits. 4310 */ 4311 while (ar->len && 4312 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { 4313 4314 /* let others to free the space */ 4315 yield(); 4316 ar->len = ar->len >> 1; 4317 } 4318 if (!ar->len) { 4319 *errp = -ENOSPC; 4320 return 0; 4321 } 4322 reserv_clstrs = ar->len; 4323 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { 4324 dquot_alloc_block_nofail(ar->inode, 4325 EXT4_C2B(sbi, ar->len)); 4326 } else { 4327 while (ar->len && 4328 dquot_alloc_block(ar->inode, 4329 EXT4_C2B(sbi, ar->len))) { 4330 4331 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4332 ar->len--; 4333 } 4334 } 4335 inquota = ar->len; 4336 if (ar->len == 0) { 4337 *errp = -EDQUOT; 4338 goto out; 4339 } 4340 } 4341 4342 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4343 if (!ac) { 4344 ar->len = 0; 4345 *errp = -ENOMEM; 4346 goto out; 4347 } 4348 4349 *errp = ext4_mb_initialize_context(ac, ar); 4350 if (*errp) { 4351 ar->len = 0; 4352 goto out; 4353 } 4354 4355 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4356 if (!ext4_mb_use_preallocated(ac)) { 4357 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 4358 ext4_mb_normalize_request(ac, ar); 4359 repeat: 4360 /* allocate space in core */ 4361 *errp = ext4_mb_regular_allocator(ac); 4362 if (*errp) 4363 goto errout; 4364 4365 /* as we've just preallocated more space than 4366 * user requested orinally, we store allocated 4367 * space in a special descriptor */ 4368 if (ac->ac_status == AC_STATUS_FOUND && 4369 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4370 ext4_mb_new_preallocation(ac); 4371 } 4372 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4373 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); 4374 if (*errp == -EAGAIN) { 4375 /* 4376 * drop the reference that we took 4377 * in ext4_mb_use_best_found 4378 */ 4379 ext4_mb_release_context(ac); 4380 ac->ac_b_ex.fe_group = 0; 4381 ac->ac_b_ex.fe_start = 0; 4382 ac->ac_b_ex.fe_len = 0; 4383 ac->ac_status = AC_STATUS_CONTINUE; 4384 goto repeat; 4385 } else if (*errp) 4386 errout: 4387 ext4_discard_allocated_blocks(ac); 4388 else { 4389 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4390 ar->len = ac->ac_b_ex.fe_len; 4391 } 4392 } else { 4393 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); 4394 if (freed) 4395 goto repeat; 4396 *errp = -ENOSPC; 4397 } 4398 4399 if (*errp) { 4400 ac->ac_b_ex.fe_len = 0; 4401 ar->len = 0; 4402 ext4_mb_show_ac(ac); 4403 } 4404 ext4_mb_release_context(ac); 4405 out: 4406 if (ac) 4407 kmem_cache_free(ext4_ac_cachep, ac); 4408 if (inquota && ar->len < inquota) 4409 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); 4410 if (!ar->len) { 4411 if (!ext4_test_inode_state(ar->inode, 4412 EXT4_STATE_DELALLOC_RESERVED)) 4413 /* release all the reserved blocks if non delalloc */ 4414 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 4415 reserv_clstrs); 4416 } 4417 4418 trace_ext4_allocate_blocks(ar, (unsigned long long)block); 4419 4420 return block; 4421 } 4422 4423 /* 4424 * We can merge two free data extents only if the physical blocks 4425 * are contiguous, AND the extents were freed by the same transaction, 4426 * AND the blocks are associated with the same group. 4427 */ 4428 static int can_merge(struct ext4_free_data *entry1, 4429 struct ext4_free_data *entry2) 4430 { 4431 if ((entry1->t_tid == entry2->t_tid) && 4432 (entry1->group == entry2->group) && 4433 ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) 4434 return 1; 4435 return 0; 4436 } 4437 4438 static noinline_for_stack int 4439 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4440 struct ext4_free_data *new_entry) 4441 { 4442 ext4_group_t group = e4b->bd_group; 4443 ext4_grpblk_t cluster; 4444 struct ext4_free_data *entry; 4445 struct ext4_group_info *db = e4b->bd_info; 4446 struct super_block *sb = e4b->bd_sb; 4447 struct ext4_sb_info *sbi = EXT4_SB(sb); 4448 struct rb_node **n = &db->bb_free_root.rb_node, *node; 4449 struct rb_node *parent = NULL, *new_node; 4450 4451 BUG_ON(!ext4_handle_valid(handle)); 4452 BUG_ON(e4b->bd_bitmap_page == NULL); 4453 BUG_ON(e4b->bd_buddy_page == NULL); 4454 4455 new_node = &new_entry->node; 4456 cluster = new_entry->start_cluster; 4457 4458 if (!*n) { 4459 /* first free block exent. We need to 4460 protect buddy cache from being freed, 4461 * otherwise we'll refresh it from 4462 * on-disk bitmap and lose not-yet-available 4463 * blocks */ 4464 page_cache_get(e4b->bd_buddy_page); 4465 page_cache_get(e4b->bd_bitmap_page); 4466 } 4467 while (*n) { 4468 parent = *n; 4469 entry = rb_entry(parent, struct ext4_free_data, node); 4470 if (cluster < entry->start_cluster) 4471 n = &(*n)->rb_left; 4472 else if (cluster >= (entry->start_cluster + entry->count)) 4473 n = &(*n)->rb_right; 4474 else { 4475 ext4_grp_locked_error(sb, group, 0, 4476 ext4_group_first_block_no(sb, group) + 4477 EXT4_C2B(sbi, cluster), 4478 "Block already on to-be-freed list"); 4479 return 0; 4480 } 4481 } 4482 4483 rb_link_node(new_node, parent, n); 4484 rb_insert_color(new_node, &db->bb_free_root); 4485 4486 /* Now try to see the extent can be merged to left and right */ 4487 node = rb_prev(new_node); 4488 if (node) { 4489 entry = rb_entry(node, struct ext4_free_data, node); 4490 if (can_merge(entry, new_entry)) { 4491 new_entry->start_cluster = entry->start_cluster; 4492 new_entry->count += entry->count; 4493 rb_erase(node, &(db->bb_free_root)); 4494 spin_lock(&sbi->s_md_lock); 4495 list_del(&entry->list); 4496 spin_unlock(&sbi->s_md_lock); 4497 kmem_cache_free(ext4_free_ext_cachep, entry); 4498 } 4499 } 4500 4501 node = rb_next(new_node); 4502 if (node) { 4503 entry = rb_entry(node, struct ext4_free_data, node); 4504 if (can_merge(new_entry, entry)) { 4505 new_entry->count += entry->count; 4506 rb_erase(node, &(db->bb_free_root)); 4507 spin_lock(&sbi->s_md_lock); 4508 list_del(&entry->list); 4509 spin_unlock(&sbi->s_md_lock); 4510 kmem_cache_free(ext4_free_ext_cachep, entry); 4511 } 4512 } 4513 /* Add the extent to transaction's private list */ 4514 spin_lock(&sbi->s_md_lock); 4515 list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4516 spin_unlock(&sbi->s_md_lock); 4517 return 0; 4518 } 4519 4520 /** 4521 * ext4_free_blocks() -- Free given blocks and update quota 4522 * @handle: handle for this transaction 4523 * @inode: inode 4524 * @block: start physical block to free 4525 * @count: number of blocks to count 4526 * @flags: flags used by ext4_free_blocks 4527 */ 4528 void ext4_free_blocks(handle_t *handle, struct inode *inode, 4529 struct buffer_head *bh, ext4_fsblk_t block, 4530 unsigned long count, int flags) 4531 { 4532 struct buffer_head *bitmap_bh = NULL; 4533 struct super_block *sb = inode->i_sb; 4534 struct ext4_group_desc *gdp; 4535 unsigned long freed = 0; 4536 unsigned int overflow; 4537 ext4_grpblk_t bit; 4538 struct buffer_head *gd_bh; 4539 ext4_group_t block_group; 4540 struct ext4_sb_info *sbi; 4541 struct ext4_buddy e4b; 4542 unsigned int count_clusters; 4543 int err = 0; 4544 int ret; 4545 4546 if (bh) { 4547 if (block) 4548 BUG_ON(block != bh->b_blocknr); 4549 else 4550 block = bh->b_blocknr; 4551 } 4552 4553 sbi = EXT4_SB(sb); 4554 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 4555 !ext4_data_block_valid(sbi, block, count)) { 4556 ext4_error(sb, "Freeing blocks not in datazone - " 4557 "block = %llu, count = %lu", block, count); 4558 goto error_return; 4559 } 4560 4561 ext4_debug("freeing block %llu\n", block); 4562 trace_ext4_free_blocks(inode, block, count, flags); 4563 4564 if (flags & EXT4_FREE_BLOCKS_FORGET) { 4565 struct buffer_head *tbh = bh; 4566 int i; 4567 4568 BUG_ON(bh && (count > 1)); 4569 4570 for (i = 0; i < count; i++) { 4571 if (!bh) 4572 tbh = sb_find_get_block(inode->i_sb, 4573 block + i); 4574 if (unlikely(!tbh)) 4575 continue; 4576 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4577 inode, tbh, block + i); 4578 } 4579 } 4580 4581 /* 4582 * We need to make sure we don't reuse the freed block until 4583 * after the transaction is committed, which we can do by 4584 * treating the block as metadata, below. We make an 4585 * exception if the inode is to be written in writeback mode 4586 * since writeback mode has weak data consistency guarantees. 4587 */ 4588 if (!ext4_should_writeback_data(inode)) 4589 flags |= EXT4_FREE_BLOCKS_METADATA; 4590 4591 /* 4592 * If the extent to be freed does not begin on a cluster 4593 * boundary, we need to deal with partial clusters at the 4594 * beginning and end of the extent. Normally we will free 4595 * blocks at the beginning or the end unless we are explicitly 4596 * requested to avoid doing so. 4597 */ 4598 overflow = block & (sbi->s_cluster_ratio - 1); 4599 if (overflow) { 4600 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { 4601 overflow = sbi->s_cluster_ratio - overflow; 4602 block += overflow; 4603 if (count > overflow) 4604 count -= overflow; 4605 else 4606 return; 4607 } else { 4608 block -= overflow; 4609 count += overflow; 4610 } 4611 } 4612 overflow = count & (sbi->s_cluster_ratio - 1); 4613 if (overflow) { 4614 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { 4615 if (count > overflow) 4616 count -= overflow; 4617 else 4618 return; 4619 } else 4620 count += sbi->s_cluster_ratio - overflow; 4621 } 4622 4623 do_more: 4624 overflow = 0; 4625 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4626 4627 /* 4628 * Check to see if we are freeing blocks across a group 4629 * boundary. 4630 */ 4631 if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4632 overflow = EXT4_C2B(sbi, bit) + count - 4633 EXT4_BLOCKS_PER_GROUP(sb); 4634 count -= overflow; 4635 } 4636 count_clusters = EXT4_B2C(sbi, count); 4637 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4638 if (!bitmap_bh) { 4639 err = -EIO; 4640 goto error_return; 4641 } 4642 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 4643 if (!gdp) { 4644 err = -EIO; 4645 goto error_return; 4646 } 4647 4648 if (in_range(ext4_block_bitmap(sb, gdp), block, count) || 4649 in_range(ext4_inode_bitmap(sb, gdp), block, count) || 4650 in_range(block, ext4_inode_table(sb, gdp), 4651 EXT4_SB(sb)->s_itb_per_group) || 4652 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4653 EXT4_SB(sb)->s_itb_per_group)) { 4654 4655 ext4_error(sb, "Freeing blocks in system zone - " 4656 "Block = %llu, count = %lu", block, count); 4657 /* err = 0. ext4_std_error should be a no op */ 4658 goto error_return; 4659 } 4660 4661 BUFFER_TRACE(bitmap_bh, "getting write access"); 4662 err = ext4_journal_get_write_access(handle, bitmap_bh); 4663 if (err) 4664 goto error_return; 4665 4666 /* 4667 * We are about to modify some metadata. Call the journal APIs 4668 * to unshare ->b_data if a currently-committing transaction is 4669 * using it 4670 */ 4671 BUFFER_TRACE(gd_bh, "get_write_access"); 4672 err = ext4_journal_get_write_access(handle, gd_bh); 4673 if (err) 4674 goto error_return; 4675 #ifdef AGGRESSIVE_CHECK 4676 { 4677 int i; 4678 for (i = 0; i < count_clusters; i++) 4679 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4680 } 4681 #endif 4682 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); 4683 4684 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4685 if (err) 4686 goto error_return; 4687 4688 if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { 4689 struct ext4_free_data *new_entry; 4690 /* 4691 * blocks being freed are metadata. these blocks shouldn't 4692 * be used until this transaction is committed 4693 */ 4694 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4695 if (!new_entry) { 4696 err = -ENOMEM; 4697 goto error_return; 4698 } 4699 new_entry->start_cluster = bit; 4700 new_entry->group = block_group; 4701 new_entry->count = count_clusters; 4702 new_entry->t_tid = handle->h_transaction->t_tid; 4703 4704 ext4_lock_group(sb, block_group); 4705 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4706 ext4_mb_free_metadata(handle, &e4b, new_entry); 4707 } else { 4708 /* need to update group_info->bb_free and bitmap 4709 * with group lock held. generate_buddy look at 4710 * them with group lock_held 4711 */ 4712 ext4_lock_group(sb, block_group); 4713 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4714 mb_free_blocks(inode, &e4b, bit, count_clusters); 4715 } 4716 4717 ret = ext4_free_group_clusters(sb, gdp) + count_clusters; 4718 ext4_free_group_clusters_set(sb, gdp, ret); 4719 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4720 ext4_unlock_group(sb, block_group); 4721 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); 4722 4723 if (sbi->s_log_groups_per_flex) { 4724 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4725 atomic_add(count_clusters, 4726 &sbi->s_flex_groups[flex_group].free_clusters); 4727 } 4728 4729 ext4_mb_unload_buddy(&e4b); 4730 4731 freed += count; 4732 4733 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) 4734 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); 4735 4736 /* We dirtied the bitmap block */ 4737 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4738 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4739 4740 /* And the group descriptor block */ 4741 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4742 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 4743 if (!err) 4744 err = ret; 4745 4746 if (overflow && !err) { 4747 block += count; 4748 count = overflow; 4749 put_bh(bitmap_bh); 4750 goto do_more; 4751 } 4752 ext4_mark_super_dirty(sb); 4753 error_return: 4754 brelse(bitmap_bh); 4755 ext4_std_error(sb, err); 4756 return; 4757 } 4758 4759 /** 4760 * ext4_group_add_blocks() -- Add given blocks to an existing group 4761 * @handle: handle to this transaction 4762 * @sb: super block 4763 * @block: start physcial block to add to the block group 4764 * @count: number of blocks to free 4765 * 4766 * This marks the blocks as free in the bitmap and buddy. 4767 */ 4768 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, 4769 ext4_fsblk_t block, unsigned long count) 4770 { 4771 struct buffer_head *bitmap_bh = NULL; 4772 struct buffer_head *gd_bh; 4773 ext4_group_t block_group; 4774 ext4_grpblk_t bit; 4775 unsigned int i; 4776 struct ext4_group_desc *desc; 4777 struct ext4_sb_info *sbi = EXT4_SB(sb); 4778 struct ext4_buddy e4b; 4779 int err = 0, ret, blk_free_count; 4780 ext4_grpblk_t blocks_freed; 4781 4782 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 4783 4784 if (count == 0) 4785 return 0; 4786 4787 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4788 /* 4789 * Check to see if we are freeing blocks across a group 4790 * boundary. 4791 */ 4792 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4793 ext4_warning(sb, "too much blocks added to group %u\n", 4794 block_group); 4795 err = -EINVAL; 4796 goto error_return; 4797 } 4798 4799 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4800 if (!bitmap_bh) { 4801 err = -EIO; 4802 goto error_return; 4803 } 4804 4805 desc = ext4_get_group_desc(sb, block_group, &gd_bh); 4806 if (!desc) { 4807 err = -EIO; 4808 goto error_return; 4809 } 4810 4811 if (in_range(ext4_block_bitmap(sb, desc), block, count) || 4812 in_range(ext4_inode_bitmap(sb, desc), block, count) || 4813 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 4814 in_range(block + count - 1, ext4_inode_table(sb, desc), 4815 sbi->s_itb_per_group)) { 4816 ext4_error(sb, "Adding blocks in system zones - " 4817 "Block = %llu, count = %lu", 4818 block, count); 4819 err = -EINVAL; 4820 goto error_return; 4821 } 4822 4823 BUFFER_TRACE(bitmap_bh, "getting write access"); 4824 err = ext4_journal_get_write_access(handle, bitmap_bh); 4825 if (err) 4826 goto error_return; 4827 4828 /* 4829 * We are about to modify some metadata. Call the journal APIs 4830 * to unshare ->b_data if a currently-committing transaction is 4831 * using it 4832 */ 4833 BUFFER_TRACE(gd_bh, "get_write_access"); 4834 err = ext4_journal_get_write_access(handle, gd_bh); 4835 if (err) 4836 goto error_return; 4837 4838 for (i = 0, blocks_freed = 0; i < count; i++) { 4839 BUFFER_TRACE(bitmap_bh, "clear bit"); 4840 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { 4841 ext4_error(sb, "bit already cleared for block %llu", 4842 (ext4_fsblk_t)(block + i)); 4843 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 4844 } else { 4845 blocks_freed++; 4846 } 4847 } 4848 4849 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4850 if (err) 4851 goto error_return; 4852 4853 /* 4854 * need to update group_info->bb_free and bitmap 4855 * with group lock held. generate_buddy look at 4856 * them with group lock_held 4857 */ 4858 ext4_lock_group(sb, block_group); 4859 mb_clear_bits(bitmap_bh->b_data, bit, count); 4860 mb_free_blocks(NULL, &e4b, bit, count); 4861 blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); 4862 ext4_free_group_clusters_set(sb, desc, blk_free_count); 4863 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); 4864 ext4_unlock_group(sb, block_group); 4865 percpu_counter_add(&sbi->s_freeclusters_counter, 4866 EXT4_B2C(sbi, blocks_freed)); 4867 4868 if (sbi->s_log_groups_per_flex) { 4869 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4870 atomic_add(EXT4_B2C(sbi, blocks_freed), 4871 &sbi->s_flex_groups[flex_group].free_clusters); 4872 } 4873 4874 ext4_mb_unload_buddy(&e4b); 4875 4876 /* We dirtied the bitmap block */ 4877 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4878 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4879 4880 /* And the group descriptor block */ 4881 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4882 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 4883 if (!err) 4884 err = ret; 4885 4886 error_return: 4887 brelse(bitmap_bh); 4888 ext4_std_error(sb, err); 4889 return err; 4890 } 4891 4892 /** 4893 * ext4_trim_extent -- function to TRIM one single free extent in the group 4894 * @sb: super block for the file system 4895 * @start: starting block of the free extent in the alloc. group 4896 * @count: number of blocks to TRIM 4897 * @group: alloc. group we are working with 4898 * @e4b: ext4 buddy for the group 4899 * 4900 * Trim "count" blocks starting at "start" in the "group". To assure that no 4901 * one will allocate those blocks, mark it as used in buddy bitmap. This must 4902 * be called with under the group lock. 4903 */ 4904 static void ext4_trim_extent(struct super_block *sb, int start, int count, 4905 ext4_group_t group, struct ext4_buddy *e4b) 4906 { 4907 struct ext4_free_extent ex; 4908 4909 trace_ext4_trim_extent(sb, group, start, count); 4910 4911 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 4912 4913 ex.fe_start = start; 4914 ex.fe_group = group; 4915 ex.fe_len = count; 4916 4917 /* 4918 * Mark blocks used, so no one can reuse them while 4919 * being trimmed. 4920 */ 4921 mb_mark_used(e4b, &ex); 4922 ext4_unlock_group(sb, group); 4923 ext4_issue_discard(sb, group, start, count); 4924 ext4_lock_group(sb, group); 4925 mb_free_blocks(NULL, e4b, start, ex.fe_len); 4926 } 4927 4928 /** 4929 * ext4_trim_all_free -- function to trim all free space in alloc. group 4930 * @sb: super block for file system 4931 * @group: group to be trimmed 4932 * @start: first group block to examine 4933 * @max: last group block to examine 4934 * @minblocks: minimum extent block count 4935 * 4936 * ext4_trim_all_free walks through group's buddy bitmap searching for free 4937 * extents. When the free block is found, ext4_trim_extent is called to TRIM 4938 * the extent. 4939 * 4940 * 4941 * ext4_trim_all_free walks through group's block bitmap searching for free 4942 * extents. When the free extent is found, mark it as used in group buddy 4943 * bitmap. Then issue a TRIM command on this extent and free the extent in 4944 * the group buddy bitmap. This is done until whole group is scanned. 4945 */ 4946 static ext4_grpblk_t 4947 ext4_trim_all_free(struct super_block *sb, ext4_group_t group, 4948 ext4_grpblk_t start, ext4_grpblk_t max, 4949 ext4_grpblk_t minblocks) 4950 { 4951 void *bitmap; 4952 ext4_grpblk_t next, count = 0, free_count = 0; 4953 struct ext4_buddy e4b; 4954 int ret; 4955 4956 trace_ext4_trim_all_free(sb, group, start, max); 4957 4958 ret = ext4_mb_load_buddy(sb, group, &e4b); 4959 if (ret) { 4960 ext4_error(sb, "Error in loading buddy " 4961 "information for %u", group); 4962 return ret; 4963 } 4964 bitmap = e4b.bd_bitmap; 4965 4966 ext4_lock_group(sb, group); 4967 if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && 4968 minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) 4969 goto out; 4970 4971 start = (e4b.bd_info->bb_first_free > start) ? 4972 e4b.bd_info->bb_first_free : start; 4973 4974 while (start < max) { 4975 start = mb_find_next_zero_bit(bitmap, max, start); 4976 if (start >= max) 4977 break; 4978 next = mb_find_next_bit(bitmap, max, start); 4979 4980 if ((next - start) >= minblocks) { 4981 ext4_trim_extent(sb, start, 4982 next - start, group, &e4b); 4983 count += next - start; 4984 } 4985 free_count += next - start; 4986 start = next + 1; 4987 4988 if (fatal_signal_pending(current)) { 4989 count = -ERESTARTSYS; 4990 break; 4991 } 4992 4993 if (need_resched()) { 4994 ext4_unlock_group(sb, group); 4995 cond_resched(); 4996 ext4_lock_group(sb, group); 4997 } 4998 4999 if ((e4b.bd_info->bb_free - free_count) < minblocks) 5000 break; 5001 } 5002 5003 if (!ret) 5004 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); 5005 out: 5006 ext4_unlock_group(sb, group); 5007 ext4_mb_unload_buddy(&e4b); 5008 5009 ext4_debug("trimmed %d blocks in the group %d\n", 5010 count, group); 5011 5012 return count; 5013 } 5014 5015 /** 5016 * ext4_trim_fs() -- trim ioctl handle function 5017 * @sb: superblock for filesystem 5018 * @range: fstrim_range structure 5019 * 5020 * start: First Byte to trim 5021 * len: number of Bytes to trim from start 5022 * minlen: minimum extent length in Bytes 5023 * ext4_trim_fs goes through all allocation groups containing Bytes from 5024 * start to start+len. For each such a group ext4_trim_all_free function 5025 * is invoked to trim all free space. 5026 */ 5027 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 5028 { 5029 struct ext4_group_info *grp; 5030 ext4_group_t first_group, last_group; 5031 ext4_group_t group, ngroups = ext4_get_groups_count(sb); 5032 ext4_grpblk_t cnt = 0, first_cluster, last_cluster; 5033 uint64_t start, len, minlen, trimmed = 0; 5034 ext4_fsblk_t first_data_blk = 5035 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 5036 int ret = 0; 5037 5038 start = range->start >> sb->s_blocksize_bits; 5039 len = range->len >> sb->s_blocksize_bits; 5040 minlen = range->minlen >> sb->s_blocksize_bits; 5041 5042 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) 5043 return -EINVAL; 5044 if (start + len <= first_data_blk) 5045 goto out; 5046 if (start < first_data_blk) { 5047 len -= first_data_blk - start; 5048 start = first_data_blk; 5049 } 5050 5051 /* Determine first and last group to examine based on start and len */ 5052 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 5053 &first_group, &first_cluster); 5054 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), 5055 &last_group, &last_cluster); 5056 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; 5057 last_cluster = EXT4_CLUSTERS_PER_GROUP(sb); 5058 5059 if (first_group > last_group) 5060 return -EINVAL; 5061 5062 for (group = first_group; group <= last_group; group++) { 5063 grp = ext4_get_group_info(sb, group); 5064 /* We only do this if the grp has never been initialized */ 5065 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 5066 ret = ext4_mb_init_group(sb, group); 5067 if (ret) 5068 break; 5069 } 5070 5071 /* 5072 * For all the groups except the last one, last block will 5073 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to 5074 * change it for the last group in which case start + 5075 * len < EXT4_BLOCKS_PER_GROUP(sb). 5076 */ 5077 if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) 5078 last_cluster = first_cluster + len; 5079 len -= last_cluster - first_cluster; 5080 5081 if (grp->bb_free >= minlen) { 5082 cnt = ext4_trim_all_free(sb, group, first_cluster, 5083 last_cluster, minlen); 5084 if (cnt < 0) { 5085 ret = cnt; 5086 break; 5087 } 5088 } 5089 trimmed += cnt; 5090 first_cluster = 0; 5091 } 5092 range->len = trimmed * sb->s_blocksize; 5093 5094 if (!ret) 5095 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); 5096 5097 out: 5098 return ret; 5099 } 5100