1 /* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License version 2 as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public Licens 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 17 */ 18 19 20 /* 21 * mballoc.c contains the multiblocks allocation routines 22 */ 23 24 #include "ext4_jbd2.h" 25 #include "mballoc.h" 26 #include <linux/log2.h> 27 #include <linux/module.h> 28 #include <linux/slab.h> 29 #include <trace/events/ext4.h> 30 31 #ifdef CONFIG_EXT4_DEBUG 32 ushort ext4_mballoc_debug __read_mostly; 33 34 module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); 35 MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); 36 #endif 37 38 /* 39 * MUSTDO: 40 * - test ext4_ext_search_left() and ext4_ext_search_right() 41 * - search for metadata in few groups 42 * 43 * TODO v4: 44 * - normalization should take into account whether file is still open 45 * - discard preallocations if no free space left (policy?) 46 * - don't normalize tails 47 * - quota 48 * - reservation for superuser 49 * 50 * TODO v3: 51 * - bitmap read-ahead (proposed by Oleg Drokin aka green) 52 * - track min/max extents in each group for better group selection 53 * - mb_mark_used() may allocate chunk right after splitting buddy 54 * - tree of groups sorted by number of free blocks 55 * - error handling 56 */ 57 58 /* 59 * The allocation request involve request for multiple number of blocks 60 * near to the goal(block) value specified. 61 * 62 * During initialization phase of the allocator we decide to use the 63 * group preallocation or inode preallocation depending on the size of 64 * the file. The size of the file could be the resulting file size we 65 * would have after allocation, or the current file size, which ever 66 * is larger. If the size is less than sbi->s_mb_stream_request we 67 * select to use the group preallocation. The default value of 68 * s_mb_stream_request is 16 blocks. This can also be tuned via 69 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in 70 * terms of number of blocks. 71 * 72 * The main motivation for having small file use group preallocation is to 73 * ensure that we have small files closer together on the disk. 74 * 75 * First stage the allocator looks at the inode prealloc list, 76 * ext4_inode_info->i_prealloc_list, which contains list of prealloc 77 * spaces for this particular inode. The inode prealloc space is 78 * represented as: 79 * 80 * pa_lstart -> the logical start block for this prealloc space 81 * pa_pstart -> the physical start block for this prealloc space 82 * pa_len -> length for this prealloc space (in clusters) 83 * pa_free -> free space available in this prealloc space (in clusters) 84 * 85 * The inode preallocation space is used looking at the _logical_ start 86 * block. If only the logical file block falls within the range of prealloc 87 * space we will consume the particular prealloc space. This makes sure that 88 * we have contiguous physical blocks representing the file blocks 89 * 90 * The important thing to be noted in case of inode prealloc space is that 91 * we don't modify the values associated to inode prealloc space except 92 * pa_free. 93 * 94 * If we are not able to find blocks in the inode prealloc space and if we 95 * have the group allocation flag set then we look at the locality group 96 * prealloc space. These are per CPU prealloc list represented as 97 * 98 * ext4_sb_info.s_locality_groups[smp_processor_id()] 99 * 100 * The reason for having a per cpu locality group is to reduce the contention 101 * between CPUs. It is possible to get scheduled at this point. 102 * 103 * The locality group prealloc space is used looking at whether we have 104 * enough free space (pa_free) within the prealloc space. 105 * 106 * If we can't allocate blocks via inode prealloc or/and locality group 107 * prealloc then we look at the buddy cache. The buddy cache is represented 108 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets 109 * mapped to the buddy and bitmap information regarding different 110 * groups. The buddy information is attached to buddy cache inode so that 111 * we can access them through the page cache. The information regarding 112 * each group is loaded via ext4_mb_load_buddy. The information involve 113 * block bitmap and buddy information. The information are stored in the 114 * inode as: 115 * 116 * { page } 117 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 118 * 119 * 120 * one block each for bitmap and buddy information. So for each group we 121 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / 122 * blocksize) blocks. So it can have information regarding groups_per_page 123 * which is blocks_per_page/2 124 * 125 * The buddy cache inode is not stored on disk. The inode is thrown 126 * away when the filesystem is unmounted. 127 * 128 * We look for count number of blocks in the buddy cache. If we were able 129 * to locate that many free blocks we return with additional information 130 * regarding rest of the contiguous physical block available 131 * 132 * Before allocating blocks via buddy cache we normalize the request 133 * blocks. This ensure we ask for more blocks that we needed. The extra 134 * blocks that we get after allocation is added to the respective prealloc 135 * list. In case of inode preallocation we follow a list of heuristics 136 * based on file size. This can be found in ext4_mb_normalize_request. If 137 * we are doing a group prealloc we try to normalize the request to 138 * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is 139 * dependent on the cluster size; for non-bigalloc file systems, it is 140 * 512 blocks. This can be tuned via 141 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in 142 * terms of number of blocks. If we have mounted the file system with -O 143 * stripe=<value> option the group prealloc request is normalized to the 144 * the smallest multiple of the stripe value (sbi->s_stripe) which is 145 * greater than the default mb_group_prealloc. 146 * 147 * The regular allocator (using the buddy cache) supports a few tunables. 148 * 149 * /sys/fs/ext4/<partition>/mb_min_to_scan 150 * /sys/fs/ext4/<partition>/mb_max_to_scan 151 * /sys/fs/ext4/<partition>/mb_order2_req 152 * 153 * The regular allocator uses buddy scan only if the request len is power of 154 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 155 * value of s_mb_order2_reqs can be tuned via 156 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to 157 * stripe size (sbi->s_stripe), we try to search for contiguous block in 158 * stripe size. This should result in better allocation on RAID setups. If 159 * not, we search in the specific group using bitmap for best extents. The 160 * tunable min_to_scan and max_to_scan control the behaviour here. 161 * min_to_scan indicate how long the mballoc __must__ look for a best 162 * extent and max_to_scan indicates how long the mballoc __can__ look for a 163 * best extent in the found extents. Searching for the blocks starts with 164 * the group specified as the goal value in allocation context via 165 * ac_g_ex. Each group is first checked based on the criteria whether it 166 * can be used for allocation. ext4_mb_good_group explains how the groups are 167 * checked. 168 * 169 * Both the prealloc space are getting populated as above. So for the first 170 * request we will hit the buddy cache which will result in this prealloc 171 * space getting filled. The prealloc space is then later used for the 172 * subsequent request. 173 */ 174 175 /* 176 * mballoc operates on the following data: 177 * - on-disk bitmap 178 * - in-core buddy (actually includes buddy and bitmap) 179 * - preallocation descriptors (PAs) 180 * 181 * there are two types of preallocations: 182 * - inode 183 * assiged to specific inode and can be used for this inode only. 184 * it describes part of inode's space preallocated to specific 185 * physical blocks. any block from that preallocated can be used 186 * independent. the descriptor just tracks number of blocks left 187 * unused. so, before taking some block from descriptor, one must 188 * make sure corresponded logical block isn't allocated yet. this 189 * also means that freeing any block within descriptor's range 190 * must discard all preallocated blocks. 191 * - locality group 192 * assigned to specific locality group which does not translate to 193 * permanent set of inodes: inode can join and leave group. space 194 * from this type of preallocation can be used for any inode. thus 195 * it's consumed from the beginning to the end. 196 * 197 * relation between them can be expressed as: 198 * in-core buddy = on-disk bitmap + preallocation descriptors 199 * 200 * this mean blocks mballoc considers used are: 201 * - allocated blocks (persistent) 202 * - preallocated blocks (non-persistent) 203 * 204 * consistency in mballoc world means that at any time a block is either 205 * free or used in ALL structures. notice: "any time" should not be read 206 * literally -- time is discrete and delimited by locks. 207 * 208 * to keep it simple, we don't use block numbers, instead we count number of 209 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. 210 * 211 * all operations can be expressed as: 212 * - init buddy: buddy = on-disk + PAs 213 * - new PA: buddy += N; PA = N 214 * - use inode PA: on-disk += N; PA -= N 215 * - discard inode PA buddy -= on-disk - PA; PA = 0 216 * - use locality group PA on-disk += N; PA -= N 217 * - discard locality group PA buddy -= PA; PA = 0 218 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap 219 * is used in real operation because we can't know actual used 220 * bits from PA, only from on-disk bitmap 221 * 222 * if we follow this strict logic, then all operations above should be atomic. 223 * given some of them can block, we'd have to use something like semaphores 224 * killing performance on high-end SMP hardware. let's try to relax it using 225 * the following knowledge: 226 * 1) if buddy is referenced, it's already initialized 227 * 2) while block is used in buddy and the buddy is referenced, 228 * nobody can re-allocate that block 229 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has 230 * bit set and PA claims same block, it's OK. IOW, one can set bit in 231 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded 232 * block 233 * 234 * so, now we're building a concurrency table: 235 * - init buddy vs. 236 * - new PA 237 * blocks for PA are allocated in the buddy, buddy must be referenced 238 * until PA is linked to allocation group to avoid concurrent buddy init 239 * - use inode PA 240 * we need to make sure that either on-disk bitmap or PA has uptodate data 241 * given (3) we care that PA-=N operation doesn't interfere with init 242 * - discard inode PA 243 * the simplest way would be to have buddy initialized by the discard 244 * - use locality group PA 245 * again PA-=N must be serialized with init 246 * - discard locality group PA 247 * the simplest way would be to have buddy initialized by the discard 248 * - new PA vs. 249 * - use inode PA 250 * i_data_sem serializes them 251 * - discard inode PA 252 * discard process must wait until PA isn't used by another process 253 * - use locality group PA 254 * some mutex should serialize them 255 * - discard locality group PA 256 * discard process must wait until PA isn't used by another process 257 * - use inode PA 258 * - use inode PA 259 * i_data_sem or another mutex should serializes them 260 * - discard inode PA 261 * discard process must wait until PA isn't used by another process 262 * - use locality group PA 263 * nothing wrong here -- they're different PAs covering different blocks 264 * - discard locality group PA 265 * discard process must wait until PA isn't used by another process 266 * 267 * now we're ready to make few consequences: 268 * - PA is referenced and while it is no discard is possible 269 * - PA is referenced until block isn't marked in on-disk bitmap 270 * - PA changes only after on-disk bitmap 271 * - discard must not compete with init. either init is done before 272 * any discard or they're serialized somehow 273 * - buddy init as sum of on-disk bitmap and PAs is done atomically 274 * 275 * a special case when we've used PA to emptiness. no need to modify buddy 276 * in this case, but we should care about concurrent init 277 * 278 */ 279 280 /* 281 * Logic in few words: 282 * 283 * - allocation: 284 * load group 285 * find blocks 286 * mark bits in on-disk bitmap 287 * release group 288 * 289 * - use preallocation: 290 * find proper PA (per-inode or group) 291 * load group 292 * mark bits in on-disk bitmap 293 * release group 294 * release PA 295 * 296 * - free: 297 * load group 298 * mark bits in on-disk bitmap 299 * release group 300 * 301 * - discard preallocations in group: 302 * mark PAs deleted 303 * move them onto local list 304 * load on-disk bitmap 305 * load group 306 * remove PA from object (inode or locality group) 307 * mark free blocks in-core 308 * 309 * - discard inode's preallocations: 310 */ 311 312 /* 313 * Locking rules 314 * 315 * Locks: 316 * - bitlock on a group (group) 317 * - object (inode/locality) (object) 318 * - per-pa lock (pa) 319 * 320 * Paths: 321 * - new pa 322 * object 323 * group 324 * 325 * - find and use pa: 326 * pa 327 * 328 * - release consumed pa: 329 * pa 330 * group 331 * object 332 * 333 * - generate in-core bitmap: 334 * group 335 * pa 336 * 337 * - discard all for given object (inode, locality group): 338 * object 339 * pa 340 * group 341 * 342 * - discard all for given group: 343 * group 344 * pa 345 * group 346 * object 347 * 348 */ 349 static struct kmem_cache *ext4_pspace_cachep; 350 static struct kmem_cache *ext4_ac_cachep; 351 static struct kmem_cache *ext4_free_data_cachep; 352 353 /* We create slab caches for groupinfo data structures based on the 354 * superblock block size. There will be one per mounted filesystem for 355 * each unique s_blocksize_bits */ 356 #define NR_GRPINFO_CACHES 8 357 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; 358 359 static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { 360 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", 361 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", 362 "ext4_groupinfo_64k", "ext4_groupinfo_128k" 363 }; 364 365 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 366 ext4_group_t group); 367 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 368 ext4_group_t group); 369 static void ext4_free_data_callback(struct super_block *sb, 370 struct ext4_journal_cb_entry *jce, int rc); 371 372 static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 373 { 374 #if BITS_PER_LONG == 64 375 *bit += ((unsigned long) addr & 7UL) << 3; 376 addr = (void *) ((unsigned long) addr & ~7UL); 377 #elif BITS_PER_LONG == 32 378 *bit += ((unsigned long) addr & 3UL) << 3; 379 addr = (void *) ((unsigned long) addr & ~3UL); 380 #else 381 #error "how many bits you are?!" 382 #endif 383 return addr; 384 } 385 386 static inline int mb_test_bit(int bit, void *addr) 387 { 388 /* 389 * ext4_test_bit on architecture like powerpc 390 * needs unsigned long aligned address 391 */ 392 addr = mb_correct_addr_and_bit(&bit, addr); 393 return ext4_test_bit(bit, addr); 394 } 395 396 static inline void mb_set_bit(int bit, void *addr) 397 { 398 addr = mb_correct_addr_and_bit(&bit, addr); 399 ext4_set_bit(bit, addr); 400 } 401 402 static inline void mb_clear_bit(int bit, void *addr) 403 { 404 addr = mb_correct_addr_and_bit(&bit, addr); 405 ext4_clear_bit(bit, addr); 406 } 407 408 static inline int mb_test_and_clear_bit(int bit, void *addr) 409 { 410 addr = mb_correct_addr_and_bit(&bit, addr); 411 return ext4_test_and_clear_bit(bit, addr); 412 } 413 414 static inline int mb_find_next_zero_bit(void *addr, int max, int start) 415 { 416 int fix = 0, ret, tmpmax; 417 addr = mb_correct_addr_and_bit(&fix, addr); 418 tmpmax = max + fix; 419 start += fix; 420 421 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; 422 if (ret > max) 423 return max; 424 return ret; 425 } 426 427 static inline int mb_find_next_bit(void *addr, int max, int start) 428 { 429 int fix = 0, ret, tmpmax; 430 addr = mb_correct_addr_and_bit(&fix, addr); 431 tmpmax = max + fix; 432 start += fix; 433 434 ret = ext4_find_next_bit(addr, tmpmax, start) - fix; 435 if (ret > max) 436 return max; 437 return ret; 438 } 439 440 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 441 { 442 char *bb; 443 444 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 445 BUG_ON(max == NULL); 446 447 if (order > e4b->bd_blkbits + 1) { 448 *max = 0; 449 return NULL; 450 } 451 452 /* at order 0 we see each particular block */ 453 if (order == 0) { 454 *max = 1 << (e4b->bd_blkbits + 3); 455 return e4b->bd_bitmap; 456 } 457 458 bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 459 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 460 461 return bb; 462 } 463 464 #ifdef DOUBLE_CHECK 465 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, 466 int first, int count) 467 { 468 int i; 469 struct super_block *sb = e4b->bd_sb; 470 471 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 472 return; 473 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 474 for (i = 0; i < count; i++) { 475 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 476 ext4_fsblk_t blocknr; 477 478 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 479 blocknr += EXT4_C2B(EXT4_SB(sb), first + i); 480 ext4_grp_locked_error(sb, e4b->bd_group, 481 inode ? inode->i_ino : 0, 482 blocknr, 483 "freeing block already freed " 484 "(bit %u)", 485 first + i); 486 } 487 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 488 } 489 } 490 491 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) 492 { 493 int i; 494 495 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 496 return; 497 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 498 for (i = 0; i < count; i++) { 499 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 500 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 501 } 502 } 503 504 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 505 { 506 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { 507 unsigned char *b1, *b2; 508 int i; 509 b1 = (unsigned char *) e4b->bd_info->bb_bitmap; 510 b2 = (unsigned char *) bitmap; 511 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 512 if (b1[i] != b2[i]) { 513 ext4_msg(e4b->bd_sb, KERN_ERR, 514 "corruption in group %u " 515 "at byte %u(%u): %x in copy != %x " 516 "on disk/prealloc", 517 e4b->bd_group, i, i * 8, b1[i], b2[i]); 518 BUG(); 519 } 520 } 521 } 522 } 523 524 #else 525 static inline void mb_free_blocks_double(struct inode *inode, 526 struct ext4_buddy *e4b, int first, int count) 527 { 528 return; 529 } 530 static inline void mb_mark_used_double(struct ext4_buddy *e4b, 531 int first, int count) 532 { 533 return; 534 } 535 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 536 { 537 return; 538 } 539 #endif 540 541 #ifdef AGGRESSIVE_CHECK 542 543 #define MB_CHECK_ASSERT(assert) \ 544 do { \ 545 if (!(assert)) { \ 546 printk(KERN_EMERG \ 547 "Assertion failure in %s() at %s:%d: \"%s\"\n", \ 548 function, file, line, # assert); \ 549 BUG(); \ 550 } \ 551 } while (0) 552 553 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, 554 const char *function, int line) 555 { 556 struct super_block *sb = e4b->bd_sb; 557 int order = e4b->bd_blkbits + 1; 558 int max; 559 int max2; 560 int i; 561 int j; 562 int k; 563 int count; 564 struct ext4_group_info *grp; 565 int fragments = 0; 566 int fstart; 567 struct list_head *cur; 568 void *buddy; 569 void *buddy2; 570 571 { 572 static int mb_check_counter; 573 if (mb_check_counter++ % 100 != 0) 574 return 0; 575 } 576 577 while (order > 1) { 578 buddy = mb_find_buddy(e4b, order, &max); 579 MB_CHECK_ASSERT(buddy); 580 buddy2 = mb_find_buddy(e4b, order - 1, &max2); 581 MB_CHECK_ASSERT(buddy2); 582 MB_CHECK_ASSERT(buddy != buddy2); 583 MB_CHECK_ASSERT(max * 2 == max2); 584 585 count = 0; 586 for (i = 0; i < max; i++) { 587 588 if (mb_test_bit(i, buddy)) { 589 /* only single bit in buddy2 may be 1 */ 590 if (!mb_test_bit(i << 1, buddy2)) { 591 MB_CHECK_ASSERT( 592 mb_test_bit((i<<1)+1, buddy2)); 593 } else if (!mb_test_bit((i << 1) + 1, buddy2)) { 594 MB_CHECK_ASSERT( 595 mb_test_bit(i << 1, buddy2)); 596 } 597 continue; 598 } 599 600 /* both bits in buddy2 must be 1 */ 601 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 602 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 603 604 for (j = 0; j < (1 << order); j++) { 605 k = (i * (1 << order)) + j; 606 MB_CHECK_ASSERT( 607 !mb_test_bit(k, e4b->bd_bitmap)); 608 } 609 count++; 610 } 611 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); 612 order--; 613 } 614 615 fstart = -1; 616 buddy = mb_find_buddy(e4b, 0, &max); 617 for (i = 0; i < max; i++) { 618 if (!mb_test_bit(i, buddy)) { 619 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); 620 if (fstart == -1) { 621 fragments++; 622 fstart = i; 623 } 624 continue; 625 } 626 fstart = -1; 627 /* check used bits only */ 628 for (j = 0; j < e4b->bd_blkbits + 1; j++) { 629 buddy2 = mb_find_buddy(e4b, j, &max2); 630 k = i >> j; 631 MB_CHECK_ASSERT(k < max2); 632 MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); 633 } 634 } 635 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); 636 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 637 638 grp = ext4_get_group_info(sb, e4b->bd_group); 639 list_for_each(cur, &grp->bb_prealloc_list) { 640 ext4_group_t groupnr; 641 struct ext4_prealloc_space *pa; 642 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 643 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); 644 MB_CHECK_ASSERT(groupnr == e4b->bd_group); 645 for (i = 0; i < pa->pa_len; i++) 646 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); 647 } 648 return 0; 649 } 650 #undef MB_CHECK_ASSERT 651 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ 652 __FILE__, __func__, __LINE__) 653 #else 654 #define mb_check_buddy(e4b) 655 #endif 656 657 /* 658 * Divide blocks started from @first with length @len into 659 * smaller chunks with power of 2 blocks. 660 * Clear the bits in bitmap which the blocks of the chunk(s) covered, 661 * then increase bb_counters[] for corresponded chunk size. 662 */ 663 static void ext4_mb_mark_free_simple(struct super_block *sb, 664 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 665 struct ext4_group_info *grp) 666 { 667 struct ext4_sb_info *sbi = EXT4_SB(sb); 668 ext4_grpblk_t min; 669 ext4_grpblk_t max; 670 ext4_grpblk_t chunk; 671 unsigned short border; 672 673 BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); 674 675 border = 2 << sb->s_blocksize_bits; 676 677 while (len > 0) { 678 /* find how many blocks can be covered since this position */ 679 max = ffs(first | border) - 1; 680 681 /* find how many blocks of power 2 we need to mark */ 682 min = fls(len) - 1; 683 684 if (max < min) 685 min = max; 686 chunk = 1 << min; 687 688 /* mark multiblock chunks only */ 689 grp->bb_counters[min]++; 690 if (min > 0) 691 mb_clear_bit(first >> min, 692 buddy + sbi->s_mb_offsets[min]); 693 694 len -= chunk; 695 first += chunk; 696 } 697 } 698 699 /* 700 * Cache the order of the largest free extent we have available in this block 701 * group. 702 */ 703 static void 704 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) 705 { 706 int i; 707 int bits; 708 709 grp->bb_largest_free_order = -1; /* uninit */ 710 711 bits = sb->s_blocksize_bits + 1; 712 for (i = bits; i >= 0; i--) { 713 if (grp->bb_counters[i] > 0) { 714 grp->bb_largest_free_order = i; 715 break; 716 } 717 } 718 } 719 720 static noinline_for_stack 721 void ext4_mb_generate_buddy(struct super_block *sb, 722 void *buddy, void *bitmap, ext4_group_t group) 723 { 724 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 725 ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 726 ext4_grpblk_t i = 0; 727 ext4_grpblk_t first; 728 ext4_grpblk_t len; 729 unsigned free = 0; 730 unsigned fragments = 0; 731 unsigned long long period = get_cycles(); 732 733 /* initialize buddy from bitmap which is aggregation 734 * of on-disk bitmap and preallocations */ 735 i = mb_find_next_zero_bit(bitmap, max, 0); 736 grp->bb_first_free = i; 737 while (i < max) { 738 fragments++; 739 first = i; 740 i = mb_find_next_bit(bitmap, max, i); 741 len = i - first; 742 free += len; 743 if (len > 1) 744 ext4_mb_mark_free_simple(sb, buddy, first, len, grp); 745 else 746 grp->bb_counters[0]++; 747 if (i < max) 748 i = mb_find_next_zero_bit(bitmap, max, i); 749 } 750 grp->bb_fragments = fragments; 751 752 if (free != grp->bb_free) { 753 ext4_grp_locked_error(sb, group, 0, 0, 754 "%u clusters in bitmap, %u in gd; " 755 "block bitmap corrupt.", 756 free, grp->bb_free); 757 /* 758 * If we intend to continue, we consider group descriptor 759 * corrupt and update bb_free using bitmap value 760 */ 761 grp->bb_free = free; 762 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); 763 } 764 mb_set_largest_free_order(sb, grp); 765 766 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 767 768 period = get_cycles() - period; 769 spin_lock(&EXT4_SB(sb)->s_bal_lock); 770 EXT4_SB(sb)->s_mb_buddies_generated++; 771 EXT4_SB(sb)->s_mb_generation_time += period; 772 spin_unlock(&EXT4_SB(sb)->s_bal_lock); 773 } 774 775 static void mb_regenerate_buddy(struct ext4_buddy *e4b) 776 { 777 int count; 778 int order = 1; 779 void *buddy; 780 781 while ((buddy = mb_find_buddy(e4b, order++, &count))) { 782 ext4_set_bits(buddy, 0, count); 783 } 784 e4b->bd_info->bb_fragments = 0; 785 memset(e4b->bd_info->bb_counters, 0, 786 sizeof(*e4b->bd_info->bb_counters) * 787 (e4b->bd_sb->s_blocksize_bits + 2)); 788 789 ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, 790 e4b->bd_bitmap, e4b->bd_group); 791 } 792 793 /* The buddy information is attached the buddy cache inode 794 * for convenience. The information regarding each group 795 * is loaded via ext4_mb_load_buddy. The information involve 796 * block bitmap and buddy information. The information are 797 * stored in the inode as 798 * 799 * { page } 800 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 801 * 802 * 803 * one block each for bitmap and buddy information. 804 * So for each group we take up 2 blocks. A page can 805 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 806 * So it can have information regarding groups_per_page which 807 * is blocks_per_page/2 808 * 809 * Locking note: This routine takes the block group lock of all groups 810 * for this page; do not hold this lock when calling this routine! 811 */ 812 813 static int ext4_mb_init_cache(struct page *page, char *incore) 814 { 815 ext4_group_t ngroups; 816 int blocksize; 817 int blocks_per_page; 818 int groups_per_page; 819 int err = 0; 820 int i; 821 ext4_group_t first_group, group; 822 int first_block; 823 struct super_block *sb; 824 struct buffer_head *bhs; 825 struct buffer_head **bh = NULL; 826 struct inode *inode; 827 char *data; 828 char *bitmap; 829 struct ext4_group_info *grinfo; 830 831 mb_debug(1, "init page %lu\n", page->index); 832 833 inode = page->mapping->host; 834 sb = inode->i_sb; 835 ngroups = ext4_get_groups_count(sb); 836 blocksize = 1 << inode->i_blkbits; 837 blocks_per_page = PAGE_CACHE_SIZE / blocksize; 838 839 groups_per_page = blocks_per_page >> 1; 840 if (groups_per_page == 0) 841 groups_per_page = 1; 842 843 /* allocate buffer_heads to read bitmaps */ 844 if (groups_per_page > 1) { 845 i = sizeof(struct buffer_head *) * groups_per_page; 846 bh = kzalloc(i, GFP_NOFS); 847 if (bh == NULL) { 848 err = -ENOMEM; 849 goto out; 850 } 851 } else 852 bh = &bhs; 853 854 first_group = page->index * blocks_per_page / 2; 855 856 /* read all groups the page covers into the cache */ 857 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 858 if (group >= ngroups) 859 break; 860 861 grinfo = ext4_get_group_info(sb, group); 862 /* 863 * If page is uptodate then we came here after online resize 864 * which added some new uninitialized group info structs, so 865 * we must skip all initialized uptodate buddies on the page, 866 * which may be currently in use by an allocating task. 867 */ 868 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { 869 bh[i] = NULL; 870 continue; 871 } 872 if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { 873 err = -ENOMEM; 874 goto out; 875 } 876 mb_debug(1, "read bitmap for group %u\n", group); 877 } 878 879 /* wait for I/O completion */ 880 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 881 if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { 882 err = -EIO; 883 goto out; 884 } 885 } 886 887 first_block = page->index * blocks_per_page; 888 for (i = 0; i < blocks_per_page; i++) { 889 group = (first_block + i) >> 1; 890 if (group >= ngroups) 891 break; 892 893 if (!bh[group - first_group]) 894 /* skip initialized uptodate buddy */ 895 continue; 896 897 /* 898 * data carry information regarding this 899 * particular group in the format specified 900 * above 901 * 902 */ 903 data = page_address(page) + (i * blocksize); 904 bitmap = bh[group - first_group]->b_data; 905 906 /* 907 * We place the buddy block and bitmap block 908 * close together 909 */ 910 if ((first_block + i) & 1) { 911 /* this is block of buddy */ 912 BUG_ON(incore == NULL); 913 mb_debug(1, "put buddy for group %u in page %lu/%x\n", 914 group, page->index, i * blocksize); 915 trace_ext4_mb_buddy_bitmap_load(sb, group); 916 grinfo = ext4_get_group_info(sb, group); 917 grinfo->bb_fragments = 0; 918 memset(grinfo->bb_counters, 0, 919 sizeof(*grinfo->bb_counters) * 920 (sb->s_blocksize_bits+2)); 921 /* 922 * incore got set to the group block bitmap below 923 */ 924 ext4_lock_group(sb, group); 925 /* init the buddy */ 926 memset(data, 0xff, blocksize); 927 ext4_mb_generate_buddy(sb, data, incore, group); 928 ext4_unlock_group(sb, group); 929 incore = NULL; 930 } else { 931 /* this is block of bitmap */ 932 BUG_ON(incore != NULL); 933 mb_debug(1, "put bitmap for group %u in page %lu/%x\n", 934 group, page->index, i * blocksize); 935 trace_ext4_mb_bitmap_load(sb, group); 936 937 /* see comments in ext4_mb_put_pa() */ 938 ext4_lock_group(sb, group); 939 memcpy(data, bitmap, blocksize); 940 941 /* mark all preallocated blks used in in-core bitmap */ 942 ext4_mb_generate_from_pa(sb, data, group); 943 ext4_mb_generate_from_freelist(sb, data, group); 944 ext4_unlock_group(sb, group); 945 946 /* set incore so that the buddy information can be 947 * generated using this 948 */ 949 incore = data; 950 } 951 } 952 SetPageUptodate(page); 953 954 out: 955 if (bh) { 956 for (i = 0; i < groups_per_page; i++) 957 brelse(bh[i]); 958 if (bh != &bhs) 959 kfree(bh); 960 } 961 return err; 962 } 963 964 /* 965 * Lock the buddy and bitmap pages. This make sure other parallel init_group 966 * on the same buddy page doesn't happen whild holding the buddy page lock. 967 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap 968 * are on the same page e4b->bd_buddy_page is NULL and return value is 0. 969 */ 970 static int ext4_mb_get_buddy_page_lock(struct super_block *sb, 971 ext4_group_t group, struct ext4_buddy *e4b) 972 { 973 struct inode *inode = EXT4_SB(sb)->s_buddy_cache; 974 int block, pnum, poff; 975 int blocks_per_page; 976 struct page *page; 977 978 e4b->bd_buddy_page = NULL; 979 e4b->bd_bitmap_page = NULL; 980 981 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 982 /* 983 * the buddy cache inode stores the block bitmap 984 * and buddy information in consecutive blocks. 985 * So for each group we need two blocks. 986 */ 987 block = group * 2; 988 pnum = block / blocks_per_page; 989 poff = block % blocks_per_page; 990 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 991 if (!page) 992 return -EIO; 993 BUG_ON(page->mapping != inode->i_mapping); 994 e4b->bd_bitmap_page = page; 995 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 996 997 if (blocks_per_page >= 2) { 998 /* buddy and bitmap are on the same page */ 999 return 0; 1000 } 1001 1002 block++; 1003 pnum = block / blocks_per_page; 1004 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1005 if (!page) 1006 return -EIO; 1007 BUG_ON(page->mapping != inode->i_mapping); 1008 e4b->bd_buddy_page = page; 1009 return 0; 1010 } 1011 1012 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) 1013 { 1014 if (e4b->bd_bitmap_page) { 1015 unlock_page(e4b->bd_bitmap_page); 1016 page_cache_release(e4b->bd_bitmap_page); 1017 } 1018 if (e4b->bd_buddy_page) { 1019 unlock_page(e4b->bd_buddy_page); 1020 page_cache_release(e4b->bd_buddy_page); 1021 } 1022 } 1023 1024 /* 1025 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1026 * block group lock of all groups for this page; do not hold the BG lock when 1027 * calling this routine! 1028 */ 1029 static noinline_for_stack 1030 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1031 { 1032 1033 struct ext4_group_info *this_grp; 1034 struct ext4_buddy e4b; 1035 struct page *page; 1036 int ret = 0; 1037 1038 might_sleep(); 1039 mb_debug(1, "init group %u\n", group); 1040 this_grp = ext4_get_group_info(sb, group); 1041 /* 1042 * This ensures that we don't reinit the buddy cache 1043 * page which map to the group from which we are already 1044 * allocating. If we are looking at the buddy cache we would 1045 * have taken a reference using ext4_mb_load_buddy and that 1046 * would have pinned buddy page to page cache. 1047 */ 1048 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); 1049 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { 1050 /* 1051 * somebody initialized the group 1052 * return without doing anything 1053 */ 1054 goto err; 1055 } 1056 1057 page = e4b.bd_bitmap_page; 1058 ret = ext4_mb_init_cache(page, NULL); 1059 if (ret) 1060 goto err; 1061 if (!PageUptodate(page)) { 1062 ret = -EIO; 1063 goto err; 1064 } 1065 mark_page_accessed(page); 1066 1067 if (e4b.bd_buddy_page == NULL) { 1068 /* 1069 * If both the bitmap and buddy are in 1070 * the same page we don't need to force 1071 * init the buddy 1072 */ 1073 ret = 0; 1074 goto err; 1075 } 1076 /* init buddy cache */ 1077 page = e4b.bd_buddy_page; 1078 ret = ext4_mb_init_cache(page, e4b.bd_bitmap); 1079 if (ret) 1080 goto err; 1081 if (!PageUptodate(page)) { 1082 ret = -EIO; 1083 goto err; 1084 } 1085 mark_page_accessed(page); 1086 err: 1087 ext4_mb_put_buddy_page_lock(&e4b); 1088 return ret; 1089 } 1090 1091 /* 1092 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1093 * block group lock of all groups for this page; do not hold the BG lock when 1094 * calling this routine! 1095 */ 1096 static noinline_for_stack int 1097 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1098 struct ext4_buddy *e4b) 1099 { 1100 int blocks_per_page; 1101 int block; 1102 int pnum; 1103 int poff; 1104 struct page *page; 1105 int ret; 1106 struct ext4_group_info *grp; 1107 struct ext4_sb_info *sbi = EXT4_SB(sb); 1108 struct inode *inode = sbi->s_buddy_cache; 1109 1110 might_sleep(); 1111 mb_debug(1, "load group %u\n", group); 1112 1113 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1114 grp = ext4_get_group_info(sb, group); 1115 1116 e4b->bd_blkbits = sb->s_blocksize_bits; 1117 e4b->bd_info = grp; 1118 e4b->bd_sb = sb; 1119 e4b->bd_group = group; 1120 e4b->bd_buddy_page = NULL; 1121 e4b->bd_bitmap_page = NULL; 1122 1123 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1124 /* 1125 * we need full data about the group 1126 * to make a good selection 1127 */ 1128 ret = ext4_mb_init_group(sb, group); 1129 if (ret) 1130 return ret; 1131 } 1132 1133 /* 1134 * the buddy cache inode stores the block bitmap 1135 * and buddy information in consecutive blocks. 1136 * So for each group we need two blocks. 1137 */ 1138 block = group * 2; 1139 pnum = block / blocks_per_page; 1140 poff = block % blocks_per_page; 1141 1142 /* we could use find_or_create_page(), but it locks page 1143 * what we'd like to avoid in fast path ... */ 1144 page = find_get_page(inode->i_mapping, pnum); 1145 if (page == NULL || !PageUptodate(page)) { 1146 if (page) 1147 /* 1148 * drop the page reference and try 1149 * to get the page with lock. If we 1150 * are not uptodate that implies 1151 * somebody just created the page but 1152 * is yet to initialize the same. So 1153 * wait for it to initialize. 1154 */ 1155 page_cache_release(page); 1156 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1157 if (page) { 1158 BUG_ON(page->mapping != inode->i_mapping); 1159 if (!PageUptodate(page)) { 1160 ret = ext4_mb_init_cache(page, NULL); 1161 if (ret) { 1162 unlock_page(page); 1163 goto err; 1164 } 1165 mb_cmp_bitmaps(e4b, page_address(page) + 1166 (poff * sb->s_blocksize)); 1167 } 1168 unlock_page(page); 1169 } 1170 } 1171 if (page == NULL || !PageUptodate(page)) { 1172 ret = -EIO; 1173 goto err; 1174 } 1175 e4b->bd_bitmap_page = page; 1176 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1177 mark_page_accessed(page); 1178 1179 block++; 1180 pnum = block / blocks_per_page; 1181 poff = block % blocks_per_page; 1182 1183 page = find_get_page(inode->i_mapping, pnum); 1184 if (page == NULL || !PageUptodate(page)) { 1185 if (page) 1186 page_cache_release(page); 1187 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1188 if (page) { 1189 BUG_ON(page->mapping != inode->i_mapping); 1190 if (!PageUptodate(page)) { 1191 ret = ext4_mb_init_cache(page, e4b->bd_bitmap); 1192 if (ret) { 1193 unlock_page(page); 1194 goto err; 1195 } 1196 } 1197 unlock_page(page); 1198 } 1199 } 1200 if (page == NULL || !PageUptodate(page)) { 1201 ret = -EIO; 1202 goto err; 1203 } 1204 e4b->bd_buddy_page = page; 1205 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1206 mark_page_accessed(page); 1207 1208 BUG_ON(e4b->bd_bitmap_page == NULL); 1209 BUG_ON(e4b->bd_buddy_page == NULL); 1210 1211 return 0; 1212 1213 err: 1214 if (page) 1215 page_cache_release(page); 1216 if (e4b->bd_bitmap_page) 1217 page_cache_release(e4b->bd_bitmap_page); 1218 if (e4b->bd_buddy_page) 1219 page_cache_release(e4b->bd_buddy_page); 1220 e4b->bd_buddy = NULL; 1221 e4b->bd_bitmap = NULL; 1222 return ret; 1223 } 1224 1225 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) 1226 { 1227 if (e4b->bd_bitmap_page) 1228 page_cache_release(e4b->bd_bitmap_page); 1229 if (e4b->bd_buddy_page) 1230 page_cache_release(e4b->bd_buddy_page); 1231 } 1232 1233 1234 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1235 { 1236 int order = 1; 1237 void *bb; 1238 1239 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 1240 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1241 1242 bb = e4b->bd_buddy; 1243 while (order <= e4b->bd_blkbits + 1) { 1244 block = block >> 1; 1245 if (!mb_test_bit(block, bb)) { 1246 /* this block is part of buddy of order 'order' */ 1247 return order; 1248 } 1249 bb += 1 << (e4b->bd_blkbits - order); 1250 order++; 1251 } 1252 return 0; 1253 } 1254 1255 static void mb_clear_bits(void *bm, int cur, int len) 1256 { 1257 __u32 *addr; 1258 1259 len = cur + len; 1260 while (cur < len) { 1261 if ((cur & 31) == 0 && (len - cur) >= 32) { 1262 /* fast path: clear whole word at once */ 1263 addr = bm + (cur >> 3); 1264 *addr = 0; 1265 cur += 32; 1266 continue; 1267 } 1268 mb_clear_bit(cur, bm); 1269 cur++; 1270 } 1271 } 1272 1273 /* clear bits in given range 1274 * will return first found zero bit if any, -1 otherwise 1275 */ 1276 static int mb_test_and_clear_bits(void *bm, int cur, int len) 1277 { 1278 __u32 *addr; 1279 int zero_bit = -1; 1280 1281 len = cur + len; 1282 while (cur < len) { 1283 if ((cur & 31) == 0 && (len - cur) >= 32) { 1284 /* fast path: clear whole word at once */ 1285 addr = bm + (cur >> 3); 1286 if (*addr != (__u32)(-1) && zero_bit == -1) 1287 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); 1288 *addr = 0; 1289 cur += 32; 1290 continue; 1291 } 1292 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) 1293 zero_bit = cur; 1294 cur++; 1295 } 1296 1297 return zero_bit; 1298 } 1299 1300 void ext4_set_bits(void *bm, int cur, int len) 1301 { 1302 __u32 *addr; 1303 1304 len = cur + len; 1305 while (cur < len) { 1306 if ((cur & 31) == 0 && (len - cur) >= 32) { 1307 /* fast path: set whole word at once */ 1308 addr = bm + (cur >> 3); 1309 *addr = 0xffffffff; 1310 cur += 32; 1311 continue; 1312 } 1313 mb_set_bit(cur, bm); 1314 cur++; 1315 } 1316 } 1317 1318 /* 1319 * _________________________________________________________________ */ 1320 1321 static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) 1322 { 1323 if (mb_test_bit(*bit + side, bitmap)) { 1324 mb_clear_bit(*bit, bitmap); 1325 (*bit) -= side; 1326 return 1; 1327 } 1328 else { 1329 (*bit) += side; 1330 mb_set_bit(*bit, bitmap); 1331 return -1; 1332 } 1333 } 1334 1335 static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) 1336 { 1337 int max; 1338 int order = 1; 1339 void *buddy = mb_find_buddy(e4b, order, &max); 1340 1341 while (buddy) { 1342 void *buddy2; 1343 1344 /* Bits in range [first; last] are known to be set since 1345 * corresponding blocks were allocated. Bits in range 1346 * (first; last) will stay set because they form buddies on 1347 * upper layer. We just deal with borders if they don't 1348 * align with upper layer and then go up. 1349 * Releasing entire group is all about clearing 1350 * single bit of highest order buddy. 1351 */ 1352 1353 /* Example: 1354 * --------------------------------- 1355 * | 1 | 1 | 1 | 1 | 1356 * --------------------------------- 1357 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1358 * --------------------------------- 1359 * 0 1 2 3 4 5 6 7 1360 * \_____________________/ 1361 * 1362 * Neither [1] nor [6] is aligned to above layer. 1363 * Left neighbour [0] is free, so mark it busy, 1364 * decrease bb_counters and extend range to 1365 * [0; 6] 1366 * Right neighbour [7] is busy. It can't be coaleasced with [6], so 1367 * mark [6] free, increase bb_counters and shrink range to 1368 * [0; 5]. 1369 * Then shift range to [0; 2], go up and do the same. 1370 */ 1371 1372 1373 if (first & 1) 1374 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); 1375 if (!(last & 1)) 1376 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); 1377 if (first > last) 1378 break; 1379 order++; 1380 1381 if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { 1382 mb_clear_bits(buddy, first, last - first + 1); 1383 e4b->bd_info->bb_counters[order - 1] += last - first + 1; 1384 break; 1385 } 1386 first >>= 1; 1387 last >>= 1; 1388 buddy = buddy2; 1389 } 1390 } 1391 1392 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1393 int first, int count) 1394 { 1395 int left_is_free = 0; 1396 int right_is_free = 0; 1397 int block; 1398 int last = first + count - 1; 1399 struct super_block *sb = e4b->bd_sb; 1400 1401 BUG_ON(last >= (sb->s_blocksize << 3)); 1402 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1403 /* Don't bother if the block group is corrupt. */ 1404 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) 1405 return; 1406 1407 mb_check_buddy(e4b); 1408 mb_free_blocks_double(inode, e4b, first, count); 1409 1410 e4b->bd_info->bb_free += count; 1411 if (first < e4b->bd_info->bb_first_free) 1412 e4b->bd_info->bb_first_free = first; 1413 1414 /* access memory sequentially: check left neighbour, 1415 * clear range and then check right neighbour 1416 */ 1417 if (first != 0) 1418 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); 1419 block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); 1420 if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) 1421 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); 1422 1423 if (unlikely(block != -1)) { 1424 ext4_fsblk_t blocknr; 1425 1426 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1427 blocknr += EXT4_C2B(EXT4_SB(sb), block); 1428 ext4_grp_locked_error(sb, e4b->bd_group, 1429 inode ? inode->i_ino : 0, 1430 blocknr, 1431 "freeing already freed block " 1432 "(bit %u); block bitmap corrupt.", 1433 block); 1434 /* Mark the block group as corrupt. */ 1435 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, 1436 &e4b->bd_info->bb_state); 1437 mb_regenerate_buddy(e4b); 1438 goto done; 1439 } 1440 1441 /* let's maintain fragments counter */ 1442 if (left_is_free && right_is_free) 1443 e4b->bd_info->bb_fragments--; 1444 else if (!left_is_free && !right_is_free) 1445 e4b->bd_info->bb_fragments++; 1446 1447 /* buddy[0] == bd_bitmap is a special case, so handle 1448 * it right away and let mb_buddy_mark_free stay free of 1449 * zero order checks. 1450 * Check if neighbours are to be coaleasced, 1451 * adjust bitmap bb_counters and borders appropriately. 1452 */ 1453 if (first & 1) { 1454 first += !left_is_free; 1455 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; 1456 } 1457 if (!(last & 1)) { 1458 last -= !right_is_free; 1459 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; 1460 } 1461 1462 if (first <= last) 1463 mb_buddy_mark_free(e4b, first >> 1, last >> 1); 1464 1465 done: 1466 mb_set_largest_free_order(sb, e4b->bd_info); 1467 mb_check_buddy(e4b); 1468 } 1469 1470 static int mb_find_extent(struct ext4_buddy *e4b, int block, 1471 int needed, struct ext4_free_extent *ex) 1472 { 1473 int next = block; 1474 int max, order; 1475 void *buddy; 1476 1477 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1478 BUG_ON(ex == NULL); 1479 1480 buddy = mb_find_buddy(e4b, 0, &max); 1481 BUG_ON(buddy == NULL); 1482 BUG_ON(block >= max); 1483 if (mb_test_bit(block, buddy)) { 1484 ex->fe_len = 0; 1485 ex->fe_start = 0; 1486 ex->fe_group = 0; 1487 return 0; 1488 } 1489 1490 /* find actual order */ 1491 order = mb_find_order_for_block(e4b, block); 1492 block = block >> order; 1493 1494 ex->fe_len = 1 << order; 1495 ex->fe_start = block << order; 1496 ex->fe_group = e4b->bd_group; 1497 1498 /* calc difference from given start */ 1499 next = next - ex->fe_start; 1500 ex->fe_len -= next; 1501 ex->fe_start += next; 1502 1503 while (needed > ex->fe_len && 1504 mb_find_buddy(e4b, order, &max)) { 1505 1506 if (block + 1 >= max) 1507 break; 1508 1509 next = (block + 1) * (1 << order); 1510 if (mb_test_bit(next, e4b->bd_bitmap)) 1511 break; 1512 1513 order = mb_find_order_for_block(e4b, next); 1514 1515 block = next >> order; 1516 ex->fe_len += 1 << order; 1517 } 1518 1519 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); 1520 return ex->fe_len; 1521 } 1522 1523 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) 1524 { 1525 int ord; 1526 int mlen = 0; 1527 int max = 0; 1528 int cur; 1529 int start = ex->fe_start; 1530 int len = ex->fe_len; 1531 unsigned ret = 0; 1532 int len0 = len; 1533 void *buddy; 1534 1535 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 1536 BUG_ON(e4b->bd_group != ex->fe_group); 1537 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1538 mb_check_buddy(e4b); 1539 mb_mark_used_double(e4b, start, len); 1540 1541 e4b->bd_info->bb_free -= len; 1542 if (e4b->bd_info->bb_first_free == start) 1543 e4b->bd_info->bb_first_free += len; 1544 1545 /* let's maintain fragments counter */ 1546 if (start != 0) 1547 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); 1548 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1549 max = !mb_test_bit(start + len, e4b->bd_bitmap); 1550 if (mlen && max) 1551 e4b->bd_info->bb_fragments++; 1552 else if (!mlen && !max) 1553 e4b->bd_info->bb_fragments--; 1554 1555 /* let's maintain buddy itself */ 1556 while (len) { 1557 ord = mb_find_order_for_block(e4b, start); 1558 1559 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 1560 /* the whole chunk may be allocated at once! */ 1561 mlen = 1 << ord; 1562 buddy = mb_find_buddy(e4b, ord, &max); 1563 BUG_ON((start >> ord) >= max); 1564 mb_set_bit(start >> ord, buddy); 1565 e4b->bd_info->bb_counters[ord]--; 1566 start += mlen; 1567 len -= mlen; 1568 BUG_ON(len < 0); 1569 continue; 1570 } 1571 1572 /* store for history */ 1573 if (ret == 0) 1574 ret = len | (ord << 16); 1575 1576 /* we have to split large buddy */ 1577 BUG_ON(ord <= 0); 1578 buddy = mb_find_buddy(e4b, ord, &max); 1579 mb_set_bit(start >> ord, buddy); 1580 e4b->bd_info->bb_counters[ord]--; 1581 1582 ord--; 1583 cur = (start >> ord) & ~1U; 1584 buddy = mb_find_buddy(e4b, ord, &max); 1585 mb_clear_bit(cur, buddy); 1586 mb_clear_bit(cur + 1, buddy); 1587 e4b->bd_info->bb_counters[ord]++; 1588 e4b->bd_info->bb_counters[ord]++; 1589 } 1590 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1591 1592 ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); 1593 mb_check_buddy(e4b); 1594 1595 return ret; 1596 } 1597 1598 /* 1599 * Must be called under group lock! 1600 */ 1601 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, 1602 struct ext4_buddy *e4b) 1603 { 1604 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1605 int ret; 1606 1607 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); 1608 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1609 1610 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); 1611 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; 1612 ret = mb_mark_used(e4b, &ac->ac_b_ex); 1613 1614 /* preallocation can change ac_b_ex, thus we store actually 1615 * allocated blocks for history */ 1616 ac->ac_f_ex = ac->ac_b_ex; 1617 1618 ac->ac_status = AC_STATUS_FOUND; 1619 ac->ac_tail = ret & 0xffff; 1620 ac->ac_buddy = ret >> 16; 1621 1622 /* 1623 * take the page reference. We want the page to be pinned 1624 * so that we don't get a ext4_mb_init_cache_call for this 1625 * group until we update the bitmap. That would mean we 1626 * double allocate blocks. The reference is dropped 1627 * in ext4_mb_release_context 1628 */ 1629 ac->ac_bitmap_page = e4b->bd_bitmap_page; 1630 get_page(ac->ac_bitmap_page); 1631 ac->ac_buddy_page = e4b->bd_buddy_page; 1632 get_page(ac->ac_buddy_page); 1633 /* store last allocated for subsequent stream allocation */ 1634 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 1635 spin_lock(&sbi->s_md_lock); 1636 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1637 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1638 spin_unlock(&sbi->s_md_lock); 1639 } 1640 } 1641 1642 /* 1643 * regular allocator, for general purposes allocation 1644 */ 1645 1646 static void ext4_mb_check_limits(struct ext4_allocation_context *ac, 1647 struct ext4_buddy *e4b, 1648 int finish_group) 1649 { 1650 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1651 struct ext4_free_extent *bex = &ac->ac_b_ex; 1652 struct ext4_free_extent *gex = &ac->ac_g_ex; 1653 struct ext4_free_extent ex; 1654 int max; 1655 1656 if (ac->ac_status == AC_STATUS_FOUND) 1657 return; 1658 /* 1659 * We don't want to scan for a whole year 1660 */ 1661 if (ac->ac_found > sbi->s_mb_max_to_scan && 1662 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1663 ac->ac_status = AC_STATUS_BREAK; 1664 return; 1665 } 1666 1667 /* 1668 * Haven't found good chunk so far, let's continue 1669 */ 1670 if (bex->fe_len < gex->fe_len) 1671 return; 1672 1673 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) 1674 && bex->fe_group == e4b->bd_group) { 1675 /* recheck chunk's availability - we don't know 1676 * when it was found (within this lock-unlock 1677 * period or not) */ 1678 max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); 1679 if (max >= gex->fe_len) { 1680 ext4_mb_use_best_found(ac, e4b); 1681 return; 1682 } 1683 } 1684 } 1685 1686 /* 1687 * The routine checks whether found extent is good enough. If it is, 1688 * then the extent gets marked used and flag is set to the context 1689 * to stop scanning. Otherwise, the extent is compared with the 1690 * previous found extent and if new one is better, then it's stored 1691 * in the context. Later, the best found extent will be used, if 1692 * mballoc can't find good enough extent. 1693 * 1694 * FIXME: real allocation policy is to be designed yet! 1695 */ 1696 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, 1697 struct ext4_free_extent *ex, 1698 struct ext4_buddy *e4b) 1699 { 1700 struct ext4_free_extent *bex = &ac->ac_b_ex; 1701 struct ext4_free_extent *gex = &ac->ac_g_ex; 1702 1703 BUG_ON(ex->fe_len <= 0); 1704 BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 1705 BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 1706 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 1707 1708 ac->ac_found++; 1709 1710 /* 1711 * The special case - take what you catch first 1712 */ 1713 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1714 *bex = *ex; 1715 ext4_mb_use_best_found(ac, e4b); 1716 return; 1717 } 1718 1719 /* 1720 * Let's check whether the chuck is good enough 1721 */ 1722 if (ex->fe_len == gex->fe_len) { 1723 *bex = *ex; 1724 ext4_mb_use_best_found(ac, e4b); 1725 return; 1726 } 1727 1728 /* 1729 * If this is first found extent, just store it in the context 1730 */ 1731 if (bex->fe_len == 0) { 1732 *bex = *ex; 1733 return; 1734 } 1735 1736 /* 1737 * If new found extent is better, store it in the context 1738 */ 1739 if (bex->fe_len < gex->fe_len) { 1740 /* if the request isn't satisfied, any found extent 1741 * larger than previous best one is better */ 1742 if (ex->fe_len > bex->fe_len) 1743 *bex = *ex; 1744 } else if (ex->fe_len > gex->fe_len) { 1745 /* if the request is satisfied, then we try to find 1746 * an extent that still satisfy the request, but is 1747 * smaller than previous one */ 1748 if (ex->fe_len < bex->fe_len) 1749 *bex = *ex; 1750 } 1751 1752 ext4_mb_check_limits(ac, e4b, 0); 1753 } 1754 1755 static noinline_for_stack 1756 int ext4_mb_try_best_found(struct ext4_allocation_context *ac, 1757 struct ext4_buddy *e4b) 1758 { 1759 struct ext4_free_extent ex = ac->ac_b_ex; 1760 ext4_group_t group = ex.fe_group; 1761 int max; 1762 int err; 1763 1764 BUG_ON(ex.fe_len <= 0); 1765 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1766 if (err) 1767 return err; 1768 1769 ext4_lock_group(ac->ac_sb, group); 1770 max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); 1771 1772 if (max > 0) { 1773 ac->ac_b_ex = ex; 1774 ext4_mb_use_best_found(ac, e4b); 1775 } 1776 1777 ext4_unlock_group(ac->ac_sb, group); 1778 ext4_mb_unload_buddy(e4b); 1779 1780 return 0; 1781 } 1782 1783 static noinline_for_stack 1784 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 1785 struct ext4_buddy *e4b) 1786 { 1787 ext4_group_t group = ac->ac_g_ex.fe_group; 1788 int max; 1789 int err; 1790 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1791 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1792 struct ext4_free_extent ex; 1793 1794 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1795 return 0; 1796 if (grp->bb_free == 0) 1797 return 0; 1798 1799 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1800 if (err) 1801 return err; 1802 1803 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { 1804 ext4_mb_unload_buddy(e4b); 1805 return 0; 1806 } 1807 1808 ext4_lock_group(ac->ac_sb, group); 1809 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, 1810 ac->ac_g_ex.fe_len, &ex); 1811 ex.fe_logical = 0xDEADFA11; /* debug value */ 1812 1813 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1814 ext4_fsblk_t start; 1815 1816 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + 1817 ex.fe_start; 1818 /* use do_div to get remainder (would be 64-bit modulo) */ 1819 if (do_div(start, sbi->s_stripe) == 0) { 1820 ac->ac_found++; 1821 ac->ac_b_ex = ex; 1822 ext4_mb_use_best_found(ac, e4b); 1823 } 1824 } else if (max >= ac->ac_g_ex.fe_len) { 1825 BUG_ON(ex.fe_len <= 0); 1826 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1827 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1828 ac->ac_found++; 1829 ac->ac_b_ex = ex; 1830 ext4_mb_use_best_found(ac, e4b); 1831 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { 1832 /* Sometimes, caller may want to merge even small 1833 * number of blocks to an existing extent */ 1834 BUG_ON(ex.fe_len <= 0); 1835 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1836 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1837 ac->ac_found++; 1838 ac->ac_b_ex = ex; 1839 ext4_mb_use_best_found(ac, e4b); 1840 } 1841 ext4_unlock_group(ac->ac_sb, group); 1842 ext4_mb_unload_buddy(e4b); 1843 1844 return 0; 1845 } 1846 1847 /* 1848 * The routine scans buddy structures (not bitmap!) from given order 1849 * to max order and tries to find big enough chunk to satisfy the req 1850 */ 1851 static noinline_for_stack 1852 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 1853 struct ext4_buddy *e4b) 1854 { 1855 struct super_block *sb = ac->ac_sb; 1856 struct ext4_group_info *grp = e4b->bd_info; 1857 void *buddy; 1858 int i; 1859 int k; 1860 int max; 1861 1862 BUG_ON(ac->ac_2order <= 0); 1863 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { 1864 if (grp->bb_counters[i] == 0) 1865 continue; 1866 1867 buddy = mb_find_buddy(e4b, i, &max); 1868 BUG_ON(buddy == NULL); 1869 1870 k = mb_find_next_zero_bit(buddy, max, 0); 1871 BUG_ON(k >= max); 1872 1873 ac->ac_found++; 1874 1875 ac->ac_b_ex.fe_len = 1 << i; 1876 ac->ac_b_ex.fe_start = k << i; 1877 ac->ac_b_ex.fe_group = e4b->bd_group; 1878 1879 ext4_mb_use_best_found(ac, e4b); 1880 1881 BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); 1882 1883 if (EXT4_SB(sb)->s_mb_stats) 1884 atomic_inc(&EXT4_SB(sb)->s_bal_2orders); 1885 1886 break; 1887 } 1888 } 1889 1890 /* 1891 * The routine scans the group and measures all found extents. 1892 * In order to optimize scanning, caller must pass number of 1893 * free blocks in the group, so the routine can know upper limit. 1894 */ 1895 static noinline_for_stack 1896 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 1897 struct ext4_buddy *e4b) 1898 { 1899 struct super_block *sb = ac->ac_sb; 1900 void *bitmap = e4b->bd_bitmap; 1901 struct ext4_free_extent ex; 1902 int i; 1903 int free; 1904 1905 free = e4b->bd_info->bb_free; 1906 BUG_ON(free <= 0); 1907 1908 i = e4b->bd_info->bb_first_free; 1909 1910 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 1911 i = mb_find_next_zero_bit(bitmap, 1912 EXT4_CLUSTERS_PER_GROUP(sb), i); 1913 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { 1914 /* 1915 * IF we have corrupt bitmap, we won't find any 1916 * free blocks even though group info says we 1917 * we have free blocks 1918 */ 1919 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1920 "%d free clusters as per " 1921 "group info. But bitmap says 0", 1922 free); 1923 break; 1924 } 1925 1926 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); 1927 BUG_ON(ex.fe_len <= 0); 1928 if (free < ex.fe_len) { 1929 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1930 "%d free clusters as per " 1931 "group info. But got %d blocks", 1932 free, ex.fe_len); 1933 /* 1934 * The number of free blocks differs. This mostly 1935 * indicate that the bitmap is corrupt. So exit 1936 * without claiming the space. 1937 */ 1938 break; 1939 } 1940 ex.fe_logical = 0xDEADC0DE; /* debug value */ 1941 ext4_mb_measure_extent(ac, &ex, e4b); 1942 1943 i += ex.fe_len; 1944 free -= ex.fe_len; 1945 } 1946 1947 ext4_mb_check_limits(ac, e4b, 1); 1948 } 1949 1950 /* 1951 * This is a special case for storages like raid5 1952 * we try to find stripe-aligned chunks for stripe-size-multiple requests 1953 */ 1954 static noinline_for_stack 1955 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1956 struct ext4_buddy *e4b) 1957 { 1958 struct super_block *sb = ac->ac_sb; 1959 struct ext4_sb_info *sbi = EXT4_SB(sb); 1960 void *bitmap = e4b->bd_bitmap; 1961 struct ext4_free_extent ex; 1962 ext4_fsblk_t first_group_block; 1963 ext4_fsblk_t a; 1964 ext4_grpblk_t i; 1965 int max; 1966 1967 BUG_ON(sbi->s_stripe == 0); 1968 1969 /* find first stripe-aligned block in group */ 1970 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); 1971 1972 a = first_group_block + sbi->s_stripe - 1; 1973 do_div(a, sbi->s_stripe); 1974 i = (a * sbi->s_stripe) - first_group_block; 1975 1976 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { 1977 if (!mb_test_bit(i, bitmap)) { 1978 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); 1979 if (max >= sbi->s_stripe) { 1980 ac->ac_found++; 1981 ex.fe_logical = 0xDEADF00D; /* debug value */ 1982 ac->ac_b_ex = ex; 1983 ext4_mb_use_best_found(ac, e4b); 1984 break; 1985 } 1986 } 1987 i += sbi->s_stripe; 1988 } 1989 } 1990 1991 /* This is now called BEFORE we load the buddy bitmap. */ 1992 static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1993 ext4_group_t group, int cr) 1994 { 1995 unsigned free, fragments; 1996 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1997 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1998 1999 BUG_ON(cr < 0 || cr >= 4); 2000 2001 free = grp->bb_free; 2002 if (free == 0) 2003 return 0; 2004 if (cr <= 2 && free < ac->ac_g_ex.fe_len) 2005 return 0; 2006 2007 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) 2008 return 0; 2009 2010 /* We only do this if the grp has never been initialized */ 2011 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 2012 int ret = ext4_mb_init_group(ac->ac_sb, group); 2013 if (ret) 2014 return 0; 2015 } 2016 2017 fragments = grp->bb_fragments; 2018 if (fragments == 0) 2019 return 0; 2020 2021 switch (cr) { 2022 case 0: 2023 BUG_ON(ac->ac_2order == 0); 2024 2025 /* Avoid using the first bg of a flexgroup for data files */ 2026 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 2027 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 2028 ((group % flex_size) == 0)) 2029 return 0; 2030 2031 if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || 2032 (free / fragments) >= ac->ac_g_ex.fe_len) 2033 return 1; 2034 2035 if (grp->bb_largest_free_order < ac->ac_2order) 2036 return 0; 2037 2038 return 1; 2039 case 1: 2040 if ((free / fragments) >= ac->ac_g_ex.fe_len) 2041 return 1; 2042 break; 2043 case 2: 2044 if (free >= ac->ac_g_ex.fe_len) 2045 return 1; 2046 break; 2047 case 3: 2048 return 1; 2049 default: 2050 BUG(); 2051 } 2052 2053 return 0; 2054 } 2055 2056 static noinline_for_stack int 2057 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 2058 { 2059 ext4_group_t ngroups, group, i; 2060 int cr; 2061 int err = 0; 2062 struct ext4_sb_info *sbi; 2063 struct super_block *sb; 2064 struct ext4_buddy e4b; 2065 2066 sb = ac->ac_sb; 2067 sbi = EXT4_SB(sb); 2068 ngroups = ext4_get_groups_count(sb); 2069 /* non-extent files are limited to low blocks/groups */ 2070 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) 2071 ngroups = sbi->s_blockfile_groups; 2072 2073 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2074 2075 /* first, try the goal */ 2076 err = ext4_mb_find_by_goal(ac, &e4b); 2077 if (err || ac->ac_status == AC_STATUS_FOUND) 2078 goto out; 2079 2080 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 2081 goto out; 2082 2083 /* 2084 * ac->ac2_order is set only if the fe_len is a power of 2 2085 * if ac2_order is set we also set criteria to 0 so that we 2086 * try exact allocation using buddy. 2087 */ 2088 i = fls(ac->ac_g_ex.fe_len); 2089 ac->ac_2order = 0; 2090 /* 2091 * We search using buddy data only if the order of the request 2092 * is greater than equal to the sbi_s_mb_order2_reqs 2093 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req 2094 */ 2095 if (i >= sbi->s_mb_order2_reqs) { 2096 /* 2097 * This should tell if fe_len is exactly power of 2 2098 */ 2099 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 2100 ac->ac_2order = i - 1; 2101 } 2102 2103 /* if stream allocation is enabled, use global goal */ 2104 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2105 /* TBD: may be hot point */ 2106 spin_lock(&sbi->s_md_lock); 2107 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2108 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2109 spin_unlock(&sbi->s_md_lock); 2110 } 2111 2112 /* Let's just scan groups to find more-less suitable blocks */ 2113 cr = ac->ac_2order ? 0 : 1; 2114 /* 2115 * cr == 0 try to get exact allocation, 2116 * cr == 3 try to get anything 2117 */ 2118 repeat: 2119 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 2120 ac->ac_criteria = cr; 2121 /* 2122 * searching for the right group start 2123 * from the goal value specified 2124 */ 2125 group = ac->ac_g_ex.fe_group; 2126 2127 for (i = 0; i < ngroups; group++, i++) { 2128 cond_resched(); 2129 /* 2130 * Artificially restricted ngroups for non-extent 2131 * files makes group > ngroups possible on first loop. 2132 */ 2133 if (group >= ngroups) 2134 group = 0; 2135 2136 /* This now checks without needing the buddy page */ 2137 if (!ext4_mb_good_group(ac, group, cr)) 2138 continue; 2139 2140 err = ext4_mb_load_buddy(sb, group, &e4b); 2141 if (err) 2142 goto out; 2143 2144 ext4_lock_group(sb, group); 2145 2146 /* 2147 * We need to check again after locking the 2148 * block group 2149 */ 2150 if (!ext4_mb_good_group(ac, group, cr)) { 2151 ext4_unlock_group(sb, group); 2152 ext4_mb_unload_buddy(&e4b); 2153 continue; 2154 } 2155 2156 ac->ac_groups_scanned++; 2157 if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2) 2158 ext4_mb_simple_scan_group(ac, &e4b); 2159 else if (cr == 1 && sbi->s_stripe && 2160 !(ac->ac_g_ex.fe_len % sbi->s_stripe)) 2161 ext4_mb_scan_aligned(ac, &e4b); 2162 else 2163 ext4_mb_complex_scan_group(ac, &e4b); 2164 2165 ext4_unlock_group(sb, group); 2166 ext4_mb_unload_buddy(&e4b); 2167 2168 if (ac->ac_status != AC_STATUS_CONTINUE) 2169 break; 2170 } 2171 } 2172 2173 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && 2174 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2175 /* 2176 * We've been searching too long. Let's try to allocate 2177 * the best chunk we've found so far 2178 */ 2179 2180 ext4_mb_try_best_found(ac, &e4b); 2181 if (ac->ac_status != AC_STATUS_FOUND) { 2182 /* 2183 * Someone more lucky has already allocated it. 2184 * The only thing we can do is just take first 2185 * found block(s) 2186 printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); 2187 */ 2188 ac->ac_b_ex.fe_group = 0; 2189 ac->ac_b_ex.fe_start = 0; 2190 ac->ac_b_ex.fe_len = 0; 2191 ac->ac_status = AC_STATUS_CONTINUE; 2192 ac->ac_flags |= EXT4_MB_HINT_FIRST; 2193 cr = 3; 2194 atomic_inc(&sbi->s_mb_lost_chunks); 2195 goto repeat; 2196 } 2197 } 2198 out: 2199 return err; 2200 } 2201 2202 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2203 { 2204 struct super_block *sb = seq->private; 2205 ext4_group_t group; 2206 2207 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2208 return NULL; 2209 group = *pos + 1; 2210 return (void *) ((unsigned long) group); 2211 } 2212 2213 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2214 { 2215 struct super_block *sb = seq->private; 2216 ext4_group_t group; 2217 2218 ++*pos; 2219 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2220 return NULL; 2221 group = *pos + 1; 2222 return (void *) ((unsigned long) group); 2223 } 2224 2225 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2226 { 2227 struct super_block *sb = seq->private; 2228 ext4_group_t group = (ext4_group_t) ((unsigned long) v); 2229 int i; 2230 int err, buddy_loaded = 0; 2231 struct ext4_buddy e4b; 2232 struct ext4_group_info *grinfo; 2233 struct sg { 2234 struct ext4_group_info info; 2235 ext4_grpblk_t counters[16]; 2236 } sg; 2237 2238 group--; 2239 if (group == 0) 2240 seq_printf(seq, "#%-5s: %-5s %-5s %-5s " 2241 "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " 2242 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", 2243 "group", "free", "frags", "first", 2244 "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", 2245 "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); 2246 2247 i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + 2248 sizeof(struct ext4_group_info); 2249 grinfo = ext4_get_group_info(sb, group); 2250 /* Load the group info in memory only if not already loaded. */ 2251 if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { 2252 err = ext4_mb_load_buddy(sb, group, &e4b); 2253 if (err) { 2254 seq_printf(seq, "#%-5u: I/O error\n", group); 2255 return 0; 2256 } 2257 buddy_loaded = 1; 2258 } 2259 2260 memcpy(&sg, ext4_get_group_info(sb, group), i); 2261 2262 if (buddy_loaded) 2263 ext4_mb_unload_buddy(&e4b); 2264 2265 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2266 sg.info.bb_fragments, sg.info.bb_first_free); 2267 for (i = 0; i <= 13; i++) 2268 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? 2269 sg.info.bb_counters[i] : 0); 2270 seq_printf(seq, " ]\n"); 2271 2272 return 0; 2273 } 2274 2275 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) 2276 { 2277 } 2278 2279 static const struct seq_operations ext4_mb_seq_groups_ops = { 2280 .start = ext4_mb_seq_groups_start, 2281 .next = ext4_mb_seq_groups_next, 2282 .stop = ext4_mb_seq_groups_stop, 2283 .show = ext4_mb_seq_groups_show, 2284 }; 2285 2286 static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) 2287 { 2288 struct super_block *sb = PDE_DATA(inode); 2289 int rc; 2290 2291 rc = seq_open(file, &ext4_mb_seq_groups_ops); 2292 if (rc == 0) { 2293 struct seq_file *m = file->private_data; 2294 m->private = sb; 2295 } 2296 return rc; 2297 2298 } 2299 2300 static const struct file_operations ext4_mb_seq_groups_fops = { 2301 .owner = THIS_MODULE, 2302 .open = ext4_mb_seq_groups_open, 2303 .read = seq_read, 2304 .llseek = seq_lseek, 2305 .release = seq_release, 2306 }; 2307 2308 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) 2309 { 2310 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 2311 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; 2312 2313 BUG_ON(!cachep); 2314 return cachep; 2315 } 2316 2317 /* 2318 * Allocate the top-level s_group_info array for the specified number 2319 * of groups 2320 */ 2321 int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) 2322 { 2323 struct ext4_sb_info *sbi = EXT4_SB(sb); 2324 unsigned size; 2325 struct ext4_group_info ***new_groupinfo; 2326 2327 size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> 2328 EXT4_DESC_PER_BLOCK_BITS(sb); 2329 if (size <= sbi->s_group_info_size) 2330 return 0; 2331 2332 size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); 2333 new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); 2334 if (!new_groupinfo) { 2335 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); 2336 return -ENOMEM; 2337 } 2338 if (sbi->s_group_info) { 2339 memcpy(new_groupinfo, sbi->s_group_info, 2340 sbi->s_group_info_size * sizeof(*sbi->s_group_info)); 2341 ext4_kvfree(sbi->s_group_info); 2342 } 2343 sbi->s_group_info = new_groupinfo; 2344 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); 2345 ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 2346 sbi->s_group_info_size); 2347 return 0; 2348 } 2349 2350 /* Create and initialize ext4_group_info data for the given group. */ 2351 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2352 struct ext4_group_desc *desc) 2353 { 2354 int i; 2355 int metalen = 0; 2356 struct ext4_sb_info *sbi = EXT4_SB(sb); 2357 struct ext4_group_info **meta_group_info; 2358 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2359 2360 /* 2361 * First check if this group is the first of a reserved block. 2362 * If it's true, we have to allocate a new table of pointers 2363 * to ext4_group_info structures 2364 */ 2365 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 2366 metalen = sizeof(*meta_group_info) << 2367 EXT4_DESC_PER_BLOCK_BITS(sb); 2368 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2369 if (meta_group_info == NULL) { 2370 ext4_msg(sb, KERN_ERR, "can't allocate mem " 2371 "for a buddy group"); 2372 goto exit_meta_group_info; 2373 } 2374 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = 2375 meta_group_info; 2376 } 2377 2378 meta_group_info = 2379 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2380 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2381 2382 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); 2383 if (meta_group_info[i] == NULL) { 2384 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); 2385 goto exit_group_info; 2386 } 2387 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2388 &(meta_group_info[i]->bb_state)); 2389 2390 /* 2391 * initialize bb_free to be able to skip 2392 * empty groups without initialization 2393 */ 2394 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2395 meta_group_info[i]->bb_free = 2396 ext4_free_clusters_after_init(sb, group, desc); 2397 } else { 2398 meta_group_info[i]->bb_free = 2399 ext4_free_group_clusters(sb, desc); 2400 } 2401 2402 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2403 init_rwsem(&meta_group_info[i]->alloc_sem); 2404 meta_group_info[i]->bb_free_root = RB_ROOT; 2405 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ 2406 2407 #ifdef DOUBLE_CHECK 2408 { 2409 struct buffer_head *bh; 2410 meta_group_info[i]->bb_bitmap = 2411 kmalloc(sb->s_blocksize, GFP_KERNEL); 2412 BUG_ON(meta_group_info[i]->bb_bitmap == NULL); 2413 bh = ext4_read_block_bitmap(sb, group); 2414 BUG_ON(bh == NULL); 2415 memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, 2416 sb->s_blocksize); 2417 put_bh(bh); 2418 } 2419 #endif 2420 2421 return 0; 2422 2423 exit_group_info: 2424 /* If a meta_group_info table has been allocated, release it now */ 2425 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 2426 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); 2427 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; 2428 } 2429 exit_meta_group_info: 2430 return -ENOMEM; 2431 } /* ext4_mb_add_groupinfo */ 2432 2433 static int ext4_mb_init_backend(struct super_block *sb) 2434 { 2435 ext4_group_t ngroups = ext4_get_groups_count(sb); 2436 ext4_group_t i; 2437 struct ext4_sb_info *sbi = EXT4_SB(sb); 2438 int err; 2439 struct ext4_group_desc *desc; 2440 struct kmem_cache *cachep; 2441 2442 err = ext4_mb_alloc_groupinfo(sb, ngroups); 2443 if (err) 2444 return err; 2445 2446 sbi->s_buddy_cache = new_inode(sb); 2447 if (sbi->s_buddy_cache == NULL) { 2448 ext4_msg(sb, KERN_ERR, "can't get new inode"); 2449 goto err_freesgi; 2450 } 2451 /* To avoid potentially colliding with an valid on-disk inode number, 2452 * use EXT4_BAD_INO for the buddy cache inode number. This inode is 2453 * not in the inode hash, so it should never be found by iget(), but 2454 * this will avoid confusion if it ever shows up during debugging. */ 2455 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; 2456 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2457 for (i = 0; i < ngroups; i++) { 2458 desc = ext4_get_group_desc(sb, i, NULL); 2459 if (desc == NULL) { 2460 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); 2461 goto err_freebuddy; 2462 } 2463 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2464 goto err_freebuddy; 2465 } 2466 2467 return 0; 2468 2469 err_freebuddy: 2470 cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2471 while (i-- > 0) 2472 kmem_cache_free(cachep, ext4_get_group_info(sb, i)); 2473 i = sbi->s_group_info_size; 2474 while (i-- > 0) 2475 kfree(sbi->s_group_info[i]); 2476 iput(sbi->s_buddy_cache); 2477 err_freesgi: 2478 ext4_kvfree(sbi->s_group_info); 2479 return -ENOMEM; 2480 } 2481 2482 static void ext4_groupinfo_destroy_slabs(void) 2483 { 2484 int i; 2485 2486 for (i = 0; i < NR_GRPINFO_CACHES; i++) { 2487 if (ext4_groupinfo_caches[i]) 2488 kmem_cache_destroy(ext4_groupinfo_caches[i]); 2489 ext4_groupinfo_caches[i] = NULL; 2490 } 2491 } 2492 2493 static int ext4_groupinfo_create_slab(size_t size) 2494 { 2495 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); 2496 int slab_size; 2497 int blocksize_bits = order_base_2(size); 2498 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 2499 struct kmem_cache *cachep; 2500 2501 if (cache_index >= NR_GRPINFO_CACHES) 2502 return -EINVAL; 2503 2504 if (unlikely(cache_index < 0)) 2505 cache_index = 0; 2506 2507 mutex_lock(&ext4_grpinfo_slab_create_mutex); 2508 if (ext4_groupinfo_caches[cache_index]) { 2509 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 2510 return 0; /* Already created */ 2511 } 2512 2513 slab_size = offsetof(struct ext4_group_info, 2514 bb_counters[blocksize_bits + 2]); 2515 2516 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], 2517 slab_size, 0, SLAB_RECLAIM_ACCOUNT, 2518 NULL); 2519 2520 ext4_groupinfo_caches[cache_index] = cachep; 2521 2522 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 2523 if (!cachep) { 2524 printk(KERN_EMERG 2525 "EXT4-fs: no memory for groupinfo slab cache\n"); 2526 return -ENOMEM; 2527 } 2528 2529 return 0; 2530 } 2531 2532 int ext4_mb_init(struct super_block *sb) 2533 { 2534 struct ext4_sb_info *sbi = EXT4_SB(sb); 2535 unsigned i, j; 2536 unsigned offset; 2537 unsigned max; 2538 int ret; 2539 2540 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); 2541 2542 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2543 if (sbi->s_mb_offsets == NULL) { 2544 ret = -ENOMEM; 2545 goto out; 2546 } 2547 2548 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); 2549 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2550 if (sbi->s_mb_maxs == NULL) { 2551 ret = -ENOMEM; 2552 goto out; 2553 } 2554 2555 ret = ext4_groupinfo_create_slab(sb->s_blocksize); 2556 if (ret < 0) 2557 goto out; 2558 2559 /* order 0 is regular bitmap */ 2560 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 2561 sbi->s_mb_offsets[0] = 0; 2562 2563 i = 1; 2564 offset = 0; 2565 max = sb->s_blocksize << 2; 2566 do { 2567 sbi->s_mb_offsets[i] = offset; 2568 sbi->s_mb_maxs[i] = max; 2569 offset += 1 << (sb->s_blocksize_bits - i); 2570 max = max >> 1; 2571 i++; 2572 } while (i <= sb->s_blocksize_bits + 1); 2573 2574 spin_lock_init(&sbi->s_md_lock); 2575 spin_lock_init(&sbi->s_bal_lock); 2576 2577 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 2578 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; 2579 sbi->s_mb_stats = MB_DEFAULT_STATS; 2580 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2581 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2582 /* 2583 * The default group preallocation is 512, which for 4k block 2584 * sizes translates to 2 megabytes. However for bigalloc file 2585 * systems, this is probably too big (i.e, if the cluster size 2586 * is 1 megabyte, then group preallocation size becomes half a 2587 * gigabyte!). As a default, we will keep a two megabyte 2588 * group pralloc size for cluster sizes up to 64k, and after 2589 * that, we will force a minimum group preallocation size of 2590 * 32 clusters. This translates to 8 megs when the cluster 2591 * size is 256k, and 32 megs when the cluster size is 1 meg, 2592 * which seems reasonable as a default. 2593 */ 2594 sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> 2595 sbi->s_cluster_bits, 32); 2596 /* 2597 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc 2598 * to the lowest multiple of s_stripe which is bigger than 2599 * the s_mb_group_prealloc as determined above. We want 2600 * the preallocation size to be an exact multiple of the 2601 * RAID stripe size so that preallocations don't fragment 2602 * the stripes. 2603 */ 2604 if (sbi->s_stripe > 1) { 2605 sbi->s_mb_group_prealloc = roundup( 2606 sbi->s_mb_group_prealloc, sbi->s_stripe); 2607 } 2608 2609 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2610 if (sbi->s_locality_groups == NULL) { 2611 ret = -ENOMEM; 2612 goto out_free_groupinfo_slab; 2613 } 2614 for_each_possible_cpu(i) { 2615 struct ext4_locality_group *lg; 2616 lg = per_cpu_ptr(sbi->s_locality_groups, i); 2617 mutex_init(&lg->lg_mutex); 2618 for (j = 0; j < PREALLOC_TB_SIZE; j++) 2619 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 2620 spin_lock_init(&lg->lg_prealloc_lock); 2621 } 2622 2623 /* init file for buddy data */ 2624 ret = ext4_mb_init_backend(sb); 2625 if (ret != 0) 2626 goto out_free_locality_groups; 2627 2628 if (sbi->s_proc) 2629 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2630 &ext4_mb_seq_groups_fops, sb); 2631 2632 return 0; 2633 2634 out_free_locality_groups: 2635 free_percpu(sbi->s_locality_groups); 2636 sbi->s_locality_groups = NULL; 2637 out_free_groupinfo_slab: 2638 ext4_groupinfo_destroy_slabs(); 2639 out: 2640 kfree(sbi->s_mb_offsets); 2641 sbi->s_mb_offsets = NULL; 2642 kfree(sbi->s_mb_maxs); 2643 sbi->s_mb_maxs = NULL; 2644 return ret; 2645 } 2646 2647 /* need to called with the ext4 group lock held */ 2648 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) 2649 { 2650 struct ext4_prealloc_space *pa; 2651 struct list_head *cur, *tmp; 2652 int count = 0; 2653 2654 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { 2655 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 2656 list_del(&pa->pa_group_list); 2657 count++; 2658 kmem_cache_free(ext4_pspace_cachep, pa); 2659 } 2660 if (count) 2661 mb_debug(1, "mballoc: %u PAs left\n", count); 2662 2663 } 2664 2665 int ext4_mb_release(struct super_block *sb) 2666 { 2667 ext4_group_t ngroups = ext4_get_groups_count(sb); 2668 ext4_group_t i; 2669 int num_meta_group_infos; 2670 struct ext4_group_info *grinfo; 2671 struct ext4_sb_info *sbi = EXT4_SB(sb); 2672 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2673 2674 if (sbi->s_proc) 2675 remove_proc_entry("mb_groups", sbi->s_proc); 2676 2677 if (sbi->s_group_info) { 2678 for (i = 0; i < ngroups; i++) { 2679 grinfo = ext4_get_group_info(sb, i); 2680 #ifdef DOUBLE_CHECK 2681 kfree(grinfo->bb_bitmap); 2682 #endif 2683 ext4_lock_group(sb, i); 2684 ext4_mb_cleanup_pa(grinfo); 2685 ext4_unlock_group(sb, i); 2686 kmem_cache_free(cachep, grinfo); 2687 } 2688 num_meta_group_infos = (ngroups + 2689 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2690 EXT4_DESC_PER_BLOCK_BITS(sb); 2691 for (i = 0; i < num_meta_group_infos; i++) 2692 kfree(sbi->s_group_info[i]); 2693 ext4_kvfree(sbi->s_group_info); 2694 } 2695 kfree(sbi->s_mb_offsets); 2696 kfree(sbi->s_mb_maxs); 2697 if (sbi->s_buddy_cache) 2698 iput(sbi->s_buddy_cache); 2699 if (sbi->s_mb_stats) { 2700 ext4_msg(sb, KERN_INFO, 2701 "mballoc: %u blocks %u reqs (%u success)", 2702 atomic_read(&sbi->s_bal_allocated), 2703 atomic_read(&sbi->s_bal_reqs), 2704 atomic_read(&sbi->s_bal_success)); 2705 ext4_msg(sb, KERN_INFO, 2706 "mballoc: %u extents scanned, %u goal hits, " 2707 "%u 2^N hits, %u breaks, %u lost", 2708 atomic_read(&sbi->s_bal_ex_scanned), 2709 atomic_read(&sbi->s_bal_goals), 2710 atomic_read(&sbi->s_bal_2orders), 2711 atomic_read(&sbi->s_bal_breaks), 2712 atomic_read(&sbi->s_mb_lost_chunks)); 2713 ext4_msg(sb, KERN_INFO, 2714 "mballoc: %lu generated and it took %Lu", 2715 sbi->s_mb_buddies_generated, 2716 sbi->s_mb_generation_time); 2717 ext4_msg(sb, KERN_INFO, 2718 "mballoc: %u preallocated, %u discarded", 2719 atomic_read(&sbi->s_mb_preallocated), 2720 atomic_read(&sbi->s_mb_discarded)); 2721 } 2722 2723 free_percpu(sbi->s_locality_groups); 2724 2725 return 0; 2726 } 2727 2728 static inline int ext4_issue_discard(struct super_block *sb, 2729 ext4_group_t block_group, ext4_grpblk_t cluster, int count) 2730 { 2731 ext4_fsblk_t discard_block; 2732 2733 discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + 2734 ext4_group_first_block_no(sb, block_group)); 2735 count = EXT4_C2B(EXT4_SB(sb), count); 2736 trace_ext4_discard_blocks(sb, 2737 (unsigned long long) discard_block, count); 2738 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); 2739 } 2740 2741 /* 2742 * This function is called by the jbd2 layer once the commit has finished, 2743 * so we know we can free the blocks that were released with that commit. 2744 */ 2745 static void ext4_free_data_callback(struct super_block *sb, 2746 struct ext4_journal_cb_entry *jce, 2747 int rc) 2748 { 2749 struct ext4_free_data *entry = (struct ext4_free_data *)jce; 2750 struct ext4_buddy e4b; 2751 struct ext4_group_info *db; 2752 int err, count = 0, count2 = 0; 2753 2754 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2755 entry->efd_count, entry->efd_group, entry); 2756 2757 if (test_opt(sb, DISCARD)) { 2758 err = ext4_issue_discard(sb, entry->efd_group, 2759 entry->efd_start_cluster, 2760 entry->efd_count); 2761 if (err && err != -EOPNOTSUPP) 2762 ext4_msg(sb, KERN_WARNING, "discard request in" 2763 " group:%d block:%d count:%d failed" 2764 " with %d", entry->efd_group, 2765 entry->efd_start_cluster, 2766 entry->efd_count, err); 2767 } 2768 2769 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); 2770 /* we expect to find existing buddy because it's pinned */ 2771 BUG_ON(err != 0); 2772 2773 2774 db = e4b.bd_info; 2775 /* there are blocks to put in buddy to make them really free */ 2776 count += entry->efd_count; 2777 count2++; 2778 ext4_lock_group(sb, entry->efd_group); 2779 /* Take it out of per group rb tree */ 2780 rb_erase(&entry->efd_node, &(db->bb_free_root)); 2781 mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); 2782 2783 /* 2784 * Clear the trimmed flag for the group so that the next 2785 * ext4_trim_fs can trim it. 2786 * If the volume is mounted with -o discard, online discard 2787 * is supported and the free blocks will be trimmed online. 2788 */ 2789 if (!test_opt(sb, DISCARD)) 2790 EXT4_MB_GRP_CLEAR_TRIMMED(db); 2791 2792 if (!db->bb_free_root.rb_node) { 2793 /* No more items in the per group rb tree 2794 * balance refcounts from ext4_mb_free_metadata() 2795 */ 2796 page_cache_release(e4b.bd_buddy_page); 2797 page_cache_release(e4b.bd_bitmap_page); 2798 } 2799 ext4_unlock_group(sb, entry->efd_group); 2800 kmem_cache_free(ext4_free_data_cachep, entry); 2801 ext4_mb_unload_buddy(&e4b); 2802 2803 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2804 } 2805 2806 int __init ext4_init_mballoc(void) 2807 { 2808 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, 2809 SLAB_RECLAIM_ACCOUNT); 2810 if (ext4_pspace_cachep == NULL) 2811 return -ENOMEM; 2812 2813 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, 2814 SLAB_RECLAIM_ACCOUNT); 2815 if (ext4_ac_cachep == NULL) { 2816 kmem_cache_destroy(ext4_pspace_cachep); 2817 return -ENOMEM; 2818 } 2819 2820 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, 2821 SLAB_RECLAIM_ACCOUNT); 2822 if (ext4_free_data_cachep == NULL) { 2823 kmem_cache_destroy(ext4_pspace_cachep); 2824 kmem_cache_destroy(ext4_ac_cachep); 2825 return -ENOMEM; 2826 } 2827 return 0; 2828 } 2829 2830 void ext4_exit_mballoc(void) 2831 { 2832 /* 2833 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2834 * before destroying the slab cache. 2835 */ 2836 rcu_barrier(); 2837 kmem_cache_destroy(ext4_pspace_cachep); 2838 kmem_cache_destroy(ext4_ac_cachep); 2839 kmem_cache_destroy(ext4_free_data_cachep); 2840 ext4_groupinfo_destroy_slabs(); 2841 } 2842 2843 2844 /* 2845 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps 2846 * Returns 0 if success or error code 2847 */ 2848 static noinline_for_stack int 2849 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2850 handle_t *handle, unsigned int reserv_clstrs) 2851 { 2852 struct buffer_head *bitmap_bh = NULL; 2853 struct ext4_group_desc *gdp; 2854 struct buffer_head *gdp_bh; 2855 struct ext4_sb_info *sbi; 2856 struct super_block *sb; 2857 ext4_fsblk_t block; 2858 int err, len; 2859 2860 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 2861 BUG_ON(ac->ac_b_ex.fe_len <= 0); 2862 2863 sb = ac->ac_sb; 2864 sbi = EXT4_SB(sb); 2865 2866 err = -EIO; 2867 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 2868 if (!bitmap_bh) 2869 goto out_err; 2870 2871 err = ext4_journal_get_write_access(handle, bitmap_bh); 2872 if (err) 2873 goto out_err; 2874 2875 err = -EIO; 2876 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); 2877 if (!gdp) 2878 goto out_err; 2879 2880 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 2881 ext4_free_group_clusters(sb, gdp)); 2882 2883 err = ext4_journal_get_write_access(handle, gdp_bh); 2884 if (err) 2885 goto out_err; 2886 2887 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 2888 2889 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 2890 if (!ext4_data_block_valid(sbi, block, len)) { 2891 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 2892 "fs metadata", block, block+len); 2893 /* File system mounted not to panic on error 2894 * Fix the bitmap and repeat the block allocation 2895 * We leak some of the blocks here. 2896 */ 2897 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 2898 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 2899 ac->ac_b_ex.fe_len); 2900 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2901 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2902 if (!err) 2903 err = -EAGAIN; 2904 goto out_err; 2905 } 2906 2907 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 2908 #ifdef AGGRESSIVE_CHECK 2909 { 2910 int i; 2911 for (i = 0; i < ac->ac_b_ex.fe_len; i++) { 2912 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, 2913 bitmap_bh->b_data)); 2914 } 2915 } 2916 #endif 2917 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 2918 ac->ac_b_ex.fe_len); 2919 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2920 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 2921 ext4_free_group_clusters_set(sb, gdp, 2922 ext4_free_clusters_after_init(sb, 2923 ac->ac_b_ex.fe_group, gdp)); 2924 } 2925 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; 2926 ext4_free_group_clusters_set(sb, gdp, len); 2927 ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); 2928 ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); 2929 2930 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2931 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); 2932 /* 2933 * Now reduce the dirty block count also. Should not go negative 2934 */ 2935 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 2936 /* release all the reserved blocks if non delalloc */ 2937 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 2938 reserv_clstrs); 2939 2940 if (sbi->s_log_groups_per_flex) { 2941 ext4_group_t flex_group = ext4_flex_group(sbi, 2942 ac->ac_b_ex.fe_group); 2943 atomic64_sub(ac->ac_b_ex.fe_len, 2944 &sbi->s_flex_groups[flex_group].free_clusters); 2945 } 2946 2947 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2948 if (err) 2949 goto out_err; 2950 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 2951 2952 out_err: 2953 brelse(bitmap_bh); 2954 return err; 2955 } 2956 2957 /* 2958 * here we normalize request for locality group 2959 * Group request are normalized to s_mb_group_prealloc, which goes to 2960 * s_strip if we set the same via mount option. 2961 * s_mb_group_prealloc can be configured via 2962 * /sys/fs/ext4/<partition>/mb_group_prealloc 2963 * 2964 * XXX: should we try to preallocate more than the group has now? 2965 */ 2966 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) 2967 { 2968 struct super_block *sb = ac->ac_sb; 2969 struct ext4_locality_group *lg = ac->ac_lg; 2970 2971 BUG_ON(lg == NULL); 2972 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 2973 mb_debug(1, "#%u: goal %u blocks for locality group\n", 2974 current->pid, ac->ac_g_ex.fe_len); 2975 } 2976 2977 /* 2978 * Normalization means making request better in terms of 2979 * size and alignment 2980 */ 2981 static noinline_for_stack void 2982 ext4_mb_normalize_request(struct ext4_allocation_context *ac, 2983 struct ext4_allocation_request *ar) 2984 { 2985 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2986 int bsbits, max; 2987 ext4_lblk_t end; 2988 loff_t size, start_off; 2989 loff_t orig_size __maybe_unused; 2990 ext4_lblk_t start; 2991 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 2992 struct ext4_prealloc_space *pa; 2993 2994 /* do normalize only data requests, metadata requests 2995 do not need preallocation */ 2996 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 2997 return; 2998 2999 /* sometime caller may want exact blocks */ 3000 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 3001 return; 3002 3003 /* caller may indicate that preallocation isn't 3004 * required (it's a tail, for example) */ 3005 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) 3006 return; 3007 3008 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { 3009 ext4_mb_normalize_group_request(ac); 3010 return ; 3011 } 3012 3013 bsbits = ac->ac_sb->s_blocksize_bits; 3014 3015 /* first, let's learn actual file size 3016 * given current request is allocated */ 3017 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); 3018 size = size << bsbits; 3019 if (size < i_size_read(ac->ac_inode)) 3020 size = i_size_read(ac->ac_inode); 3021 orig_size = size; 3022 3023 /* max size of free chunks */ 3024 max = 2 << bsbits; 3025 3026 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ 3027 (req <= (size) || max <= (chunk_size)) 3028 3029 /* first, try to predict filesize */ 3030 /* XXX: should this table be tunable? */ 3031 start_off = 0; 3032 if (size <= 16 * 1024) { 3033 size = 16 * 1024; 3034 } else if (size <= 32 * 1024) { 3035 size = 32 * 1024; 3036 } else if (size <= 64 * 1024) { 3037 size = 64 * 1024; 3038 } else if (size <= 128 * 1024) { 3039 size = 128 * 1024; 3040 } else if (size <= 256 * 1024) { 3041 size = 256 * 1024; 3042 } else if (size <= 512 * 1024) { 3043 size = 512 * 1024; 3044 } else if (size <= 1024 * 1024) { 3045 size = 1024 * 1024; 3046 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { 3047 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3048 (21 - bsbits)) << 21; 3049 size = 2 * 1024 * 1024; 3050 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { 3051 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3052 (22 - bsbits)) << 22; 3053 size = 4 * 1024 * 1024; 3054 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, 3055 (8<<20)>>bsbits, max, 8 * 1024)) { 3056 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3057 (23 - bsbits)) << 23; 3058 size = 8 * 1024 * 1024; 3059 } else { 3060 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; 3061 size = ac->ac_o_ex.fe_len << bsbits; 3062 } 3063 size = size >> bsbits; 3064 start = start_off >> bsbits; 3065 3066 /* don't cover already allocated blocks in selected range */ 3067 if (ar->pleft && start <= ar->lleft) { 3068 size -= ar->lleft + 1 - start; 3069 start = ar->lleft + 1; 3070 } 3071 if (ar->pright && start + size - 1 >= ar->lright) 3072 size -= start + size - ar->lright; 3073 3074 end = start + size; 3075 3076 /* check we don't cross already preallocated blocks */ 3077 rcu_read_lock(); 3078 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3079 ext4_lblk_t pa_end; 3080 3081 if (pa->pa_deleted) 3082 continue; 3083 spin_lock(&pa->pa_lock); 3084 if (pa->pa_deleted) { 3085 spin_unlock(&pa->pa_lock); 3086 continue; 3087 } 3088 3089 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), 3090 pa->pa_len); 3091 3092 /* PA must not overlap original request */ 3093 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3094 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 3095 3096 /* skip PAs this normalized request doesn't overlap with */ 3097 if (pa->pa_lstart >= end || pa_end <= start) { 3098 spin_unlock(&pa->pa_lock); 3099 continue; 3100 } 3101 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 3102 3103 /* adjust start or end to be adjacent to this pa */ 3104 if (pa_end <= ac->ac_o_ex.fe_logical) { 3105 BUG_ON(pa_end < start); 3106 start = pa_end; 3107 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { 3108 BUG_ON(pa->pa_lstart > end); 3109 end = pa->pa_lstart; 3110 } 3111 spin_unlock(&pa->pa_lock); 3112 } 3113 rcu_read_unlock(); 3114 size = end - start; 3115 3116 /* XXX: extra loop to check we really don't overlap preallocations */ 3117 rcu_read_lock(); 3118 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3119 ext4_lblk_t pa_end; 3120 3121 spin_lock(&pa->pa_lock); 3122 if (pa->pa_deleted == 0) { 3123 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), 3124 pa->pa_len); 3125 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); 3126 } 3127 spin_unlock(&pa->pa_lock); 3128 } 3129 rcu_read_unlock(); 3130 3131 if (start + size <= ac->ac_o_ex.fe_logical && 3132 start > ac->ac_o_ex.fe_logical) { 3133 ext4_msg(ac->ac_sb, KERN_ERR, 3134 "start %lu, size %lu, fe_logical %lu", 3135 (unsigned long) start, (unsigned long) size, 3136 (unsigned long) ac->ac_o_ex.fe_logical); 3137 } 3138 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3139 start > ac->ac_o_ex.fe_logical); 3140 BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 3141 3142 /* now prepare goal request */ 3143 3144 /* XXX: is it better to align blocks WRT to logical 3145 * placement or satisfy big request as is */ 3146 ac->ac_g_ex.fe_logical = start; 3147 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); 3148 3149 /* define goal start in order to merge */ 3150 if (ar->pright && (ar->lright == (start + size))) { 3151 /* merge to the right */ 3152 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, 3153 &ac->ac_f_ex.fe_group, 3154 &ac->ac_f_ex.fe_start); 3155 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3156 } 3157 if (ar->pleft && (ar->lleft + 1 == start)) { 3158 /* merge to the left */ 3159 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, 3160 &ac->ac_f_ex.fe_group, 3161 &ac->ac_f_ex.fe_start); 3162 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3163 } 3164 3165 mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, 3166 (unsigned) orig_size, (unsigned) start); 3167 } 3168 3169 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) 3170 { 3171 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3172 3173 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3174 atomic_inc(&sbi->s_bal_reqs); 3175 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3176 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) 3177 atomic_inc(&sbi->s_bal_success); 3178 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3179 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3180 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 3181 atomic_inc(&sbi->s_bal_goals); 3182 if (ac->ac_found > sbi->s_mb_max_to_scan) 3183 atomic_inc(&sbi->s_bal_breaks); 3184 } 3185 3186 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) 3187 trace_ext4_mballoc_alloc(ac); 3188 else 3189 trace_ext4_mballoc_prealloc(ac); 3190 } 3191 3192 /* 3193 * Called on failure; free up any blocks from the inode PA for this 3194 * context. We don't need this for MB_GROUP_PA because we only change 3195 * pa_free in ext4_mb_release_context(), but on failure, we've already 3196 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. 3197 */ 3198 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) 3199 { 3200 struct ext4_prealloc_space *pa = ac->ac_pa; 3201 3202 if (pa && pa->pa_type == MB_INODE_PA) 3203 pa->pa_free += ac->ac_b_ex.fe_len; 3204 } 3205 3206 /* 3207 * use blocks preallocated to inode 3208 */ 3209 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 3210 struct ext4_prealloc_space *pa) 3211 { 3212 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3213 ext4_fsblk_t start; 3214 ext4_fsblk_t end; 3215 int len; 3216 3217 /* found preallocated blocks, use them */ 3218 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 3219 end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), 3220 start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); 3221 len = EXT4_NUM_B2C(sbi, end - start); 3222 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 3223 &ac->ac_b_ex.fe_start); 3224 ac->ac_b_ex.fe_len = len; 3225 ac->ac_status = AC_STATUS_FOUND; 3226 ac->ac_pa = pa; 3227 3228 BUG_ON(start < pa->pa_pstart); 3229 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); 3230 BUG_ON(pa->pa_free < len); 3231 pa->pa_free -= len; 3232 3233 mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); 3234 } 3235 3236 /* 3237 * use blocks preallocated to locality group 3238 */ 3239 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, 3240 struct ext4_prealloc_space *pa) 3241 { 3242 unsigned int len = ac->ac_o_ex.fe_len; 3243 3244 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 3245 &ac->ac_b_ex.fe_group, 3246 &ac->ac_b_ex.fe_start); 3247 ac->ac_b_ex.fe_len = len; 3248 ac->ac_status = AC_STATUS_FOUND; 3249 ac->ac_pa = pa; 3250 3251 /* we don't correct pa_pstart or pa_plen here to avoid 3252 * possible race when the group is being loaded concurrently 3253 * instead we correct pa later, after blocks are marked 3254 * in on-disk bitmap -- see ext4_mb_release_context() 3255 * Other CPUs are prevented from allocating from this pa by lg_mutex 3256 */ 3257 mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3258 } 3259 3260 /* 3261 * Return the prealloc space that have minimal distance 3262 * from the goal block. @cpa is the prealloc 3263 * space that is having currently known minimal distance 3264 * from the goal block. 3265 */ 3266 static struct ext4_prealloc_space * 3267 ext4_mb_check_group_pa(ext4_fsblk_t goal_block, 3268 struct ext4_prealloc_space *pa, 3269 struct ext4_prealloc_space *cpa) 3270 { 3271 ext4_fsblk_t cur_distance, new_distance; 3272 3273 if (cpa == NULL) { 3274 atomic_inc(&pa->pa_count); 3275 return pa; 3276 } 3277 cur_distance = abs(goal_block - cpa->pa_pstart); 3278 new_distance = abs(goal_block - pa->pa_pstart); 3279 3280 if (cur_distance <= new_distance) 3281 return cpa; 3282 3283 /* drop the previous reference */ 3284 atomic_dec(&cpa->pa_count); 3285 atomic_inc(&pa->pa_count); 3286 return pa; 3287 } 3288 3289 /* 3290 * search goal blocks in preallocated space 3291 */ 3292 static noinline_for_stack int 3293 ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 3294 { 3295 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3296 int order, i; 3297 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3298 struct ext4_locality_group *lg; 3299 struct ext4_prealloc_space *pa, *cpa = NULL; 3300 ext4_fsblk_t goal_block; 3301 3302 /* only data can be preallocated */ 3303 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3304 return 0; 3305 3306 /* first, try per-file preallocation */ 3307 rcu_read_lock(); 3308 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3309 3310 /* all fields in this condition don't change, 3311 * so we can skip locking for them */ 3312 if (ac->ac_o_ex.fe_logical < pa->pa_lstart || 3313 ac->ac_o_ex.fe_logical >= (pa->pa_lstart + 3314 EXT4_C2B(sbi, pa->pa_len))) 3315 continue; 3316 3317 /* non-extent files can't have physical blocks past 2^32 */ 3318 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 3319 (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > 3320 EXT4_MAX_BLOCK_FILE_PHYS)) 3321 continue; 3322 3323 /* found preallocated blocks, use them */ 3324 spin_lock(&pa->pa_lock); 3325 if (pa->pa_deleted == 0 && pa->pa_free) { 3326 atomic_inc(&pa->pa_count); 3327 ext4_mb_use_inode_pa(ac, pa); 3328 spin_unlock(&pa->pa_lock); 3329 ac->ac_criteria = 10; 3330 rcu_read_unlock(); 3331 return 1; 3332 } 3333 spin_unlock(&pa->pa_lock); 3334 } 3335 rcu_read_unlock(); 3336 3337 /* can we use group allocation? */ 3338 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) 3339 return 0; 3340 3341 /* inode may have no locality group for some reason */ 3342 lg = ac->ac_lg; 3343 if (lg == NULL) 3344 return 0; 3345 order = fls(ac->ac_o_ex.fe_len) - 1; 3346 if (order > PREALLOC_TB_SIZE - 1) 3347 /* The max size of hash table is PREALLOC_TB_SIZE */ 3348 order = PREALLOC_TB_SIZE - 1; 3349 3350 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); 3351 /* 3352 * search for the prealloc space that is having 3353 * minimal distance from the goal block. 3354 */ 3355 for (i = order; i < PREALLOC_TB_SIZE; i++) { 3356 rcu_read_lock(); 3357 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], 3358 pa_inode_list) { 3359 spin_lock(&pa->pa_lock); 3360 if (pa->pa_deleted == 0 && 3361 pa->pa_free >= ac->ac_o_ex.fe_len) { 3362 3363 cpa = ext4_mb_check_group_pa(goal_block, 3364 pa, cpa); 3365 } 3366 spin_unlock(&pa->pa_lock); 3367 } 3368 rcu_read_unlock(); 3369 } 3370 if (cpa) { 3371 ext4_mb_use_group_pa(ac, cpa); 3372 ac->ac_criteria = 20; 3373 return 1; 3374 } 3375 return 0; 3376 } 3377 3378 /* 3379 * the function goes through all block freed in the group 3380 * but not yet committed and marks them used in in-core bitmap. 3381 * buddy must be generated from this bitmap 3382 * Need to be called with the ext4 group lock held 3383 */ 3384 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 3385 ext4_group_t group) 3386 { 3387 struct rb_node *n; 3388 struct ext4_group_info *grp; 3389 struct ext4_free_data *entry; 3390 3391 grp = ext4_get_group_info(sb, group); 3392 n = rb_first(&(grp->bb_free_root)); 3393 3394 while (n) { 3395 entry = rb_entry(n, struct ext4_free_data, efd_node); 3396 ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); 3397 n = rb_next(n); 3398 } 3399 return; 3400 } 3401 3402 /* 3403 * the function goes through all preallocation in this group and marks them 3404 * used in in-core bitmap. buddy must be generated from this bitmap 3405 * Need to be called with ext4 group lock held 3406 */ 3407 static noinline_for_stack 3408 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 3409 ext4_group_t group) 3410 { 3411 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3412 struct ext4_prealloc_space *pa; 3413 struct list_head *cur; 3414 ext4_group_t groupnr; 3415 ext4_grpblk_t start; 3416 int preallocated = 0; 3417 int len; 3418 3419 /* all form of preallocation discards first load group, 3420 * so the only competing code is preallocation use. 3421 * we don't need any locking here 3422 * notice we do NOT ignore preallocations with pa_deleted 3423 * otherwise we could leave used blocks available for 3424 * allocation in buddy when concurrent ext4_mb_put_pa() 3425 * is dropping preallocation 3426 */ 3427 list_for_each(cur, &grp->bb_prealloc_list) { 3428 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 3429 spin_lock(&pa->pa_lock); 3430 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 3431 &groupnr, &start); 3432 len = pa->pa_len; 3433 spin_unlock(&pa->pa_lock); 3434 if (unlikely(len == 0)) 3435 continue; 3436 BUG_ON(groupnr != group); 3437 ext4_set_bits(bitmap, start, len); 3438 preallocated += len; 3439 } 3440 mb_debug(1, "prellocated %u for group %u\n", preallocated, group); 3441 } 3442 3443 static void ext4_mb_pa_callback(struct rcu_head *head) 3444 { 3445 struct ext4_prealloc_space *pa; 3446 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 3447 3448 BUG_ON(atomic_read(&pa->pa_count)); 3449 BUG_ON(pa->pa_deleted == 0); 3450 kmem_cache_free(ext4_pspace_cachep, pa); 3451 } 3452 3453 /* 3454 * drops a reference to preallocated space descriptor 3455 * if this was the last reference and the space is consumed 3456 */ 3457 static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 3458 struct super_block *sb, struct ext4_prealloc_space *pa) 3459 { 3460 ext4_group_t grp; 3461 ext4_fsblk_t grp_blk; 3462 3463 /* in this short window concurrent discard can set pa_deleted */ 3464 spin_lock(&pa->pa_lock); 3465 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { 3466 spin_unlock(&pa->pa_lock); 3467 return; 3468 } 3469 3470 if (pa->pa_deleted == 1) { 3471 spin_unlock(&pa->pa_lock); 3472 return; 3473 } 3474 3475 pa->pa_deleted = 1; 3476 spin_unlock(&pa->pa_lock); 3477 3478 grp_blk = pa->pa_pstart; 3479 /* 3480 * If doing group-based preallocation, pa_pstart may be in the 3481 * next group when pa is used up 3482 */ 3483 if (pa->pa_type == MB_GROUP_PA) 3484 grp_blk--; 3485 3486 grp = ext4_get_group_number(sb, grp_blk); 3487 3488 /* 3489 * possible race: 3490 * 3491 * P1 (buddy init) P2 (regular allocation) 3492 * find block B in PA 3493 * copy on-disk bitmap to buddy 3494 * mark B in on-disk bitmap 3495 * drop PA from group 3496 * mark all PAs in buddy 3497 * 3498 * thus, P1 initializes buddy with B available. to prevent this 3499 * we make "copy" and "mark all PAs" atomic and serialize "drop PA" 3500 * against that pair 3501 */ 3502 ext4_lock_group(sb, grp); 3503 list_del(&pa->pa_group_list); 3504 ext4_unlock_group(sb, grp); 3505 3506 spin_lock(pa->pa_obj_lock); 3507 list_del_rcu(&pa->pa_inode_list); 3508 spin_unlock(pa->pa_obj_lock); 3509 3510 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3511 } 3512 3513 /* 3514 * creates new preallocated space for given inode 3515 */ 3516 static noinline_for_stack int 3517 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 3518 { 3519 struct super_block *sb = ac->ac_sb; 3520 struct ext4_sb_info *sbi = EXT4_SB(sb); 3521 struct ext4_prealloc_space *pa; 3522 struct ext4_group_info *grp; 3523 struct ext4_inode_info *ei; 3524 3525 /* preallocate only when found space is larger then requested */ 3526 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3527 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3528 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3529 3530 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3531 if (pa == NULL) 3532 return -ENOMEM; 3533 3534 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { 3535 int winl; 3536 int wins; 3537 int win; 3538 int offs; 3539 3540 /* we can't allocate as much as normalizer wants. 3541 * so, found space must get proper lstart 3542 * to cover original request */ 3543 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 3544 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); 3545 3546 /* we're limited by original request in that 3547 * logical block must be covered any way 3548 * winl is window we can move our chunk within */ 3549 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; 3550 3551 /* also, we should cover whole original request */ 3552 wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); 3553 3554 /* the smallest one defines real window */ 3555 win = min(winl, wins); 3556 3557 offs = ac->ac_o_ex.fe_logical % 3558 EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 3559 if (offs && offs < win) 3560 win = offs; 3561 3562 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - 3563 EXT4_NUM_B2C(sbi, win); 3564 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 3565 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 3566 } 3567 3568 /* preallocation can change ac_b_ex, thus we store actually 3569 * allocated blocks for history */ 3570 ac->ac_f_ex = ac->ac_b_ex; 3571 3572 pa->pa_lstart = ac->ac_b_ex.fe_logical; 3573 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3574 pa->pa_len = ac->ac_b_ex.fe_len; 3575 pa->pa_free = pa->pa_len; 3576 atomic_set(&pa->pa_count, 1); 3577 spin_lock_init(&pa->pa_lock); 3578 INIT_LIST_HEAD(&pa->pa_inode_list); 3579 INIT_LIST_HEAD(&pa->pa_group_list); 3580 pa->pa_deleted = 0; 3581 pa->pa_type = MB_INODE_PA; 3582 3583 mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, 3584 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3585 trace_ext4_mb_new_inode_pa(ac, pa); 3586 3587 ext4_mb_use_inode_pa(ac, pa); 3588 atomic_add(pa->pa_free, &sbi->s_mb_preallocated); 3589 3590 ei = EXT4_I(ac->ac_inode); 3591 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3592 3593 pa->pa_obj_lock = &ei->i_prealloc_lock; 3594 pa->pa_inode = ac->ac_inode; 3595 3596 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3597 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3598 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3599 3600 spin_lock(pa->pa_obj_lock); 3601 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); 3602 spin_unlock(pa->pa_obj_lock); 3603 3604 return 0; 3605 } 3606 3607 /* 3608 * creates new preallocated space for locality group inodes belongs to 3609 */ 3610 static noinline_for_stack int 3611 ext4_mb_new_group_pa(struct ext4_allocation_context *ac) 3612 { 3613 struct super_block *sb = ac->ac_sb; 3614 struct ext4_locality_group *lg; 3615 struct ext4_prealloc_space *pa; 3616 struct ext4_group_info *grp; 3617 3618 /* preallocate only when found space is larger then requested */ 3619 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3620 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3621 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3622 3623 BUG_ON(ext4_pspace_cachep == NULL); 3624 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3625 if (pa == NULL) 3626 return -ENOMEM; 3627 3628 /* preallocation can change ac_b_ex, thus we store actually 3629 * allocated blocks for history */ 3630 ac->ac_f_ex = ac->ac_b_ex; 3631 3632 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3633 pa->pa_lstart = pa->pa_pstart; 3634 pa->pa_len = ac->ac_b_ex.fe_len; 3635 pa->pa_free = pa->pa_len; 3636 atomic_set(&pa->pa_count, 1); 3637 spin_lock_init(&pa->pa_lock); 3638 INIT_LIST_HEAD(&pa->pa_inode_list); 3639 INIT_LIST_HEAD(&pa->pa_group_list); 3640 pa->pa_deleted = 0; 3641 pa->pa_type = MB_GROUP_PA; 3642 3643 mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, 3644 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3645 trace_ext4_mb_new_group_pa(ac, pa); 3646 3647 ext4_mb_use_group_pa(ac, pa); 3648 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3649 3650 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3651 lg = ac->ac_lg; 3652 BUG_ON(lg == NULL); 3653 3654 pa->pa_obj_lock = &lg->lg_prealloc_lock; 3655 pa->pa_inode = NULL; 3656 3657 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3658 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3659 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3660 3661 /* 3662 * We will later add the new pa to the right bucket 3663 * after updating the pa_free in ext4_mb_release_context 3664 */ 3665 return 0; 3666 } 3667 3668 static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) 3669 { 3670 int err; 3671 3672 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 3673 err = ext4_mb_new_group_pa(ac); 3674 else 3675 err = ext4_mb_new_inode_pa(ac); 3676 return err; 3677 } 3678 3679 /* 3680 * finds all unused blocks in on-disk bitmap, frees them in 3681 * in-core bitmap and buddy. 3682 * @pa must be unlinked from inode and group lists, so that 3683 * nobody else can find/use it. 3684 * the caller MUST hold group/inode locks. 3685 * TODO: optimize the case when there are no in-core structures yet 3686 */ 3687 static noinline_for_stack int 3688 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 3689 struct ext4_prealloc_space *pa) 3690 { 3691 struct super_block *sb = e4b->bd_sb; 3692 struct ext4_sb_info *sbi = EXT4_SB(sb); 3693 unsigned int end; 3694 unsigned int next; 3695 ext4_group_t group; 3696 ext4_grpblk_t bit; 3697 unsigned long long grp_blk_start; 3698 int err = 0; 3699 int free = 0; 3700 3701 BUG_ON(pa->pa_deleted == 0); 3702 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3703 grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); 3704 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3705 end = bit + pa->pa_len; 3706 3707 while (bit < end) { 3708 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 3709 if (bit >= end) 3710 break; 3711 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3712 mb_debug(1, " free preallocated %u/%u in group %u\n", 3713 (unsigned) ext4_group_first_block_no(sb, group) + bit, 3714 (unsigned) next - bit, (unsigned) group); 3715 free += next - bit; 3716 3717 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); 3718 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + 3719 EXT4_C2B(sbi, bit)), 3720 next - bit); 3721 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3722 bit = next + 1; 3723 } 3724 if (free != pa->pa_free) { 3725 ext4_msg(e4b->bd_sb, KERN_CRIT, 3726 "pa %p: logic %lu, phys. %lu, len %lu", 3727 pa, (unsigned long) pa->pa_lstart, 3728 (unsigned long) pa->pa_pstart, 3729 (unsigned long) pa->pa_len); 3730 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", 3731 free, pa->pa_free); 3732 /* 3733 * pa is already deleted so we use the value obtained 3734 * from the bitmap and continue. 3735 */ 3736 } 3737 atomic_add(free, &sbi->s_mb_discarded); 3738 3739 return err; 3740 } 3741 3742 static noinline_for_stack int 3743 ext4_mb_release_group_pa(struct ext4_buddy *e4b, 3744 struct ext4_prealloc_space *pa) 3745 { 3746 struct super_block *sb = e4b->bd_sb; 3747 ext4_group_t group; 3748 ext4_grpblk_t bit; 3749 3750 trace_ext4_mb_release_group_pa(sb, pa); 3751 BUG_ON(pa->pa_deleted == 0); 3752 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3753 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3754 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 3755 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 3756 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); 3757 3758 return 0; 3759 } 3760 3761 /* 3762 * releases all preallocations in given group 3763 * 3764 * first, we need to decide discard policy: 3765 * - when do we discard 3766 * 1) ENOSPC 3767 * - how many do we discard 3768 * 1) how many requested 3769 */ 3770 static noinline_for_stack int 3771 ext4_mb_discard_group_preallocations(struct super_block *sb, 3772 ext4_group_t group, int needed) 3773 { 3774 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3775 struct buffer_head *bitmap_bh = NULL; 3776 struct ext4_prealloc_space *pa, *tmp; 3777 struct list_head list; 3778 struct ext4_buddy e4b; 3779 int err; 3780 int busy = 0; 3781 int free = 0; 3782 3783 mb_debug(1, "discard preallocation for group %u\n", group); 3784 3785 if (list_empty(&grp->bb_prealloc_list)) 3786 return 0; 3787 3788 bitmap_bh = ext4_read_block_bitmap(sb, group); 3789 if (bitmap_bh == NULL) { 3790 ext4_error(sb, "Error reading block bitmap for %u", group); 3791 return 0; 3792 } 3793 3794 err = ext4_mb_load_buddy(sb, group, &e4b); 3795 if (err) { 3796 ext4_error(sb, "Error loading buddy information for %u", group); 3797 put_bh(bitmap_bh); 3798 return 0; 3799 } 3800 3801 if (needed == 0) 3802 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; 3803 3804 INIT_LIST_HEAD(&list); 3805 repeat: 3806 ext4_lock_group(sb, group); 3807 list_for_each_entry_safe(pa, tmp, 3808 &grp->bb_prealloc_list, pa_group_list) { 3809 spin_lock(&pa->pa_lock); 3810 if (atomic_read(&pa->pa_count)) { 3811 spin_unlock(&pa->pa_lock); 3812 busy = 1; 3813 continue; 3814 } 3815 if (pa->pa_deleted) { 3816 spin_unlock(&pa->pa_lock); 3817 continue; 3818 } 3819 3820 /* seems this one can be freed ... */ 3821 pa->pa_deleted = 1; 3822 3823 /* we can trust pa_free ... */ 3824 free += pa->pa_free; 3825 3826 spin_unlock(&pa->pa_lock); 3827 3828 list_del(&pa->pa_group_list); 3829 list_add(&pa->u.pa_tmp_list, &list); 3830 } 3831 3832 /* if we still need more blocks and some PAs were used, try again */ 3833 if (free < needed && busy) { 3834 busy = 0; 3835 ext4_unlock_group(sb, group); 3836 cond_resched(); 3837 goto repeat; 3838 } 3839 3840 /* found anything to free? */ 3841 if (list_empty(&list)) { 3842 BUG_ON(free != 0); 3843 goto out; 3844 } 3845 3846 /* now free all selected PAs */ 3847 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3848 3849 /* remove from object (inode or locality group) */ 3850 spin_lock(pa->pa_obj_lock); 3851 list_del_rcu(&pa->pa_inode_list); 3852 spin_unlock(pa->pa_obj_lock); 3853 3854 if (pa->pa_type == MB_GROUP_PA) 3855 ext4_mb_release_group_pa(&e4b, pa); 3856 else 3857 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 3858 3859 list_del(&pa->u.pa_tmp_list); 3860 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3861 } 3862 3863 out: 3864 ext4_unlock_group(sb, group); 3865 ext4_mb_unload_buddy(&e4b); 3866 put_bh(bitmap_bh); 3867 return free; 3868 } 3869 3870 /* 3871 * releases all non-used preallocated blocks for given inode 3872 * 3873 * It's important to discard preallocations under i_data_sem 3874 * We don't want another block to be served from the prealloc 3875 * space when we are discarding the inode prealloc space. 3876 * 3877 * FIXME!! Make sure it is valid at all the call sites 3878 */ 3879 void ext4_discard_preallocations(struct inode *inode) 3880 { 3881 struct ext4_inode_info *ei = EXT4_I(inode); 3882 struct super_block *sb = inode->i_sb; 3883 struct buffer_head *bitmap_bh = NULL; 3884 struct ext4_prealloc_space *pa, *tmp; 3885 ext4_group_t group = 0; 3886 struct list_head list; 3887 struct ext4_buddy e4b; 3888 int err; 3889 3890 if (!S_ISREG(inode->i_mode)) { 3891 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 3892 return; 3893 } 3894 3895 mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); 3896 trace_ext4_discard_preallocations(inode); 3897 3898 INIT_LIST_HEAD(&list); 3899 3900 repeat: 3901 /* first, collect all pa's in the inode */ 3902 spin_lock(&ei->i_prealloc_lock); 3903 while (!list_empty(&ei->i_prealloc_list)) { 3904 pa = list_entry(ei->i_prealloc_list.next, 3905 struct ext4_prealloc_space, pa_inode_list); 3906 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); 3907 spin_lock(&pa->pa_lock); 3908 if (atomic_read(&pa->pa_count)) { 3909 /* this shouldn't happen often - nobody should 3910 * use preallocation while we're discarding it */ 3911 spin_unlock(&pa->pa_lock); 3912 spin_unlock(&ei->i_prealloc_lock); 3913 ext4_msg(sb, KERN_ERR, 3914 "uh-oh! used pa while discarding"); 3915 WARN_ON(1); 3916 schedule_timeout_uninterruptible(HZ); 3917 goto repeat; 3918 3919 } 3920 if (pa->pa_deleted == 0) { 3921 pa->pa_deleted = 1; 3922 spin_unlock(&pa->pa_lock); 3923 list_del_rcu(&pa->pa_inode_list); 3924 list_add(&pa->u.pa_tmp_list, &list); 3925 continue; 3926 } 3927 3928 /* someone is deleting pa right now */ 3929 spin_unlock(&pa->pa_lock); 3930 spin_unlock(&ei->i_prealloc_lock); 3931 3932 /* we have to wait here because pa_deleted 3933 * doesn't mean pa is already unlinked from 3934 * the list. as we might be called from 3935 * ->clear_inode() the inode will get freed 3936 * and concurrent thread which is unlinking 3937 * pa from inode's list may access already 3938 * freed memory, bad-bad-bad */ 3939 3940 /* XXX: if this happens too often, we can 3941 * add a flag to force wait only in case 3942 * of ->clear_inode(), but not in case of 3943 * regular truncate */ 3944 schedule_timeout_uninterruptible(HZ); 3945 goto repeat; 3946 } 3947 spin_unlock(&ei->i_prealloc_lock); 3948 3949 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3950 BUG_ON(pa->pa_type != MB_INODE_PA); 3951 group = ext4_get_group_number(sb, pa->pa_pstart); 3952 3953 err = ext4_mb_load_buddy(sb, group, &e4b); 3954 if (err) { 3955 ext4_error(sb, "Error loading buddy information for %u", 3956 group); 3957 continue; 3958 } 3959 3960 bitmap_bh = ext4_read_block_bitmap(sb, group); 3961 if (bitmap_bh == NULL) { 3962 ext4_error(sb, "Error reading block bitmap for %u", 3963 group); 3964 ext4_mb_unload_buddy(&e4b); 3965 continue; 3966 } 3967 3968 ext4_lock_group(sb, group); 3969 list_del(&pa->pa_group_list); 3970 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 3971 ext4_unlock_group(sb, group); 3972 3973 ext4_mb_unload_buddy(&e4b); 3974 put_bh(bitmap_bh); 3975 3976 list_del(&pa->u.pa_tmp_list); 3977 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3978 } 3979 } 3980 3981 #ifdef CONFIG_EXT4_DEBUG 3982 static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3983 { 3984 struct super_block *sb = ac->ac_sb; 3985 ext4_group_t ngroups, i; 3986 3987 if (!ext4_mballoc_debug || 3988 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) 3989 return; 3990 3991 ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" 3992 " Allocation context details:"); 3993 ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", 3994 ac->ac_status, ac->ac_flags); 3995 ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " 3996 "goal %lu/%lu/%lu@%lu, " 3997 "best %lu/%lu/%lu@%lu cr %d", 3998 (unsigned long)ac->ac_o_ex.fe_group, 3999 (unsigned long)ac->ac_o_ex.fe_start, 4000 (unsigned long)ac->ac_o_ex.fe_len, 4001 (unsigned long)ac->ac_o_ex.fe_logical, 4002 (unsigned long)ac->ac_g_ex.fe_group, 4003 (unsigned long)ac->ac_g_ex.fe_start, 4004 (unsigned long)ac->ac_g_ex.fe_len, 4005 (unsigned long)ac->ac_g_ex.fe_logical, 4006 (unsigned long)ac->ac_b_ex.fe_group, 4007 (unsigned long)ac->ac_b_ex.fe_start, 4008 (unsigned long)ac->ac_b_ex.fe_len, 4009 (unsigned long)ac->ac_b_ex.fe_logical, 4010 (int)ac->ac_criteria); 4011 ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); 4012 ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); 4013 ngroups = ext4_get_groups_count(sb); 4014 for (i = 0; i < ngroups; i++) { 4015 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 4016 struct ext4_prealloc_space *pa; 4017 ext4_grpblk_t start; 4018 struct list_head *cur; 4019 ext4_lock_group(sb, i); 4020 list_for_each(cur, &grp->bb_prealloc_list) { 4021 pa = list_entry(cur, struct ext4_prealloc_space, 4022 pa_group_list); 4023 spin_lock(&pa->pa_lock); 4024 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 4025 NULL, &start); 4026 spin_unlock(&pa->pa_lock); 4027 printk(KERN_ERR "PA:%u:%d:%u \n", i, 4028 start, pa->pa_len); 4029 } 4030 ext4_unlock_group(sb, i); 4031 4032 if (grp->bb_free == 0) 4033 continue; 4034 printk(KERN_ERR "%u: %d/%d \n", 4035 i, grp->bb_free, grp->bb_fragments); 4036 } 4037 printk(KERN_ERR "\n"); 4038 } 4039 #else 4040 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4041 { 4042 return; 4043 } 4044 #endif 4045 4046 /* 4047 * We use locality group preallocation for small size file. The size of the 4048 * file is determined by the current size or the resulting size after 4049 * allocation which ever is larger 4050 * 4051 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req 4052 */ 4053 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 4054 { 4055 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4056 int bsbits = ac->ac_sb->s_blocksize_bits; 4057 loff_t size, isize; 4058 4059 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4060 return; 4061 4062 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4063 return; 4064 4065 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); 4066 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 4067 >> bsbits; 4068 4069 if ((size == isize) && 4070 !ext4_fs_is_busy(sbi) && 4071 (atomic_read(&ac->ac_inode->i_writecount) == 0)) { 4072 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; 4073 return; 4074 } 4075 4076 if (sbi->s_mb_group_prealloc <= 0) { 4077 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 4078 return; 4079 } 4080 4081 /* don't use group allocation for large files */ 4082 size = max(size, isize); 4083 if (size > sbi->s_mb_stream_request) { 4084 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 4085 return; 4086 } 4087 4088 BUG_ON(ac->ac_lg != NULL); 4089 /* 4090 * locality group prealloc space are per cpu. The reason for having 4091 * per cpu locality group is to reduce the contention between block 4092 * request from multiple CPUs. 4093 */ 4094 ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups); 4095 4096 /* we're going to use group allocation */ 4097 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 4098 4099 /* serialize all allocations in the group */ 4100 mutex_lock(&ac->ac_lg->lg_mutex); 4101 } 4102 4103 static noinline_for_stack int 4104 ext4_mb_initialize_context(struct ext4_allocation_context *ac, 4105 struct ext4_allocation_request *ar) 4106 { 4107 struct super_block *sb = ar->inode->i_sb; 4108 struct ext4_sb_info *sbi = EXT4_SB(sb); 4109 struct ext4_super_block *es = sbi->s_es; 4110 ext4_group_t group; 4111 unsigned int len; 4112 ext4_fsblk_t goal; 4113 ext4_grpblk_t block; 4114 4115 /* we can't allocate > group size */ 4116 len = ar->len; 4117 4118 /* just a dirty hack to filter too big requests */ 4119 if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) 4120 len = EXT4_CLUSTERS_PER_GROUP(sb); 4121 4122 /* start searching from the goal */ 4123 goal = ar->goal; 4124 if (goal < le32_to_cpu(es->s_first_data_block) || 4125 goal >= ext4_blocks_count(es)) 4126 goal = le32_to_cpu(es->s_first_data_block); 4127 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4128 4129 /* set up allocation goals */ 4130 ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); 4131 ac->ac_status = AC_STATUS_CONTINUE; 4132 ac->ac_sb = sb; 4133 ac->ac_inode = ar->inode; 4134 ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; 4135 ac->ac_o_ex.fe_group = group; 4136 ac->ac_o_ex.fe_start = block; 4137 ac->ac_o_ex.fe_len = len; 4138 ac->ac_g_ex = ac->ac_o_ex; 4139 ac->ac_flags = ar->flags; 4140 4141 /* we have to define context: we'll we work with a file or 4142 * locality group. this is a policy, actually */ 4143 ext4_mb_group_or_file(ac); 4144 4145 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 4146 "left: %u/%u, right %u/%u to %swritable\n", 4147 (unsigned) ar->len, (unsigned) ar->logical, 4148 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 4149 (unsigned) ar->lleft, (unsigned) ar->pleft, 4150 (unsigned) ar->lright, (unsigned) ar->pright, 4151 atomic_read(&ar->inode->i_writecount) ? "" : "non-"); 4152 return 0; 4153 4154 } 4155 4156 static noinline_for_stack void 4157 ext4_mb_discard_lg_preallocations(struct super_block *sb, 4158 struct ext4_locality_group *lg, 4159 int order, int total_entries) 4160 { 4161 ext4_group_t group = 0; 4162 struct ext4_buddy e4b; 4163 struct list_head discard_list; 4164 struct ext4_prealloc_space *pa, *tmp; 4165 4166 mb_debug(1, "discard locality group preallocation\n"); 4167 4168 INIT_LIST_HEAD(&discard_list); 4169 4170 spin_lock(&lg->lg_prealloc_lock); 4171 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4172 pa_inode_list) { 4173 spin_lock(&pa->pa_lock); 4174 if (atomic_read(&pa->pa_count)) { 4175 /* 4176 * This is the pa that we just used 4177 * for block allocation. So don't 4178 * free that 4179 */ 4180 spin_unlock(&pa->pa_lock); 4181 continue; 4182 } 4183 if (pa->pa_deleted) { 4184 spin_unlock(&pa->pa_lock); 4185 continue; 4186 } 4187 /* only lg prealloc space */ 4188 BUG_ON(pa->pa_type != MB_GROUP_PA); 4189 4190 /* seems this one can be freed ... */ 4191 pa->pa_deleted = 1; 4192 spin_unlock(&pa->pa_lock); 4193 4194 list_del_rcu(&pa->pa_inode_list); 4195 list_add(&pa->u.pa_tmp_list, &discard_list); 4196 4197 total_entries--; 4198 if (total_entries <= 5) { 4199 /* 4200 * we want to keep only 5 entries 4201 * allowing it to grow to 8. This 4202 * mak sure we don't call discard 4203 * soon for this list. 4204 */ 4205 break; 4206 } 4207 } 4208 spin_unlock(&lg->lg_prealloc_lock); 4209 4210 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 4211 4212 group = ext4_get_group_number(sb, pa->pa_pstart); 4213 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4214 ext4_error(sb, "Error loading buddy information for %u", 4215 group); 4216 continue; 4217 } 4218 ext4_lock_group(sb, group); 4219 list_del(&pa->pa_group_list); 4220 ext4_mb_release_group_pa(&e4b, pa); 4221 ext4_unlock_group(sb, group); 4222 4223 ext4_mb_unload_buddy(&e4b); 4224 list_del(&pa->u.pa_tmp_list); 4225 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4226 } 4227 } 4228 4229 /* 4230 * We have incremented pa_count. So it cannot be freed at this 4231 * point. Also we hold lg_mutex. So no parallel allocation is 4232 * possible from this lg. That means pa_free cannot be updated. 4233 * 4234 * A parallel ext4_mb_discard_group_preallocations is possible. 4235 * which can cause the lg_prealloc_list to be updated. 4236 */ 4237 4238 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) 4239 { 4240 int order, added = 0, lg_prealloc_count = 1; 4241 struct super_block *sb = ac->ac_sb; 4242 struct ext4_locality_group *lg = ac->ac_lg; 4243 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; 4244 4245 order = fls(pa->pa_free) - 1; 4246 if (order > PREALLOC_TB_SIZE - 1) 4247 /* The max size of hash table is PREALLOC_TB_SIZE */ 4248 order = PREALLOC_TB_SIZE - 1; 4249 /* Add the prealloc space to lg */ 4250 spin_lock(&lg->lg_prealloc_lock); 4251 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], 4252 pa_inode_list) { 4253 spin_lock(&tmp_pa->pa_lock); 4254 if (tmp_pa->pa_deleted) { 4255 spin_unlock(&tmp_pa->pa_lock); 4256 continue; 4257 } 4258 if (!added && pa->pa_free < tmp_pa->pa_free) { 4259 /* Add to the tail of the previous entry */ 4260 list_add_tail_rcu(&pa->pa_inode_list, 4261 &tmp_pa->pa_inode_list); 4262 added = 1; 4263 /* 4264 * we want to count the total 4265 * number of entries in the list 4266 */ 4267 } 4268 spin_unlock(&tmp_pa->pa_lock); 4269 lg_prealloc_count++; 4270 } 4271 if (!added) 4272 list_add_tail_rcu(&pa->pa_inode_list, 4273 &lg->lg_prealloc_list[order]); 4274 spin_unlock(&lg->lg_prealloc_lock); 4275 4276 /* Now trim the list to be not more than 8 elements */ 4277 if (lg_prealloc_count > 8) { 4278 ext4_mb_discard_lg_preallocations(sb, lg, 4279 order, lg_prealloc_count); 4280 return; 4281 } 4282 return ; 4283 } 4284 4285 /* 4286 * release all resource we used in allocation 4287 */ 4288 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 4289 { 4290 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4291 struct ext4_prealloc_space *pa = ac->ac_pa; 4292 if (pa) { 4293 if (pa->pa_type == MB_GROUP_PA) { 4294 /* see comment in ext4_mb_use_group_pa() */ 4295 spin_lock(&pa->pa_lock); 4296 pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4297 pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4298 pa->pa_free -= ac->ac_b_ex.fe_len; 4299 pa->pa_len -= ac->ac_b_ex.fe_len; 4300 spin_unlock(&pa->pa_lock); 4301 } 4302 } 4303 if (pa) { 4304 /* 4305 * We want to add the pa to the right bucket. 4306 * Remove it from the list and while adding 4307 * make sure the list to which we are adding 4308 * doesn't grow big. 4309 */ 4310 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { 4311 spin_lock(pa->pa_obj_lock); 4312 list_del_rcu(&pa->pa_inode_list); 4313 spin_unlock(pa->pa_obj_lock); 4314 ext4_mb_add_n_trim(ac); 4315 } 4316 ext4_mb_put_pa(ac, ac->ac_sb, pa); 4317 } 4318 if (ac->ac_bitmap_page) 4319 page_cache_release(ac->ac_bitmap_page); 4320 if (ac->ac_buddy_page) 4321 page_cache_release(ac->ac_buddy_page); 4322 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 4323 mutex_unlock(&ac->ac_lg->lg_mutex); 4324 ext4_mb_collect_stats(ac); 4325 return 0; 4326 } 4327 4328 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 4329 { 4330 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 4331 int ret; 4332 int freed = 0; 4333 4334 trace_ext4_mb_discard_preallocations(sb, needed); 4335 for (i = 0; i < ngroups && needed > 0; i++) { 4336 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4337 freed += ret; 4338 needed -= ret; 4339 } 4340 4341 return freed; 4342 } 4343 4344 /* 4345 * Main entry point into mballoc to allocate blocks 4346 * it tries to use preallocation first, then falls back 4347 * to usual allocation 4348 */ 4349 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4350 struct ext4_allocation_request *ar, int *errp) 4351 { 4352 int freed; 4353 struct ext4_allocation_context *ac = NULL; 4354 struct ext4_sb_info *sbi; 4355 struct super_block *sb; 4356 ext4_fsblk_t block = 0; 4357 unsigned int inquota = 0; 4358 unsigned int reserv_clstrs = 0; 4359 4360 might_sleep(); 4361 sb = ar->inode->i_sb; 4362 sbi = EXT4_SB(sb); 4363 4364 trace_ext4_request_blocks(ar); 4365 4366 /* Allow to use superuser reservation for quota file */ 4367 if (IS_NOQUOTA(ar->inode)) 4368 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; 4369 4370 /* 4371 * For delayed allocation, we could skip the ENOSPC and 4372 * EDQUOT check, as blocks and quotas have been already 4373 * reserved when data being copied into pagecache. 4374 */ 4375 if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) 4376 ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4377 else { 4378 /* Without delayed allocation we need to verify 4379 * there is enough free blocks to do block allocation 4380 * and verify allocation doesn't exceed the quota limits. 4381 */ 4382 while (ar->len && 4383 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { 4384 4385 /* let others to free the space */ 4386 cond_resched(); 4387 ar->len = ar->len >> 1; 4388 } 4389 if (!ar->len) { 4390 *errp = -ENOSPC; 4391 return 0; 4392 } 4393 reserv_clstrs = ar->len; 4394 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { 4395 dquot_alloc_block_nofail(ar->inode, 4396 EXT4_C2B(sbi, ar->len)); 4397 } else { 4398 while (ar->len && 4399 dquot_alloc_block(ar->inode, 4400 EXT4_C2B(sbi, ar->len))) { 4401 4402 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4403 ar->len--; 4404 } 4405 } 4406 inquota = ar->len; 4407 if (ar->len == 0) { 4408 *errp = -EDQUOT; 4409 goto out; 4410 } 4411 } 4412 4413 ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); 4414 if (!ac) { 4415 ar->len = 0; 4416 *errp = -ENOMEM; 4417 goto out; 4418 } 4419 4420 *errp = ext4_mb_initialize_context(ac, ar); 4421 if (*errp) { 4422 ar->len = 0; 4423 goto out; 4424 } 4425 4426 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4427 if (!ext4_mb_use_preallocated(ac)) { 4428 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 4429 ext4_mb_normalize_request(ac, ar); 4430 repeat: 4431 /* allocate space in core */ 4432 *errp = ext4_mb_regular_allocator(ac); 4433 if (*errp) 4434 goto discard_and_exit; 4435 4436 /* as we've just preallocated more space than 4437 * user requested originally, we store allocated 4438 * space in a special descriptor */ 4439 if (ac->ac_status == AC_STATUS_FOUND && 4440 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4441 *errp = ext4_mb_new_preallocation(ac); 4442 if (*errp) { 4443 discard_and_exit: 4444 ext4_discard_allocated_blocks(ac); 4445 goto errout; 4446 } 4447 } 4448 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4449 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); 4450 if (*errp == -EAGAIN) { 4451 /* 4452 * drop the reference that we took 4453 * in ext4_mb_use_best_found 4454 */ 4455 ext4_mb_release_context(ac); 4456 ac->ac_b_ex.fe_group = 0; 4457 ac->ac_b_ex.fe_start = 0; 4458 ac->ac_b_ex.fe_len = 0; 4459 ac->ac_status = AC_STATUS_CONTINUE; 4460 goto repeat; 4461 } else if (*errp) { 4462 ext4_discard_allocated_blocks(ac); 4463 goto errout; 4464 } else { 4465 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4466 ar->len = ac->ac_b_ex.fe_len; 4467 } 4468 } else { 4469 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); 4470 if (freed) 4471 goto repeat; 4472 *errp = -ENOSPC; 4473 } 4474 4475 errout: 4476 if (*errp) { 4477 ac->ac_b_ex.fe_len = 0; 4478 ar->len = 0; 4479 ext4_mb_show_ac(ac); 4480 } 4481 ext4_mb_release_context(ac); 4482 out: 4483 if (ac) 4484 kmem_cache_free(ext4_ac_cachep, ac); 4485 if (inquota && ar->len < inquota) 4486 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); 4487 if (!ar->len) { 4488 if (!ext4_test_inode_state(ar->inode, 4489 EXT4_STATE_DELALLOC_RESERVED)) 4490 /* release all the reserved blocks if non delalloc */ 4491 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 4492 reserv_clstrs); 4493 } 4494 4495 trace_ext4_allocate_blocks(ar, (unsigned long long)block); 4496 4497 return block; 4498 } 4499 4500 /* 4501 * We can merge two free data extents only if the physical blocks 4502 * are contiguous, AND the extents were freed by the same transaction, 4503 * AND the blocks are associated with the same group. 4504 */ 4505 static int can_merge(struct ext4_free_data *entry1, 4506 struct ext4_free_data *entry2) 4507 { 4508 if ((entry1->efd_tid == entry2->efd_tid) && 4509 (entry1->efd_group == entry2->efd_group) && 4510 ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) 4511 return 1; 4512 return 0; 4513 } 4514 4515 static noinline_for_stack int 4516 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4517 struct ext4_free_data *new_entry) 4518 { 4519 ext4_group_t group = e4b->bd_group; 4520 ext4_grpblk_t cluster; 4521 struct ext4_free_data *entry; 4522 struct ext4_group_info *db = e4b->bd_info; 4523 struct super_block *sb = e4b->bd_sb; 4524 struct ext4_sb_info *sbi = EXT4_SB(sb); 4525 struct rb_node **n = &db->bb_free_root.rb_node, *node; 4526 struct rb_node *parent = NULL, *new_node; 4527 4528 BUG_ON(!ext4_handle_valid(handle)); 4529 BUG_ON(e4b->bd_bitmap_page == NULL); 4530 BUG_ON(e4b->bd_buddy_page == NULL); 4531 4532 new_node = &new_entry->efd_node; 4533 cluster = new_entry->efd_start_cluster; 4534 4535 if (!*n) { 4536 /* first free block exent. We need to 4537 protect buddy cache from being freed, 4538 * otherwise we'll refresh it from 4539 * on-disk bitmap and lose not-yet-available 4540 * blocks */ 4541 page_cache_get(e4b->bd_buddy_page); 4542 page_cache_get(e4b->bd_bitmap_page); 4543 } 4544 while (*n) { 4545 parent = *n; 4546 entry = rb_entry(parent, struct ext4_free_data, efd_node); 4547 if (cluster < entry->efd_start_cluster) 4548 n = &(*n)->rb_left; 4549 else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) 4550 n = &(*n)->rb_right; 4551 else { 4552 ext4_grp_locked_error(sb, group, 0, 4553 ext4_group_first_block_no(sb, group) + 4554 EXT4_C2B(sbi, cluster), 4555 "Block already on to-be-freed list"); 4556 return 0; 4557 } 4558 } 4559 4560 rb_link_node(new_node, parent, n); 4561 rb_insert_color(new_node, &db->bb_free_root); 4562 4563 /* Now try to see the extent can be merged to left and right */ 4564 node = rb_prev(new_node); 4565 if (node) { 4566 entry = rb_entry(node, struct ext4_free_data, efd_node); 4567 if (can_merge(entry, new_entry) && 4568 ext4_journal_callback_try_del(handle, &entry->efd_jce)) { 4569 new_entry->efd_start_cluster = entry->efd_start_cluster; 4570 new_entry->efd_count += entry->efd_count; 4571 rb_erase(node, &(db->bb_free_root)); 4572 kmem_cache_free(ext4_free_data_cachep, entry); 4573 } 4574 } 4575 4576 node = rb_next(new_node); 4577 if (node) { 4578 entry = rb_entry(node, struct ext4_free_data, efd_node); 4579 if (can_merge(new_entry, entry) && 4580 ext4_journal_callback_try_del(handle, &entry->efd_jce)) { 4581 new_entry->efd_count += entry->efd_count; 4582 rb_erase(node, &(db->bb_free_root)); 4583 kmem_cache_free(ext4_free_data_cachep, entry); 4584 } 4585 } 4586 /* Add the extent to transaction's private list */ 4587 ext4_journal_callback_add(handle, ext4_free_data_callback, 4588 &new_entry->efd_jce); 4589 return 0; 4590 } 4591 4592 /** 4593 * ext4_free_blocks() -- Free given blocks and update quota 4594 * @handle: handle for this transaction 4595 * @inode: inode 4596 * @block: start physical block to free 4597 * @count: number of blocks to count 4598 * @flags: flags used by ext4_free_blocks 4599 */ 4600 void ext4_free_blocks(handle_t *handle, struct inode *inode, 4601 struct buffer_head *bh, ext4_fsblk_t block, 4602 unsigned long count, int flags) 4603 { 4604 struct buffer_head *bitmap_bh = NULL; 4605 struct super_block *sb = inode->i_sb; 4606 struct ext4_group_desc *gdp; 4607 unsigned int overflow; 4608 ext4_grpblk_t bit; 4609 struct buffer_head *gd_bh; 4610 ext4_group_t block_group; 4611 struct ext4_sb_info *sbi; 4612 struct ext4_inode_info *ei = EXT4_I(inode); 4613 struct ext4_buddy e4b; 4614 unsigned int count_clusters; 4615 int err = 0; 4616 int ret; 4617 4618 might_sleep(); 4619 if (bh) { 4620 if (block) 4621 BUG_ON(block != bh->b_blocknr); 4622 else 4623 block = bh->b_blocknr; 4624 } 4625 4626 sbi = EXT4_SB(sb); 4627 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 4628 !ext4_data_block_valid(sbi, block, count)) { 4629 ext4_error(sb, "Freeing blocks not in datazone - " 4630 "block = %llu, count = %lu", block, count); 4631 goto error_return; 4632 } 4633 4634 ext4_debug("freeing block %llu\n", block); 4635 trace_ext4_free_blocks(inode, block, count, flags); 4636 4637 if (flags & EXT4_FREE_BLOCKS_FORGET) { 4638 struct buffer_head *tbh = bh; 4639 int i; 4640 4641 BUG_ON(bh && (count > 1)); 4642 4643 for (i = 0; i < count; i++) { 4644 cond_resched(); 4645 if (!bh) 4646 tbh = sb_find_get_block(inode->i_sb, 4647 block + i); 4648 if (!tbh) 4649 continue; 4650 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4651 inode, tbh, block + i); 4652 } 4653 } 4654 4655 /* 4656 * We need to make sure we don't reuse the freed block until 4657 * after the transaction is committed, which we can do by 4658 * treating the block as metadata, below. We make an 4659 * exception if the inode is to be written in writeback mode 4660 * since writeback mode has weak data consistency guarantees. 4661 */ 4662 if (!ext4_should_writeback_data(inode)) 4663 flags |= EXT4_FREE_BLOCKS_METADATA; 4664 4665 /* 4666 * If the extent to be freed does not begin on a cluster 4667 * boundary, we need to deal with partial clusters at the 4668 * beginning and end of the extent. Normally we will free 4669 * blocks at the beginning or the end unless we are explicitly 4670 * requested to avoid doing so. 4671 */ 4672 overflow = EXT4_PBLK_COFF(sbi, block); 4673 if (overflow) { 4674 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { 4675 overflow = sbi->s_cluster_ratio - overflow; 4676 block += overflow; 4677 if (count > overflow) 4678 count -= overflow; 4679 else 4680 return; 4681 } else { 4682 block -= overflow; 4683 count += overflow; 4684 } 4685 } 4686 overflow = EXT4_LBLK_COFF(sbi, count); 4687 if (overflow) { 4688 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { 4689 if (count > overflow) 4690 count -= overflow; 4691 else 4692 return; 4693 } else 4694 count += sbi->s_cluster_ratio - overflow; 4695 } 4696 4697 do_more: 4698 overflow = 0; 4699 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4700 4701 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( 4702 ext4_get_group_info(sb, block_group)))) 4703 return; 4704 4705 /* 4706 * Check to see if we are freeing blocks across a group 4707 * boundary. 4708 */ 4709 if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4710 overflow = EXT4_C2B(sbi, bit) + count - 4711 EXT4_BLOCKS_PER_GROUP(sb); 4712 count -= overflow; 4713 } 4714 count_clusters = EXT4_NUM_B2C(sbi, count); 4715 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4716 if (!bitmap_bh) { 4717 err = -EIO; 4718 goto error_return; 4719 } 4720 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 4721 if (!gdp) { 4722 err = -EIO; 4723 goto error_return; 4724 } 4725 4726 if (in_range(ext4_block_bitmap(sb, gdp), block, count) || 4727 in_range(ext4_inode_bitmap(sb, gdp), block, count) || 4728 in_range(block, ext4_inode_table(sb, gdp), 4729 EXT4_SB(sb)->s_itb_per_group) || 4730 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4731 EXT4_SB(sb)->s_itb_per_group)) { 4732 4733 ext4_error(sb, "Freeing blocks in system zone - " 4734 "Block = %llu, count = %lu", block, count); 4735 /* err = 0. ext4_std_error should be a no op */ 4736 goto error_return; 4737 } 4738 4739 BUFFER_TRACE(bitmap_bh, "getting write access"); 4740 err = ext4_journal_get_write_access(handle, bitmap_bh); 4741 if (err) 4742 goto error_return; 4743 4744 /* 4745 * We are about to modify some metadata. Call the journal APIs 4746 * to unshare ->b_data if a currently-committing transaction is 4747 * using it 4748 */ 4749 BUFFER_TRACE(gd_bh, "get_write_access"); 4750 err = ext4_journal_get_write_access(handle, gd_bh); 4751 if (err) 4752 goto error_return; 4753 #ifdef AGGRESSIVE_CHECK 4754 { 4755 int i; 4756 for (i = 0; i < count_clusters; i++) 4757 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4758 } 4759 #endif 4760 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); 4761 4762 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4763 if (err) 4764 goto error_return; 4765 4766 if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { 4767 struct ext4_free_data *new_entry; 4768 /* 4769 * blocks being freed are metadata. these blocks shouldn't 4770 * be used until this transaction is committed 4771 */ 4772 retry: 4773 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); 4774 if (!new_entry) { 4775 /* 4776 * We use a retry loop because 4777 * ext4_free_blocks() is not allowed to fail. 4778 */ 4779 cond_resched(); 4780 congestion_wait(BLK_RW_ASYNC, HZ/50); 4781 goto retry; 4782 } 4783 new_entry->efd_start_cluster = bit; 4784 new_entry->efd_group = block_group; 4785 new_entry->efd_count = count_clusters; 4786 new_entry->efd_tid = handle->h_transaction->t_tid; 4787 4788 ext4_lock_group(sb, block_group); 4789 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4790 ext4_mb_free_metadata(handle, &e4b, new_entry); 4791 } else { 4792 /* need to update group_info->bb_free and bitmap 4793 * with group lock held. generate_buddy look at 4794 * them with group lock_held 4795 */ 4796 if (test_opt(sb, DISCARD)) { 4797 err = ext4_issue_discard(sb, block_group, bit, count); 4798 if (err && err != -EOPNOTSUPP) 4799 ext4_msg(sb, KERN_WARNING, "discard request in" 4800 " group:%d block:%d count:%lu failed" 4801 " with %d", block_group, bit, count, 4802 err); 4803 } else 4804 EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); 4805 4806 ext4_lock_group(sb, block_group); 4807 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4808 mb_free_blocks(inode, &e4b, bit, count_clusters); 4809 } 4810 4811 ret = ext4_free_group_clusters(sb, gdp) + count_clusters; 4812 ext4_free_group_clusters_set(sb, gdp, ret); 4813 ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); 4814 ext4_group_desc_csum_set(sb, block_group, gdp); 4815 ext4_unlock_group(sb, block_group); 4816 4817 if (sbi->s_log_groups_per_flex) { 4818 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4819 atomic64_add(count_clusters, 4820 &sbi->s_flex_groups[flex_group].free_clusters); 4821 } 4822 4823 if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) { 4824 percpu_counter_add(&sbi->s_dirtyclusters_counter, 4825 count_clusters); 4826 spin_lock(&ei->i_block_reservation_lock); 4827 if (flags & EXT4_FREE_BLOCKS_METADATA) 4828 ei->i_reserved_meta_blocks += count_clusters; 4829 else 4830 ei->i_reserved_data_blocks += count_clusters; 4831 spin_unlock(&ei->i_block_reservation_lock); 4832 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) 4833 dquot_reclaim_block(inode, 4834 EXT4_C2B(sbi, count_clusters)); 4835 } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) 4836 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); 4837 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); 4838 4839 ext4_mb_unload_buddy(&e4b); 4840 4841 /* We dirtied the bitmap block */ 4842 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4843 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4844 4845 /* And the group descriptor block */ 4846 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4847 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 4848 if (!err) 4849 err = ret; 4850 4851 if (overflow && !err) { 4852 block += count; 4853 count = overflow; 4854 put_bh(bitmap_bh); 4855 goto do_more; 4856 } 4857 error_return: 4858 brelse(bitmap_bh); 4859 ext4_std_error(sb, err); 4860 return; 4861 } 4862 4863 /** 4864 * ext4_group_add_blocks() -- Add given blocks to an existing group 4865 * @handle: handle to this transaction 4866 * @sb: super block 4867 * @block: start physical block to add to the block group 4868 * @count: number of blocks to free 4869 * 4870 * This marks the blocks as free in the bitmap and buddy. 4871 */ 4872 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, 4873 ext4_fsblk_t block, unsigned long count) 4874 { 4875 struct buffer_head *bitmap_bh = NULL; 4876 struct buffer_head *gd_bh; 4877 ext4_group_t block_group; 4878 ext4_grpblk_t bit; 4879 unsigned int i; 4880 struct ext4_group_desc *desc; 4881 struct ext4_sb_info *sbi = EXT4_SB(sb); 4882 struct ext4_buddy e4b; 4883 int err = 0, ret, blk_free_count; 4884 ext4_grpblk_t blocks_freed; 4885 4886 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 4887 4888 if (count == 0) 4889 return 0; 4890 4891 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4892 /* 4893 * Check to see if we are freeing blocks across a group 4894 * boundary. 4895 */ 4896 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4897 ext4_warning(sb, "too much blocks added to group %u\n", 4898 block_group); 4899 err = -EINVAL; 4900 goto error_return; 4901 } 4902 4903 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4904 if (!bitmap_bh) { 4905 err = -EIO; 4906 goto error_return; 4907 } 4908 4909 desc = ext4_get_group_desc(sb, block_group, &gd_bh); 4910 if (!desc) { 4911 err = -EIO; 4912 goto error_return; 4913 } 4914 4915 if (in_range(ext4_block_bitmap(sb, desc), block, count) || 4916 in_range(ext4_inode_bitmap(sb, desc), block, count) || 4917 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 4918 in_range(block + count - 1, ext4_inode_table(sb, desc), 4919 sbi->s_itb_per_group)) { 4920 ext4_error(sb, "Adding blocks in system zones - " 4921 "Block = %llu, count = %lu", 4922 block, count); 4923 err = -EINVAL; 4924 goto error_return; 4925 } 4926 4927 BUFFER_TRACE(bitmap_bh, "getting write access"); 4928 err = ext4_journal_get_write_access(handle, bitmap_bh); 4929 if (err) 4930 goto error_return; 4931 4932 /* 4933 * We are about to modify some metadata. Call the journal APIs 4934 * to unshare ->b_data if a currently-committing transaction is 4935 * using it 4936 */ 4937 BUFFER_TRACE(gd_bh, "get_write_access"); 4938 err = ext4_journal_get_write_access(handle, gd_bh); 4939 if (err) 4940 goto error_return; 4941 4942 for (i = 0, blocks_freed = 0; i < count; i++) { 4943 BUFFER_TRACE(bitmap_bh, "clear bit"); 4944 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { 4945 ext4_error(sb, "bit already cleared for block %llu", 4946 (ext4_fsblk_t)(block + i)); 4947 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 4948 } else { 4949 blocks_freed++; 4950 } 4951 } 4952 4953 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4954 if (err) 4955 goto error_return; 4956 4957 /* 4958 * need to update group_info->bb_free and bitmap 4959 * with group lock held. generate_buddy look at 4960 * them with group lock_held 4961 */ 4962 ext4_lock_group(sb, block_group); 4963 mb_clear_bits(bitmap_bh->b_data, bit, count); 4964 mb_free_blocks(NULL, &e4b, bit, count); 4965 blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); 4966 ext4_free_group_clusters_set(sb, desc, blk_free_count); 4967 ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); 4968 ext4_group_desc_csum_set(sb, block_group, desc); 4969 ext4_unlock_group(sb, block_group); 4970 percpu_counter_add(&sbi->s_freeclusters_counter, 4971 EXT4_NUM_B2C(sbi, blocks_freed)); 4972 4973 if (sbi->s_log_groups_per_flex) { 4974 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4975 atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), 4976 &sbi->s_flex_groups[flex_group].free_clusters); 4977 } 4978 4979 ext4_mb_unload_buddy(&e4b); 4980 4981 /* We dirtied the bitmap block */ 4982 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4983 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4984 4985 /* And the group descriptor block */ 4986 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4987 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 4988 if (!err) 4989 err = ret; 4990 4991 error_return: 4992 brelse(bitmap_bh); 4993 ext4_std_error(sb, err); 4994 return err; 4995 } 4996 4997 /** 4998 * ext4_trim_extent -- function to TRIM one single free extent in the group 4999 * @sb: super block for the file system 5000 * @start: starting block of the free extent in the alloc. group 5001 * @count: number of blocks to TRIM 5002 * @group: alloc. group we are working with 5003 * @e4b: ext4 buddy for the group 5004 * 5005 * Trim "count" blocks starting at "start" in the "group". To assure that no 5006 * one will allocate those blocks, mark it as used in buddy bitmap. This must 5007 * be called with under the group lock. 5008 */ 5009 static int ext4_trim_extent(struct super_block *sb, int start, int count, 5010 ext4_group_t group, struct ext4_buddy *e4b) 5011 { 5012 struct ext4_free_extent ex; 5013 int ret = 0; 5014 5015 trace_ext4_trim_extent(sb, group, start, count); 5016 5017 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 5018 5019 ex.fe_start = start; 5020 ex.fe_group = group; 5021 ex.fe_len = count; 5022 5023 /* 5024 * Mark blocks used, so no one can reuse them while 5025 * being trimmed. 5026 */ 5027 mb_mark_used(e4b, &ex); 5028 ext4_unlock_group(sb, group); 5029 ret = ext4_issue_discard(sb, group, start, count); 5030 ext4_lock_group(sb, group); 5031 mb_free_blocks(NULL, e4b, start, ex.fe_len); 5032 return ret; 5033 } 5034 5035 /** 5036 * ext4_trim_all_free -- function to trim all free space in alloc. group 5037 * @sb: super block for file system 5038 * @group: group to be trimmed 5039 * @start: first group block to examine 5040 * @max: last group block to examine 5041 * @minblocks: minimum extent block count 5042 * 5043 * ext4_trim_all_free walks through group's buddy bitmap searching for free 5044 * extents. When the free block is found, ext4_trim_extent is called to TRIM 5045 * the extent. 5046 * 5047 * 5048 * ext4_trim_all_free walks through group's block bitmap searching for free 5049 * extents. When the free extent is found, mark it as used in group buddy 5050 * bitmap. Then issue a TRIM command on this extent and free the extent in 5051 * the group buddy bitmap. This is done until whole group is scanned. 5052 */ 5053 static ext4_grpblk_t 5054 ext4_trim_all_free(struct super_block *sb, ext4_group_t group, 5055 ext4_grpblk_t start, ext4_grpblk_t max, 5056 ext4_grpblk_t minblocks) 5057 { 5058 void *bitmap; 5059 ext4_grpblk_t next, count = 0, free_count = 0; 5060 struct ext4_buddy e4b; 5061 int ret = 0; 5062 5063 trace_ext4_trim_all_free(sb, group, start, max); 5064 5065 ret = ext4_mb_load_buddy(sb, group, &e4b); 5066 if (ret) { 5067 ext4_error(sb, "Error in loading buddy " 5068 "information for %u", group); 5069 return ret; 5070 } 5071 bitmap = e4b.bd_bitmap; 5072 5073 ext4_lock_group(sb, group); 5074 if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && 5075 minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) 5076 goto out; 5077 5078 start = (e4b.bd_info->bb_first_free > start) ? 5079 e4b.bd_info->bb_first_free : start; 5080 5081 while (start <= max) { 5082 start = mb_find_next_zero_bit(bitmap, max + 1, start); 5083 if (start > max) 5084 break; 5085 next = mb_find_next_bit(bitmap, max + 1, start); 5086 5087 if ((next - start) >= minblocks) { 5088 ret = ext4_trim_extent(sb, start, 5089 next - start, group, &e4b); 5090 if (ret && ret != -EOPNOTSUPP) 5091 break; 5092 ret = 0; 5093 count += next - start; 5094 } 5095 free_count += next - start; 5096 start = next + 1; 5097 5098 if (fatal_signal_pending(current)) { 5099 count = -ERESTARTSYS; 5100 break; 5101 } 5102 5103 if (need_resched()) { 5104 ext4_unlock_group(sb, group); 5105 cond_resched(); 5106 ext4_lock_group(sb, group); 5107 } 5108 5109 if ((e4b.bd_info->bb_free - free_count) < minblocks) 5110 break; 5111 } 5112 5113 if (!ret) { 5114 ret = count; 5115 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); 5116 } 5117 out: 5118 ext4_unlock_group(sb, group); 5119 ext4_mb_unload_buddy(&e4b); 5120 5121 ext4_debug("trimmed %d blocks in the group %d\n", 5122 count, group); 5123 5124 return ret; 5125 } 5126 5127 /** 5128 * ext4_trim_fs() -- trim ioctl handle function 5129 * @sb: superblock for filesystem 5130 * @range: fstrim_range structure 5131 * 5132 * start: First Byte to trim 5133 * len: number of Bytes to trim from start 5134 * minlen: minimum extent length in Bytes 5135 * ext4_trim_fs goes through all allocation groups containing Bytes from 5136 * start to start+len. For each such a group ext4_trim_all_free function 5137 * is invoked to trim all free space. 5138 */ 5139 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 5140 { 5141 struct ext4_group_info *grp; 5142 ext4_group_t group, first_group, last_group; 5143 ext4_grpblk_t cnt = 0, first_cluster, last_cluster; 5144 uint64_t start, end, minlen, trimmed = 0; 5145 ext4_fsblk_t first_data_blk = 5146 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 5147 ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); 5148 int ret = 0; 5149 5150 start = range->start >> sb->s_blocksize_bits; 5151 end = start + (range->len >> sb->s_blocksize_bits) - 1; 5152 minlen = EXT4_NUM_B2C(EXT4_SB(sb), 5153 range->minlen >> sb->s_blocksize_bits); 5154 5155 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || 5156 start >= max_blks || 5157 range->len < sb->s_blocksize) 5158 return -EINVAL; 5159 if (end >= max_blks) 5160 end = max_blks - 1; 5161 if (end <= first_data_blk) 5162 goto out; 5163 if (start < first_data_blk) 5164 start = first_data_blk; 5165 5166 /* Determine first and last group to examine based on start and end */ 5167 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 5168 &first_group, &first_cluster); 5169 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, 5170 &last_group, &last_cluster); 5171 5172 /* end now represents the last cluster to discard in this group */ 5173 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; 5174 5175 for (group = first_group; group <= last_group; group++) { 5176 grp = ext4_get_group_info(sb, group); 5177 /* We only do this if the grp has never been initialized */ 5178 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 5179 ret = ext4_mb_init_group(sb, group); 5180 if (ret) 5181 break; 5182 } 5183 5184 /* 5185 * For all the groups except the last one, last cluster will 5186 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to 5187 * change it for the last group, note that last_cluster is 5188 * already computed earlier by ext4_get_group_no_and_offset() 5189 */ 5190 if (group == last_group) 5191 end = last_cluster; 5192 5193 if (grp->bb_free >= minlen) { 5194 cnt = ext4_trim_all_free(sb, group, first_cluster, 5195 end, minlen); 5196 if (cnt < 0) { 5197 ret = cnt; 5198 break; 5199 } 5200 trimmed += cnt; 5201 } 5202 5203 /* 5204 * For every group except the first one, we are sure 5205 * that the first cluster to discard will be cluster #0. 5206 */ 5207 first_cluster = 0; 5208 } 5209 5210 if (!ret) 5211 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); 5212 5213 out: 5214 range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; 5215 return ret; 5216 } 5217