1 /* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License version 2 as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public Licens 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 17 */ 18 19 20 /* 21 * mballoc.c contains the multiblocks allocation routines 22 */ 23 24 #include "mballoc.h" 25 /* 26 * MUSTDO: 27 * - test ext4_ext_search_left() and ext4_ext_search_right() 28 * - search for metadata in few groups 29 * 30 * TODO v4: 31 * - normalization should take into account whether file is still open 32 * - discard preallocations if no free space left (policy?) 33 * - don't normalize tails 34 * - quota 35 * - reservation for superuser 36 * 37 * TODO v3: 38 * - bitmap read-ahead (proposed by Oleg Drokin aka green) 39 * - track min/max extents in each group for better group selection 40 * - mb_mark_used() may allocate chunk right after splitting buddy 41 * - tree of groups sorted by number of free blocks 42 * - error handling 43 */ 44 45 /* 46 * The allocation request involve request for multiple number of blocks 47 * near to the goal(block) value specified. 48 * 49 * During initialization phase of the allocator we decide to use the group 50 * preallocation or inode preallocation depending on the size file. The 51 * size of the file could be the resulting file size we would have after 52 * allocation or the current file size which ever is larger. If the size is 53 * less that sbi->s_mb_stream_request we select the group 54 * preallocation. The default value of s_mb_stream_request is 16 55 * blocks. This can also be tuned via 56 * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms 57 * of number of blocks. 58 * 59 * The main motivation for having small file use group preallocation is to 60 * ensure that we have small file closer in the disk. 61 * 62 * First stage the allocator looks at the inode prealloc list 63 * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for 64 * this particular inode. The inode prealloc space is represented as: 65 * 66 * pa_lstart -> the logical start block for this prealloc space 67 * pa_pstart -> the physical start block for this prealloc space 68 * pa_len -> lenght for this prealloc space 69 * pa_free -> free space available in this prealloc space 70 * 71 * The inode preallocation space is used looking at the _logical_ start 72 * block. If only the logical file block falls within the range of prealloc 73 * space we will consume the particular prealloc space. This make sure that 74 * that the we have contiguous physical blocks representing the file blocks 75 * 76 * The important thing to be noted in case of inode prealloc space is that 77 * we don't modify the values associated to inode prealloc space except 78 * pa_free. 79 * 80 * If we are not able to find blocks in the inode prealloc space and if we 81 * have the group allocation flag set then we look at the locality group 82 * prealloc space. These are per CPU prealloc list repreasented as 83 * 84 * ext4_sb_info.s_locality_groups[smp_processor_id()] 85 * 86 * The reason for having a per cpu locality group is to reduce the contention 87 * between CPUs. It is possible to get scheduled at this point. 88 * 89 * The locality group prealloc space is used looking at whether we have 90 * enough free space (pa_free) withing the prealloc space. 91 * 92 * If we can't allocate blocks via inode prealloc or/and locality group 93 * prealloc then we look at the buddy cache. The buddy cache is represented 94 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets 95 * mapped to the buddy and bitmap information regarding different 96 * groups. The buddy information is attached to buddy cache inode so that 97 * we can access them through the page cache. The information regarding 98 * each group is loaded via ext4_mb_load_buddy. The information involve 99 * block bitmap and buddy information. The information are stored in the 100 * inode as: 101 * 102 * { page } 103 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 104 * 105 * 106 * one block each for bitmap and buddy information. So for each group we 107 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / 108 * blocksize) blocks. So it can have information regarding groups_per_page 109 * which is blocks_per_page/2 110 * 111 * The buddy cache inode is not stored on disk. The inode is thrown 112 * away when the filesystem is unmounted. 113 * 114 * We look for count number of blocks in the buddy cache. If we were able 115 * to locate that many free blocks we return with additional information 116 * regarding rest of the contiguous physical block available 117 * 118 * Before allocating blocks via buddy cache we normalize the request 119 * blocks. This ensure we ask for more blocks that we needed. The extra 120 * blocks that we get after allocation is added to the respective prealloc 121 * list. In case of inode preallocation we follow a list of heuristics 122 * based on file size. This can be found in ext4_mb_normalize_request. If 123 * we are doing a group prealloc we try to normalize the request to 124 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to 125 * 512 blocks. This can be tuned via 126 * /proc/fs/ext4/<partition/group_prealloc. The value is represented in 127 * terms of number of blocks. If we have mounted the file system with -O 128 * stripe=<value> option the group prealloc request is normalized to the 129 * stripe value (sbi->s_stripe) 130 * 131 * The regular allocator(using the buddy cache) support few tunables. 132 * 133 * /proc/fs/ext4/<partition>/min_to_scan 134 * /proc/fs/ext4/<partition>/max_to_scan 135 * /proc/fs/ext4/<partition>/order2_req 136 * 137 * The regular allocator use buddy scan only if the request len is power of 138 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 139 * value of s_mb_order2_reqs can be tuned via 140 * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to 141 * stripe size (sbi->s_stripe), we try to search for contigous block in 142 * stripe size. This should result in better allocation on RAID setup. If 143 * not we search in the specific group using bitmap for best extents. The 144 * tunable min_to_scan and max_to_scan controll the behaviour here. 145 * min_to_scan indicate how long the mballoc __must__ look for a best 146 * extent and max_to_scanindicate how long the mballoc __can__ look for a 147 * best extent in the found extents. Searching for the blocks starts with 148 * the group specified as the goal value in allocation context via 149 * ac_g_ex. Each group is first checked based on the criteria whether it 150 * can used for allocation. ext4_mb_good_group explains how the groups are 151 * checked. 152 * 153 * Both the prealloc space are getting populated as above. So for the first 154 * request we will hit the buddy cache which will result in this prealloc 155 * space getting filled. The prealloc space is then later used for the 156 * subsequent request. 157 */ 158 159 /* 160 * mballoc operates on the following data: 161 * - on-disk bitmap 162 * - in-core buddy (actually includes buddy and bitmap) 163 * - preallocation descriptors (PAs) 164 * 165 * there are two types of preallocations: 166 * - inode 167 * assiged to specific inode and can be used for this inode only. 168 * it describes part of inode's space preallocated to specific 169 * physical blocks. any block from that preallocated can be used 170 * independent. the descriptor just tracks number of blocks left 171 * unused. so, before taking some block from descriptor, one must 172 * make sure corresponded logical block isn't allocated yet. this 173 * also means that freeing any block within descriptor's range 174 * must discard all preallocated blocks. 175 * - locality group 176 * assigned to specific locality group which does not translate to 177 * permanent set of inodes: inode can join and leave group. space 178 * from this type of preallocation can be used for any inode. thus 179 * it's consumed from the beginning to the end. 180 * 181 * relation between them can be expressed as: 182 * in-core buddy = on-disk bitmap + preallocation descriptors 183 * 184 * this mean blocks mballoc considers used are: 185 * - allocated blocks (persistent) 186 * - preallocated blocks (non-persistent) 187 * 188 * consistency in mballoc world means that at any time a block is either 189 * free or used in ALL structures. notice: "any time" should not be read 190 * literally -- time is discrete and delimited by locks. 191 * 192 * to keep it simple, we don't use block numbers, instead we count number of 193 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. 194 * 195 * all operations can be expressed as: 196 * - init buddy: buddy = on-disk + PAs 197 * - new PA: buddy += N; PA = N 198 * - use inode PA: on-disk += N; PA -= N 199 * - discard inode PA buddy -= on-disk - PA; PA = 0 200 * - use locality group PA on-disk += N; PA -= N 201 * - discard locality group PA buddy -= PA; PA = 0 202 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap 203 * is used in real operation because we can't know actual used 204 * bits from PA, only from on-disk bitmap 205 * 206 * if we follow this strict logic, then all operations above should be atomic. 207 * given some of them can block, we'd have to use something like semaphores 208 * killing performance on high-end SMP hardware. let's try to relax it using 209 * the following knowledge: 210 * 1) if buddy is referenced, it's already initialized 211 * 2) while block is used in buddy and the buddy is referenced, 212 * nobody can re-allocate that block 213 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has 214 * bit set and PA claims same block, it's OK. IOW, one can set bit in 215 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded 216 * block 217 * 218 * so, now we're building a concurrency table: 219 * - init buddy vs. 220 * - new PA 221 * blocks for PA are allocated in the buddy, buddy must be referenced 222 * until PA is linked to allocation group to avoid concurrent buddy init 223 * - use inode PA 224 * we need to make sure that either on-disk bitmap or PA has uptodate data 225 * given (3) we care that PA-=N operation doesn't interfere with init 226 * - discard inode PA 227 * the simplest way would be to have buddy initialized by the discard 228 * - use locality group PA 229 * again PA-=N must be serialized with init 230 * - discard locality group PA 231 * the simplest way would be to have buddy initialized by the discard 232 * - new PA vs. 233 * - use inode PA 234 * i_data_sem serializes them 235 * - discard inode PA 236 * discard process must wait until PA isn't used by another process 237 * - use locality group PA 238 * some mutex should serialize them 239 * - discard locality group PA 240 * discard process must wait until PA isn't used by another process 241 * - use inode PA 242 * - use inode PA 243 * i_data_sem or another mutex should serializes them 244 * - discard inode PA 245 * discard process must wait until PA isn't used by another process 246 * - use locality group PA 247 * nothing wrong here -- they're different PAs covering different blocks 248 * - discard locality group PA 249 * discard process must wait until PA isn't used by another process 250 * 251 * now we're ready to make few consequences: 252 * - PA is referenced and while it is no discard is possible 253 * - PA is referenced until block isn't marked in on-disk bitmap 254 * - PA changes only after on-disk bitmap 255 * - discard must not compete with init. either init is done before 256 * any discard or they're serialized somehow 257 * - buddy init as sum of on-disk bitmap and PAs is done atomically 258 * 259 * a special case when we've used PA to emptiness. no need to modify buddy 260 * in this case, but we should care about concurrent init 261 * 262 */ 263 264 /* 265 * Logic in few words: 266 * 267 * - allocation: 268 * load group 269 * find blocks 270 * mark bits in on-disk bitmap 271 * release group 272 * 273 * - use preallocation: 274 * find proper PA (per-inode or group) 275 * load group 276 * mark bits in on-disk bitmap 277 * release group 278 * release PA 279 * 280 * - free: 281 * load group 282 * mark bits in on-disk bitmap 283 * release group 284 * 285 * - discard preallocations in group: 286 * mark PAs deleted 287 * move them onto local list 288 * load on-disk bitmap 289 * load group 290 * remove PA from object (inode or locality group) 291 * mark free blocks in-core 292 * 293 * - discard inode's preallocations: 294 */ 295 296 /* 297 * Locking rules 298 * 299 * Locks: 300 * - bitlock on a group (group) 301 * - object (inode/locality) (object) 302 * - per-pa lock (pa) 303 * 304 * Paths: 305 * - new pa 306 * object 307 * group 308 * 309 * - find and use pa: 310 * pa 311 * 312 * - release consumed pa: 313 * pa 314 * group 315 * object 316 * 317 * - generate in-core bitmap: 318 * group 319 * pa 320 * 321 * - discard all for given object (inode, locality group): 322 * object 323 * pa 324 * group 325 * 326 * - discard all for given group: 327 * group 328 * pa 329 * group 330 * object 331 * 332 */ 333 static struct kmem_cache *ext4_pspace_cachep; 334 static struct kmem_cache *ext4_ac_cachep; 335 static struct kmem_cache *ext4_free_ext_cachep; 336 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 337 ext4_group_t group); 338 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 339 ext4_group_t group); 340 static int ext4_mb_init_per_dev_proc(struct super_block *sb); 341 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); 342 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 343 344 345 346 static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 347 { 348 #if BITS_PER_LONG == 64 349 *bit += ((unsigned long) addr & 7UL) << 3; 350 addr = (void *) ((unsigned long) addr & ~7UL); 351 #elif BITS_PER_LONG == 32 352 *bit += ((unsigned long) addr & 3UL) << 3; 353 addr = (void *) ((unsigned long) addr & ~3UL); 354 #else 355 #error "how many bits you are?!" 356 #endif 357 return addr; 358 } 359 360 static inline int mb_test_bit(int bit, void *addr) 361 { 362 /* 363 * ext4_test_bit on architecture like powerpc 364 * needs unsigned long aligned address 365 */ 366 addr = mb_correct_addr_and_bit(&bit, addr); 367 return ext4_test_bit(bit, addr); 368 } 369 370 static inline void mb_set_bit(int bit, void *addr) 371 { 372 addr = mb_correct_addr_and_bit(&bit, addr); 373 ext4_set_bit(bit, addr); 374 } 375 376 static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) 377 { 378 addr = mb_correct_addr_and_bit(&bit, addr); 379 ext4_set_bit_atomic(lock, bit, addr); 380 } 381 382 static inline void mb_clear_bit(int bit, void *addr) 383 { 384 addr = mb_correct_addr_and_bit(&bit, addr); 385 ext4_clear_bit(bit, addr); 386 } 387 388 static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) 389 { 390 addr = mb_correct_addr_and_bit(&bit, addr); 391 ext4_clear_bit_atomic(lock, bit, addr); 392 } 393 394 static inline int mb_find_next_zero_bit(void *addr, int max, int start) 395 { 396 int fix = 0, ret, tmpmax; 397 addr = mb_correct_addr_and_bit(&fix, addr); 398 tmpmax = max + fix; 399 start += fix; 400 401 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; 402 if (ret > max) 403 return max; 404 return ret; 405 } 406 407 static inline int mb_find_next_bit(void *addr, int max, int start) 408 { 409 int fix = 0, ret, tmpmax; 410 addr = mb_correct_addr_and_bit(&fix, addr); 411 tmpmax = max + fix; 412 start += fix; 413 414 ret = ext4_find_next_bit(addr, tmpmax, start) - fix; 415 if (ret > max) 416 return max; 417 return ret; 418 } 419 420 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 421 { 422 char *bb; 423 424 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 425 BUG_ON(max == NULL); 426 427 if (order > e4b->bd_blkbits + 1) { 428 *max = 0; 429 return NULL; 430 } 431 432 /* at order 0 we see each particular block */ 433 *max = 1 << (e4b->bd_blkbits + 3); 434 if (order == 0) 435 return EXT4_MB_BITMAP(e4b); 436 437 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 438 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 439 440 return bb; 441 } 442 443 #ifdef DOUBLE_CHECK 444 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, 445 int first, int count) 446 { 447 int i; 448 struct super_block *sb = e4b->bd_sb; 449 450 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 451 return; 452 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); 453 for (i = 0; i < count; i++) { 454 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 455 ext4_fsblk_t blocknr; 456 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 457 blocknr += first + i; 458 blocknr += 459 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 460 ext4_grp_locked_error(sb, e4b->bd_group, 461 __func__, "double-free of inode" 462 " %lu's block %llu(bit %u in group %u)", 463 inode ? inode->i_ino : 0, blocknr, 464 first + i, e4b->bd_group); 465 } 466 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 467 } 468 } 469 470 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) 471 { 472 int i; 473 474 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 475 return; 476 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 477 for (i = 0; i < count; i++) { 478 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 479 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 480 } 481 } 482 483 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 484 { 485 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { 486 unsigned char *b1, *b2; 487 int i; 488 b1 = (unsigned char *) e4b->bd_info->bb_bitmap; 489 b2 = (unsigned char *) bitmap; 490 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 491 if (b1[i] != b2[i]) { 492 printk(KERN_ERR "corruption in group %u " 493 "at byte %u(%u): %x in copy != %x " 494 "on disk/prealloc\n", 495 e4b->bd_group, i, i * 8, b1[i], b2[i]); 496 BUG(); 497 } 498 } 499 } 500 } 501 502 #else 503 static inline void mb_free_blocks_double(struct inode *inode, 504 struct ext4_buddy *e4b, int first, int count) 505 { 506 return; 507 } 508 static inline void mb_mark_used_double(struct ext4_buddy *e4b, 509 int first, int count) 510 { 511 return; 512 } 513 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 514 { 515 return; 516 } 517 #endif 518 519 #ifdef AGGRESSIVE_CHECK 520 521 #define MB_CHECK_ASSERT(assert) \ 522 do { \ 523 if (!(assert)) { \ 524 printk(KERN_EMERG \ 525 "Assertion failure in %s() at %s:%d: \"%s\"\n", \ 526 function, file, line, # assert); \ 527 BUG(); \ 528 } \ 529 } while (0) 530 531 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, 532 const char *function, int line) 533 { 534 struct super_block *sb = e4b->bd_sb; 535 int order = e4b->bd_blkbits + 1; 536 int max; 537 int max2; 538 int i; 539 int j; 540 int k; 541 int count; 542 struct ext4_group_info *grp; 543 int fragments = 0; 544 int fstart; 545 struct list_head *cur; 546 void *buddy; 547 void *buddy2; 548 549 { 550 static int mb_check_counter; 551 if (mb_check_counter++ % 100 != 0) 552 return 0; 553 } 554 555 while (order > 1) { 556 buddy = mb_find_buddy(e4b, order, &max); 557 MB_CHECK_ASSERT(buddy); 558 buddy2 = mb_find_buddy(e4b, order - 1, &max2); 559 MB_CHECK_ASSERT(buddy2); 560 MB_CHECK_ASSERT(buddy != buddy2); 561 MB_CHECK_ASSERT(max * 2 == max2); 562 563 count = 0; 564 for (i = 0; i < max; i++) { 565 566 if (mb_test_bit(i, buddy)) { 567 /* only single bit in buddy2 may be 1 */ 568 if (!mb_test_bit(i << 1, buddy2)) { 569 MB_CHECK_ASSERT( 570 mb_test_bit((i<<1)+1, buddy2)); 571 } else if (!mb_test_bit((i << 1) + 1, buddy2)) { 572 MB_CHECK_ASSERT( 573 mb_test_bit(i << 1, buddy2)); 574 } 575 continue; 576 } 577 578 /* both bits in buddy2 must be 0 */ 579 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 580 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 581 582 for (j = 0; j < (1 << order); j++) { 583 k = (i * (1 << order)) + j; 584 MB_CHECK_ASSERT( 585 !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); 586 } 587 count++; 588 } 589 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); 590 order--; 591 } 592 593 fstart = -1; 594 buddy = mb_find_buddy(e4b, 0, &max); 595 for (i = 0; i < max; i++) { 596 if (!mb_test_bit(i, buddy)) { 597 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); 598 if (fstart == -1) { 599 fragments++; 600 fstart = i; 601 } 602 continue; 603 } 604 fstart = -1; 605 /* check used bits only */ 606 for (j = 0; j < e4b->bd_blkbits + 1; j++) { 607 buddy2 = mb_find_buddy(e4b, j, &max2); 608 k = i >> j; 609 MB_CHECK_ASSERT(k < max2); 610 MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); 611 } 612 } 613 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); 614 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 615 616 grp = ext4_get_group_info(sb, e4b->bd_group); 617 buddy = mb_find_buddy(e4b, 0, &max); 618 list_for_each(cur, &grp->bb_prealloc_list) { 619 ext4_group_t groupnr; 620 struct ext4_prealloc_space *pa; 621 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 622 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); 623 MB_CHECK_ASSERT(groupnr == e4b->bd_group); 624 for (i = 0; i < pa->pa_len; i++) 625 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); 626 } 627 return 0; 628 } 629 #undef MB_CHECK_ASSERT 630 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ 631 __FILE__, __func__, __LINE__) 632 #else 633 #define mb_check_buddy(e4b) 634 #endif 635 636 /* FIXME!! need more doc */ 637 static void ext4_mb_mark_free_simple(struct super_block *sb, 638 void *buddy, unsigned first, int len, 639 struct ext4_group_info *grp) 640 { 641 struct ext4_sb_info *sbi = EXT4_SB(sb); 642 unsigned short min; 643 unsigned short max; 644 unsigned short chunk; 645 unsigned short border; 646 647 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 648 649 border = 2 << sb->s_blocksize_bits; 650 651 while (len > 0) { 652 /* find how many blocks can be covered since this position */ 653 max = ffs(first | border) - 1; 654 655 /* find how many blocks of power 2 we need to mark */ 656 min = fls(len) - 1; 657 658 if (max < min) 659 min = max; 660 chunk = 1 << min; 661 662 /* mark multiblock chunks only */ 663 grp->bb_counters[min]++; 664 if (min > 0) 665 mb_clear_bit(first >> min, 666 buddy + sbi->s_mb_offsets[min]); 667 668 len -= chunk; 669 first += chunk; 670 } 671 } 672 673 static void ext4_mb_generate_buddy(struct super_block *sb, 674 void *buddy, void *bitmap, ext4_group_t group) 675 { 676 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 677 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); 678 unsigned short i = 0; 679 unsigned short first; 680 unsigned short len; 681 unsigned free = 0; 682 unsigned fragments = 0; 683 unsigned long long period = get_cycles(); 684 685 /* initialize buddy from bitmap which is aggregation 686 * of on-disk bitmap and preallocations */ 687 i = mb_find_next_zero_bit(bitmap, max, 0); 688 grp->bb_first_free = i; 689 while (i < max) { 690 fragments++; 691 first = i; 692 i = mb_find_next_bit(bitmap, max, i); 693 len = i - first; 694 free += len; 695 if (len > 1) 696 ext4_mb_mark_free_simple(sb, buddy, first, len, grp); 697 else 698 grp->bb_counters[0]++; 699 if (i < max) 700 i = mb_find_next_zero_bit(bitmap, max, i); 701 } 702 grp->bb_fragments = fragments; 703 704 if (free != grp->bb_free) { 705 ext4_grp_locked_error(sb, group, __func__, 706 "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", 707 group, free, grp->bb_free); 708 /* 709 * If we intent to continue, we consider group descritor 710 * corrupt and update bb_free using bitmap value 711 */ 712 grp->bb_free = free; 713 } 714 715 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 716 717 period = get_cycles() - period; 718 spin_lock(&EXT4_SB(sb)->s_bal_lock); 719 EXT4_SB(sb)->s_mb_buddies_generated++; 720 EXT4_SB(sb)->s_mb_generation_time += period; 721 spin_unlock(&EXT4_SB(sb)->s_bal_lock); 722 } 723 724 /* The buddy information is attached the buddy cache inode 725 * for convenience. The information regarding each group 726 * is loaded via ext4_mb_load_buddy. The information involve 727 * block bitmap and buddy information. The information are 728 * stored in the inode as 729 * 730 * { page } 731 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 732 * 733 * 734 * one block each for bitmap and buddy information. 735 * So for each group we take up 2 blocks. A page can 736 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 737 * So it can have information regarding groups_per_page which 738 * is blocks_per_page/2 739 */ 740 741 static int ext4_mb_init_cache(struct page *page, char *incore) 742 { 743 int blocksize; 744 int blocks_per_page; 745 int groups_per_page; 746 int err = 0; 747 int i; 748 ext4_group_t first_group; 749 int first_block; 750 struct super_block *sb; 751 struct buffer_head *bhs; 752 struct buffer_head **bh; 753 struct inode *inode; 754 char *data; 755 char *bitmap; 756 757 mb_debug("init page %lu\n", page->index); 758 759 inode = page->mapping->host; 760 sb = inode->i_sb; 761 blocksize = 1 << inode->i_blkbits; 762 blocks_per_page = PAGE_CACHE_SIZE / blocksize; 763 764 groups_per_page = blocks_per_page >> 1; 765 if (groups_per_page == 0) 766 groups_per_page = 1; 767 768 /* allocate buffer_heads to read bitmaps */ 769 if (groups_per_page > 1) { 770 err = -ENOMEM; 771 i = sizeof(struct buffer_head *) * groups_per_page; 772 bh = kzalloc(i, GFP_NOFS); 773 if (bh == NULL) 774 goto out; 775 } else 776 bh = &bhs; 777 778 first_group = page->index * blocks_per_page / 2; 779 780 /* read all groups the page covers into the cache */ 781 for (i = 0; i < groups_per_page; i++) { 782 struct ext4_group_desc *desc; 783 784 if (first_group + i >= EXT4_SB(sb)->s_groups_count) 785 break; 786 787 err = -EIO; 788 desc = ext4_get_group_desc(sb, first_group + i, NULL); 789 if (desc == NULL) 790 goto out; 791 792 err = -ENOMEM; 793 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); 794 if (bh[i] == NULL) 795 goto out; 796 797 if (bitmap_uptodate(bh[i])) 798 continue; 799 800 lock_buffer(bh[i]); 801 if (bitmap_uptodate(bh[i])) { 802 unlock_buffer(bh[i]); 803 continue; 804 } 805 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 806 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 807 ext4_init_block_bitmap(sb, bh[i], 808 first_group + i, desc); 809 set_bitmap_uptodate(bh[i]); 810 set_buffer_uptodate(bh[i]); 811 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 812 unlock_buffer(bh[i]); 813 continue; 814 } 815 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 816 if (buffer_uptodate(bh[i])) { 817 /* 818 * if not uninit if bh is uptodate, 819 * bitmap is also uptodate 820 */ 821 set_bitmap_uptodate(bh[i]); 822 unlock_buffer(bh[i]); 823 continue; 824 } 825 get_bh(bh[i]); 826 /* 827 * submit the buffer_head for read. We can 828 * safely mark the bitmap as uptodate now. 829 * We do it here so the bitmap uptodate bit 830 * get set with buffer lock held. 831 */ 832 set_bitmap_uptodate(bh[i]); 833 bh[i]->b_end_io = end_buffer_read_sync; 834 submit_bh(READ, bh[i]); 835 mb_debug("read bitmap for group %u\n", first_group + i); 836 } 837 838 /* wait for I/O completion */ 839 for (i = 0; i < groups_per_page && bh[i]; i++) 840 wait_on_buffer(bh[i]); 841 842 err = -EIO; 843 for (i = 0; i < groups_per_page && bh[i]; i++) 844 if (!buffer_uptodate(bh[i])) 845 goto out; 846 847 err = 0; 848 first_block = page->index * blocks_per_page; 849 /* init the page */ 850 memset(page_address(page), 0xff, PAGE_CACHE_SIZE); 851 for (i = 0; i < blocks_per_page; i++) { 852 int group; 853 struct ext4_group_info *grinfo; 854 855 group = (first_block + i) >> 1; 856 if (group >= EXT4_SB(sb)->s_groups_count) 857 break; 858 859 /* 860 * data carry information regarding this 861 * particular group in the format specified 862 * above 863 * 864 */ 865 data = page_address(page) + (i * blocksize); 866 bitmap = bh[group - first_group]->b_data; 867 868 /* 869 * We place the buddy block and bitmap block 870 * close together 871 */ 872 if ((first_block + i) & 1) { 873 /* this is block of buddy */ 874 BUG_ON(incore == NULL); 875 mb_debug("put buddy for group %u in page %lu/%x\n", 876 group, page->index, i * blocksize); 877 grinfo = ext4_get_group_info(sb, group); 878 grinfo->bb_fragments = 0; 879 memset(grinfo->bb_counters, 0, 880 sizeof(unsigned short)*(sb->s_blocksize_bits+2)); 881 /* 882 * incore got set to the group block bitmap below 883 */ 884 ext4_lock_group(sb, group); 885 ext4_mb_generate_buddy(sb, data, incore, group); 886 ext4_unlock_group(sb, group); 887 incore = NULL; 888 } else { 889 /* this is block of bitmap */ 890 BUG_ON(incore != NULL); 891 mb_debug("put bitmap for group %u in page %lu/%x\n", 892 group, page->index, i * blocksize); 893 894 /* see comments in ext4_mb_put_pa() */ 895 ext4_lock_group(sb, group); 896 memcpy(data, bitmap, blocksize); 897 898 /* mark all preallocated blks used in in-core bitmap */ 899 ext4_mb_generate_from_pa(sb, data, group); 900 ext4_mb_generate_from_freelist(sb, data, group); 901 ext4_unlock_group(sb, group); 902 903 /* set incore so that the buddy information can be 904 * generated using this 905 */ 906 incore = data; 907 } 908 } 909 SetPageUptodate(page); 910 911 out: 912 if (bh) { 913 for (i = 0; i < groups_per_page && bh[i]; i++) 914 brelse(bh[i]); 915 if (bh != &bhs) 916 kfree(bh); 917 } 918 return err; 919 } 920 921 static noinline_for_stack int 922 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 923 struct ext4_buddy *e4b) 924 { 925 int blocks_per_page; 926 int block; 927 int pnum; 928 int poff; 929 struct page *page; 930 int ret; 931 struct ext4_group_info *grp; 932 struct ext4_sb_info *sbi = EXT4_SB(sb); 933 struct inode *inode = sbi->s_buddy_cache; 934 935 mb_debug("load group %u\n", group); 936 937 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 938 grp = ext4_get_group_info(sb, group); 939 940 e4b->bd_blkbits = sb->s_blocksize_bits; 941 e4b->bd_info = ext4_get_group_info(sb, group); 942 e4b->bd_sb = sb; 943 e4b->bd_group = group; 944 e4b->bd_buddy_page = NULL; 945 e4b->bd_bitmap_page = NULL; 946 e4b->alloc_semp = &grp->alloc_sem; 947 948 /* Take the read lock on the group alloc 949 * sem. This would make sure a parallel 950 * ext4_mb_init_group happening on other 951 * groups mapped by the page is blocked 952 * till we are done with allocation 953 */ 954 down_read(e4b->alloc_semp); 955 956 /* 957 * the buddy cache inode stores the block bitmap 958 * and buddy information in consecutive blocks. 959 * So for each group we need two blocks. 960 */ 961 block = group * 2; 962 pnum = block / blocks_per_page; 963 poff = block % blocks_per_page; 964 965 /* we could use find_or_create_page(), but it locks page 966 * what we'd like to avoid in fast path ... */ 967 page = find_get_page(inode->i_mapping, pnum); 968 if (page == NULL || !PageUptodate(page)) { 969 if (page) 970 /* 971 * drop the page reference and try 972 * to get the page with lock. If we 973 * are not uptodate that implies 974 * somebody just created the page but 975 * is yet to initialize the same. So 976 * wait for it to initialize. 977 */ 978 page_cache_release(page); 979 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 980 if (page) { 981 BUG_ON(page->mapping != inode->i_mapping); 982 if (!PageUptodate(page)) { 983 ret = ext4_mb_init_cache(page, NULL); 984 if (ret) { 985 unlock_page(page); 986 goto err; 987 } 988 mb_cmp_bitmaps(e4b, page_address(page) + 989 (poff * sb->s_blocksize)); 990 } 991 unlock_page(page); 992 } 993 } 994 if (page == NULL || !PageUptodate(page)) { 995 ret = -EIO; 996 goto err; 997 } 998 e4b->bd_bitmap_page = page; 999 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1000 mark_page_accessed(page); 1001 1002 block++; 1003 pnum = block / blocks_per_page; 1004 poff = block % blocks_per_page; 1005 1006 page = find_get_page(inode->i_mapping, pnum); 1007 if (page == NULL || !PageUptodate(page)) { 1008 if (page) 1009 page_cache_release(page); 1010 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1011 if (page) { 1012 BUG_ON(page->mapping != inode->i_mapping); 1013 if (!PageUptodate(page)) { 1014 ret = ext4_mb_init_cache(page, e4b->bd_bitmap); 1015 if (ret) { 1016 unlock_page(page); 1017 goto err; 1018 } 1019 } 1020 unlock_page(page); 1021 } 1022 } 1023 if (page == NULL || !PageUptodate(page)) { 1024 ret = -EIO; 1025 goto err; 1026 } 1027 e4b->bd_buddy_page = page; 1028 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1029 mark_page_accessed(page); 1030 1031 BUG_ON(e4b->bd_bitmap_page == NULL); 1032 BUG_ON(e4b->bd_buddy_page == NULL); 1033 1034 return 0; 1035 1036 err: 1037 if (e4b->bd_bitmap_page) 1038 page_cache_release(e4b->bd_bitmap_page); 1039 if (e4b->bd_buddy_page) 1040 page_cache_release(e4b->bd_buddy_page); 1041 e4b->bd_buddy = NULL; 1042 e4b->bd_bitmap = NULL; 1043 1044 /* Done with the buddy cache */ 1045 up_read(e4b->alloc_semp); 1046 return ret; 1047 } 1048 1049 static void ext4_mb_release_desc(struct ext4_buddy *e4b) 1050 { 1051 if (e4b->bd_bitmap_page) 1052 page_cache_release(e4b->bd_bitmap_page); 1053 if (e4b->bd_buddy_page) 1054 page_cache_release(e4b->bd_buddy_page); 1055 /* Done with the buddy cache */ 1056 if (e4b->alloc_semp) 1057 up_read(e4b->alloc_semp); 1058 } 1059 1060 1061 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1062 { 1063 int order = 1; 1064 void *bb; 1065 1066 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 1067 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1068 1069 bb = EXT4_MB_BUDDY(e4b); 1070 while (order <= e4b->bd_blkbits + 1) { 1071 block = block >> 1; 1072 if (!mb_test_bit(block, bb)) { 1073 /* this block is part of buddy of order 'order' */ 1074 return order; 1075 } 1076 bb += 1 << (e4b->bd_blkbits - order); 1077 order++; 1078 } 1079 return 0; 1080 } 1081 1082 static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) 1083 { 1084 __u32 *addr; 1085 1086 len = cur + len; 1087 while (cur < len) { 1088 if ((cur & 31) == 0 && (len - cur) >= 32) { 1089 /* fast path: clear whole word at once */ 1090 addr = bm + (cur >> 3); 1091 *addr = 0; 1092 cur += 32; 1093 continue; 1094 } 1095 if (lock) 1096 mb_clear_bit_atomic(lock, cur, bm); 1097 else 1098 mb_clear_bit(cur, bm); 1099 cur++; 1100 } 1101 } 1102 1103 static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) 1104 { 1105 __u32 *addr; 1106 1107 len = cur + len; 1108 while (cur < len) { 1109 if ((cur & 31) == 0 && (len - cur) >= 32) { 1110 /* fast path: set whole word at once */ 1111 addr = bm + (cur >> 3); 1112 *addr = 0xffffffff; 1113 cur += 32; 1114 continue; 1115 } 1116 if (lock) 1117 mb_set_bit_atomic(lock, cur, bm); 1118 else 1119 mb_set_bit(cur, bm); 1120 cur++; 1121 } 1122 } 1123 1124 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1125 int first, int count) 1126 { 1127 int block = 0; 1128 int max = 0; 1129 int order; 1130 void *buddy; 1131 void *buddy2; 1132 struct super_block *sb = e4b->bd_sb; 1133 1134 BUG_ON(first + count > (sb->s_blocksize << 3)); 1135 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); 1136 mb_check_buddy(e4b); 1137 mb_free_blocks_double(inode, e4b, first, count); 1138 1139 e4b->bd_info->bb_free += count; 1140 if (first < e4b->bd_info->bb_first_free) 1141 e4b->bd_info->bb_first_free = first; 1142 1143 /* let's maintain fragments counter */ 1144 if (first != 0) 1145 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); 1146 if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) 1147 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); 1148 if (block && max) 1149 e4b->bd_info->bb_fragments--; 1150 else if (!block && !max) 1151 e4b->bd_info->bb_fragments++; 1152 1153 /* let's maintain buddy itself */ 1154 while (count-- > 0) { 1155 block = first++; 1156 order = 0; 1157 1158 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1159 ext4_fsblk_t blocknr; 1160 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 1161 blocknr += block; 1162 blocknr += 1163 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 1164 ext4_grp_locked_error(sb, e4b->bd_group, 1165 __func__, "double-free of inode" 1166 " %lu's block %llu(bit %u in group %u)", 1167 inode ? inode->i_ino : 0, blocknr, block, 1168 e4b->bd_group); 1169 } 1170 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1171 e4b->bd_info->bb_counters[order]++; 1172 1173 /* start of the buddy */ 1174 buddy = mb_find_buddy(e4b, order, &max); 1175 1176 do { 1177 block &= ~1UL; 1178 if (mb_test_bit(block, buddy) || 1179 mb_test_bit(block + 1, buddy)) 1180 break; 1181 1182 /* both the buddies are free, try to coalesce them */ 1183 buddy2 = mb_find_buddy(e4b, order + 1, &max); 1184 1185 if (!buddy2) 1186 break; 1187 1188 if (order > 0) { 1189 /* for special purposes, we don't set 1190 * free bits in bitmap */ 1191 mb_set_bit(block, buddy); 1192 mb_set_bit(block + 1, buddy); 1193 } 1194 e4b->bd_info->bb_counters[order]--; 1195 e4b->bd_info->bb_counters[order]--; 1196 1197 block = block >> 1; 1198 order++; 1199 e4b->bd_info->bb_counters[order]++; 1200 1201 mb_clear_bit(block, buddy2); 1202 buddy = buddy2; 1203 } while (1); 1204 } 1205 mb_check_buddy(e4b); 1206 } 1207 1208 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, 1209 int needed, struct ext4_free_extent *ex) 1210 { 1211 int next = block; 1212 int max; 1213 int ord; 1214 void *buddy; 1215 1216 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 1217 BUG_ON(ex == NULL); 1218 1219 buddy = mb_find_buddy(e4b, order, &max); 1220 BUG_ON(buddy == NULL); 1221 BUG_ON(block >= max); 1222 if (mb_test_bit(block, buddy)) { 1223 ex->fe_len = 0; 1224 ex->fe_start = 0; 1225 ex->fe_group = 0; 1226 return 0; 1227 } 1228 1229 /* FIXME dorp order completely ? */ 1230 if (likely(order == 0)) { 1231 /* find actual order */ 1232 order = mb_find_order_for_block(e4b, block); 1233 block = block >> order; 1234 } 1235 1236 ex->fe_len = 1 << order; 1237 ex->fe_start = block << order; 1238 ex->fe_group = e4b->bd_group; 1239 1240 /* calc difference from given start */ 1241 next = next - ex->fe_start; 1242 ex->fe_len -= next; 1243 ex->fe_start += next; 1244 1245 while (needed > ex->fe_len && 1246 (buddy = mb_find_buddy(e4b, order, &max))) { 1247 1248 if (block + 1 >= max) 1249 break; 1250 1251 next = (block + 1) * (1 << order); 1252 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) 1253 break; 1254 1255 ord = mb_find_order_for_block(e4b, next); 1256 1257 order = ord; 1258 block = next >> order; 1259 ex->fe_len += 1 << order; 1260 } 1261 1262 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); 1263 return ex->fe_len; 1264 } 1265 1266 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) 1267 { 1268 int ord; 1269 int mlen = 0; 1270 int max = 0; 1271 int cur; 1272 int start = ex->fe_start; 1273 int len = ex->fe_len; 1274 unsigned ret = 0; 1275 int len0 = len; 1276 void *buddy; 1277 1278 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 1279 BUG_ON(e4b->bd_group != ex->fe_group); 1280 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 1281 mb_check_buddy(e4b); 1282 mb_mark_used_double(e4b, start, len); 1283 1284 e4b->bd_info->bb_free -= len; 1285 if (e4b->bd_info->bb_first_free == start) 1286 e4b->bd_info->bb_first_free += len; 1287 1288 /* let's maintain fragments counter */ 1289 if (start != 0) 1290 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); 1291 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1292 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); 1293 if (mlen && max) 1294 e4b->bd_info->bb_fragments++; 1295 else if (!mlen && !max) 1296 e4b->bd_info->bb_fragments--; 1297 1298 /* let's maintain buddy itself */ 1299 while (len) { 1300 ord = mb_find_order_for_block(e4b, start); 1301 1302 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 1303 /* the whole chunk may be allocated at once! */ 1304 mlen = 1 << ord; 1305 buddy = mb_find_buddy(e4b, ord, &max); 1306 BUG_ON((start >> ord) >= max); 1307 mb_set_bit(start >> ord, buddy); 1308 e4b->bd_info->bb_counters[ord]--; 1309 start += mlen; 1310 len -= mlen; 1311 BUG_ON(len < 0); 1312 continue; 1313 } 1314 1315 /* store for history */ 1316 if (ret == 0) 1317 ret = len | (ord << 16); 1318 1319 /* we have to split large buddy */ 1320 BUG_ON(ord <= 0); 1321 buddy = mb_find_buddy(e4b, ord, &max); 1322 mb_set_bit(start >> ord, buddy); 1323 e4b->bd_info->bb_counters[ord]--; 1324 1325 ord--; 1326 cur = (start >> ord) & ~1U; 1327 buddy = mb_find_buddy(e4b, ord, &max); 1328 mb_clear_bit(cur, buddy); 1329 mb_clear_bit(cur + 1, buddy); 1330 e4b->bd_info->bb_counters[ord]++; 1331 e4b->bd_info->bb_counters[ord]++; 1332 } 1333 1334 mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), 1335 EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1336 mb_check_buddy(e4b); 1337 1338 return ret; 1339 } 1340 1341 /* 1342 * Must be called under group lock! 1343 */ 1344 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, 1345 struct ext4_buddy *e4b) 1346 { 1347 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1348 int ret; 1349 1350 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); 1351 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1352 1353 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); 1354 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; 1355 ret = mb_mark_used(e4b, &ac->ac_b_ex); 1356 1357 /* preallocation can change ac_b_ex, thus we store actually 1358 * allocated blocks for history */ 1359 ac->ac_f_ex = ac->ac_b_ex; 1360 1361 ac->ac_status = AC_STATUS_FOUND; 1362 ac->ac_tail = ret & 0xffff; 1363 ac->ac_buddy = ret >> 16; 1364 1365 /* 1366 * take the page reference. We want the page to be pinned 1367 * so that we don't get a ext4_mb_init_cache_call for this 1368 * group until we update the bitmap. That would mean we 1369 * double allocate blocks. The reference is dropped 1370 * in ext4_mb_release_context 1371 */ 1372 ac->ac_bitmap_page = e4b->bd_bitmap_page; 1373 get_page(ac->ac_bitmap_page); 1374 ac->ac_buddy_page = e4b->bd_buddy_page; 1375 get_page(ac->ac_buddy_page); 1376 /* on allocation we use ac to track the held semaphore */ 1377 ac->alloc_semp = e4b->alloc_semp; 1378 e4b->alloc_semp = NULL; 1379 /* store last allocated for subsequent stream allocation */ 1380 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1381 spin_lock(&sbi->s_md_lock); 1382 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1383 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1384 spin_unlock(&sbi->s_md_lock); 1385 } 1386 } 1387 1388 /* 1389 * regular allocator, for general purposes allocation 1390 */ 1391 1392 static void ext4_mb_check_limits(struct ext4_allocation_context *ac, 1393 struct ext4_buddy *e4b, 1394 int finish_group) 1395 { 1396 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1397 struct ext4_free_extent *bex = &ac->ac_b_ex; 1398 struct ext4_free_extent *gex = &ac->ac_g_ex; 1399 struct ext4_free_extent ex; 1400 int max; 1401 1402 if (ac->ac_status == AC_STATUS_FOUND) 1403 return; 1404 /* 1405 * We don't want to scan for a whole year 1406 */ 1407 if (ac->ac_found > sbi->s_mb_max_to_scan && 1408 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1409 ac->ac_status = AC_STATUS_BREAK; 1410 return; 1411 } 1412 1413 /* 1414 * Haven't found good chunk so far, let's continue 1415 */ 1416 if (bex->fe_len < gex->fe_len) 1417 return; 1418 1419 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) 1420 && bex->fe_group == e4b->bd_group) { 1421 /* recheck chunk's availability - we don't know 1422 * when it was found (within this lock-unlock 1423 * period or not) */ 1424 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); 1425 if (max >= gex->fe_len) { 1426 ext4_mb_use_best_found(ac, e4b); 1427 return; 1428 } 1429 } 1430 } 1431 1432 /* 1433 * The routine checks whether found extent is good enough. If it is, 1434 * then the extent gets marked used and flag is set to the context 1435 * to stop scanning. Otherwise, the extent is compared with the 1436 * previous found extent and if new one is better, then it's stored 1437 * in the context. Later, the best found extent will be used, if 1438 * mballoc can't find good enough extent. 1439 * 1440 * FIXME: real allocation policy is to be designed yet! 1441 */ 1442 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, 1443 struct ext4_free_extent *ex, 1444 struct ext4_buddy *e4b) 1445 { 1446 struct ext4_free_extent *bex = &ac->ac_b_ex; 1447 struct ext4_free_extent *gex = &ac->ac_g_ex; 1448 1449 BUG_ON(ex->fe_len <= 0); 1450 BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1451 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1452 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 1453 1454 ac->ac_found++; 1455 1456 /* 1457 * The special case - take what you catch first 1458 */ 1459 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1460 *bex = *ex; 1461 ext4_mb_use_best_found(ac, e4b); 1462 return; 1463 } 1464 1465 /* 1466 * Let's check whether the chuck is good enough 1467 */ 1468 if (ex->fe_len == gex->fe_len) { 1469 *bex = *ex; 1470 ext4_mb_use_best_found(ac, e4b); 1471 return; 1472 } 1473 1474 /* 1475 * If this is first found extent, just store it in the context 1476 */ 1477 if (bex->fe_len == 0) { 1478 *bex = *ex; 1479 return; 1480 } 1481 1482 /* 1483 * If new found extent is better, store it in the context 1484 */ 1485 if (bex->fe_len < gex->fe_len) { 1486 /* if the request isn't satisfied, any found extent 1487 * larger than previous best one is better */ 1488 if (ex->fe_len > bex->fe_len) 1489 *bex = *ex; 1490 } else if (ex->fe_len > gex->fe_len) { 1491 /* if the request is satisfied, then we try to find 1492 * an extent that still satisfy the request, but is 1493 * smaller than previous one */ 1494 if (ex->fe_len < bex->fe_len) 1495 *bex = *ex; 1496 } 1497 1498 ext4_mb_check_limits(ac, e4b, 0); 1499 } 1500 1501 static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, 1502 struct ext4_buddy *e4b) 1503 { 1504 struct ext4_free_extent ex = ac->ac_b_ex; 1505 ext4_group_t group = ex.fe_group; 1506 int max; 1507 int err; 1508 1509 BUG_ON(ex.fe_len <= 0); 1510 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1511 if (err) 1512 return err; 1513 1514 ext4_lock_group(ac->ac_sb, group); 1515 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); 1516 1517 if (max > 0) { 1518 ac->ac_b_ex = ex; 1519 ext4_mb_use_best_found(ac, e4b); 1520 } 1521 1522 ext4_unlock_group(ac->ac_sb, group); 1523 ext4_mb_release_desc(e4b); 1524 1525 return 0; 1526 } 1527 1528 static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 1529 struct ext4_buddy *e4b) 1530 { 1531 ext4_group_t group = ac->ac_g_ex.fe_group; 1532 int max; 1533 int err; 1534 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1535 struct ext4_super_block *es = sbi->s_es; 1536 struct ext4_free_extent ex; 1537 1538 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1539 return 0; 1540 1541 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1542 if (err) 1543 return err; 1544 1545 ext4_lock_group(ac->ac_sb, group); 1546 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, 1547 ac->ac_g_ex.fe_len, &ex); 1548 1549 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1550 ext4_fsblk_t start; 1551 1552 start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + 1553 ex.fe_start + le32_to_cpu(es->s_first_data_block); 1554 /* use do_div to get remainder (would be 64-bit modulo) */ 1555 if (do_div(start, sbi->s_stripe) == 0) { 1556 ac->ac_found++; 1557 ac->ac_b_ex = ex; 1558 ext4_mb_use_best_found(ac, e4b); 1559 } 1560 } else if (max >= ac->ac_g_ex.fe_len) { 1561 BUG_ON(ex.fe_len <= 0); 1562 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1563 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1564 ac->ac_found++; 1565 ac->ac_b_ex = ex; 1566 ext4_mb_use_best_found(ac, e4b); 1567 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { 1568 /* Sometimes, caller may want to merge even small 1569 * number of blocks to an existing extent */ 1570 BUG_ON(ex.fe_len <= 0); 1571 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1572 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1573 ac->ac_found++; 1574 ac->ac_b_ex = ex; 1575 ext4_mb_use_best_found(ac, e4b); 1576 } 1577 ext4_unlock_group(ac->ac_sb, group); 1578 ext4_mb_release_desc(e4b); 1579 1580 return 0; 1581 } 1582 1583 /* 1584 * The routine scans buddy structures (not bitmap!) from given order 1585 * to max order and tries to find big enough chunk to satisfy the req 1586 */ 1587 static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 1588 struct ext4_buddy *e4b) 1589 { 1590 struct super_block *sb = ac->ac_sb; 1591 struct ext4_group_info *grp = e4b->bd_info; 1592 void *buddy; 1593 int i; 1594 int k; 1595 int max; 1596 1597 BUG_ON(ac->ac_2order <= 0); 1598 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { 1599 if (grp->bb_counters[i] == 0) 1600 continue; 1601 1602 buddy = mb_find_buddy(e4b, i, &max); 1603 BUG_ON(buddy == NULL); 1604 1605 k = mb_find_next_zero_bit(buddy, max, 0); 1606 BUG_ON(k >= max); 1607 1608 ac->ac_found++; 1609 1610 ac->ac_b_ex.fe_len = 1 << i; 1611 ac->ac_b_ex.fe_start = k << i; 1612 ac->ac_b_ex.fe_group = e4b->bd_group; 1613 1614 ext4_mb_use_best_found(ac, e4b); 1615 1616 BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); 1617 1618 if (EXT4_SB(sb)->s_mb_stats) 1619 atomic_inc(&EXT4_SB(sb)->s_bal_2orders); 1620 1621 break; 1622 } 1623 } 1624 1625 /* 1626 * The routine scans the group and measures all found extents. 1627 * In order to optimize scanning, caller must pass number of 1628 * free blocks in the group, so the routine can know upper limit. 1629 */ 1630 static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 1631 struct ext4_buddy *e4b) 1632 { 1633 struct super_block *sb = ac->ac_sb; 1634 void *bitmap = EXT4_MB_BITMAP(e4b); 1635 struct ext4_free_extent ex; 1636 int i; 1637 int free; 1638 1639 free = e4b->bd_info->bb_free; 1640 BUG_ON(free <= 0); 1641 1642 i = e4b->bd_info->bb_first_free; 1643 1644 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 1645 i = mb_find_next_zero_bit(bitmap, 1646 EXT4_BLOCKS_PER_GROUP(sb), i); 1647 if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { 1648 /* 1649 * IF we have corrupt bitmap, we won't find any 1650 * free blocks even though group info says we 1651 * we have free blocks 1652 */ 1653 ext4_grp_locked_error(sb, e4b->bd_group, 1654 __func__, "%d free blocks as per " 1655 "group info. But bitmap says 0", 1656 free); 1657 break; 1658 } 1659 1660 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1661 BUG_ON(ex.fe_len <= 0); 1662 if (free < ex.fe_len) { 1663 ext4_grp_locked_error(sb, e4b->bd_group, 1664 __func__, "%d free blocks as per " 1665 "group info. But got %d blocks", 1666 free, ex.fe_len); 1667 /* 1668 * The number of free blocks differs. This mostly 1669 * indicate that the bitmap is corrupt. So exit 1670 * without claiming the space. 1671 */ 1672 break; 1673 } 1674 1675 ext4_mb_measure_extent(ac, &ex, e4b); 1676 1677 i += ex.fe_len; 1678 free -= ex.fe_len; 1679 } 1680 1681 ext4_mb_check_limits(ac, e4b, 1); 1682 } 1683 1684 /* 1685 * This is a special case for storages like raid5 1686 * we try to find stripe-aligned chunks for stripe-size requests 1687 * XXX should do so at least for multiples of stripe size as well 1688 */ 1689 static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1690 struct ext4_buddy *e4b) 1691 { 1692 struct super_block *sb = ac->ac_sb; 1693 struct ext4_sb_info *sbi = EXT4_SB(sb); 1694 void *bitmap = EXT4_MB_BITMAP(e4b); 1695 struct ext4_free_extent ex; 1696 ext4_fsblk_t first_group_block; 1697 ext4_fsblk_t a; 1698 ext4_grpblk_t i; 1699 int max; 1700 1701 BUG_ON(sbi->s_stripe == 0); 1702 1703 /* find first stripe-aligned block in group */ 1704 first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) 1705 + le32_to_cpu(sbi->s_es->s_first_data_block); 1706 a = first_group_block + sbi->s_stripe - 1; 1707 do_div(a, sbi->s_stripe); 1708 i = (a * sbi->s_stripe) - first_group_block; 1709 1710 while (i < EXT4_BLOCKS_PER_GROUP(sb)) { 1711 if (!mb_test_bit(i, bitmap)) { 1712 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); 1713 if (max >= sbi->s_stripe) { 1714 ac->ac_found++; 1715 ac->ac_b_ex = ex; 1716 ext4_mb_use_best_found(ac, e4b); 1717 break; 1718 } 1719 } 1720 i += sbi->s_stripe; 1721 } 1722 } 1723 1724 static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1725 ext4_group_t group, int cr) 1726 { 1727 unsigned free, fragments; 1728 unsigned i, bits; 1729 struct ext4_group_desc *desc; 1730 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1731 1732 BUG_ON(cr < 0 || cr >= 4); 1733 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); 1734 1735 free = grp->bb_free; 1736 fragments = grp->bb_fragments; 1737 if (free == 0) 1738 return 0; 1739 if (fragments == 0) 1740 return 0; 1741 1742 switch (cr) { 1743 case 0: 1744 BUG_ON(ac->ac_2order == 0); 1745 /* If this group is uninitialized, skip it initially */ 1746 desc = ext4_get_group_desc(ac->ac_sb, group, NULL); 1747 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) 1748 return 0; 1749 1750 bits = ac->ac_sb->s_blocksize_bits + 1; 1751 for (i = ac->ac_2order; i <= bits; i++) 1752 if (grp->bb_counters[i] > 0) 1753 return 1; 1754 break; 1755 case 1: 1756 if ((free / fragments) >= ac->ac_g_ex.fe_len) 1757 return 1; 1758 break; 1759 case 2: 1760 if (free >= ac->ac_g_ex.fe_len) 1761 return 1; 1762 break; 1763 case 3: 1764 return 1; 1765 default: 1766 BUG(); 1767 } 1768 1769 return 0; 1770 } 1771 1772 /* 1773 * lock the group_info alloc_sem of all the groups 1774 * belonging to the same buddy cache page. This 1775 * make sure other parallel operation on the buddy 1776 * cache doesn't happen whild holding the buddy cache 1777 * lock 1778 */ 1779 int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) 1780 { 1781 int i; 1782 int block, pnum; 1783 int blocks_per_page; 1784 int groups_per_page; 1785 ext4_group_t first_group; 1786 struct ext4_group_info *grp; 1787 1788 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1789 /* 1790 * the buddy cache inode stores the block bitmap 1791 * and buddy information in consecutive blocks. 1792 * So for each group we need two blocks. 1793 */ 1794 block = group * 2; 1795 pnum = block / blocks_per_page; 1796 first_group = pnum * blocks_per_page / 2; 1797 1798 groups_per_page = blocks_per_page >> 1; 1799 if (groups_per_page == 0) 1800 groups_per_page = 1; 1801 /* read all groups the page covers into the cache */ 1802 for (i = 0; i < groups_per_page; i++) { 1803 1804 if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) 1805 break; 1806 grp = ext4_get_group_info(sb, first_group + i); 1807 /* take all groups write allocation 1808 * semaphore. This make sure there is 1809 * no block allocation going on in any 1810 * of that groups 1811 */ 1812 down_write_nested(&grp->alloc_sem, i); 1813 } 1814 return i; 1815 } 1816 1817 void ext4_mb_put_buddy_cache_lock(struct super_block *sb, 1818 ext4_group_t group, int locked_group) 1819 { 1820 int i; 1821 int block, pnum; 1822 int blocks_per_page; 1823 ext4_group_t first_group; 1824 struct ext4_group_info *grp; 1825 1826 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1827 /* 1828 * the buddy cache inode stores the block bitmap 1829 * and buddy information in consecutive blocks. 1830 * So for each group we need two blocks. 1831 */ 1832 block = group * 2; 1833 pnum = block / blocks_per_page; 1834 first_group = pnum * blocks_per_page / 2; 1835 /* release locks on all the groups */ 1836 for (i = 0; i < locked_group; i++) { 1837 1838 grp = ext4_get_group_info(sb, first_group + i); 1839 /* take all groups write allocation 1840 * semaphore. This make sure there is 1841 * no block allocation going on in any 1842 * of that groups 1843 */ 1844 up_write(&grp->alloc_sem); 1845 } 1846 1847 } 1848 1849 static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1850 { 1851 1852 int ret; 1853 void *bitmap; 1854 int blocks_per_page; 1855 int block, pnum, poff; 1856 int num_grp_locked = 0; 1857 struct ext4_group_info *this_grp; 1858 struct ext4_sb_info *sbi = EXT4_SB(sb); 1859 struct inode *inode = sbi->s_buddy_cache; 1860 struct page *page = NULL, *bitmap_page = NULL; 1861 1862 mb_debug("init group %lu\n", group); 1863 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1864 this_grp = ext4_get_group_info(sb, group); 1865 /* 1866 * This ensures we don't add group 1867 * to this buddy cache via resize 1868 */ 1869 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); 1870 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { 1871 /* 1872 * somebody initialized the group 1873 * return without doing anything 1874 */ 1875 ret = 0; 1876 goto err; 1877 } 1878 /* 1879 * the buddy cache inode stores the block bitmap 1880 * and buddy information in consecutive blocks. 1881 * So for each group we need two blocks. 1882 */ 1883 block = group * 2; 1884 pnum = block / blocks_per_page; 1885 poff = block % blocks_per_page; 1886 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1887 if (page) { 1888 BUG_ON(page->mapping != inode->i_mapping); 1889 ret = ext4_mb_init_cache(page, NULL); 1890 if (ret) { 1891 unlock_page(page); 1892 goto err; 1893 } 1894 unlock_page(page); 1895 } 1896 if (page == NULL || !PageUptodate(page)) { 1897 ret = -EIO; 1898 goto err; 1899 } 1900 mark_page_accessed(page); 1901 bitmap_page = page; 1902 bitmap = page_address(page) + (poff * sb->s_blocksize); 1903 1904 /* init buddy cache */ 1905 block++; 1906 pnum = block / blocks_per_page; 1907 poff = block % blocks_per_page; 1908 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1909 if (page == bitmap_page) { 1910 /* 1911 * If both the bitmap and buddy are in 1912 * the same page we don't need to force 1913 * init the buddy 1914 */ 1915 unlock_page(page); 1916 } else if (page) { 1917 BUG_ON(page->mapping != inode->i_mapping); 1918 ret = ext4_mb_init_cache(page, bitmap); 1919 if (ret) { 1920 unlock_page(page); 1921 goto err; 1922 } 1923 unlock_page(page); 1924 } 1925 if (page == NULL || !PageUptodate(page)) { 1926 ret = -EIO; 1927 goto err; 1928 } 1929 mark_page_accessed(page); 1930 err: 1931 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); 1932 if (bitmap_page) 1933 page_cache_release(bitmap_page); 1934 if (page) 1935 page_cache_release(page); 1936 return ret; 1937 } 1938 1939 static noinline_for_stack int 1940 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1941 { 1942 ext4_group_t group; 1943 ext4_group_t i; 1944 int cr; 1945 int err = 0; 1946 int bsbits; 1947 struct ext4_sb_info *sbi; 1948 struct super_block *sb; 1949 struct ext4_buddy e4b; 1950 loff_t size, isize; 1951 1952 sb = ac->ac_sb; 1953 sbi = EXT4_SB(sb); 1954 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1955 1956 /* first, try the goal */ 1957 err = ext4_mb_find_by_goal(ac, &e4b); 1958 if (err || ac->ac_status == AC_STATUS_FOUND) 1959 goto out; 1960 1961 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 1962 goto out; 1963 1964 /* 1965 * ac->ac2_order is set only if the fe_len is a power of 2 1966 * if ac2_order is set we also set criteria to 0 so that we 1967 * try exact allocation using buddy. 1968 */ 1969 i = fls(ac->ac_g_ex.fe_len); 1970 ac->ac_2order = 0; 1971 /* 1972 * We search using buddy data only if the order of the request 1973 * is greater than equal to the sbi_s_mb_order2_reqs 1974 * You can tune it via /proc/fs/ext4/<partition>/order2_req 1975 */ 1976 if (i >= sbi->s_mb_order2_reqs) { 1977 /* 1978 * This should tell if fe_len is exactly power of 2 1979 */ 1980 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 1981 ac->ac_2order = i - 1; 1982 } 1983 1984 bsbits = ac->ac_sb->s_blocksize_bits; 1985 /* if stream allocation is enabled, use global goal */ 1986 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 1987 isize = i_size_read(ac->ac_inode) >> bsbits; 1988 if (size < isize) 1989 size = isize; 1990 1991 if (size < sbi->s_mb_stream_request && 1992 (ac->ac_flags & EXT4_MB_HINT_DATA)) { 1993 /* TBD: may be hot point */ 1994 spin_lock(&sbi->s_md_lock); 1995 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 1996 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 1997 spin_unlock(&sbi->s_md_lock); 1998 } 1999 /* Let's just scan groups to find more-less suitable blocks */ 2000 cr = ac->ac_2order ? 0 : 1; 2001 /* 2002 * cr == 0 try to get exact allocation, 2003 * cr == 3 try to get anything 2004 */ 2005 repeat: 2006 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 2007 ac->ac_criteria = cr; 2008 /* 2009 * searching for the right group start 2010 * from the goal value specified 2011 */ 2012 group = ac->ac_g_ex.fe_group; 2013 2014 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { 2015 struct ext4_group_info *grp; 2016 struct ext4_group_desc *desc; 2017 2018 if (group == EXT4_SB(sb)->s_groups_count) 2019 group = 0; 2020 2021 /* quick check to skip empty groups */ 2022 grp = ext4_get_group_info(sb, group); 2023 if (grp->bb_free == 0) 2024 continue; 2025 2026 /* 2027 * if the group is already init we check whether it is 2028 * a good group and if not we don't load the buddy 2029 */ 2030 if (EXT4_MB_GRP_NEED_INIT(grp)) { 2031 /* 2032 * we need full data about the group 2033 * to make a good selection 2034 */ 2035 err = ext4_mb_init_group(sb, group); 2036 if (err) 2037 goto out; 2038 } 2039 2040 /* 2041 * If the particular group doesn't satisfy our 2042 * criteria we continue with the next group 2043 */ 2044 if (!ext4_mb_good_group(ac, group, cr)) 2045 continue; 2046 2047 err = ext4_mb_load_buddy(sb, group, &e4b); 2048 if (err) 2049 goto out; 2050 2051 ext4_lock_group(sb, group); 2052 if (!ext4_mb_good_group(ac, group, cr)) { 2053 /* someone did allocation from this group */ 2054 ext4_unlock_group(sb, group); 2055 ext4_mb_release_desc(&e4b); 2056 continue; 2057 } 2058 2059 ac->ac_groups_scanned++; 2060 desc = ext4_get_group_desc(sb, group, NULL); 2061 if (cr == 0 || (desc->bg_flags & 2062 cpu_to_le16(EXT4_BG_BLOCK_UNINIT) && 2063 ac->ac_2order != 0)) 2064 ext4_mb_simple_scan_group(ac, &e4b); 2065 else if (cr == 1 && 2066 ac->ac_g_ex.fe_len == sbi->s_stripe) 2067 ext4_mb_scan_aligned(ac, &e4b); 2068 else 2069 ext4_mb_complex_scan_group(ac, &e4b); 2070 2071 ext4_unlock_group(sb, group); 2072 ext4_mb_release_desc(&e4b); 2073 2074 if (ac->ac_status != AC_STATUS_CONTINUE) 2075 break; 2076 } 2077 } 2078 2079 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && 2080 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2081 /* 2082 * We've been searching too long. Let's try to allocate 2083 * the best chunk we've found so far 2084 */ 2085 2086 ext4_mb_try_best_found(ac, &e4b); 2087 if (ac->ac_status != AC_STATUS_FOUND) { 2088 /* 2089 * Someone more lucky has already allocated it. 2090 * The only thing we can do is just take first 2091 * found block(s) 2092 printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); 2093 */ 2094 ac->ac_b_ex.fe_group = 0; 2095 ac->ac_b_ex.fe_start = 0; 2096 ac->ac_b_ex.fe_len = 0; 2097 ac->ac_status = AC_STATUS_CONTINUE; 2098 ac->ac_flags |= EXT4_MB_HINT_FIRST; 2099 cr = 3; 2100 atomic_inc(&sbi->s_mb_lost_chunks); 2101 goto repeat; 2102 } 2103 } 2104 out: 2105 return err; 2106 } 2107 2108 #ifdef EXT4_MB_HISTORY 2109 struct ext4_mb_proc_session { 2110 struct ext4_mb_history *history; 2111 struct super_block *sb; 2112 int start; 2113 int max; 2114 }; 2115 2116 static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s, 2117 struct ext4_mb_history *hs, 2118 int first) 2119 { 2120 if (hs == s->history + s->max) 2121 hs = s->history; 2122 if (!first && hs == s->history + s->start) 2123 return NULL; 2124 while (hs->orig.fe_len == 0) { 2125 hs++; 2126 if (hs == s->history + s->max) 2127 hs = s->history; 2128 if (hs == s->history + s->start) 2129 return NULL; 2130 } 2131 return hs; 2132 } 2133 2134 static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos) 2135 { 2136 struct ext4_mb_proc_session *s = seq->private; 2137 struct ext4_mb_history *hs; 2138 int l = *pos; 2139 2140 if (l == 0) 2141 return SEQ_START_TOKEN; 2142 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1); 2143 if (!hs) 2144 return NULL; 2145 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL); 2146 return hs; 2147 } 2148 2149 static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v, 2150 loff_t *pos) 2151 { 2152 struct ext4_mb_proc_session *s = seq->private; 2153 struct ext4_mb_history *hs = v; 2154 2155 ++*pos; 2156 if (v == SEQ_START_TOKEN) 2157 return ext4_mb_history_skip_empty(s, s->history + s->start, 1); 2158 else 2159 return ext4_mb_history_skip_empty(s, ++hs, 0); 2160 } 2161 2162 static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) 2163 { 2164 char buf[25], buf2[25], buf3[25], *fmt; 2165 struct ext4_mb_history *hs = v; 2166 2167 if (v == SEQ_START_TOKEN) { 2168 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " 2169 "%-5s %-2s %-5s %-5s %-5s %-6s\n", 2170 "pid", "inode", "original", "goal", "result", "found", 2171 "grps", "cr", "flags", "merge", "tail", "broken"); 2172 return 0; 2173 } 2174 2175 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 2176 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 2177 "%-5u %-5s %-5u %-6u\n"; 2178 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, 2179 hs->result.fe_start, hs->result.fe_len, 2180 hs->result.fe_logical); 2181 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, 2182 hs->orig.fe_start, hs->orig.fe_len, 2183 hs->orig.fe_logical); 2184 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group, 2185 hs->goal.fe_start, hs->goal.fe_len, 2186 hs->goal.fe_logical); 2187 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, 2188 hs->found, hs->groups, hs->cr, hs->flags, 2189 hs->merged ? "M" : "", hs->tail, 2190 hs->buddy ? 1 << hs->buddy : 0); 2191 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { 2192 fmt = "%-5u %-8u %-23s %-23s %-23s\n"; 2193 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, 2194 hs->result.fe_start, hs->result.fe_len, 2195 hs->result.fe_logical); 2196 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, 2197 hs->orig.fe_start, hs->orig.fe_len, 2198 hs->orig.fe_logical); 2199 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); 2200 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { 2201 sprintf(buf2, "%u/%d/%u", hs->result.fe_group, 2202 hs->result.fe_start, hs->result.fe_len); 2203 seq_printf(seq, "%-5u %-8u %-23s discard\n", 2204 hs->pid, hs->ino, buf2); 2205 } else if (hs->op == EXT4_MB_HISTORY_FREE) { 2206 sprintf(buf2, "%u/%d/%u", hs->result.fe_group, 2207 hs->result.fe_start, hs->result.fe_len); 2208 seq_printf(seq, "%-5u %-8u %-23s free\n", 2209 hs->pid, hs->ino, buf2); 2210 } 2211 return 0; 2212 } 2213 2214 static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v) 2215 { 2216 } 2217 2218 static struct seq_operations ext4_mb_seq_history_ops = { 2219 .start = ext4_mb_seq_history_start, 2220 .next = ext4_mb_seq_history_next, 2221 .stop = ext4_mb_seq_history_stop, 2222 .show = ext4_mb_seq_history_show, 2223 }; 2224 2225 static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) 2226 { 2227 struct super_block *sb = PDE(inode)->data; 2228 struct ext4_sb_info *sbi = EXT4_SB(sb); 2229 struct ext4_mb_proc_session *s; 2230 int rc; 2231 int size; 2232 2233 if (unlikely(sbi->s_mb_history == NULL)) 2234 return -ENOMEM; 2235 s = kmalloc(sizeof(*s), GFP_KERNEL); 2236 if (s == NULL) 2237 return -ENOMEM; 2238 s->sb = sb; 2239 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max; 2240 s->history = kmalloc(size, GFP_KERNEL); 2241 if (s->history == NULL) { 2242 kfree(s); 2243 return -ENOMEM; 2244 } 2245 2246 spin_lock(&sbi->s_mb_history_lock); 2247 memcpy(s->history, sbi->s_mb_history, size); 2248 s->max = sbi->s_mb_history_max; 2249 s->start = sbi->s_mb_history_cur % s->max; 2250 spin_unlock(&sbi->s_mb_history_lock); 2251 2252 rc = seq_open(file, &ext4_mb_seq_history_ops); 2253 if (rc == 0) { 2254 struct seq_file *m = (struct seq_file *)file->private_data; 2255 m->private = s; 2256 } else { 2257 kfree(s->history); 2258 kfree(s); 2259 } 2260 return rc; 2261 2262 } 2263 2264 static int ext4_mb_seq_history_release(struct inode *inode, struct file *file) 2265 { 2266 struct seq_file *seq = (struct seq_file *)file->private_data; 2267 struct ext4_mb_proc_session *s = seq->private; 2268 kfree(s->history); 2269 kfree(s); 2270 return seq_release(inode, file); 2271 } 2272 2273 static ssize_t ext4_mb_seq_history_write(struct file *file, 2274 const char __user *buffer, 2275 size_t count, loff_t *ppos) 2276 { 2277 struct seq_file *seq = (struct seq_file *)file->private_data; 2278 struct ext4_mb_proc_session *s = seq->private; 2279 struct super_block *sb = s->sb; 2280 char str[32]; 2281 int value; 2282 2283 if (count >= sizeof(str)) { 2284 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", 2285 "mb_history", (int)sizeof(str)); 2286 return -EOVERFLOW; 2287 } 2288 2289 if (copy_from_user(str, buffer, count)) 2290 return -EFAULT; 2291 2292 value = simple_strtol(str, NULL, 0); 2293 if (value < 0) 2294 return -ERANGE; 2295 EXT4_SB(sb)->s_mb_history_filter = value; 2296 2297 return count; 2298 } 2299 2300 static struct file_operations ext4_mb_seq_history_fops = { 2301 .owner = THIS_MODULE, 2302 .open = ext4_mb_seq_history_open, 2303 .read = seq_read, 2304 .write = ext4_mb_seq_history_write, 2305 .llseek = seq_lseek, 2306 .release = ext4_mb_seq_history_release, 2307 }; 2308 2309 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2310 { 2311 struct super_block *sb = seq->private; 2312 struct ext4_sb_info *sbi = EXT4_SB(sb); 2313 ext4_group_t group; 2314 2315 if (*pos < 0 || *pos >= sbi->s_groups_count) 2316 return NULL; 2317 2318 group = *pos + 1; 2319 return (void *) ((unsigned long) group); 2320 } 2321 2322 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2323 { 2324 struct super_block *sb = seq->private; 2325 struct ext4_sb_info *sbi = EXT4_SB(sb); 2326 ext4_group_t group; 2327 2328 ++*pos; 2329 if (*pos < 0 || *pos >= sbi->s_groups_count) 2330 return NULL; 2331 group = *pos + 1; 2332 return (void *) ((unsigned long) group); 2333 } 2334 2335 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2336 { 2337 struct super_block *sb = seq->private; 2338 ext4_group_t group = (ext4_group_t) ((unsigned long) v); 2339 int i; 2340 int err; 2341 struct ext4_buddy e4b; 2342 struct sg { 2343 struct ext4_group_info info; 2344 unsigned short counters[16]; 2345 } sg; 2346 2347 group--; 2348 if (group == 0) 2349 seq_printf(seq, "#%-5s: %-5s %-5s %-5s " 2350 "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " 2351 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", 2352 "group", "free", "frags", "first", 2353 "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", 2354 "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); 2355 2356 i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + 2357 sizeof(struct ext4_group_info); 2358 err = ext4_mb_load_buddy(sb, group, &e4b); 2359 if (err) { 2360 seq_printf(seq, "#%-5u: I/O error\n", group); 2361 return 0; 2362 } 2363 ext4_lock_group(sb, group); 2364 memcpy(&sg, ext4_get_group_info(sb, group), i); 2365 ext4_unlock_group(sb, group); 2366 ext4_mb_release_desc(&e4b); 2367 2368 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2369 sg.info.bb_fragments, sg.info.bb_first_free); 2370 for (i = 0; i <= 13; i++) 2371 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? 2372 sg.info.bb_counters[i] : 0); 2373 seq_printf(seq, " ]\n"); 2374 2375 return 0; 2376 } 2377 2378 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) 2379 { 2380 } 2381 2382 static struct seq_operations ext4_mb_seq_groups_ops = { 2383 .start = ext4_mb_seq_groups_start, 2384 .next = ext4_mb_seq_groups_next, 2385 .stop = ext4_mb_seq_groups_stop, 2386 .show = ext4_mb_seq_groups_show, 2387 }; 2388 2389 static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) 2390 { 2391 struct super_block *sb = PDE(inode)->data; 2392 int rc; 2393 2394 rc = seq_open(file, &ext4_mb_seq_groups_ops); 2395 if (rc == 0) { 2396 struct seq_file *m = (struct seq_file *)file->private_data; 2397 m->private = sb; 2398 } 2399 return rc; 2400 2401 } 2402 2403 static struct file_operations ext4_mb_seq_groups_fops = { 2404 .owner = THIS_MODULE, 2405 .open = ext4_mb_seq_groups_open, 2406 .read = seq_read, 2407 .llseek = seq_lseek, 2408 .release = seq_release, 2409 }; 2410 2411 static void ext4_mb_history_release(struct super_block *sb) 2412 { 2413 struct ext4_sb_info *sbi = EXT4_SB(sb); 2414 2415 if (sbi->s_proc != NULL) { 2416 remove_proc_entry("mb_groups", sbi->s_proc); 2417 remove_proc_entry("mb_history", sbi->s_proc); 2418 } 2419 kfree(sbi->s_mb_history); 2420 } 2421 2422 static void ext4_mb_history_init(struct super_block *sb) 2423 { 2424 struct ext4_sb_info *sbi = EXT4_SB(sb); 2425 int i; 2426 2427 if (sbi->s_proc != NULL) { 2428 proc_create_data("mb_history", S_IRUGO, sbi->s_proc, 2429 &ext4_mb_seq_history_fops, sb); 2430 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2431 &ext4_mb_seq_groups_fops, sb); 2432 } 2433 2434 sbi->s_mb_history_max = 1000; 2435 sbi->s_mb_history_cur = 0; 2436 spin_lock_init(&sbi->s_mb_history_lock); 2437 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); 2438 sbi->s_mb_history = kzalloc(i, GFP_KERNEL); 2439 /* if we can't allocate history, then we simple won't use it */ 2440 } 2441 2442 static noinline_for_stack void 2443 ext4_mb_store_history(struct ext4_allocation_context *ac) 2444 { 2445 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2446 struct ext4_mb_history h; 2447 2448 if (unlikely(sbi->s_mb_history == NULL)) 2449 return; 2450 2451 if (!(ac->ac_op & sbi->s_mb_history_filter)) 2452 return; 2453 2454 h.op = ac->ac_op; 2455 h.pid = current->pid; 2456 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0; 2457 h.orig = ac->ac_o_ex; 2458 h.result = ac->ac_b_ex; 2459 h.flags = ac->ac_flags; 2460 h.found = ac->ac_found; 2461 h.groups = ac->ac_groups_scanned; 2462 h.cr = ac->ac_criteria; 2463 h.tail = ac->ac_tail; 2464 h.buddy = ac->ac_buddy; 2465 h.merged = 0; 2466 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { 2467 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 2468 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 2469 h.merged = 1; 2470 h.goal = ac->ac_g_ex; 2471 h.result = ac->ac_f_ex; 2472 } 2473 2474 spin_lock(&sbi->s_mb_history_lock); 2475 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); 2476 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) 2477 sbi->s_mb_history_cur = 0; 2478 spin_unlock(&sbi->s_mb_history_lock); 2479 } 2480 2481 #else 2482 #define ext4_mb_history_release(sb) 2483 #define ext4_mb_history_init(sb) 2484 #endif 2485 2486 2487 /* Create and initialize ext4_group_info data for the given group. */ 2488 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2489 struct ext4_group_desc *desc) 2490 { 2491 int i, len; 2492 int metalen = 0; 2493 struct ext4_sb_info *sbi = EXT4_SB(sb); 2494 struct ext4_group_info **meta_group_info; 2495 2496 /* 2497 * First check if this group is the first of a reserved block. 2498 * If it's true, we have to allocate a new table of pointers 2499 * to ext4_group_info structures 2500 */ 2501 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 2502 metalen = sizeof(*meta_group_info) << 2503 EXT4_DESC_PER_BLOCK_BITS(sb); 2504 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2505 if (meta_group_info == NULL) { 2506 printk(KERN_ERR "EXT4-fs: can't allocate mem for a " 2507 "buddy group\n"); 2508 goto exit_meta_group_info; 2509 } 2510 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = 2511 meta_group_info; 2512 } 2513 2514 /* 2515 * calculate needed size. if change bb_counters size, 2516 * don't forget about ext4_mb_generate_buddy() 2517 */ 2518 len = offsetof(typeof(**meta_group_info), 2519 bb_counters[sb->s_blocksize_bits + 2]); 2520 2521 meta_group_info = 2522 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2523 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2524 2525 meta_group_info[i] = kzalloc(len, GFP_KERNEL); 2526 if (meta_group_info[i] == NULL) { 2527 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); 2528 goto exit_group_info; 2529 } 2530 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2531 &(meta_group_info[i]->bb_state)); 2532 2533 /* 2534 * initialize bb_free to be able to skip 2535 * empty groups without initialization 2536 */ 2537 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2538 meta_group_info[i]->bb_free = 2539 ext4_free_blocks_after_init(sb, group, desc); 2540 } else { 2541 meta_group_info[i]->bb_free = 2542 ext4_free_blks_count(sb, desc); 2543 } 2544 2545 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2546 init_rwsem(&meta_group_info[i]->alloc_sem); 2547 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2548 2549 #ifdef DOUBLE_CHECK 2550 { 2551 struct buffer_head *bh; 2552 meta_group_info[i]->bb_bitmap = 2553 kmalloc(sb->s_blocksize, GFP_KERNEL); 2554 BUG_ON(meta_group_info[i]->bb_bitmap == NULL); 2555 bh = ext4_read_block_bitmap(sb, group); 2556 BUG_ON(bh == NULL); 2557 memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, 2558 sb->s_blocksize); 2559 put_bh(bh); 2560 } 2561 #endif 2562 2563 return 0; 2564 2565 exit_group_info: 2566 /* If a meta_group_info table has been allocated, release it now */ 2567 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) 2568 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); 2569 exit_meta_group_info: 2570 return -ENOMEM; 2571 } /* ext4_mb_add_groupinfo */ 2572 2573 /* 2574 * Update an existing group. 2575 * This function is used for online resize 2576 */ 2577 void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) 2578 { 2579 grp->bb_free += add; 2580 } 2581 2582 static int ext4_mb_init_backend(struct super_block *sb) 2583 { 2584 ext4_group_t i; 2585 int metalen; 2586 struct ext4_sb_info *sbi = EXT4_SB(sb); 2587 struct ext4_super_block *es = sbi->s_es; 2588 int num_meta_group_infos; 2589 int num_meta_group_infos_max; 2590 int array_size; 2591 struct ext4_group_info **meta_group_info; 2592 struct ext4_group_desc *desc; 2593 2594 /* This is the number of blocks used by GDT */ 2595 num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 2596 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); 2597 2598 /* 2599 * This is the total number of blocks used by GDT including 2600 * the number of reserved blocks for GDT. 2601 * The s_group_info array is allocated with this value 2602 * to allow a clean online resize without a complex 2603 * manipulation of pointer. 2604 * The drawback is the unused memory when no resize 2605 * occurs but it's very low in terms of pages 2606 * (see comments below) 2607 * Need to handle this properly when META_BG resizing is allowed 2608 */ 2609 num_meta_group_infos_max = num_meta_group_infos + 2610 le16_to_cpu(es->s_reserved_gdt_blocks); 2611 2612 /* 2613 * array_size is the size of s_group_info array. We round it 2614 * to the next power of two because this approximation is done 2615 * internally by kmalloc so we can have some more memory 2616 * for free here (e.g. may be used for META_BG resize). 2617 */ 2618 array_size = 1; 2619 while (array_size < sizeof(*sbi->s_group_info) * 2620 num_meta_group_infos_max) 2621 array_size = array_size << 1; 2622 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2623 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2624 * So a two level scheme suffices for now. */ 2625 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); 2626 if (sbi->s_group_info == NULL) { 2627 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2628 return -ENOMEM; 2629 } 2630 sbi->s_buddy_cache = new_inode(sb); 2631 if (sbi->s_buddy_cache == NULL) { 2632 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2633 goto err_freesgi; 2634 } 2635 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2636 2637 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); 2638 for (i = 0; i < num_meta_group_infos; i++) { 2639 if ((i + 1) == num_meta_group_infos) 2640 metalen = sizeof(*meta_group_info) * 2641 (sbi->s_groups_count - 2642 (i << EXT4_DESC_PER_BLOCK_BITS(sb))); 2643 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2644 if (meta_group_info == NULL) { 2645 printk(KERN_ERR "EXT4-fs: can't allocate mem for a " 2646 "buddy group\n"); 2647 goto err_freemeta; 2648 } 2649 sbi->s_group_info[i] = meta_group_info; 2650 } 2651 2652 for (i = 0; i < sbi->s_groups_count; i++) { 2653 desc = ext4_get_group_desc(sb, i, NULL); 2654 if (desc == NULL) { 2655 printk(KERN_ERR 2656 "EXT4-fs: can't read descriptor %u\n", i); 2657 goto err_freebuddy; 2658 } 2659 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2660 goto err_freebuddy; 2661 } 2662 2663 return 0; 2664 2665 err_freebuddy: 2666 while (i-- > 0) 2667 kfree(ext4_get_group_info(sb, i)); 2668 i = num_meta_group_infos; 2669 err_freemeta: 2670 while (i-- > 0) 2671 kfree(sbi->s_group_info[i]); 2672 iput(sbi->s_buddy_cache); 2673 err_freesgi: 2674 kfree(sbi->s_group_info); 2675 return -ENOMEM; 2676 } 2677 2678 int ext4_mb_init(struct super_block *sb, int needs_recovery) 2679 { 2680 struct ext4_sb_info *sbi = EXT4_SB(sb); 2681 unsigned i, j; 2682 unsigned offset; 2683 unsigned max; 2684 int ret; 2685 2686 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2687 2688 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2689 if (sbi->s_mb_offsets == NULL) { 2690 return -ENOMEM; 2691 } 2692 2693 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2694 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2695 if (sbi->s_mb_maxs == NULL) { 2696 kfree(sbi->s_mb_maxs); 2697 return -ENOMEM; 2698 } 2699 2700 /* order 0 is regular bitmap */ 2701 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 2702 sbi->s_mb_offsets[0] = 0; 2703 2704 i = 1; 2705 offset = 0; 2706 max = sb->s_blocksize << 2; 2707 do { 2708 sbi->s_mb_offsets[i] = offset; 2709 sbi->s_mb_maxs[i] = max; 2710 offset += 1 << (sb->s_blocksize_bits - i); 2711 max = max >> 1; 2712 i++; 2713 } while (i <= sb->s_blocksize_bits + 1); 2714 2715 /* init file for buddy data */ 2716 ret = ext4_mb_init_backend(sb); 2717 if (ret != 0) { 2718 kfree(sbi->s_mb_offsets); 2719 kfree(sbi->s_mb_maxs); 2720 return ret; 2721 } 2722 2723 spin_lock_init(&sbi->s_md_lock); 2724 spin_lock_init(&sbi->s_bal_lock); 2725 2726 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 2727 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; 2728 sbi->s_mb_stats = MB_DEFAULT_STATS; 2729 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2730 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2731 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; 2732 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2733 2734 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2735 if (sbi->s_locality_groups == NULL) { 2736 kfree(sbi->s_mb_offsets); 2737 kfree(sbi->s_mb_maxs); 2738 return -ENOMEM; 2739 } 2740 for_each_possible_cpu(i) { 2741 struct ext4_locality_group *lg; 2742 lg = per_cpu_ptr(sbi->s_locality_groups, i); 2743 mutex_init(&lg->lg_mutex); 2744 for (j = 0; j < PREALLOC_TB_SIZE; j++) 2745 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 2746 spin_lock_init(&lg->lg_prealloc_lock); 2747 } 2748 2749 ext4_mb_init_per_dev_proc(sb); 2750 ext4_mb_history_init(sb); 2751 2752 if (sbi->s_journal) 2753 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2754 2755 printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); 2756 return 0; 2757 } 2758 2759 /* need to called with ext4 group lock (ext4_lock_group) */ 2760 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) 2761 { 2762 struct ext4_prealloc_space *pa; 2763 struct list_head *cur, *tmp; 2764 int count = 0; 2765 2766 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { 2767 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 2768 list_del(&pa->pa_group_list); 2769 count++; 2770 kmem_cache_free(ext4_pspace_cachep, pa); 2771 } 2772 if (count) 2773 mb_debug("mballoc: %u PAs left\n", count); 2774 2775 } 2776 2777 int ext4_mb_release(struct super_block *sb) 2778 { 2779 ext4_group_t i; 2780 int num_meta_group_infos; 2781 struct ext4_group_info *grinfo; 2782 struct ext4_sb_info *sbi = EXT4_SB(sb); 2783 2784 if (sbi->s_group_info) { 2785 for (i = 0; i < sbi->s_groups_count; i++) { 2786 grinfo = ext4_get_group_info(sb, i); 2787 #ifdef DOUBLE_CHECK 2788 kfree(grinfo->bb_bitmap); 2789 #endif 2790 ext4_lock_group(sb, i); 2791 ext4_mb_cleanup_pa(grinfo); 2792 ext4_unlock_group(sb, i); 2793 kfree(grinfo); 2794 } 2795 num_meta_group_infos = (sbi->s_groups_count + 2796 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2797 EXT4_DESC_PER_BLOCK_BITS(sb); 2798 for (i = 0; i < num_meta_group_infos; i++) 2799 kfree(sbi->s_group_info[i]); 2800 kfree(sbi->s_group_info); 2801 } 2802 kfree(sbi->s_mb_offsets); 2803 kfree(sbi->s_mb_maxs); 2804 if (sbi->s_buddy_cache) 2805 iput(sbi->s_buddy_cache); 2806 if (sbi->s_mb_stats) { 2807 printk(KERN_INFO 2808 "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", 2809 atomic_read(&sbi->s_bal_allocated), 2810 atomic_read(&sbi->s_bal_reqs), 2811 atomic_read(&sbi->s_bal_success)); 2812 printk(KERN_INFO 2813 "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " 2814 "%u 2^N hits, %u breaks, %u lost\n", 2815 atomic_read(&sbi->s_bal_ex_scanned), 2816 atomic_read(&sbi->s_bal_goals), 2817 atomic_read(&sbi->s_bal_2orders), 2818 atomic_read(&sbi->s_bal_breaks), 2819 atomic_read(&sbi->s_mb_lost_chunks)); 2820 printk(KERN_INFO 2821 "EXT4-fs: mballoc: %lu generated and it took %Lu\n", 2822 sbi->s_mb_buddies_generated++, 2823 sbi->s_mb_generation_time); 2824 printk(KERN_INFO 2825 "EXT4-fs: mballoc: %u preallocated, %u discarded\n", 2826 atomic_read(&sbi->s_mb_preallocated), 2827 atomic_read(&sbi->s_mb_discarded)); 2828 } 2829 2830 free_percpu(sbi->s_locality_groups); 2831 ext4_mb_history_release(sb); 2832 ext4_mb_destroy_per_dev_proc(sb); 2833 2834 return 0; 2835 } 2836 2837 /* 2838 * This function is called by the jbd2 layer once the commit has finished, 2839 * so we know we can free the blocks that were released with that commit. 2840 */ 2841 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) 2842 { 2843 struct super_block *sb = journal->j_private; 2844 struct ext4_buddy e4b; 2845 struct ext4_group_info *db; 2846 int err, count = 0, count2 = 0; 2847 struct ext4_free_data *entry; 2848 ext4_fsblk_t discard_block; 2849 struct list_head *l, *ltmp; 2850 2851 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2852 entry = list_entry(l, struct ext4_free_data, list); 2853 2854 mb_debug("gonna free %u blocks in group %u (0x%p):", 2855 entry->count, entry->group, entry); 2856 2857 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2858 /* we expect to find existing buddy because it's pinned */ 2859 BUG_ON(err != 0); 2860 2861 db = e4b.bd_info; 2862 /* there are blocks to put in buddy to make them really free */ 2863 count += entry->count; 2864 count2++; 2865 ext4_lock_group(sb, entry->group); 2866 /* Take it out of per group rb tree */ 2867 rb_erase(&entry->node, &(db->bb_free_root)); 2868 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); 2869 2870 if (!db->bb_free_root.rb_node) { 2871 /* No more items in the per group rb tree 2872 * balance refcounts from ext4_mb_free_metadata() 2873 */ 2874 page_cache_release(e4b.bd_buddy_page); 2875 page_cache_release(e4b.bd_bitmap_page); 2876 } 2877 ext4_unlock_group(sb, entry->group); 2878 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2879 + entry->start_blk 2880 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2881 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", 2882 sb->s_id, (unsigned long long) discard_block, 2883 entry->count); 2884 sb_issue_discard(sb, discard_block, entry->count); 2885 2886 kmem_cache_free(ext4_free_ext_cachep, entry); 2887 ext4_mb_release_desc(&e4b); 2888 } 2889 2890 mb_debug("freed %u blocks in %u structures\n", count, count2); 2891 } 2892 2893 #define EXT4_MB_STATS_NAME "stats" 2894 #define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" 2895 #define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" 2896 #define EXT4_MB_ORDER2_REQ "order2_req" 2897 #define EXT4_MB_STREAM_REQ "stream_req" 2898 #define EXT4_MB_GROUP_PREALLOC "group_prealloc" 2899 2900 static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2901 { 2902 #ifdef CONFIG_PROC_FS 2903 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; 2904 struct ext4_sb_info *sbi = EXT4_SB(sb); 2905 struct proc_dir_entry *proc; 2906 2907 if (sbi->s_proc == NULL) 2908 return -EINVAL; 2909 2910 EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats); 2911 EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan); 2912 EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan); 2913 EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs); 2914 EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request); 2915 EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc); 2916 return 0; 2917 2918 err_out: 2919 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); 2920 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); 2921 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); 2922 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); 2923 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); 2924 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); 2925 return -ENOMEM; 2926 #else 2927 return 0; 2928 #endif 2929 } 2930 2931 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) 2932 { 2933 #ifdef CONFIG_PROC_FS 2934 struct ext4_sb_info *sbi = EXT4_SB(sb); 2935 2936 if (sbi->s_proc == NULL) 2937 return -EINVAL; 2938 2939 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); 2940 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); 2941 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); 2942 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); 2943 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); 2944 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); 2945 #endif 2946 return 0; 2947 } 2948 2949 int __init init_ext4_mballoc(void) 2950 { 2951 ext4_pspace_cachep = 2952 kmem_cache_create("ext4_prealloc_space", 2953 sizeof(struct ext4_prealloc_space), 2954 0, SLAB_RECLAIM_ACCOUNT, NULL); 2955 if (ext4_pspace_cachep == NULL) 2956 return -ENOMEM; 2957 2958 ext4_ac_cachep = 2959 kmem_cache_create("ext4_alloc_context", 2960 sizeof(struct ext4_allocation_context), 2961 0, SLAB_RECLAIM_ACCOUNT, NULL); 2962 if (ext4_ac_cachep == NULL) { 2963 kmem_cache_destroy(ext4_pspace_cachep); 2964 return -ENOMEM; 2965 } 2966 2967 ext4_free_ext_cachep = 2968 kmem_cache_create("ext4_free_block_extents", 2969 sizeof(struct ext4_free_data), 2970 0, SLAB_RECLAIM_ACCOUNT, NULL); 2971 if (ext4_free_ext_cachep == NULL) { 2972 kmem_cache_destroy(ext4_pspace_cachep); 2973 kmem_cache_destroy(ext4_ac_cachep); 2974 return -ENOMEM; 2975 } 2976 return 0; 2977 } 2978 2979 void exit_ext4_mballoc(void) 2980 { 2981 /* XXX: synchronize_rcu(); */ 2982 kmem_cache_destroy(ext4_pspace_cachep); 2983 kmem_cache_destroy(ext4_ac_cachep); 2984 kmem_cache_destroy(ext4_free_ext_cachep); 2985 } 2986 2987 2988 /* 2989 * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps 2990 * Returns 0 if success or error code 2991 */ 2992 static noinline_for_stack int 2993 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2994 handle_t *handle, unsigned int reserv_blks) 2995 { 2996 struct buffer_head *bitmap_bh = NULL; 2997 struct ext4_super_block *es; 2998 struct ext4_group_desc *gdp; 2999 struct buffer_head *gdp_bh; 3000 struct ext4_sb_info *sbi; 3001 struct super_block *sb; 3002 ext4_fsblk_t block; 3003 int err, len; 3004 3005 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3006 BUG_ON(ac->ac_b_ex.fe_len <= 0); 3007 3008 sb = ac->ac_sb; 3009 sbi = EXT4_SB(sb); 3010 es = sbi->s_es; 3011 3012 3013 err = -EIO; 3014 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 3015 if (!bitmap_bh) 3016 goto out_err; 3017 3018 err = ext4_journal_get_write_access(handle, bitmap_bh); 3019 if (err) 3020 goto out_err; 3021 3022 err = -EIO; 3023 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); 3024 if (!gdp) 3025 goto out_err; 3026 3027 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 3028 ext4_free_blks_count(sb, gdp)); 3029 3030 err = ext4_journal_get_write_access(handle, gdp_bh); 3031 if (err) 3032 goto out_err; 3033 3034 block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) 3035 + ac->ac_b_ex.fe_start 3036 + le32_to_cpu(es->s_first_data_block); 3037 3038 len = ac->ac_b_ex.fe_len; 3039 if (in_range(ext4_block_bitmap(sb, gdp), block, len) || 3040 in_range(ext4_inode_bitmap(sb, gdp), block, len) || 3041 in_range(block, ext4_inode_table(sb, gdp), 3042 EXT4_SB(sb)->s_itb_per_group) || 3043 in_range(block + len - 1, ext4_inode_table(sb, gdp), 3044 EXT4_SB(sb)->s_itb_per_group)) { 3045 ext4_error(sb, __func__, 3046 "Allocating block %llu in system zone of %d group\n", 3047 block, ac->ac_b_ex.fe_group); 3048 /* File system mounted not to panic on error 3049 * Fix the bitmap and repeat the block allocation 3050 * We leak some of the blocks here. 3051 */ 3052 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), 3053 bitmap_bh->b_data, ac->ac_b_ex.fe_start, 3054 ac->ac_b_ex.fe_len); 3055 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3056 if (!err) 3057 err = -EAGAIN; 3058 goto out_err; 3059 } 3060 #ifdef AGGRESSIVE_CHECK 3061 { 3062 int i; 3063 for (i = 0; i < ac->ac_b_ex.fe_len; i++) { 3064 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, 3065 bitmap_bh->b_data)); 3066 } 3067 } 3068 #endif 3069 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3070 mb_set_bits(NULL, bitmap_bh->b_data, 3071 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); 3072 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 3073 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3074 ext4_free_blks_set(sb, gdp, 3075 ext4_free_blocks_after_init(sb, 3076 ac->ac_b_ex.fe_group, gdp)); 3077 } 3078 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; 3079 ext4_free_blks_set(sb, gdp, len); 3080 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 3081 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3082 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 3083 /* 3084 * Now reduce the dirty block count also. Should not go negative 3085 */ 3086 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 3087 /* release all the reserved blocks if non delalloc */ 3088 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); 3089 else 3090 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 3091 ac->ac_b_ex.fe_len); 3092 3093 if (sbi->s_log_groups_per_flex) { 3094 ext4_group_t flex_group = ext4_flex_group(sbi, 3095 ac->ac_b_ex.fe_group); 3096 spin_lock(sb_bgl_lock(sbi, flex_group)); 3097 sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; 3098 spin_unlock(sb_bgl_lock(sbi, flex_group)); 3099 } 3100 3101 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 3102 if (err) 3103 goto out_err; 3104 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 3105 3106 out_err: 3107 sb->s_dirt = 1; 3108 brelse(bitmap_bh); 3109 return err; 3110 } 3111 3112 /* 3113 * here we normalize request for locality group 3114 * Group request are normalized to s_strip size if we set the same via mount 3115 * option. If not we set it to s_mb_group_prealloc which can be configured via 3116 * /proc/fs/ext4/<partition>/group_prealloc 3117 * 3118 * XXX: should we try to preallocate more than the group has now? 3119 */ 3120 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) 3121 { 3122 struct super_block *sb = ac->ac_sb; 3123 struct ext4_locality_group *lg = ac->ac_lg; 3124 3125 BUG_ON(lg == NULL); 3126 if (EXT4_SB(sb)->s_stripe) 3127 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; 3128 else 3129 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 3130 mb_debug("#%u: goal %u blocks for locality group\n", 3131 current->pid, ac->ac_g_ex.fe_len); 3132 } 3133 3134 /* 3135 * Normalization means making request better in terms of 3136 * size and alignment 3137 */ 3138 static noinline_for_stack void 3139 ext4_mb_normalize_request(struct ext4_allocation_context *ac, 3140 struct ext4_allocation_request *ar) 3141 { 3142 int bsbits, max; 3143 ext4_lblk_t end; 3144 loff_t size, orig_size, start_off; 3145 ext4_lblk_t start, orig_start; 3146 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3147 struct ext4_prealloc_space *pa; 3148 3149 /* do normalize only data requests, metadata requests 3150 do not need preallocation */ 3151 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3152 return; 3153 3154 /* sometime caller may want exact blocks */ 3155 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 3156 return; 3157 3158 /* caller may indicate that preallocation isn't 3159 * required (it's a tail, for example) */ 3160 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) 3161 return; 3162 3163 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { 3164 ext4_mb_normalize_group_request(ac); 3165 return ; 3166 } 3167 3168 bsbits = ac->ac_sb->s_blocksize_bits; 3169 3170 /* first, let's learn actual file size 3171 * given current request is allocated */ 3172 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3173 size = size << bsbits; 3174 if (size < i_size_read(ac->ac_inode)) 3175 size = i_size_read(ac->ac_inode); 3176 3177 /* max size of free chunks */ 3178 max = 2 << bsbits; 3179 3180 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ 3181 (req <= (size) || max <= (chunk_size)) 3182 3183 /* first, try to predict filesize */ 3184 /* XXX: should this table be tunable? */ 3185 start_off = 0; 3186 if (size <= 16 * 1024) { 3187 size = 16 * 1024; 3188 } else if (size <= 32 * 1024) { 3189 size = 32 * 1024; 3190 } else if (size <= 64 * 1024) { 3191 size = 64 * 1024; 3192 } else if (size <= 128 * 1024) { 3193 size = 128 * 1024; 3194 } else if (size <= 256 * 1024) { 3195 size = 256 * 1024; 3196 } else if (size <= 512 * 1024) { 3197 size = 512 * 1024; 3198 } else if (size <= 1024 * 1024) { 3199 size = 1024 * 1024; 3200 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { 3201 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3202 (21 - bsbits)) << 21; 3203 size = 2 * 1024 * 1024; 3204 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { 3205 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3206 (22 - bsbits)) << 22; 3207 size = 4 * 1024 * 1024; 3208 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, 3209 (8<<20)>>bsbits, max, 8 * 1024)) { 3210 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3211 (23 - bsbits)) << 23; 3212 size = 8 * 1024 * 1024; 3213 } else { 3214 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; 3215 size = ac->ac_o_ex.fe_len << bsbits; 3216 } 3217 orig_size = size = size >> bsbits; 3218 orig_start = start = start_off >> bsbits; 3219 3220 /* don't cover already allocated blocks in selected range */ 3221 if (ar->pleft && start <= ar->lleft) { 3222 size -= ar->lleft + 1 - start; 3223 start = ar->lleft + 1; 3224 } 3225 if (ar->pright && start + size - 1 >= ar->lright) 3226 size -= start + size - ar->lright; 3227 3228 end = start + size; 3229 3230 /* check we don't cross already preallocated blocks */ 3231 rcu_read_lock(); 3232 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3233 ext4_lblk_t pa_end; 3234 3235 if (pa->pa_deleted) 3236 continue; 3237 spin_lock(&pa->pa_lock); 3238 if (pa->pa_deleted) { 3239 spin_unlock(&pa->pa_lock); 3240 continue; 3241 } 3242 3243 pa_end = pa->pa_lstart + pa->pa_len; 3244 3245 /* PA must not overlap original request */ 3246 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3247 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 3248 3249 /* skip PA normalized request doesn't overlap with */ 3250 if (pa->pa_lstart >= end) { 3251 spin_unlock(&pa->pa_lock); 3252 continue; 3253 } 3254 if (pa_end <= start) { 3255 spin_unlock(&pa->pa_lock); 3256 continue; 3257 } 3258 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 3259 3260 if (pa_end <= ac->ac_o_ex.fe_logical) { 3261 BUG_ON(pa_end < start); 3262 start = pa_end; 3263 } 3264 3265 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { 3266 BUG_ON(pa->pa_lstart > end); 3267 end = pa->pa_lstart; 3268 } 3269 spin_unlock(&pa->pa_lock); 3270 } 3271 rcu_read_unlock(); 3272 size = end - start; 3273 3274 /* XXX: extra loop to check we really don't overlap preallocations */ 3275 rcu_read_lock(); 3276 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3277 ext4_lblk_t pa_end; 3278 spin_lock(&pa->pa_lock); 3279 if (pa->pa_deleted == 0) { 3280 pa_end = pa->pa_lstart + pa->pa_len; 3281 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); 3282 } 3283 spin_unlock(&pa->pa_lock); 3284 } 3285 rcu_read_unlock(); 3286 3287 if (start + size <= ac->ac_o_ex.fe_logical && 3288 start > ac->ac_o_ex.fe_logical) { 3289 printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", 3290 (unsigned long) start, (unsigned long) size, 3291 (unsigned long) ac->ac_o_ex.fe_logical); 3292 } 3293 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3294 start > ac->ac_o_ex.fe_logical); 3295 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 3296 3297 /* now prepare goal request */ 3298 3299 /* XXX: is it better to align blocks WRT to logical 3300 * placement or satisfy big request as is */ 3301 ac->ac_g_ex.fe_logical = start; 3302 ac->ac_g_ex.fe_len = size; 3303 3304 /* define goal start in order to merge */ 3305 if (ar->pright && (ar->lright == (start + size))) { 3306 /* merge to the right */ 3307 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, 3308 &ac->ac_f_ex.fe_group, 3309 &ac->ac_f_ex.fe_start); 3310 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3311 } 3312 if (ar->pleft && (ar->lleft + 1 == start)) { 3313 /* merge to the left */ 3314 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, 3315 &ac->ac_f_ex.fe_group, 3316 &ac->ac_f_ex.fe_start); 3317 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3318 } 3319 3320 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, 3321 (unsigned) orig_size, (unsigned) start); 3322 } 3323 3324 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) 3325 { 3326 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3327 3328 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3329 atomic_inc(&sbi->s_bal_reqs); 3330 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3331 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) 3332 atomic_inc(&sbi->s_bal_success); 3333 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3334 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3335 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 3336 atomic_inc(&sbi->s_bal_goals); 3337 if (ac->ac_found > sbi->s_mb_max_to_scan) 3338 atomic_inc(&sbi->s_bal_breaks); 3339 } 3340 3341 ext4_mb_store_history(ac); 3342 } 3343 3344 /* 3345 * use blocks preallocated to inode 3346 */ 3347 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 3348 struct ext4_prealloc_space *pa) 3349 { 3350 ext4_fsblk_t start; 3351 ext4_fsblk_t end; 3352 int len; 3353 3354 /* found preallocated blocks, use them */ 3355 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 3356 end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); 3357 len = end - start; 3358 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 3359 &ac->ac_b_ex.fe_start); 3360 ac->ac_b_ex.fe_len = len; 3361 ac->ac_status = AC_STATUS_FOUND; 3362 ac->ac_pa = pa; 3363 3364 BUG_ON(start < pa->pa_pstart); 3365 BUG_ON(start + len > pa->pa_pstart + pa->pa_len); 3366 BUG_ON(pa->pa_free < len); 3367 pa->pa_free -= len; 3368 3369 mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); 3370 } 3371 3372 /* 3373 * use blocks preallocated to locality group 3374 */ 3375 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, 3376 struct ext4_prealloc_space *pa) 3377 { 3378 unsigned int len = ac->ac_o_ex.fe_len; 3379 3380 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 3381 &ac->ac_b_ex.fe_group, 3382 &ac->ac_b_ex.fe_start); 3383 ac->ac_b_ex.fe_len = len; 3384 ac->ac_status = AC_STATUS_FOUND; 3385 ac->ac_pa = pa; 3386 3387 /* we don't correct pa_pstart or pa_plen here to avoid 3388 * possible race when the group is being loaded concurrently 3389 * instead we correct pa later, after blocks are marked 3390 * in on-disk bitmap -- see ext4_mb_release_context() 3391 * Other CPUs are prevented from allocating from this pa by lg_mutex 3392 */ 3393 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3394 } 3395 3396 /* 3397 * Return the prealloc space that have minimal distance 3398 * from the goal block. @cpa is the prealloc 3399 * space that is having currently known minimal distance 3400 * from the goal block. 3401 */ 3402 static struct ext4_prealloc_space * 3403 ext4_mb_check_group_pa(ext4_fsblk_t goal_block, 3404 struct ext4_prealloc_space *pa, 3405 struct ext4_prealloc_space *cpa) 3406 { 3407 ext4_fsblk_t cur_distance, new_distance; 3408 3409 if (cpa == NULL) { 3410 atomic_inc(&pa->pa_count); 3411 return pa; 3412 } 3413 cur_distance = abs(goal_block - cpa->pa_pstart); 3414 new_distance = abs(goal_block - pa->pa_pstart); 3415 3416 if (cur_distance < new_distance) 3417 return cpa; 3418 3419 /* drop the previous reference */ 3420 atomic_dec(&cpa->pa_count); 3421 atomic_inc(&pa->pa_count); 3422 return pa; 3423 } 3424 3425 /* 3426 * search goal blocks in preallocated space 3427 */ 3428 static noinline_for_stack int 3429 ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 3430 { 3431 int order, i; 3432 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3433 struct ext4_locality_group *lg; 3434 struct ext4_prealloc_space *pa, *cpa = NULL; 3435 ext4_fsblk_t goal_block; 3436 3437 /* only data can be preallocated */ 3438 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3439 return 0; 3440 3441 /* first, try per-file preallocation */ 3442 rcu_read_lock(); 3443 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3444 3445 /* all fields in this condition don't change, 3446 * so we can skip locking for them */ 3447 if (ac->ac_o_ex.fe_logical < pa->pa_lstart || 3448 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3449 continue; 3450 3451 /* found preallocated blocks, use them */ 3452 spin_lock(&pa->pa_lock); 3453 if (pa->pa_deleted == 0 && pa->pa_free) { 3454 atomic_inc(&pa->pa_count); 3455 ext4_mb_use_inode_pa(ac, pa); 3456 spin_unlock(&pa->pa_lock); 3457 ac->ac_criteria = 10; 3458 rcu_read_unlock(); 3459 return 1; 3460 } 3461 spin_unlock(&pa->pa_lock); 3462 } 3463 rcu_read_unlock(); 3464 3465 /* can we use group allocation? */ 3466 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) 3467 return 0; 3468 3469 /* inode may have no locality group for some reason */ 3470 lg = ac->ac_lg; 3471 if (lg == NULL) 3472 return 0; 3473 order = fls(ac->ac_o_ex.fe_len) - 1; 3474 if (order > PREALLOC_TB_SIZE - 1) 3475 /* The max size of hash table is PREALLOC_TB_SIZE */ 3476 order = PREALLOC_TB_SIZE - 1; 3477 3478 goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + 3479 ac->ac_g_ex.fe_start + 3480 le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); 3481 /* 3482 * search for the prealloc space that is having 3483 * minimal distance from the goal block. 3484 */ 3485 for (i = order; i < PREALLOC_TB_SIZE; i++) { 3486 rcu_read_lock(); 3487 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], 3488 pa_inode_list) { 3489 spin_lock(&pa->pa_lock); 3490 if (pa->pa_deleted == 0 && 3491 pa->pa_free >= ac->ac_o_ex.fe_len) { 3492 3493 cpa = ext4_mb_check_group_pa(goal_block, 3494 pa, cpa); 3495 } 3496 spin_unlock(&pa->pa_lock); 3497 } 3498 rcu_read_unlock(); 3499 } 3500 if (cpa) { 3501 ext4_mb_use_group_pa(ac, cpa); 3502 ac->ac_criteria = 20; 3503 return 1; 3504 } 3505 return 0; 3506 } 3507 3508 /* 3509 * the function goes through all block freed in the group 3510 * but not yet committed and marks them used in in-core bitmap. 3511 * buddy must be generated from this bitmap 3512 * Need to be called with ext4 group lock (ext4_lock_group) 3513 */ 3514 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 3515 ext4_group_t group) 3516 { 3517 struct rb_node *n; 3518 struct ext4_group_info *grp; 3519 struct ext4_free_data *entry; 3520 3521 grp = ext4_get_group_info(sb, group); 3522 n = rb_first(&(grp->bb_free_root)); 3523 3524 while (n) { 3525 entry = rb_entry(n, struct ext4_free_data, node); 3526 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), 3527 bitmap, entry->start_blk, 3528 entry->count); 3529 n = rb_next(n); 3530 } 3531 return; 3532 } 3533 3534 /* 3535 * the function goes through all preallocation in this group and marks them 3536 * used in in-core bitmap. buddy must be generated from this bitmap 3537 * Need to be called with ext4 group lock (ext4_lock_group) 3538 */ 3539 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 3540 ext4_group_t group) 3541 { 3542 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3543 struct ext4_prealloc_space *pa; 3544 struct list_head *cur; 3545 ext4_group_t groupnr; 3546 ext4_grpblk_t start; 3547 int preallocated = 0; 3548 int count = 0; 3549 int len; 3550 3551 /* all form of preallocation discards first load group, 3552 * so the only competing code is preallocation use. 3553 * we don't need any locking here 3554 * notice we do NOT ignore preallocations with pa_deleted 3555 * otherwise we could leave used blocks available for 3556 * allocation in buddy when concurrent ext4_mb_put_pa() 3557 * is dropping preallocation 3558 */ 3559 list_for_each(cur, &grp->bb_prealloc_list) { 3560 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 3561 spin_lock(&pa->pa_lock); 3562 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 3563 &groupnr, &start); 3564 len = pa->pa_len; 3565 spin_unlock(&pa->pa_lock); 3566 if (unlikely(len == 0)) 3567 continue; 3568 BUG_ON(groupnr != group); 3569 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), 3570 bitmap, start, len); 3571 preallocated += len; 3572 count++; 3573 } 3574 mb_debug("prellocated %u for group %u\n", preallocated, group); 3575 } 3576 3577 static void ext4_mb_pa_callback(struct rcu_head *head) 3578 { 3579 struct ext4_prealloc_space *pa; 3580 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 3581 kmem_cache_free(ext4_pspace_cachep, pa); 3582 } 3583 3584 /* 3585 * drops a reference to preallocated space descriptor 3586 * if this was the last reference and the space is consumed 3587 */ 3588 static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 3589 struct super_block *sb, struct ext4_prealloc_space *pa) 3590 { 3591 ext4_group_t grp; 3592 ext4_fsblk_t grp_blk; 3593 3594 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) 3595 return; 3596 3597 /* in this short window concurrent discard can set pa_deleted */ 3598 spin_lock(&pa->pa_lock); 3599 if (pa->pa_deleted == 1) { 3600 spin_unlock(&pa->pa_lock); 3601 return; 3602 } 3603 3604 pa->pa_deleted = 1; 3605 spin_unlock(&pa->pa_lock); 3606 3607 grp_blk = pa->pa_pstart; 3608 /* If linear, pa_pstart may be in the next group when pa is used up */ 3609 if (pa->pa_linear) 3610 grp_blk--; 3611 3612 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); 3613 3614 /* 3615 * possible race: 3616 * 3617 * P1 (buddy init) P2 (regular allocation) 3618 * find block B in PA 3619 * copy on-disk bitmap to buddy 3620 * mark B in on-disk bitmap 3621 * drop PA from group 3622 * mark all PAs in buddy 3623 * 3624 * thus, P1 initializes buddy with B available. to prevent this 3625 * we make "copy" and "mark all PAs" atomic and serialize "drop PA" 3626 * against that pair 3627 */ 3628 ext4_lock_group(sb, grp); 3629 list_del(&pa->pa_group_list); 3630 ext4_unlock_group(sb, grp); 3631 3632 spin_lock(pa->pa_obj_lock); 3633 list_del_rcu(&pa->pa_inode_list); 3634 spin_unlock(pa->pa_obj_lock); 3635 3636 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3637 } 3638 3639 /* 3640 * creates new preallocated space for given inode 3641 */ 3642 static noinline_for_stack int 3643 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 3644 { 3645 struct super_block *sb = ac->ac_sb; 3646 struct ext4_prealloc_space *pa; 3647 struct ext4_group_info *grp; 3648 struct ext4_inode_info *ei; 3649 3650 /* preallocate only when found space is larger then requested */ 3651 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3652 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3653 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3654 3655 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3656 if (pa == NULL) 3657 return -ENOMEM; 3658 3659 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { 3660 int winl; 3661 int wins; 3662 int win; 3663 int offs; 3664 3665 /* we can't allocate as much as normalizer wants. 3666 * so, found space must get proper lstart 3667 * to cover original request */ 3668 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 3669 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); 3670 3671 /* we're limited by original request in that 3672 * logical block must be covered any way 3673 * winl is window we can move our chunk within */ 3674 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; 3675 3676 /* also, we should cover whole original request */ 3677 wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; 3678 3679 /* the smallest one defines real window */ 3680 win = min(winl, wins); 3681 3682 offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; 3683 if (offs && offs < win) 3684 win = offs; 3685 3686 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; 3687 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 3688 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 3689 } 3690 3691 /* preallocation can change ac_b_ex, thus we store actually 3692 * allocated blocks for history */ 3693 ac->ac_f_ex = ac->ac_b_ex; 3694 3695 pa->pa_lstart = ac->ac_b_ex.fe_logical; 3696 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3697 pa->pa_len = ac->ac_b_ex.fe_len; 3698 pa->pa_free = pa->pa_len; 3699 atomic_set(&pa->pa_count, 1); 3700 spin_lock_init(&pa->pa_lock); 3701 INIT_LIST_HEAD(&pa->pa_inode_list); 3702 INIT_LIST_HEAD(&pa->pa_group_list); 3703 pa->pa_deleted = 0; 3704 pa->pa_linear = 0; 3705 3706 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3707 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3708 trace_mark(ext4_mb_new_inode_pa, 3709 "dev %s ino %lu pstart %llu len %u lstart %u", 3710 sb->s_id, ac->ac_inode->i_ino, 3711 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3712 3713 ext4_mb_use_inode_pa(ac, pa); 3714 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3715 3716 ei = EXT4_I(ac->ac_inode); 3717 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3718 3719 pa->pa_obj_lock = &ei->i_prealloc_lock; 3720 pa->pa_inode = ac->ac_inode; 3721 3722 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3723 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3724 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3725 3726 spin_lock(pa->pa_obj_lock); 3727 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); 3728 spin_unlock(pa->pa_obj_lock); 3729 3730 return 0; 3731 } 3732 3733 /* 3734 * creates new preallocated space for locality group inodes belongs to 3735 */ 3736 static noinline_for_stack int 3737 ext4_mb_new_group_pa(struct ext4_allocation_context *ac) 3738 { 3739 struct super_block *sb = ac->ac_sb; 3740 struct ext4_locality_group *lg; 3741 struct ext4_prealloc_space *pa; 3742 struct ext4_group_info *grp; 3743 3744 /* preallocate only when found space is larger then requested */ 3745 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3746 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3747 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3748 3749 BUG_ON(ext4_pspace_cachep == NULL); 3750 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3751 if (pa == NULL) 3752 return -ENOMEM; 3753 3754 /* preallocation can change ac_b_ex, thus we store actually 3755 * allocated blocks for history */ 3756 ac->ac_f_ex = ac->ac_b_ex; 3757 3758 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3759 pa->pa_lstart = pa->pa_pstart; 3760 pa->pa_len = ac->ac_b_ex.fe_len; 3761 pa->pa_free = pa->pa_len; 3762 atomic_set(&pa->pa_count, 1); 3763 spin_lock_init(&pa->pa_lock); 3764 INIT_LIST_HEAD(&pa->pa_inode_list); 3765 INIT_LIST_HEAD(&pa->pa_group_list); 3766 pa->pa_deleted = 0; 3767 pa->pa_linear = 1; 3768 3769 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3770 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3771 trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u", 3772 sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3773 3774 ext4_mb_use_group_pa(ac, pa); 3775 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3776 3777 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3778 lg = ac->ac_lg; 3779 BUG_ON(lg == NULL); 3780 3781 pa->pa_obj_lock = &lg->lg_prealloc_lock; 3782 pa->pa_inode = NULL; 3783 3784 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3785 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3786 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3787 3788 /* 3789 * We will later add the new pa to the right bucket 3790 * after updating the pa_free in ext4_mb_release_context 3791 */ 3792 return 0; 3793 } 3794 3795 static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) 3796 { 3797 int err; 3798 3799 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 3800 err = ext4_mb_new_group_pa(ac); 3801 else 3802 err = ext4_mb_new_inode_pa(ac); 3803 return err; 3804 } 3805 3806 /* 3807 * finds all unused blocks in on-disk bitmap, frees them in 3808 * in-core bitmap and buddy. 3809 * @pa must be unlinked from inode and group lists, so that 3810 * nobody else can find/use it. 3811 * the caller MUST hold group/inode locks. 3812 * TODO: optimize the case when there are no in-core structures yet 3813 */ 3814 static noinline_for_stack int 3815 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 3816 struct ext4_prealloc_space *pa, 3817 struct ext4_allocation_context *ac) 3818 { 3819 struct super_block *sb = e4b->bd_sb; 3820 struct ext4_sb_info *sbi = EXT4_SB(sb); 3821 unsigned int end; 3822 unsigned int next; 3823 ext4_group_t group; 3824 ext4_grpblk_t bit; 3825 unsigned long long grp_blk_start; 3826 sector_t start; 3827 int err = 0; 3828 int free = 0; 3829 3830 BUG_ON(pa->pa_deleted == 0); 3831 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3832 grp_blk_start = pa->pa_pstart - bit; 3833 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3834 end = bit + pa->pa_len; 3835 3836 if (ac) { 3837 ac->ac_sb = sb; 3838 ac->ac_inode = pa->pa_inode; 3839 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3840 } 3841 3842 while (bit < end) { 3843 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 3844 if (bit >= end) 3845 break; 3846 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3847 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3848 le32_to_cpu(sbi->s_es->s_first_data_block); 3849 mb_debug(" free preallocated %u/%u in group %u\n", 3850 (unsigned) start, (unsigned) next - bit, 3851 (unsigned) group); 3852 free += next - bit; 3853 3854 if (ac) { 3855 ac->ac_b_ex.fe_group = group; 3856 ac->ac_b_ex.fe_start = bit; 3857 ac->ac_b_ex.fe_len = next - bit; 3858 ac->ac_b_ex.fe_logical = 0; 3859 ext4_mb_store_history(ac); 3860 } 3861 3862 trace_mark(ext4_mb_release_inode_pa, 3863 "dev %s ino %lu block %llu count %u", 3864 sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit, 3865 next - bit); 3866 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3867 bit = next + 1; 3868 } 3869 if (free != pa->pa_free) { 3870 printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", 3871 pa, (unsigned long) pa->pa_lstart, 3872 (unsigned long) pa->pa_pstart, 3873 (unsigned long) pa->pa_len); 3874 ext4_grp_locked_error(sb, group, 3875 __func__, "free %u, pa_free %u", 3876 free, pa->pa_free); 3877 /* 3878 * pa is already deleted so we use the value obtained 3879 * from the bitmap and continue. 3880 */ 3881 } 3882 atomic_add(free, &sbi->s_mb_discarded); 3883 3884 return err; 3885 } 3886 3887 static noinline_for_stack int 3888 ext4_mb_release_group_pa(struct ext4_buddy *e4b, 3889 struct ext4_prealloc_space *pa, 3890 struct ext4_allocation_context *ac) 3891 { 3892 struct super_block *sb = e4b->bd_sb; 3893 ext4_group_t group; 3894 ext4_grpblk_t bit; 3895 3896 if (ac) 3897 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3898 3899 trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d", 3900 sb->s_id, pa->pa_pstart, pa->pa_len); 3901 BUG_ON(pa->pa_deleted == 0); 3902 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3903 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3904 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 3905 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 3906 3907 if (ac) { 3908 ac->ac_sb = sb; 3909 ac->ac_inode = NULL; 3910 ac->ac_b_ex.fe_group = group; 3911 ac->ac_b_ex.fe_start = bit; 3912 ac->ac_b_ex.fe_len = pa->pa_len; 3913 ac->ac_b_ex.fe_logical = 0; 3914 ext4_mb_store_history(ac); 3915 } 3916 3917 return 0; 3918 } 3919 3920 /* 3921 * releases all preallocations in given group 3922 * 3923 * first, we need to decide discard policy: 3924 * - when do we discard 3925 * 1) ENOSPC 3926 * - how many do we discard 3927 * 1) how many requested 3928 */ 3929 static noinline_for_stack int 3930 ext4_mb_discard_group_preallocations(struct super_block *sb, 3931 ext4_group_t group, int needed) 3932 { 3933 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3934 struct buffer_head *bitmap_bh = NULL; 3935 struct ext4_prealloc_space *pa, *tmp; 3936 struct ext4_allocation_context *ac; 3937 struct list_head list; 3938 struct ext4_buddy e4b; 3939 int err; 3940 int busy = 0; 3941 int free = 0; 3942 3943 mb_debug("discard preallocation for group %u\n", group); 3944 3945 if (list_empty(&grp->bb_prealloc_list)) 3946 return 0; 3947 3948 bitmap_bh = ext4_read_block_bitmap(sb, group); 3949 if (bitmap_bh == NULL) { 3950 ext4_error(sb, __func__, "Error in reading block " 3951 "bitmap for %u", group); 3952 return 0; 3953 } 3954 3955 err = ext4_mb_load_buddy(sb, group, &e4b); 3956 if (err) { 3957 ext4_error(sb, __func__, "Error in loading buddy " 3958 "information for %u", group); 3959 put_bh(bitmap_bh); 3960 return 0; 3961 } 3962 3963 if (needed == 0) 3964 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3965 3966 INIT_LIST_HEAD(&list); 3967 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 3968 repeat: 3969 ext4_lock_group(sb, group); 3970 list_for_each_entry_safe(pa, tmp, 3971 &grp->bb_prealloc_list, pa_group_list) { 3972 spin_lock(&pa->pa_lock); 3973 if (atomic_read(&pa->pa_count)) { 3974 spin_unlock(&pa->pa_lock); 3975 busy = 1; 3976 continue; 3977 } 3978 if (pa->pa_deleted) { 3979 spin_unlock(&pa->pa_lock); 3980 continue; 3981 } 3982 3983 /* seems this one can be freed ... */ 3984 pa->pa_deleted = 1; 3985 3986 /* we can trust pa_free ... */ 3987 free += pa->pa_free; 3988 3989 spin_unlock(&pa->pa_lock); 3990 3991 list_del(&pa->pa_group_list); 3992 list_add(&pa->u.pa_tmp_list, &list); 3993 } 3994 3995 /* if we still need more blocks and some PAs were used, try again */ 3996 if (free < needed && busy) { 3997 busy = 0; 3998 ext4_unlock_group(sb, group); 3999 /* 4000 * Yield the CPU here so that we don't get soft lockup 4001 * in non preempt case. 4002 */ 4003 yield(); 4004 goto repeat; 4005 } 4006 4007 /* found anything to free? */ 4008 if (list_empty(&list)) { 4009 BUG_ON(free != 0); 4010 goto out; 4011 } 4012 4013 /* now free all selected PAs */ 4014 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 4015 4016 /* remove from object (inode or locality group) */ 4017 spin_lock(pa->pa_obj_lock); 4018 list_del_rcu(&pa->pa_inode_list); 4019 spin_unlock(pa->pa_obj_lock); 4020 4021 if (pa->pa_linear) 4022 ext4_mb_release_group_pa(&e4b, pa, ac); 4023 else 4024 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 4025 4026 list_del(&pa->u.pa_tmp_list); 4027 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4028 } 4029 4030 out: 4031 ext4_unlock_group(sb, group); 4032 if (ac) 4033 kmem_cache_free(ext4_ac_cachep, ac); 4034 ext4_mb_release_desc(&e4b); 4035 put_bh(bitmap_bh); 4036 return free; 4037 } 4038 4039 /* 4040 * releases all non-used preallocated blocks for given inode 4041 * 4042 * It's important to discard preallocations under i_data_sem 4043 * We don't want another block to be served from the prealloc 4044 * space when we are discarding the inode prealloc space. 4045 * 4046 * FIXME!! Make sure it is valid at all the call sites 4047 */ 4048 void ext4_discard_preallocations(struct inode *inode) 4049 { 4050 struct ext4_inode_info *ei = EXT4_I(inode); 4051 struct super_block *sb = inode->i_sb; 4052 struct buffer_head *bitmap_bh = NULL; 4053 struct ext4_prealloc_space *pa, *tmp; 4054 struct ext4_allocation_context *ac; 4055 ext4_group_t group = 0; 4056 struct list_head list; 4057 struct ext4_buddy e4b; 4058 int err; 4059 4060 if (!S_ISREG(inode->i_mode)) { 4061 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 4062 return; 4063 } 4064 4065 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 4066 trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id, 4067 inode->i_ino); 4068 4069 INIT_LIST_HEAD(&list); 4070 4071 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4072 repeat: 4073 /* first, collect all pa's in the inode */ 4074 spin_lock(&ei->i_prealloc_lock); 4075 while (!list_empty(&ei->i_prealloc_list)) { 4076 pa = list_entry(ei->i_prealloc_list.next, 4077 struct ext4_prealloc_space, pa_inode_list); 4078 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); 4079 spin_lock(&pa->pa_lock); 4080 if (atomic_read(&pa->pa_count)) { 4081 /* this shouldn't happen often - nobody should 4082 * use preallocation while we're discarding it */ 4083 spin_unlock(&pa->pa_lock); 4084 spin_unlock(&ei->i_prealloc_lock); 4085 printk(KERN_ERR "uh-oh! used pa while discarding\n"); 4086 WARN_ON(1); 4087 schedule_timeout_uninterruptible(HZ); 4088 goto repeat; 4089 4090 } 4091 if (pa->pa_deleted == 0) { 4092 pa->pa_deleted = 1; 4093 spin_unlock(&pa->pa_lock); 4094 list_del_rcu(&pa->pa_inode_list); 4095 list_add(&pa->u.pa_tmp_list, &list); 4096 continue; 4097 } 4098 4099 /* someone is deleting pa right now */ 4100 spin_unlock(&pa->pa_lock); 4101 spin_unlock(&ei->i_prealloc_lock); 4102 4103 /* we have to wait here because pa_deleted 4104 * doesn't mean pa is already unlinked from 4105 * the list. as we might be called from 4106 * ->clear_inode() the inode will get freed 4107 * and concurrent thread which is unlinking 4108 * pa from inode's list may access already 4109 * freed memory, bad-bad-bad */ 4110 4111 /* XXX: if this happens too often, we can 4112 * add a flag to force wait only in case 4113 * of ->clear_inode(), but not in case of 4114 * regular truncate */ 4115 schedule_timeout_uninterruptible(HZ); 4116 goto repeat; 4117 } 4118 spin_unlock(&ei->i_prealloc_lock); 4119 4120 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 4121 BUG_ON(pa->pa_linear != 0); 4122 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4123 4124 err = ext4_mb_load_buddy(sb, group, &e4b); 4125 if (err) { 4126 ext4_error(sb, __func__, "Error in loading buddy " 4127 "information for %u", group); 4128 continue; 4129 } 4130 4131 bitmap_bh = ext4_read_block_bitmap(sb, group); 4132 if (bitmap_bh == NULL) { 4133 ext4_error(sb, __func__, "Error in reading block " 4134 "bitmap for %u", group); 4135 ext4_mb_release_desc(&e4b); 4136 continue; 4137 } 4138 4139 ext4_lock_group(sb, group); 4140 list_del(&pa->pa_group_list); 4141 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 4142 ext4_unlock_group(sb, group); 4143 4144 ext4_mb_release_desc(&e4b); 4145 put_bh(bitmap_bh); 4146 4147 list_del(&pa->u.pa_tmp_list); 4148 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4149 } 4150 if (ac) 4151 kmem_cache_free(ext4_ac_cachep, ac); 4152 } 4153 4154 /* 4155 * finds all preallocated spaces and return blocks being freed to them 4156 * if preallocated space becomes full (no block is used from the space) 4157 * then the function frees space in buddy 4158 * XXX: at the moment, truncate (which is the only way to free blocks) 4159 * discards all preallocations 4160 */ 4161 static void ext4_mb_return_to_preallocation(struct inode *inode, 4162 struct ext4_buddy *e4b, 4163 sector_t block, int count) 4164 { 4165 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); 4166 } 4167 #ifdef MB_DEBUG 4168 static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4169 { 4170 struct super_block *sb = ac->ac_sb; 4171 ext4_group_t i; 4172 4173 printk(KERN_ERR "EXT4-fs: Can't allocate:" 4174 " Allocation context details:\n"); 4175 printk(KERN_ERR "EXT4-fs: status %d flags %d\n", 4176 ac->ac_status, ac->ac_flags); 4177 printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " 4178 "best %lu/%lu/%lu@%lu cr %d\n", 4179 (unsigned long)ac->ac_o_ex.fe_group, 4180 (unsigned long)ac->ac_o_ex.fe_start, 4181 (unsigned long)ac->ac_o_ex.fe_len, 4182 (unsigned long)ac->ac_o_ex.fe_logical, 4183 (unsigned long)ac->ac_g_ex.fe_group, 4184 (unsigned long)ac->ac_g_ex.fe_start, 4185 (unsigned long)ac->ac_g_ex.fe_len, 4186 (unsigned long)ac->ac_g_ex.fe_logical, 4187 (unsigned long)ac->ac_b_ex.fe_group, 4188 (unsigned long)ac->ac_b_ex.fe_start, 4189 (unsigned long)ac->ac_b_ex.fe_len, 4190 (unsigned long)ac->ac_b_ex.fe_logical, 4191 (int)ac->ac_criteria); 4192 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, 4193 ac->ac_found); 4194 printk(KERN_ERR "EXT4-fs: groups: \n"); 4195 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 4196 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 4197 struct ext4_prealloc_space *pa; 4198 ext4_grpblk_t start; 4199 struct list_head *cur; 4200 ext4_lock_group(sb, i); 4201 list_for_each(cur, &grp->bb_prealloc_list) { 4202 pa = list_entry(cur, struct ext4_prealloc_space, 4203 pa_group_list); 4204 spin_lock(&pa->pa_lock); 4205 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 4206 NULL, &start); 4207 spin_unlock(&pa->pa_lock); 4208 printk(KERN_ERR "PA:%lu:%d:%u \n", i, 4209 start, pa->pa_len); 4210 } 4211 ext4_unlock_group(sb, i); 4212 4213 if (grp->bb_free == 0) 4214 continue; 4215 printk(KERN_ERR "%lu: %d/%d \n", 4216 i, grp->bb_free, grp->bb_fragments); 4217 } 4218 printk(KERN_ERR "\n"); 4219 } 4220 #else 4221 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4222 { 4223 return; 4224 } 4225 #endif 4226 4227 /* 4228 * We use locality group preallocation for small size file. The size of the 4229 * file is determined by the current size or the resulting size after 4230 * allocation which ever is larger 4231 * 4232 * One can tune this size via /proc/fs/ext4/<partition>/stream_req 4233 */ 4234 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 4235 { 4236 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4237 int bsbits = ac->ac_sb->s_blocksize_bits; 4238 loff_t size, isize; 4239 4240 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4241 return; 4242 4243 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 4244 isize = i_size_read(ac->ac_inode) >> bsbits; 4245 size = max(size, isize); 4246 4247 /* don't use group allocation for large files */ 4248 if (size >= sbi->s_mb_stream_request) 4249 return; 4250 4251 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4252 return; 4253 4254 BUG_ON(ac->ac_lg != NULL); 4255 /* 4256 * locality group prealloc space are per cpu. The reason for having 4257 * per cpu locality group is to reduce the contention between block 4258 * request from multiple CPUs. 4259 */ 4260 ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id()); 4261 4262 /* we're going to use group allocation */ 4263 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 4264 4265 /* serialize all allocations in the group */ 4266 mutex_lock(&ac->ac_lg->lg_mutex); 4267 } 4268 4269 static noinline_for_stack int 4270 ext4_mb_initialize_context(struct ext4_allocation_context *ac, 4271 struct ext4_allocation_request *ar) 4272 { 4273 struct super_block *sb = ar->inode->i_sb; 4274 struct ext4_sb_info *sbi = EXT4_SB(sb); 4275 struct ext4_super_block *es = sbi->s_es; 4276 ext4_group_t group; 4277 unsigned int len; 4278 ext4_fsblk_t goal; 4279 ext4_grpblk_t block; 4280 4281 /* we can't allocate > group size */ 4282 len = ar->len; 4283 4284 /* just a dirty hack to filter too big requests */ 4285 if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) 4286 len = EXT4_BLOCKS_PER_GROUP(sb) - 10; 4287 4288 /* start searching from the goal */ 4289 goal = ar->goal; 4290 if (goal < le32_to_cpu(es->s_first_data_block) || 4291 goal >= ext4_blocks_count(es)) 4292 goal = le32_to_cpu(es->s_first_data_block); 4293 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4294 4295 /* set up allocation goals */ 4296 ac->ac_b_ex.fe_logical = ar->logical; 4297 ac->ac_b_ex.fe_group = 0; 4298 ac->ac_b_ex.fe_start = 0; 4299 ac->ac_b_ex.fe_len = 0; 4300 ac->ac_status = AC_STATUS_CONTINUE; 4301 ac->ac_groups_scanned = 0; 4302 ac->ac_ex_scanned = 0; 4303 ac->ac_found = 0; 4304 ac->ac_sb = sb; 4305 ac->ac_inode = ar->inode; 4306 ac->ac_o_ex.fe_logical = ar->logical; 4307 ac->ac_o_ex.fe_group = group; 4308 ac->ac_o_ex.fe_start = block; 4309 ac->ac_o_ex.fe_len = len; 4310 ac->ac_g_ex.fe_logical = ar->logical; 4311 ac->ac_g_ex.fe_group = group; 4312 ac->ac_g_ex.fe_start = block; 4313 ac->ac_g_ex.fe_len = len; 4314 ac->ac_f_ex.fe_len = 0; 4315 ac->ac_flags = ar->flags; 4316 ac->ac_2order = 0; 4317 ac->ac_criteria = 0; 4318 ac->ac_pa = NULL; 4319 ac->ac_bitmap_page = NULL; 4320 ac->ac_buddy_page = NULL; 4321 ac->alloc_semp = NULL; 4322 ac->ac_lg = NULL; 4323 4324 /* we have to define context: we'll we work with a file or 4325 * locality group. this is a policy, actually */ 4326 ext4_mb_group_or_file(ac); 4327 4328 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 4329 "left: %u/%u, right %u/%u to %swritable\n", 4330 (unsigned) ar->len, (unsigned) ar->logical, 4331 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 4332 (unsigned) ar->lleft, (unsigned) ar->pleft, 4333 (unsigned) ar->lright, (unsigned) ar->pright, 4334 atomic_read(&ar->inode->i_writecount) ? "" : "non-"); 4335 return 0; 4336 4337 } 4338 4339 static noinline_for_stack void 4340 ext4_mb_discard_lg_preallocations(struct super_block *sb, 4341 struct ext4_locality_group *lg, 4342 int order, int total_entries) 4343 { 4344 ext4_group_t group = 0; 4345 struct ext4_buddy e4b; 4346 struct list_head discard_list; 4347 struct ext4_prealloc_space *pa, *tmp; 4348 struct ext4_allocation_context *ac; 4349 4350 mb_debug("discard locality group preallocation\n"); 4351 4352 INIT_LIST_HEAD(&discard_list); 4353 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4354 4355 spin_lock(&lg->lg_prealloc_lock); 4356 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4357 pa_inode_list) { 4358 spin_lock(&pa->pa_lock); 4359 if (atomic_read(&pa->pa_count)) { 4360 /* 4361 * This is the pa that we just used 4362 * for block allocation. So don't 4363 * free that 4364 */ 4365 spin_unlock(&pa->pa_lock); 4366 continue; 4367 } 4368 if (pa->pa_deleted) { 4369 spin_unlock(&pa->pa_lock); 4370 continue; 4371 } 4372 /* only lg prealloc space */ 4373 BUG_ON(!pa->pa_linear); 4374 4375 /* seems this one can be freed ... */ 4376 pa->pa_deleted = 1; 4377 spin_unlock(&pa->pa_lock); 4378 4379 list_del_rcu(&pa->pa_inode_list); 4380 list_add(&pa->u.pa_tmp_list, &discard_list); 4381 4382 total_entries--; 4383 if (total_entries <= 5) { 4384 /* 4385 * we want to keep only 5 entries 4386 * allowing it to grow to 8. This 4387 * mak sure we don't call discard 4388 * soon for this list. 4389 */ 4390 break; 4391 } 4392 } 4393 spin_unlock(&lg->lg_prealloc_lock); 4394 4395 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 4396 4397 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4398 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4399 ext4_error(sb, __func__, "Error in loading buddy " 4400 "information for %u", group); 4401 continue; 4402 } 4403 ext4_lock_group(sb, group); 4404 list_del(&pa->pa_group_list); 4405 ext4_mb_release_group_pa(&e4b, pa, ac); 4406 ext4_unlock_group(sb, group); 4407 4408 ext4_mb_release_desc(&e4b); 4409 list_del(&pa->u.pa_tmp_list); 4410 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4411 } 4412 if (ac) 4413 kmem_cache_free(ext4_ac_cachep, ac); 4414 } 4415 4416 /* 4417 * We have incremented pa_count. So it cannot be freed at this 4418 * point. Also we hold lg_mutex. So no parallel allocation is 4419 * possible from this lg. That means pa_free cannot be updated. 4420 * 4421 * A parallel ext4_mb_discard_group_preallocations is possible. 4422 * which can cause the lg_prealloc_list to be updated. 4423 */ 4424 4425 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) 4426 { 4427 int order, added = 0, lg_prealloc_count = 1; 4428 struct super_block *sb = ac->ac_sb; 4429 struct ext4_locality_group *lg = ac->ac_lg; 4430 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; 4431 4432 order = fls(pa->pa_free) - 1; 4433 if (order > PREALLOC_TB_SIZE - 1) 4434 /* The max size of hash table is PREALLOC_TB_SIZE */ 4435 order = PREALLOC_TB_SIZE - 1; 4436 /* Add the prealloc space to lg */ 4437 rcu_read_lock(); 4438 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], 4439 pa_inode_list) { 4440 spin_lock(&tmp_pa->pa_lock); 4441 if (tmp_pa->pa_deleted) { 4442 spin_unlock(&pa->pa_lock); 4443 continue; 4444 } 4445 if (!added && pa->pa_free < tmp_pa->pa_free) { 4446 /* Add to the tail of the previous entry */ 4447 list_add_tail_rcu(&pa->pa_inode_list, 4448 &tmp_pa->pa_inode_list); 4449 added = 1; 4450 /* 4451 * we want to count the total 4452 * number of entries in the list 4453 */ 4454 } 4455 spin_unlock(&tmp_pa->pa_lock); 4456 lg_prealloc_count++; 4457 } 4458 if (!added) 4459 list_add_tail_rcu(&pa->pa_inode_list, 4460 &lg->lg_prealloc_list[order]); 4461 rcu_read_unlock(); 4462 4463 /* Now trim the list to be not more than 8 elements */ 4464 if (lg_prealloc_count > 8) { 4465 ext4_mb_discard_lg_preallocations(sb, lg, 4466 order, lg_prealloc_count); 4467 return; 4468 } 4469 return ; 4470 } 4471 4472 /* 4473 * release all resource we used in allocation 4474 */ 4475 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 4476 { 4477 struct ext4_prealloc_space *pa = ac->ac_pa; 4478 if (pa) { 4479 if (pa->pa_linear) { 4480 /* see comment in ext4_mb_use_group_pa() */ 4481 spin_lock(&pa->pa_lock); 4482 pa->pa_pstart += ac->ac_b_ex.fe_len; 4483 pa->pa_lstart += ac->ac_b_ex.fe_len; 4484 pa->pa_free -= ac->ac_b_ex.fe_len; 4485 pa->pa_len -= ac->ac_b_ex.fe_len; 4486 spin_unlock(&pa->pa_lock); 4487 } 4488 } 4489 if (ac->alloc_semp) 4490 up_read(ac->alloc_semp); 4491 if (pa) { 4492 /* 4493 * We want to add the pa to the right bucket. 4494 * Remove it from the list and while adding 4495 * make sure the list to which we are adding 4496 * doesn't grow big. We need to release 4497 * alloc_semp before calling ext4_mb_add_n_trim() 4498 */ 4499 if (pa->pa_linear && likely(pa->pa_free)) { 4500 spin_lock(pa->pa_obj_lock); 4501 list_del_rcu(&pa->pa_inode_list); 4502 spin_unlock(pa->pa_obj_lock); 4503 ext4_mb_add_n_trim(ac); 4504 } 4505 ext4_mb_put_pa(ac, ac->ac_sb, pa); 4506 } 4507 if (ac->ac_bitmap_page) 4508 page_cache_release(ac->ac_bitmap_page); 4509 if (ac->ac_buddy_page) 4510 page_cache_release(ac->ac_buddy_page); 4511 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 4512 mutex_unlock(&ac->ac_lg->lg_mutex); 4513 ext4_mb_collect_stats(ac); 4514 return 0; 4515 } 4516 4517 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 4518 { 4519 ext4_group_t i; 4520 int ret; 4521 int freed = 0; 4522 4523 trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", 4524 sb->s_id, needed); 4525 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { 4526 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4527 freed += ret; 4528 needed -= ret; 4529 } 4530 4531 return freed; 4532 } 4533 4534 /* 4535 * Main entry point into mballoc to allocate blocks 4536 * it tries to use preallocation first, then falls back 4537 * to usual allocation 4538 */ 4539 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4540 struct ext4_allocation_request *ar, int *errp) 4541 { 4542 int freed; 4543 struct ext4_allocation_context *ac = NULL; 4544 struct ext4_sb_info *sbi; 4545 struct super_block *sb; 4546 ext4_fsblk_t block = 0; 4547 unsigned int inquota; 4548 unsigned int reserv_blks = 0; 4549 4550 sb = ar->inode->i_sb; 4551 sbi = EXT4_SB(sb); 4552 4553 trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu " 4554 "lblk %llu goal %llu lleft %llu lright %llu " 4555 "pleft %llu pright %llu ", 4556 sb->s_id, ar->flags, ar->len, 4557 ar->inode ? ar->inode->i_ino : 0, 4558 (unsigned long long) ar->logical, 4559 (unsigned long long) ar->goal, 4560 (unsigned long long) ar->lleft, 4561 (unsigned long long) ar->lright, 4562 (unsigned long long) ar->pleft, 4563 (unsigned long long) ar->pright); 4564 4565 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { 4566 /* 4567 * With delalloc we already reserved the blocks 4568 */ 4569 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { 4570 /* let others to free the space */ 4571 yield(); 4572 ar->len = ar->len >> 1; 4573 } 4574 if (!ar->len) { 4575 *errp = -ENOSPC; 4576 return 0; 4577 } 4578 reserv_blks = ar->len; 4579 } 4580 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { 4581 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4582 ar->len--; 4583 } 4584 if (ar->len == 0) { 4585 *errp = -EDQUOT; 4586 goto out3; 4587 } 4588 inquota = ar->len; 4589 4590 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4591 ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4592 4593 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4594 if (!ac) { 4595 ar->len = 0; 4596 *errp = -ENOMEM; 4597 goto out1; 4598 } 4599 4600 *errp = ext4_mb_initialize_context(ac, ar); 4601 if (*errp) { 4602 ar->len = 0; 4603 goto out2; 4604 } 4605 4606 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4607 if (!ext4_mb_use_preallocated(ac)) { 4608 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 4609 ext4_mb_normalize_request(ac, ar); 4610 repeat: 4611 /* allocate space in core */ 4612 ext4_mb_regular_allocator(ac); 4613 4614 /* as we've just preallocated more space than 4615 * user requested orinally, we store allocated 4616 * space in a special descriptor */ 4617 if (ac->ac_status == AC_STATUS_FOUND && 4618 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4619 ext4_mb_new_preallocation(ac); 4620 } 4621 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4622 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4623 if (*errp == -EAGAIN) { 4624 /* 4625 * drop the reference that we took 4626 * in ext4_mb_use_best_found 4627 */ 4628 ext4_mb_release_context(ac); 4629 ac->ac_b_ex.fe_group = 0; 4630 ac->ac_b_ex.fe_start = 0; 4631 ac->ac_b_ex.fe_len = 0; 4632 ac->ac_status = AC_STATUS_CONTINUE; 4633 goto repeat; 4634 } else if (*errp) { 4635 ac->ac_b_ex.fe_len = 0; 4636 ar->len = 0; 4637 ext4_mb_show_ac(ac); 4638 } else { 4639 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4640 ar->len = ac->ac_b_ex.fe_len; 4641 } 4642 } else { 4643 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); 4644 if (freed) 4645 goto repeat; 4646 *errp = -ENOSPC; 4647 ac->ac_b_ex.fe_len = 0; 4648 ar->len = 0; 4649 ext4_mb_show_ac(ac); 4650 } 4651 4652 ext4_mb_release_context(ac); 4653 4654 out2: 4655 kmem_cache_free(ext4_ac_cachep, ac); 4656 out1: 4657 if (ar->len < inquota) 4658 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); 4659 out3: 4660 if (!ar->len) { 4661 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4662 /* release all the reserved blocks if non delalloc */ 4663 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 4664 reserv_blks); 4665 } 4666 4667 trace_mark(ext4_allocate_blocks, 4668 "dev %s block %llu flags %u len %u ino %lu " 4669 "logical %llu goal %llu lleft %llu lright %llu " 4670 "pleft %llu pright %llu ", 4671 sb->s_id, (unsigned long long) block, 4672 ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0, 4673 (unsigned long long) ar->logical, 4674 (unsigned long long) ar->goal, 4675 (unsigned long long) ar->lleft, 4676 (unsigned long long) ar->lright, 4677 (unsigned long long) ar->pleft, 4678 (unsigned long long) ar->pright); 4679 4680 return block; 4681 } 4682 4683 /* 4684 * We can merge two free data extents only if the physical blocks 4685 * are contiguous, AND the extents were freed by the same transaction, 4686 * AND the blocks are associated with the same group. 4687 */ 4688 static int can_merge(struct ext4_free_data *entry1, 4689 struct ext4_free_data *entry2) 4690 { 4691 if ((entry1->t_tid == entry2->t_tid) && 4692 (entry1->group == entry2->group) && 4693 ((entry1->start_blk + entry1->count) == entry2->start_blk)) 4694 return 1; 4695 return 0; 4696 } 4697 4698 static noinline_for_stack int 4699 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4700 struct ext4_free_data *new_entry) 4701 { 4702 ext4_grpblk_t block; 4703 struct ext4_free_data *entry; 4704 struct ext4_group_info *db = e4b->bd_info; 4705 struct super_block *sb = e4b->bd_sb; 4706 struct ext4_sb_info *sbi = EXT4_SB(sb); 4707 struct rb_node **n = &db->bb_free_root.rb_node, *node; 4708 struct rb_node *parent = NULL, *new_node; 4709 4710 BUG_ON(!ext4_handle_valid(handle)); 4711 BUG_ON(e4b->bd_bitmap_page == NULL); 4712 BUG_ON(e4b->bd_buddy_page == NULL); 4713 4714 new_node = &new_entry->node; 4715 block = new_entry->start_blk; 4716 4717 if (!*n) { 4718 /* first free block exent. We need to 4719 protect buddy cache from being freed, 4720 * otherwise we'll refresh it from 4721 * on-disk bitmap and lose not-yet-available 4722 * blocks */ 4723 page_cache_get(e4b->bd_buddy_page); 4724 page_cache_get(e4b->bd_bitmap_page); 4725 } 4726 while (*n) { 4727 parent = *n; 4728 entry = rb_entry(parent, struct ext4_free_data, node); 4729 if (block < entry->start_blk) 4730 n = &(*n)->rb_left; 4731 else if (block >= (entry->start_blk + entry->count)) 4732 n = &(*n)->rb_right; 4733 else { 4734 ext4_grp_locked_error(sb, e4b->bd_group, __func__, 4735 "Double free of blocks %d (%d %d)", 4736 block, entry->start_blk, entry->count); 4737 return 0; 4738 } 4739 } 4740 4741 rb_link_node(new_node, parent, n); 4742 rb_insert_color(new_node, &db->bb_free_root); 4743 4744 /* Now try to see the extent can be merged to left and right */ 4745 node = rb_prev(new_node); 4746 if (node) { 4747 entry = rb_entry(node, struct ext4_free_data, node); 4748 if (can_merge(entry, new_entry)) { 4749 new_entry->start_blk = entry->start_blk; 4750 new_entry->count += entry->count; 4751 rb_erase(node, &(db->bb_free_root)); 4752 spin_lock(&sbi->s_md_lock); 4753 list_del(&entry->list); 4754 spin_unlock(&sbi->s_md_lock); 4755 kmem_cache_free(ext4_free_ext_cachep, entry); 4756 } 4757 } 4758 4759 node = rb_next(new_node); 4760 if (node) { 4761 entry = rb_entry(node, struct ext4_free_data, node); 4762 if (can_merge(new_entry, entry)) { 4763 new_entry->count += entry->count; 4764 rb_erase(node, &(db->bb_free_root)); 4765 spin_lock(&sbi->s_md_lock); 4766 list_del(&entry->list); 4767 spin_unlock(&sbi->s_md_lock); 4768 kmem_cache_free(ext4_free_ext_cachep, entry); 4769 } 4770 } 4771 /* Add the extent to transaction's private list */ 4772 spin_lock(&sbi->s_md_lock); 4773 list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4774 spin_unlock(&sbi->s_md_lock); 4775 return 0; 4776 } 4777 4778 /* 4779 * Main entry point into mballoc to free blocks 4780 */ 4781 void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, 4782 unsigned long block, unsigned long count, 4783 int metadata, unsigned long *freed) 4784 { 4785 struct buffer_head *bitmap_bh = NULL; 4786 struct super_block *sb = inode->i_sb; 4787 struct ext4_allocation_context *ac = NULL; 4788 struct ext4_group_desc *gdp; 4789 struct ext4_super_block *es; 4790 unsigned int overflow; 4791 ext4_grpblk_t bit; 4792 struct buffer_head *gd_bh; 4793 ext4_group_t block_group; 4794 struct ext4_sb_info *sbi; 4795 struct ext4_buddy e4b; 4796 int err = 0; 4797 int ret; 4798 4799 *freed = 0; 4800 4801 sbi = EXT4_SB(sb); 4802 es = EXT4_SB(sb)->s_es; 4803 if (block < le32_to_cpu(es->s_first_data_block) || 4804 block + count < block || 4805 block + count > ext4_blocks_count(es)) { 4806 ext4_error(sb, __func__, 4807 "Freeing blocks not in datazone - " 4808 "block = %lu, count = %lu", block, count); 4809 goto error_return; 4810 } 4811 4812 ext4_debug("freeing block %lu\n", block); 4813 trace_mark(ext4_free_blocks, 4814 "dev %s block %llu count %lu metadata %d ino %lu", 4815 sb->s_id, (unsigned long long) block, count, metadata, 4816 inode ? inode->i_ino : 0); 4817 4818 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4819 if (ac) { 4820 ac->ac_op = EXT4_MB_HISTORY_FREE; 4821 ac->ac_inode = inode; 4822 ac->ac_sb = sb; 4823 } 4824 4825 do_more: 4826 overflow = 0; 4827 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4828 4829 /* 4830 * Check to see if we are freeing blocks across a group 4831 * boundary. 4832 */ 4833 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4834 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 4835 count -= overflow; 4836 } 4837 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4838 if (!bitmap_bh) { 4839 err = -EIO; 4840 goto error_return; 4841 } 4842 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 4843 if (!gdp) { 4844 err = -EIO; 4845 goto error_return; 4846 } 4847 4848 if (in_range(ext4_block_bitmap(sb, gdp), block, count) || 4849 in_range(ext4_inode_bitmap(sb, gdp), block, count) || 4850 in_range(block, ext4_inode_table(sb, gdp), 4851 EXT4_SB(sb)->s_itb_per_group) || 4852 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4853 EXT4_SB(sb)->s_itb_per_group)) { 4854 4855 ext4_error(sb, __func__, 4856 "Freeing blocks in system zone - " 4857 "Block = %lu, count = %lu", block, count); 4858 /* err = 0. ext4_std_error should be a no op */ 4859 goto error_return; 4860 } 4861 4862 BUFFER_TRACE(bitmap_bh, "getting write access"); 4863 err = ext4_journal_get_write_access(handle, bitmap_bh); 4864 if (err) 4865 goto error_return; 4866 4867 /* 4868 * We are about to modify some metadata. Call the journal APIs 4869 * to unshare ->b_data if a currently-committing transaction is 4870 * using it 4871 */ 4872 BUFFER_TRACE(gd_bh, "get_write_access"); 4873 err = ext4_journal_get_write_access(handle, gd_bh); 4874 if (err) 4875 goto error_return; 4876 #ifdef AGGRESSIVE_CHECK 4877 { 4878 int i; 4879 for (i = 0; i < count; i++) 4880 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4881 } 4882 #endif 4883 if (ac) { 4884 ac->ac_b_ex.fe_group = block_group; 4885 ac->ac_b_ex.fe_start = bit; 4886 ac->ac_b_ex.fe_len = count; 4887 ext4_mb_store_history(ac); 4888 } 4889 4890 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4891 if (err) 4892 goto error_return; 4893 if (metadata && ext4_handle_valid(handle)) { 4894 struct ext4_free_data *new_entry; 4895 /* 4896 * blocks being freed are metadata. these blocks shouldn't 4897 * be used until this transaction is committed 4898 */ 4899 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4900 new_entry->start_blk = bit; 4901 new_entry->group = block_group; 4902 new_entry->count = count; 4903 new_entry->t_tid = handle->h_transaction->t_tid; 4904 ext4_lock_group(sb, block_group); 4905 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, 4906 bit, count); 4907 ext4_mb_free_metadata(handle, &e4b, new_entry); 4908 ext4_unlock_group(sb, block_group); 4909 } else { 4910 ext4_lock_group(sb, block_group); 4911 /* need to update group_info->bb_free and bitmap 4912 * with group lock held. generate_buddy look at 4913 * them with group lock_held 4914 */ 4915 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, 4916 bit, count); 4917 mb_free_blocks(inode, &e4b, bit, count); 4918 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4919 ext4_unlock_group(sb, block_group); 4920 } 4921 4922 spin_lock(sb_bgl_lock(sbi, block_group)); 4923 ret = ext4_free_blks_count(sb, gdp) + count; 4924 ext4_free_blks_set(sb, gdp, ret); 4925 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4926 spin_unlock(sb_bgl_lock(sbi, block_group)); 4927 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4928 4929 if (sbi->s_log_groups_per_flex) { 4930 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4931 spin_lock(sb_bgl_lock(sbi, flex_group)); 4932 sbi->s_flex_groups[flex_group].free_blocks += count; 4933 spin_unlock(sb_bgl_lock(sbi, flex_group)); 4934 } 4935 4936 ext4_mb_release_desc(&e4b); 4937 4938 *freed += count; 4939 4940 /* We dirtied the bitmap block */ 4941 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4942 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4943 4944 /* And the group descriptor block */ 4945 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4946 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 4947 if (!err) 4948 err = ret; 4949 4950 if (overflow && !err) { 4951 block += count; 4952 count = overflow; 4953 put_bh(bitmap_bh); 4954 goto do_more; 4955 } 4956 sb->s_dirt = 1; 4957 error_return: 4958 brelse(bitmap_bh); 4959 ext4_std_error(sb, err); 4960 if (ac) 4961 kmem_cache_free(ext4_ac_cachep, ac); 4962 return; 4963 } 4964