1 /* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License version 2 as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public Licens 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 17 */ 18 19 20 /* 21 * mballoc.c contains the multiblocks allocation routines 22 */ 23 24 #include "mballoc.h" 25 /* 26 * MUSTDO: 27 * - test ext4_ext_search_left() and ext4_ext_search_right() 28 * - search for metadata in few groups 29 * 30 * TODO v4: 31 * - normalization should take into account whether file is still open 32 * - discard preallocations if no free space left (policy?) 33 * - don't normalize tails 34 * - quota 35 * - reservation for superuser 36 * 37 * TODO v3: 38 * - bitmap read-ahead (proposed by Oleg Drokin aka green) 39 * - track min/max extents in each group for better group selection 40 * - mb_mark_used() may allocate chunk right after splitting buddy 41 * - tree of groups sorted by number of free blocks 42 * - error handling 43 */ 44 45 /* 46 * The allocation request involve request for multiple number of blocks 47 * near to the goal(block) value specified. 48 * 49 * During initialization phase of the allocator we decide to use the group 50 * preallocation or inode preallocation depending on the size file. The 51 * size of the file could be the resulting file size we would have after 52 * allocation or the current file size which ever is larger. If the size is 53 * less that sbi->s_mb_stream_request we select the group 54 * preallocation. The default value of s_mb_stream_request is 16 55 * blocks. This can also be tuned via 56 * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms 57 * of number of blocks. 58 * 59 * The main motivation for having small file use group preallocation is to 60 * ensure that we have small file closer in the disk. 61 * 62 * First stage the allocator looks at the inode prealloc list 63 * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for 64 * this particular inode. The inode prealloc space is represented as: 65 * 66 * pa_lstart -> the logical start block for this prealloc space 67 * pa_pstart -> the physical start block for this prealloc space 68 * pa_len -> lenght for this prealloc space 69 * pa_free -> free space available in this prealloc space 70 * 71 * The inode preallocation space is used looking at the _logical_ start 72 * block. If only the logical file block falls within the range of prealloc 73 * space we will consume the particular prealloc space. This make sure that 74 * that the we have contiguous physical blocks representing the file blocks 75 * 76 * The important thing to be noted in case of inode prealloc space is that 77 * we don't modify the values associated to inode prealloc space except 78 * pa_free. 79 * 80 * If we are not able to find blocks in the inode prealloc space and if we 81 * have the group allocation flag set then we look at the locality group 82 * prealloc space. These are per CPU prealloc list repreasented as 83 * 84 * ext4_sb_info.s_locality_groups[smp_processor_id()] 85 * 86 * The reason for having a per cpu locality group is to reduce the contention 87 * between CPUs. It is possible to get scheduled at this point. 88 * 89 * The locality group prealloc space is used looking at whether we have 90 * enough free space (pa_free) withing the prealloc space. 91 * 92 * If we can't allocate blocks via inode prealloc or/and locality group 93 * prealloc then we look at the buddy cache. The buddy cache is represented 94 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets 95 * mapped to the buddy and bitmap information regarding different 96 * groups. The buddy information is attached to buddy cache inode so that 97 * we can access them through the page cache. The information regarding 98 * each group is loaded via ext4_mb_load_buddy. The information involve 99 * block bitmap and buddy information. The information are stored in the 100 * inode as: 101 * 102 * { page } 103 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... 104 * 105 * 106 * one block each for bitmap and buddy information. So for each group we 107 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / 108 * blocksize) blocks. So it can have information regarding groups_per_page 109 * which is blocks_per_page/2 110 * 111 * The buddy cache inode is not stored on disk. The inode is thrown 112 * away when the filesystem is unmounted. 113 * 114 * We look for count number of blocks in the buddy cache. If we were able 115 * to locate that many free blocks we return with additional information 116 * regarding rest of the contiguous physical block available 117 * 118 * Before allocating blocks via buddy cache we normalize the request 119 * blocks. This ensure we ask for more blocks that we needed. The extra 120 * blocks that we get after allocation is added to the respective prealloc 121 * list. In case of inode preallocation we follow a list of heuristics 122 * based on file size. This can be found in ext4_mb_normalize_request. If 123 * we are doing a group prealloc we try to normalize the request to 124 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to 125 * 512 blocks. This can be tuned via 126 * /proc/fs/ext4/<partition/group_prealloc. The value is represented in 127 * terms of number of blocks. If we have mounted the file system with -O 128 * stripe=<value> option the group prealloc request is normalized to the 129 * stripe value (sbi->s_stripe) 130 * 131 * The regular allocator(using the buddy cache) support few tunables. 132 * 133 * /proc/fs/ext4/<partition>/min_to_scan 134 * /proc/fs/ext4/<partition>/max_to_scan 135 * /proc/fs/ext4/<partition>/order2_req 136 * 137 * The regular allocator use buddy scan only if the request len is power of 138 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 139 * value of s_mb_order2_reqs can be tuned via 140 * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to 141 * stripe size (sbi->s_stripe), we try to search for contigous block in 142 * stripe size. This should result in better allocation on RAID setup. If 143 * not we search in the specific group using bitmap for best extents. The 144 * tunable min_to_scan and max_to_scan controll the behaviour here. 145 * min_to_scan indicate how long the mballoc __must__ look for a best 146 * extent and max_to_scanindicate how long the mballoc __can__ look for a 147 * best extent in the found extents. Searching for the blocks starts with 148 * the group specified as the goal value in allocation context via 149 * ac_g_ex. Each group is first checked based on the criteria whether it 150 * can used for allocation. ext4_mb_good_group explains how the groups are 151 * checked. 152 * 153 * Both the prealloc space are getting populated as above. So for the first 154 * request we will hit the buddy cache which will result in this prealloc 155 * space getting filled. The prealloc space is then later used for the 156 * subsequent request. 157 */ 158 159 /* 160 * mballoc operates on the following data: 161 * - on-disk bitmap 162 * - in-core buddy (actually includes buddy and bitmap) 163 * - preallocation descriptors (PAs) 164 * 165 * there are two types of preallocations: 166 * - inode 167 * assiged to specific inode and can be used for this inode only. 168 * it describes part of inode's space preallocated to specific 169 * physical blocks. any block from that preallocated can be used 170 * independent. the descriptor just tracks number of blocks left 171 * unused. so, before taking some block from descriptor, one must 172 * make sure corresponded logical block isn't allocated yet. this 173 * also means that freeing any block within descriptor's range 174 * must discard all preallocated blocks. 175 * - locality group 176 * assigned to specific locality group which does not translate to 177 * permanent set of inodes: inode can join and leave group. space 178 * from this type of preallocation can be used for any inode. thus 179 * it's consumed from the beginning to the end. 180 * 181 * relation between them can be expressed as: 182 * in-core buddy = on-disk bitmap + preallocation descriptors 183 * 184 * this mean blocks mballoc considers used are: 185 * - allocated blocks (persistent) 186 * - preallocated blocks (non-persistent) 187 * 188 * consistency in mballoc world means that at any time a block is either 189 * free or used in ALL structures. notice: "any time" should not be read 190 * literally -- time is discrete and delimited by locks. 191 * 192 * to keep it simple, we don't use block numbers, instead we count number of 193 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. 194 * 195 * all operations can be expressed as: 196 * - init buddy: buddy = on-disk + PAs 197 * - new PA: buddy += N; PA = N 198 * - use inode PA: on-disk += N; PA -= N 199 * - discard inode PA buddy -= on-disk - PA; PA = 0 200 * - use locality group PA on-disk += N; PA -= N 201 * - discard locality group PA buddy -= PA; PA = 0 202 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap 203 * is used in real operation because we can't know actual used 204 * bits from PA, only from on-disk bitmap 205 * 206 * if we follow this strict logic, then all operations above should be atomic. 207 * given some of them can block, we'd have to use something like semaphores 208 * killing performance on high-end SMP hardware. let's try to relax it using 209 * the following knowledge: 210 * 1) if buddy is referenced, it's already initialized 211 * 2) while block is used in buddy and the buddy is referenced, 212 * nobody can re-allocate that block 213 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has 214 * bit set and PA claims same block, it's OK. IOW, one can set bit in 215 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded 216 * block 217 * 218 * so, now we're building a concurrency table: 219 * - init buddy vs. 220 * - new PA 221 * blocks for PA are allocated in the buddy, buddy must be referenced 222 * until PA is linked to allocation group to avoid concurrent buddy init 223 * - use inode PA 224 * we need to make sure that either on-disk bitmap or PA has uptodate data 225 * given (3) we care that PA-=N operation doesn't interfere with init 226 * - discard inode PA 227 * the simplest way would be to have buddy initialized by the discard 228 * - use locality group PA 229 * again PA-=N must be serialized with init 230 * - discard locality group PA 231 * the simplest way would be to have buddy initialized by the discard 232 * - new PA vs. 233 * - use inode PA 234 * i_data_sem serializes them 235 * - discard inode PA 236 * discard process must wait until PA isn't used by another process 237 * - use locality group PA 238 * some mutex should serialize them 239 * - discard locality group PA 240 * discard process must wait until PA isn't used by another process 241 * - use inode PA 242 * - use inode PA 243 * i_data_sem or another mutex should serializes them 244 * - discard inode PA 245 * discard process must wait until PA isn't used by another process 246 * - use locality group PA 247 * nothing wrong here -- they're different PAs covering different blocks 248 * - discard locality group PA 249 * discard process must wait until PA isn't used by another process 250 * 251 * now we're ready to make few consequences: 252 * - PA is referenced and while it is no discard is possible 253 * - PA is referenced until block isn't marked in on-disk bitmap 254 * - PA changes only after on-disk bitmap 255 * - discard must not compete with init. either init is done before 256 * any discard or they're serialized somehow 257 * - buddy init as sum of on-disk bitmap and PAs is done atomically 258 * 259 * a special case when we've used PA to emptiness. no need to modify buddy 260 * in this case, but we should care about concurrent init 261 * 262 */ 263 264 /* 265 * Logic in few words: 266 * 267 * - allocation: 268 * load group 269 * find blocks 270 * mark bits in on-disk bitmap 271 * release group 272 * 273 * - use preallocation: 274 * find proper PA (per-inode or group) 275 * load group 276 * mark bits in on-disk bitmap 277 * release group 278 * release PA 279 * 280 * - free: 281 * load group 282 * mark bits in on-disk bitmap 283 * release group 284 * 285 * - discard preallocations in group: 286 * mark PAs deleted 287 * move them onto local list 288 * load on-disk bitmap 289 * load group 290 * remove PA from object (inode or locality group) 291 * mark free blocks in-core 292 * 293 * - discard inode's preallocations: 294 */ 295 296 /* 297 * Locking rules 298 * 299 * Locks: 300 * - bitlock on a group (group) 301 * - object (inode/locality) (object) 302 * - per-pa lock (pa) 303 * 304 * Paths: 305 * - new pa 306 * object 307 * group 308 * 309 * - find and use pa: 310 * pa 311 * 312 * - release consumed pa: 313 * pa 314 * group 315 * object 316 * 317 * - generate in-core bitmap: 318 * group 319 * pa 320 * 321 * - discard all for given object (inode, locality group): 322 * object 323 * pa 324 * group 325 * 326 * - discard all for given group: 327 * group 328 * pa 329 * group 330 * object 331 * 332 */ 333 334 static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 335 { 336 #if BITS_PER_LONG == 64 337 *bit += ((unsigned long) addr & 7UL) << 3; 338 addr = (void *) ((unsigned long) addr & ~7UL); 339 #elif BITS_PER_LONG == 32 340 *bit += ((unsigned long) addr & 3UL) << 3; 341 addr = (void *) ((unsigned long) addr & ~3UL); 342 #else 343 #error "how many bits you are?!" 344 #endif 345 return addr; 346 } 347 348 static inline int mb_test_bit(int bit, void *addr) 349 { 350 /* 351 * ext4_test_bit on architecture like powerpc 352 * needs unsigned long aligned address 353 */ 354 addr = mb_correct_addr_and_bit(&bit, addr); 355 return ext4_test_bit(bit, addr); 356 } 357 358 static inline void mb_set_bit(int bit, void *addr) 359 { 360 addr = mb_correct_addr_and_bit(&bit, addr); 361 ext4_set_bit(bit, addr); 362 } 363 364 static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) 365 { 366 addr = mb_correct_addr_and_bit(&bit, addr); 367 ext4_set_bit_atomic(lock, bit, addr); 368 } 369 370 static inline void mb_clear_bit(int bit, void *addr) 371 { 372 addr = mb_correct_addr_and_bit(&bit, addr); 373 ext4_clear_bit(bit, addr); 374 } 375 376 static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) 377 { 378 addr = mb_correct_addr_and_bit(&bit, addr); 379 ext4_clear_bit_atomic(lock, bit, addr); 380 } 381 382 static inline int mb_find_next_zero_bit(void *addr, int max, int start) 383 { 384 int fix = 0, ret, tmpmax; 385 addr = mb_correct_addr_and_bit(&fix, addr); 386 tmpmax = max + fix; 387 start += fix; 388 389 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; 390 if (ret > max) 391 return max; 392 return ret; 393 } 394 395 static inline int mb_find_next_bit(void *addr, int max, int start) 396 { 397 int fix = 0, ret, tmpmax; 398 addr = mb_correct_addr_and_bit(&fix, addr); 399 tmpmax = max + fix; 400 start += fix; 401 402 ret = ext4_find_next_bit(addr, tmpmax, start) - fix; 403 if (ret > max) 404 return max; 405 return ret; 406 } 407 408 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 409 { 410 char *bb; 411 412 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 413 BUG_ON(max == NULL); 414 415 if (order > e4b->bd_blkbits + 1) { 416 *max = 0; 417 return NULL; 418 } 419 420 /* at order 0 we see each particular block */ 421 *max = 1 << (e4b->bd_blkbits + 3); 422 if (order == 0) 423 return EXT4_MB_BITMAP(e4b); 424 425 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 426 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 427 428 return bb; 429 } 430 431 #ifdef DOUBLE_CHECK 432 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, 433 int first, int count) 434 { 435 int i; 436 struct super_block *sb = e4b->bd_sb; 437 438 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 439 return; 440 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); 441 for (i = 0; i < count; i++) { 442 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 443 ext4_fsblk_t blocknr; 444 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 445 blocknr += first + i; 446 blocknr += 447 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 448 449 ext4_error(sb, __func__, "double-free of inode" 450 " %lu's block %llu(bit %u in group %lu)\n", 451 inode ? inode->i_ino : 0, blocknr, 452 first + i, e4b->bd_group); 453 } 454 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 455 } 456 } 457 458 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) 459 { 460 int i; 461 462 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 463 return; 464 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 465 for (i = 0; i < count; i++) { 466 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 467 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 468 } 469 } 470 471 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 472 { 473 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { 474 unsigned char *b1, *b2; 475 int i; 476 b1 = (unsigned char *) e4b->bd_info->bb_bitmap; 477 b2 = (unsigned char *) bitmap; 478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 479 if (b1[i] != b2[i]) { 480 printk("corruption in group %lu at byte %u(%u):" 481 " %x in copy != %x on disk/prealloc\n", 482 e4b->bd_group, i, i * 8, b1[i], b2[i]); 483 BUG(); 484 } 485 } 486 } 487 } 488 489 #else 490 static inline void mb_free_blocks_double(struct inode *inode, 491 struct ext4_buddy *e4b, int first, int count) 492 { 493 return; 494 } 495 static inline void mb_mark_used_double(struct ext4_buddy *e4b, 496 int first, int count) 497 { 498 return; 499 } 500 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 501 { 502 return; 503 } 504 #endif 505 506 #ifdef AGGRESSIVE_CHECK 507 508 #define MB_CHECK_ASSERT(assert) \ 509 do { \ 510 if (!(assert)) { \ 511 printk(KERN_EMERG \ 512 "Assertion failure in %s() at %s:%d: \"%s\"\n", \ 513 function, file, line, # assert); \ 514 BUG(); \ 515 } \ 516 } while (0) 517 518 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, 519 const char *function, int line) 520 { 521 struct super_block *sb = e4b->bd_sb; 522 int order = e4b->bd_blkbits + 1; 523 int max; 524 int max2; 525 int i; 526 int j; 527 int k; 528 int count; 529 struct ext4_group_info *grp; 530 int fragments = 0; 531 int fstart; 532 struct list_head *cur; 533 void *buddy; 534 void *buddy2; 535 536 if (!test_opt(sb, MBALLOC)) 537 return 0; 538 539 { 540 static int mb_check_counter; 541 if (mb_check_counter++ % 100 != 0) 542 return 0; 543 } 544 545 while (order > 1) { 546 buddy = mb_find_buddy(e4b, order, &max); 547 MB_CHECK_ASSERT(buddy); 548 buddy2 = mb_find_buddy(e4b, order - 1, &max2); 549 MB_CHECK_ASSERT(buddy2); 550 MB_CHECK_ASSERT(buddy != buddy2); 551 MB_CHECK_ASSERT(max * 2 == max2); 552 553 count = 0; 554 for (i = 0; i < max; i++) { 555 556 if (mb_test_bit(i, buddy)) { 557 /* only single bit in buddy2 may be 1 */ 558 if (!mb_test_bit(i << 1, buddy2)) { 559 MB_CHECK_ASSERT( 560 mb_test_bit((i<<1)+1, buddy2)); 561 } else if (!mb_test_bit((i << 1) + 1, buddy2)) { 562 MB_CHECK_ASSERT( 563 mb_test_bit(i << 1, buddy2)); 564 } 565 continue; 566 } 567 568 /* both bits in buddy2 must be 0 */ 569 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 570 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 571 572 for (j = 0; j < (1 << order); j++) { 573 k = (i * (1 << order)) + j; 574 MB_CHECK_ASSERT( 575 !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); 576 } 577 count++; 578 } 579 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); 580 order--; 581 } 582 583 fstart = -1; 584 buddy = mb_find_buddy(e4b, 0, &max); 585 for (i = 0; i < max; i++) { 586 if (!mb_test_bit(i, buddy)) { 587 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); 588 if (fstart == -1) { 589 fragments++; 590 fstart = i; 591 } 592 continue; 593 } 594 fstart = -1; 595 /* check used bits only */ 596 for (j = 0; j < e4b->bd_blkbits + 1; j++) { 597 buddy2 = mb_find_buddy(e4b, j, &max2); 598 k = i >> j; 599 MB_CHECK_ASSERT(k < max2); 600 MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); 601 } 602 } 603 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); 604 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 605 606 grp = ext4_get_group_info(sb, e4b->bd_group); 607 buddy = mb_find_buddy(e4b, 0, &max); 608 list_for_each(cur, &grp->bb_prealloc_list) { 609 ext4_group_t groupnr; 610 struct ext4_prealloc_space *pa; 611 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 612 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); 613 MB_CHECK_ASSERT(groupnr == e4b->bd_group); 614 for (i = 0; i < pa->pa_len; i++) 615 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); 616 } 617 return 0; 618 } 619 #undef MB_CHECK_ASSERT 620 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ 621 __FILE__, __func__, __LINE__) 622 #else 623 #define mb_check_buddy(e4b) 624 #endif 625 626 /* FIXME!! need more doc */ 627 static void ext4_mb_mark_free_simple(struct super_block *sb, 628 void *buddy, unsigned first, int len, 629 struct ext4_group_info *grp) 630 { 631 struct ext4_sb_info *sbi = EXT4_SB(sb); 632 unsigned short min; 633 unsigned short max; 634 unsigned short chunk; 635 unsigned short border; 636 637 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 638 639 border = 2 << sb->s_blocksize_bits; 640 641 while (len > 0) { 642 /* find how many blocks can be covered since this position */ 643 max = ffs(first | border) - 1; 644 645 /* find how many blocks of power 2 we need to mark */ 646 min = fls(len) - 1; 647 648 if (max < min) 649 min = max; 650 chunk = 1 << min; 651 652 /* mark multiblock chunks only */ 653 grp->bb_counters[min]++; 654 if (min > 0) 655 mb_clear_bit(first >> min, 656 buddy + sbi->s_mb_offsets[min]); 657 658 len -= chunk; 659 first += chunk; 660 } 661 } 662 663 static void ext4_mb_generate_buddy(struct super_block *sb, 664 void *buddy, void *bitmap, ext4_group_t group) 665 { 666 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 667 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); 668 unsigned short i = 0; 669 unsigned short first; 670 unsigned short len; 671 unsigned free = 0; 672 unsigned fragments = 0; 673 unsigned long long period = get_cycles(); 674 675 /* initialize buddy from bitmap which is aggregation 676 * of on-disk bitmap and preallocations */ 677 i = mb_find_next_zero_bit(bitmap, max, 0); 678 grp->bb_first_free = i; 679 while (i < max) { 680 fragments++; 681 first = i; 682 i = mb_find_next_bit(bitmap, max, i); 683 len = i - first; 684 free += len; 685 if (len > 1) 686 ext4_mb_mark_free_simple(sb, buddy, first, len, grp); 687 else 688 grp->bb_counters[0]++; 689 if (i < max) 690 i = mb_find_next_zero_bit(bitmap, max, i); 691 } 692 grp->bb_fragments = fragments; 693 694 if (free != grp->bb_free) { 695 ext4_error(sb, __func__, 696 "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", 697 group, free, grp->bb_free); 698 /* 699 * If we intent to continue, we consider group descritor 700 * corrupt and update bb_free using bitmap value 701 */ 702 grp->bb_free = free; 703 } 704 705 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 706 707 period = get_cycles() - period; 708 spin_lock(&EXT4_SB(sb)->s_bal_lock); 709 EXT4_SB(sb)->s_mb_buddies_generated++; 710 EXT4_SB(sb)->s_mb_generation_time += period; 711 spin_unlock(&EXT4_SB(sb)->s_bal_lock); 712 } 713 714 /* The buddy information is attached the buddy cache inode 715 * for convenience. The information regarding each group 716 * is loaded via ext4_mb_load_buddy. The information involve 717 * block bitmap and buddy information. The information are 718 * stored in the inode as 719 * 720 * { page } 721 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... 722 * 723 * 724 * one block each for bitmap and buddy information. 725 * So for each group we take up 2 blocks. A page can 726 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 727 * So it can have information regarding groups_per_page which 728 * is blocks_per_page/2 729 */ 730 731 static int ext4_mb_init_cache(struct page *page, char *incore) 732 { 733 int blocksize; 734 int blocks_per_page; 735 int groups_per_page; 736 int err = 0; 737 int i; 738 ext4_group_t first_group; 739 int first_block; 740 struct super_block *sb; 741 struct buffer_head *bhs; 742 struct buffer_head **bh; 743 struct inode *inode; 744 char *data; 745 char *bitmap; 746 747 mb_debug("init page %lu\n", page->index); 748 749 inode = page->mapping->host; 750 sb = inode->i_sb; 751 blocksize = 1 << inode->i_blkbits; 752 blocks_per_page = PAGE_CACHE_SIZE / blocksize; 753 754 groups_per_page = blocks_per_page >> 1; 755 if (groups_per_page == 0) 756 groups_per_page = 1; 757 758 /* allocate buffer_heads to read bitmaps */ 759 if (groups_per_page > 1) { 760 err = -ENOMEM; 761 i = sizeof(struct buffer_head *) * groups_per_page; 762 bh = kzalloc(i, GFP_NOFS); 763 if (bh == NULL) 764 goto out; 765 } else 766 bh = &bhs; 767 768 first_group = page->index * blocks_per_page / 2; 769 770 /* read all groups the page covers into the cache */ 771 for (i = 0; i < groups_per_page; i++) { 772 struct ext4_group_desc *desc; 773 774 if (first_group + i >= EXT4_SB(sb)->s_groups_count) 775 break; 776 777 err = -EIO; 778 desc = ext4_get_group_desc(sb, first_group + i, NULL); 779 if (desc == NULL) 780 goto out; 781 782 err = -ENOMEM; 783 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); 784 if (bh[i] == NULL) 785 goto out; 786 787 if (bh_uptodate_or_lock(bh[i])) 788 continue; 789 790 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 791 ext4_init_block_bitmap(sb, bh[i], 792 first_group + i, desc); 793 set_buffer_uptodate(bh[i]); 794 unlock_buffer(bh[i]); 795 continue; 796 } 797 get_bh(bh[i]); 798 bh[i]->b_end_io = end_buffer_read_sync; 799 submit_bh(READ, bh[i]); 800 mb_debug("read bitmap for group %lu\n", first_group + i); 801 } 802 803 /* wait for I/O completion */ 804 for (i = 0; i < groups_per_page && bh[i]; i++) 805 wait_on_buffer(bh[i]); 806 807 err = -EIO; 808 for (i = 0; i < groups_per_page && bh[i]; i++) 809 if (!buffer_uptodate(bh[i])) 810 goto out; 811 812 err = 0; 813 first_block = page->index * blocks_per_page; 814 for (i = 0; i < blocks_per_page; i++) { 815 int group; 816 struct ext4_group_info *grinfo; 817 818 group = (first_block + i) >> 1; 819 if (group >= EXT4_SB(sb)->s_groups_count) 820 break; 821 822 /* 823 * data carry information regarding this 824 * particular group in the format specified 825 * above 826 * 827 */ 828 data = page_address(page) + (i * blocksize); 829 bitmap = bh[group - first_group]->b_data; 830 831 /* 832 * We place the buddy block and bitmap block 833 * close together 834 */ 835 if ((first_block + i) & 1) { 836 /* this is block of buddy */ 837 BUG_ON(incore == NULL); 838 mb_debug("put buddy for group %u in page %lu/%x\n", 839 group, page->index, i * blocksize); 840 memset(data, 0xff, blocksize); 841 grinfo = ext4_get_group_info(sb, group); 842 grinfo->bb_fragments = 0; 843 memset(grinfo->bb_counters, 0, 844 sizeof(unsigned short)*(sb->s_blocksize_bits+2)); 845 /* 846 * incore got set to the group block bitmap below 847 */ 848 ext4_mb_generate_buddy(sb, data, incore, group); 849 incore = NULL; 850 } else { 851 /* this is block of bitmap */ 852 BUG_ON(incore != NULL); 853 mb_debug("put bitmap for group %u in page %lu/%x\n", 854 group, page->index, i * blocksize); 855 856 /* see comments in ext4_mb_put_pa() */ 857 ext4_lock_group(sb, group); 858 memcpy(data, bitmap, blocksize); 859 860 /* mark all preallocated blks used in in-core bitmap */ 861 ext4_mb_generate_from_pa(sb, data, group); 862 ext4_unlock_group(sb, group); 863 864 /* set incore so that the buddy information can be 865 * generated using this 866 */ 867 incore = data; 868 } 869 } 870 SetPageUptodate(page); 871 872 out: 873 if (bh) { 874 for (i = 0; i < groups_per_page && bh[i]; i++) 875 brelse(bh[i]); 876 if (bh != &bhs) 877 kfree(bh); 878 } 879 return err; 880 } 881 882 static noinline_for_stack int 883 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 884 struct ext4_buddy *e4b) 885 { 886 struct ext4_sb_info *sbi = EXT4_SB(sb); 887 struct inode *inode = sbi->s_buddy_cache; 888 int blocks_per_page; 889 int block; 890 int pnum; 891 int poff; 892 struct page *page; 893 int ret; 894 895 mb_debug("load group %lu\n", group); 896 897 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 898 899 e4b->bd_blkbits = sb->s_blocksize_bits; 900 e4b->bd_info = ext4_get_group_info(sb, group); 901 e4b->bd_sb = sb; 902 e4b->bd_group = group; 903 e4b->bd_buddy_page = NULL; 904 e4b->bd_bitmap_page = NULL; 905 906 /* 907 * the buddy cache inode stores the block bitmap 908 * and buddy information in consecutive blocks. 909 * So for each group we need two blocks. 910 */ 911 block = group * 2; 912 pnum = block / blocks_per_page; 913 poff = block % blocks_per_page; 914 915 /* we could use find_or_create_page(), but it locks page 916 * what we'd like to avoid in fast path ... */ 917 page = find_get_page(inode->i_mapping, pnum); 918 if (page == NULL || !PageUptodate(page)) { 919 if (page) 920 page_cache_release(page); 921 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 922 if (page) { 923 BUG_ON(page->mapping != inode->i_mapping); 924 if (!PageUptodate(page)) { 925 ret = ext4_mb_init_cache(page, NULL); 926 if (ret) { 927 unlock_page(page); 928 goto err; 929 } 930 mb_cmp_bitmaps(e4b, page_address(page) + 931 (poff * sb->s_blocksize)); 932 } 933 unlock_page(page); 934 } 935 } 936 if (page == NULL || !PageUptodate(page)) { 937 ret = -EIO; 938 goto err; 939 } 940 e4b->bd_bitmap_page = page; 941 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 942 mark_page_accessed(page); 943 944 block++; 945 pnum = block / blocks_per_page; 946 poff = block % blocks_per_page; 947 948 page = find_get_page(inode->i_mapping, pnum); 949 if (page == NULL || !PageUptodate(page)) { 950 if (page) 951 page_cache_release(page); 952 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 953 if (page) { 954 BUG_ON(page->mapping != inode->i_mapping); 955 if (!PageUptodate(page)) { 956 ret = ext4_mb_init_cache(page, e4b->bd_bitmap); 957 if (ret) { 958 unlock_page(page); 959 goto err; 960 } 961 } 962 unlock_page(page); 963 } 964 } 965 if (page == NULL || !PageUptodate(page)) { 966 ret = -EIO; 967 goto err; 968 } 969 e4b->bd_buddy_page = page; 970 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 971 mark_page_accessed(page); 972 973 BUG_ON(e4b->bd_bitmap_page == NULL); 974 BUG_ON(e4b->bd_buddy_page == NULL); 975 976 return 0; 977 978 err: 979 if (e4b->bd_bitmap_page) 980 page_cache_release(e4b->bd_bitmap_page); 981 if (e4b->bd_buddy_page) 982 page_cache_release(e4b->bd_buddy_page); 983 e4b->bd_buddy = NULL; 984 e4b->bd_bitmap = NULL; 985 return ret; 986 } 987 988 static void ext4_mb_release_desc(struct ext4_buddy *e4b) 989 { 990 if (e4b->bd_bitmap_page) 991 page_cache_release(e4b->bd_bitmap_page); 992 if (e4b->bd_buddy_page) 993 page_cache_release(e4b->bd_buddy_page); 994 } 995 996 997 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 998 { 999 int order = 1; 1000 void *bb; 1001 1002 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 1003 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1004 1005 bb = EXT4_MB_BUDDY(e4b); 1006 while (order <= e4b->bd_blkbits + 1) { 1007 block = block >> 1; 1008 if (!mb_test_bit(block, bb)) { 1009 /* this block is part of buddy of order 'order' */ 1010 return order; 1011 } 1012 bb += 1 << (e4b->bd_blkbits - order); 1013 order++; 1014 } 1015 return 0; 1016 } 1017 1018 static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) 1019 { 1020 __u32 *addr; 1021 1022 len = cur + len; 1023 while (cur < len) { 1024 if ((cur & 31) == 0 && (len - cur) >= 32) { 1025 /* fast path: clear whole word at once */ 1026 addr = bm + (cur >> 3); 1027 *addr = 0; 1028 cur += 32; 1029 continue; 1030 } 1031 mb_clear_bit_atomic(lock, cur, bm); 1032 cur++; 1033 } 1034 } 1035 1036 static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) 1037 { 1038 __u32 *addr; 1039 1040 len = cur + len; 1041 while (cur < len) { 1042 if ((cur & 31) == 0 && (len - cur) >= 32) { 1043 /* fast path: set whole word at once */ 1044 addr = bm + (cur >> 3); 1045 *addr = 0xffffffff; 1046 cur += 32; 1047 continue; 1048 } 1049 mb_set_bit_atomic(lock, cur, bm); 1050 cur++; 1051 } 1052 } 1053 1054 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1055 int first, int count) 1056 { 1057 int block = 0; 1058 int max = 0; 1059 int order; 1060 void *buddy; 1061 void *buddy2; 1062 struct super_block *sb = e4b->bd_sb; 1063 1064 BUG_ON(first + count > (sb->s_blocksize << 3)); 1065 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); 1066 mb_check_buddy(e4b); 1067 mb_free_blocks_double(inode, e4b, first, count); 1068 1069 e4b->bd_info->bb_free += count; 1070 if (first < e4b->bd_info->bb_first_free) 1071 e4b->bd_info->bb_first_free = first; 1072 1073 /* let's maintain fragments counter */ 1074 if (first != 0) 1075 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); 1076 if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) 1077 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); 1078 if (block && max) 1079 e4b->bd_info->bb_fragments--; 1080 else if (!block && !max) 1081 e4b->bd_info->bb_fragments++; 1082 1083 /* let's maintain buddy itself */ 1084 while (count-- > 0) { 1085 block = first++; 1086 order = 0; 1087 1088 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1089 ext4_fsblk_t blocknr; 1090 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 1091 blocknr += block; 1092 blocknr += 1093 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 1094 ext4_unlock_group(sb, e4b->bd_group); 1095 ext4_error(sb, __func__, "double-free of inode" 1096 " %lu's block %llu(bit %u in group %lu)\n", 1097 inode ? inode->i_ino : 0, blocknr, block, 1098 e4b->bd_group); 1099 ext4_lock_group(sb, e4b->bd_group); 1100 } 1101 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1102 e4b->bd_info->bb_counters[order]++; 1103 1104 /* start of the buddy */ 1105 buddy = mb_find_buddy(e4b, order, &max); 1106 1107 do { 1108 block &= ~1UL; 1109 if (mb_test_bit(block, buddy) || 1110 mb_test_bit(block + 1, buddy)) 1111 break; 1112 1113 /* both the buddies are free, try to coalesce them */ 1114 buddy2 = mb_find_buddy(e4b, order + 1, &max); 1115 1116 if (!buddy2) 1117 break; 1118 1119 if (order > 0) { 1120 /* for special purposes, we don't set 1121 * free bits in bitmap */ 1122 mb_set_bit(block, buddy); 1123 mb_set_bit(block + 1, buddy); 1124 } 1125 e4b->bd_info->bb_counters[order]--; 1126 e4b->bd_info->bb_counters[order]--; 1127 1128 block = block >> 1; 1129 order++; 1130 e4b->bd_info->bb_counters[order]++; 1131 1132 mb_clear_bit(block, buddy2); 1133 buddy = buddy2; 1134 } while (1); 1135 } 1136 mb_check_buddy(e4b); 1137 } 1138 1139 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, 1140 int needed, struct ext4_free_extent *ex) 1141 { 1142 int next = block; 1143 int max; 1144 int ord; 1145 void *buddy; 1146 1147 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 1148 BUG_ON(ex == NULL); 1149 1150 buddy = mb_find_buddy(e4b, order, &max); 1151 BUG_ON(buddy == NULL); 1152 BUG_ON(block >= max); 1153 if (mb_test_bit(block, buddy)) { 1154 ex->fe_len = 0; 1155 ex->fe_start = 0; 1156 ex->fe_group = 0; 1157 return 0; 1158 } 1159 1160 /* FIXME dorp order completely ? */ 1161 if (likely(order == 0)) { 1162 /* find actual order */ 1163 order = mb_find_order_for_block(e4b, block); 1164 block = block >> order; 1165 } 1166 1167 ex->fe_len = 1 << order; 1168 ex->fe_start = block << order; 1169 ex->fe_group = e4b->bd_group; 1170 1171 /* calc difference from given start */ 1172 next = next - ex->fe_start; 1173 ex->fe_len -= next; 1174 ex->fe_start += next; 1175 1176 while (needed > ex->fe_len && 1177 (buddy = mb_find_buddy(e4b, order, &max))) { 1178 1179 if (block + 1 >= max) 1180 break; 1181 1182 next = (block + 1) * (1 << order); 1183 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) 1184 break; 1185 1186 ord = mb_find_order_for_block(e4b, next); 1187 1188 order = ord; 1189 block = next >> order; 1190 ex->fe_len += 1 << order; 1191 } 1192 1193 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); 1194 return ex->fe_len; 1195 } 1196 1197 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) 1198 { 1199 int ord; 1200 int mlen = 0; 1201 int max = 0; 1202 int cur; 1203 int start = ex->fe_start; 1204 int len = ex->fe_len; 1205 unsigned ret = 0; 1206 int len0 = len; 1207 void *buddy; 1208 1209 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 1210 BUG_ON(e4b->bd_group != ex->fe_group); 1211 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); 1212 mb_check_buddy(e4b); 1213 mb_mark_used_double(e4b, start, len); 1214 1215 e4b->bd_info->bb_free -= len; 1216 if (e4b->bd_info->bb_first_free == start) 1217 e4b->bd_info->bb_first_free += len; 1218 1219 /* let's maintain fragments counter */ 1220 if (start != 0) 1221 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); 1222 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1223 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); 1224 if (mlen && max) 1225 e4b->bd_info->bb_fragments++; 1226 else if (!mlen && !max) 1227 e4b->bd_info->bb_fragments--; 1228 1229 /* let's maintain buddy itself */ 1230 while (len) { 1231 ord = mb_find_order_for_block(e4b, start); 1232 1233 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 1234 /* the whole chunk may be allocated at once! */ 1235 mlen = 1 << ord; 1236 buddy = mb_find_buddy(e4b, ord, &max); 1237 BUG_ON((start >> ord) >= max); 1238 mb_set_bit(start >> ord, buddy); 1239 e4b->bd_info->bb_counters[ord]--; 1240 start += mlen; 1241 len -= mlen; 1242 BUG_ON(len < 0); 1243 continue; 1244 } 1245 1246 /* store for history */ 1247 if (ret == 0) 1248 ret = len | (ord << 16); 1249 1250 /* we have to split large buddy */ 1251 BUG_ON(ord <= 0); 1252 buddy = mb_find_buddy(e4b, ord, &max); 1253 mb_set_bit(start >> ord, buddy); 1254 e4b->bd_info->bb_counters[ord]--; 1255 1256 ord--; 1257 cur = (start >> ord) & ~1U; 1258 buddy = mb_find_buddy(e4b, ord, &max); 1259 mb_clear_bit(cur, buddy); 1260 mb_clear_bit(cur + 1, buddy); 1261 e4b->bd_info->bb_counters[ord]++; 1262 e4b->bd_info->bb_counters[ord]++; 1263 } 1264 1265 mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), 1266 EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1267 mb_check_buddy(e4b); 1268 1269 return ret; 1270 } 1271 1272 /* 1273 * Must be called under group lock! 1274 */ 1275 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, 1276 struct ext4_buddy *e4b) 1277 { 1278 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1279 int ret; 1280 1281 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); 1282 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1283 1284 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); 1285 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; 1286 ret = mb_mark_used(e4b, &ac->ac_b_ex); 1287 1288 /* preallocation can change ac_b_ex, thus we store actually 1289 * allocated blocks for history */ 1290 ac->ac_f_ex = ac->ac_b_ex; 1291 1292 ac->ac_status = AC_STATUS_FOUND; 1293 ac->ac_tail = ret & 0xffff; 1294 ac->ac_buddy = ret >> 16; 1295 1296 /* XXXXXXX: SUCH A HORRIBLE **CK */ 1297 /*FIXME!! Why ? */ 1298 ac->ac_bitmap_page = e4b->bd_bitmap_page; 1299 get_page(ac->ac_bitmap_page); 1300 ac->ac_buddy_page = e4b->bd_buddy_page; 1301 get_page(ac->ac_buddy_page); 1302 1303 /* store last allocated for subsequent stream allocation */ 1304 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1305 spin_lock(&sbi->s_md_lock); 1306 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1307 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1308 spin_unlock(&sbi->s_md_lock); 1309 } 1310 } 1311 1312 /* 1313 * regular allocator, for general purposes allocation 1314 */ 1315 1316 static void ext4_mb_check_limits(struct ext4_allocation_context *ac, 1317 struct ext4_buddy *e4b, 1318 int finish_group) 1319 { 1320 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1321 struct ext4_free_extent *bex = &ac->ac_b_ex; 1322 struct ext4_free_extent *gex = &ac->ac_g_ex; 1323 struct ext4_free_extent ex; 1324 int max; 1325 1326 /* 1327 * We don't want to scan for a whole year 1328 */ 1329 if (ac->ac_found > sbi->s_mb_max_to_scan && 1330 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1331 ac->ac_status = AC_STATUS_BREAK; 1332 return; 1333 } 1334 1335 /* 1336 * Haven't found good chunk so far, let's continue 1337 */ 1338 if (bex->fe_len < gex->fe_len) 1339 return; 1340 1341 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) 1342 && bex->fe_group == e4b->bd_group) { 1343 /* recheck chunk's availability - we don't know 1344 * when it was found (within this lock-unlock 1345 * period or not) */ 1346 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); 1347 if (max >= gex->fe_len) { 1348 ext4_mb_use_best_found(ac, e4b); 1349 return; 1350 } 1351 } 1352 } 1353 1354 /* 1355 * The routine checks whether found extent is good enough. If it is, 1356 * then the extent gets marked used and flag is set to the context 1357 * to stop scanning. Otherwise, the extent is compared with the 1358 * previous found extent and if new one is better, then it's stored 1359 * in the context. Later, the best found extent will be used, if 1360 * mballoc can't find good enough extent. 1361 * 1362 * FIXME: real allocation policy is to be designed yet! 1363 */ 1364 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, 1365 struct ext4_free_extent *ex, 1366 struct ext4_buddy *e4b) 1367 { 1368 struct ext4_free_extent *bex = &ac->ac_b_ex; 1369 struct ext4_free_extent *gex = &ac->ac_g_ex; 1370 1371 BUG_ON(ex->fe_len <= 0); 1372 BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1373 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1374 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 1375 1376 ac->ac_found++; 1377 1378 /* 1379 * The special case - take what you catch first 1380 */ 1381 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1382 *bex = *ex; 1383 ext4_mb_use_best_found(ac, e4b); 1384 return; 1385 } 1386 1387 /* 1388 * Let's check whether the chuck is good enough 1389 */ 1390 if (ex->fe_len == gex->fe_len) { 1391 *bex = *ex; 1392 ext4_mb_use_best_found(ac, e4b); 1393 return; 1394 } 1395 1396 /* 1397 * If this is first found extent, just store it in the context 1398 */ 1399 if (bex->fe_len == 0) { 1400 *bex = *ex; 1401 return; 1402 } 1403 1404 /* 1405 * If new found extent is better, store it in the context 1406 */ 1407 if (bex->fe_len < gex->fe_len) { 1408 /* if the request isn't satisfied, any found extent 1409 * larger than previous best one is better */ 1410 if (ex->fe_len > bex->fe_len) 1411 *bex = *ex; 1412 } else if (ex->fe_len > gex->fe_len) { 1413 /* if the request is satisfied, then we try to find 1414 * an extent that still satisfy the request, but is 1415 * smaller than previous one */ 1416 if (ex->fe_len < bex->fe_len) 1417 *bex = *ex; 1418 } 1419 1420 ext4_mb_check_limits(ac, e4b, 0); 1421 } 1422 1423 static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, 1424 struct ext4_buddy *e4b) 1425 { 1426 struct ext4_free_extent ex = ac->ac_b_ex; 1427 ext4_group_t group = ex.fe_group; 1428 int max; 1429 int err; 1430 1431 BUG_ON(ex.fe_len <= 0); 1432 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1433 if (err) 1434 return err; 1435 1436 ext4_lock_group(ac->ac_sb, group); 1437 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); 1438 1439 if (max > 0) { 1440 ac->ac_b_ex = ex; 1441 ext4_mb_use_best_found(ac, e4b); 1442 } 1443 1444 ext4_unlock_group(ac->ac_sb, group); 1445 ext4_mb_release_desc(e4b); 1446 1447 return 0; 1448 } 1449 1450 static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 1451 struct ext4_buddy *e4b) 1452 { 1453 ext4_group_t group = ac->ac_g_ex.fe_group; 1454 int max; 1455 int err; 1456 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1457 struct ext4_super_block *es = sbi->s_es; 1458 struct ext4_free_extent ex; 1459 1460 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1461 return 0; 1462 1463 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1464 if (err) 1465 return err; 1466 1467 ext4_lock_group(ac->ac_sb, group); 1468 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, 1469 ac->ac_g_ex.fe_len, &ex); 1470 1471 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1472 ext4_fsblk_t start; 1473 1474 start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + 1475 ex.fe_start + le32_to_cpu(es->s_first_data_block); 1476 /* use do_div to get remainder (would be 64-bit modulo) */ 1477 if (do_div(start, sbi->s_stripe) == 0) { 1478 ac->ac_found++; 1479 ac->ac_b_ex = ex; 1480 ext4_mb_use_best_found(ac, e4b); 1481 } 1482 } else if (max >= ac->ac_g_ex.fe_len) { 1483 BUG_ON(ex.fe_len <= 0); 1484 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1485 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1486 ac->ac_found++; 1487 ac->ac_b_ex = ex; 1488 ext4_mb_use_best_found(ac, e4b); 1489 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { 1490 /* Sometimes, caller may want to merge even small 1491 * number of blocks to an existing extent */ 1492 BUG_ON(ex.fe_len <= 0); 1493 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1494 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1495 ac->ac_found++; 1496 ac->ac_b_ex = ex; 1497 ext4_mb_use_best_found(ac, e4b); 1498 } 1499 ext4_unlock_group(ac->ac_sb, group); 1500 ext4_mb_release_desc(e4b); 1501 1502 return 0; 1503 } 1504 1505 /* 1506 * The routine scans buddy structures (not bitmap!) from given order 1507 * to max order and tries to find big enough chunk to satisfy the req 1508 */ 1509 static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 1510 struct ext4_buddy *e4b) 1511 { 1512 struct super_block *sb = ac->ac_sb; 1513 struct ext4_group_info *grp = e4b->bd_info; 1514 void *buddy; 1515 int i; 1516 int k; 1517 int max; 1518 1519 BUG_ON(ac->ac_2order <= 0); 1520 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { 1521 if (grp->bb_counters[i] == 0) 1522 continue; 1523 1524 buddy = mb_find_buddy(e4b, i, &max); 1525 BUG_ON(buddy == NULL); 1526 1527 k = mb_find_next_zero_bit(buddy, max, 0); 1528 BUG_ON(k >= max); 1529 1530 ac->ac_found++; 1531 1532 ac->ac_b_ex.fe_len = 1 << i; 1533 ac->ac_b_ex.fe_start = k << i; 1534 ac->ac_b_ex.fe_group = e4b->bd_group; 1535 1536 ext4_mb_use_best_found(ac, e4b); 1537 1538 BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); 1539 1540 if (EXT4_SB(sb)->s_mb_stats) 1541 atomic_inc(&EXT4_SB(sb)->s_bal_2orders); 1542 1543 break; 1544 } 1545 } 1546 1547 /* 1548 * The routine scans the group and measures all found extents. 1549 * In order to optimize scanning, caller must pass number of 1550 * free blocks in the group, so the routine can know upper limit. 1551 */ 1552 static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 1553 struct ext4_buddy *e4b) 1554 { 1555 struct super_block *sb = ac->ac_sb; 1556 void *bitmap = EXT4_MB_BITMAP(e4b); 1557 struct ext4_free_extent ex; 1558 int i; 1559 int free; 1560 1561 free = e4b->bd_info->bb_free; 1562 BUG_ON(free <= 0); 1563 1564 i = e4b->bd_info->bb_first_free; 1565 1566 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 1567 i = mb_find_next_zero_bit(bitmap, 1568 EXT4_BLOCKS_PER_GROUP(sb), i); 1569 if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { 1570 /* 1571 * IF we have corrupt bitmap, we won't find any 1572 * free blocks even though group info says we 1573 * we have free blocks 1574 */ 1575 ext4_error(sb, __func__, "%d free blocks as per " 1576 "group info. But bitmap says 0\n", 1577 free); 1578 break; 1579 } 1580 1581 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1582 BUG_ON(ex.fe_len <= 0); 1583 if (free < ex.fe_len) { 1584 ext4_error(sb, __func__, "%d free blocks as per " 1585 "group info. But got %d blocks\n", 1586 free, ex.fe_len); 1587 /* 1588 * The number of free blocks differs. This mostly 1589 * indicate that the bitmap is corrupt. So exit 1590 * without claiming the space. 1591 */ 1592 break; 1593 } 1594 1595 ext4_mb_measure_extent(ac, &ex, e4b); 1596 1597 i += ex.fe_len; 1598 free -= ex.fe_len; 1599 } 1600 1601 ext4_mb_check_limits(ac, e4b, 1); 1602 } 1603 1604 /* 1605 * This is a special case for storages like raid5 1606 * we try to find stripe-aligned chunks for stripe-size requests 1607 * XXX should do so at least for multiples of stripe size as well 1608 */ 1609 static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1610 struct ext4_buddy *e4b) 1611 { 1612 struct super_block *sb = ac->ac_sb; 1613 struct ext4_sb_info *sbi = EXT4_SB(sb); 1614 void *bitmap = EXT4_MB_BITMAP(e4b); 1615 struct ext4_free_extent ex; 1616 ext4_fsblk_t first_group_block; 1617 ext4_fsblk_t a; 1618 ext4_grpblk_t i; 1619 int max; 1620 1621 BUG_ON(sbi->s_stripe == 0); 1622 1623 /* find first stripe-aligned block in group */ 1624 first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) 1625 + le32_to_cpu(sbi->s_es->s_first_data_block); 1626 a = first_group_block + sbi->s_stripe - 1; 1627 do_div(a, sbi->s_stripe); 1628 i = (a * sbi->s_stripe) - first_group_block; 1629 1630 while (i < EXT4_BLOCKS_PER_GROUP(sb)) { 1631 if (!mb_test_bit(i, bitmap)) { 1632 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); 1633 if (max >= sbi->s_stripe) { 1634 ac->ac_found++; 1635 ac->ac_b_ex = ex; 1636 ext4_mb_use_best_found(ac, e4b); 1637 break; 1638 } 1639 } 1640 i += sbi->s_stripe; 1641 } 1642 } 1643 1644 static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1645 ext4_group_t group, int cr) 1646 { 1647 unsigned free, fragments; 1648 unsigned i, bits; 1649 struct ext4_group_desc *desc; 1650 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1651 1652 BUG_ON(cr < 0 || cr >= 4); 1653 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); 1654 1655 free = grp->bb_free; 1656 fragments = grp->bb_fragments; 1657 if (free == 0) 1658 return 0; 1659 if (fragments == 0) 1660 return 0; 1661 1662 switch (cr) { 1663 case 0: 1664 BUG_ON(ac->ac_2order == 0); 1665 /* If this group is uninitialized, skip it initially */ 1666 desc = ext4_get_group_desc(ac->ac_sb, group, NULL); 1667 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) 1668 return 0; 1669 1670 bits = ac->ac_sb->s_blocksize_bits + 1; 1671 for (i = ac->ac_2order; i <= bits; i++) 1672 if (grp->bb_counters[i] > 0) 1673 return 1; 1674 break; 1675 case 1: 1676 if ((free / fragments) >= ac->ac_g_ex.fe_len) 1677 return 1; 1678 break; 1679 case 2: 1680 if (free >= ac->ac_g_ex.fe_len) 1681 return 1; 1682 break; 1683 case 3: 1684 return 1; 1685 default: 1686 BUG(); 1687 } 1688 1689 return 0; 1690 } 1691 1692 static noinline_for_stack int 1693 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1694 { 1695 ext4_group_t group; 1696 ext4_group_t i; 1697 int cr; 1698 int err = 0; 1699 int bsbits; 1700 struct ext4_sb_info *sbi; 1701 struct super_block *sb; 1702 struct ext4_buddy e4b; 1703 loff_t size, isize; 1704 1705 sb = ac->ac_sb; 1706 sbi = EXT4_SB(sb); 1707 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1708 1709 /* first, try the goal */ 1710 err = ext4_mb_find_by_goal(ac, &e4b); 1711 if (err || ac->ac_status == AC_STATUS_FOUND) 1712 goto out; 1713 1714 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 1715 goto out; 1716 1717 /* 1718 * ac->ac2_order is set only if the fe_len is a power of 2 1719 * if ac2_order is set we also set criteria to 0 so that we 1720 * try exact allocation using buddy. 1721 */ 1722 i = fls(ac->ac_g_ex.fe_len); 1723 ac->ac_2order = 0; 1724 /* 1725 * We search using buddy data only if the order of the request 1726 * is greater than equal to the sbi_s_mb_order2_reqs 1727 * You can tune it via /proc/fs/ext4/<partition>/order2_req 1728 */ 1729 if (i >= sbi->s_mb_order2_reqs) { 1730 /* 1731 * This should tell if fe_len is exactly power of 2 1732 */ 1733 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 1734 ac->ac_2order = i - 1; 1735 } 1736 1737 bsbits = ac->ac_sb->s_blocksize_bits; 1738 /* if stream allocation is enabled, use global goal */ 1739 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 1740 isize = i_size_read(ac->ac_inode) >> bsbits; 1741 if (size < isize) 1742 size = isize; 1743 1744 if (size < sbi->s_mb_stream_request && 1745 (ac->ac_flags & EXT4_MB_HINT_DATA)) { 1746 /* TBD: may be hot point */ 1747 spin_lock(&sbi->s_md_lock); 1748 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 1749 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 1750 spin_unlock(&sbi->s_md_lock); 1751 } 1752 /* Let's just scan groups to find more-less suitable blocks */ 1753 cr = ac->ac_2order ? 0 : 1; 1754 /* 1755 * cr == 0 try to get exact allocation, 1756 * cr == 3 try to get anything 1757 */ 1758 repeat: 1759 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 1760 ac->ac_criteria = cr; 1761 /* 1762 * searching for the right group start 1763 * from the goal value specified 1764 */ 1765 group = ac->ac_g_ex.fe_group; 1766 1767 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { 1768 struct ext4_group_info *grp; 1769 struct ext4_group_desc *desc; 1770 1771 if (group == EXT4_SB(sb)->s_groups_count) 1772 group = 0; 1773 1774 /* quick check to skip empty groups */ 1775 grp = ext4_get_group_info(ac->ac_sb, group); 1776 if (grp->bb_free == 0) 1777 continue; 1778 1779 /* 1780 * if the group is already init we check whether it is 1781 * a good group and if not we don't load the buddy 1782 */ 1783 if (EXT4_MB_GRP_NEED_INIT(grp)) { 1784 /* 1785 * we need full data about the group 1786 * to make a good selection 1787 */ 1788 err = ext4_mb_load_buddy(sb, group, &e4b); 1789 if (err) 1790 goto out; 1791 ext4_mb_release_desc(&e4b); 1792 } 1793 1794 /* 1795 * If the particular group doesn't satisfy our 1796 * criteria we continue with the next group 1797 */ 1798 if (!ext4_mb_good_group(ac, group, cr)) 1799 continue; 1800 1801 err = ext4_mb_load_buddy(sb, group, &e4b); 1802 if (err) 1803 goto out; 1804 1805 ext4_lock_group(sb, group); 1806 if (!ext4_mb_good_group(ac, group, cr)) { 1807 /* someone did allocation from this group */ 1808 ext4_unlock_group(sb, group); 1809 ext4_mb_release_desc(&e4b); 1810 continue; 1811 } 1812 1813 ac->ac_groups_scanned++; 1814 desc = ext4_get_group_desc(sb, group, NULL); 1815 if (cr == 0 || (desc->bg_flags & 1816 cpu_to_le16(EXT4_BG_BLOCK_UNINIT) && 1817 ac->ac_2order != 0)) 1818 ext4_mb_simple_scan_group(ac, &e4b); 1819 else if (cr == 1 && 1820 ac->ac_g_ex.fe_len == sbi->s_stripe) 1821 ext4_mb_scan_aligned(ac, &e4b); 1822 else 1823 ext4_mb_complex_scan_group(ac, &e4b); 1824 1825 ext4_unlock_group(sb, group); 1826 ext4_mb_release_desc(&e4b); 1827 1828 if (ac->ac_status != AC_STATUS_CONTINUE) 1829 break; 1830 } 1831 } 1832 1833 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && 1834 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1835 /* 1836 * We've been searching too long. Let's try to allocate 1837 * the best chunk we've found so far 1838 */ 1839 1840 ext4_mb_try_best_found(ac, &e4b); 1841 if (ac->ac_status != AC_STATUS_FOUND) { 1842 /* 1843 * Someone more lucky has already allocated it. 1844 * The only thing we can do is just take first 1845 * found block(s) 1846 printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); 1847 */ 1848 ac->ac_b_ex.fe_group = 0; 1849 ac->ac_b_ex.fe_start = 0; 1850 ac->ac_b_ex.fe_len = 0; 1851 ac->ac_status = AC_STATUS_CONTINUE; 1852 ac->ac_flags |= EXT4_MB_HINT_FIRST; 1853 cr = 3; 1854 atomic_inc(&sbi->s_mb_lost_chunks); 1855 goto repeat; 1856 } 1857 } 1858 out: 1859 return err; 1860 } 1861 1862 #ifdef EXT4_MB_HISTORY 1863 struct ext4_mb_proc_session { 1864 struct ext4_mb_history *history; 1865 struct super_block *sb; 1866 int start; 1867 int max; 1868 }; 1869 1870 static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s, 1871 struct ext4_mb_history *hs, 1872 int first) 1873 { 1874 if (hs == s->history + s->max) 1875 hs = s->history; 1876 if (!first && hs == s->history + s->start) 1877 return NULL; 1878 while (hs->orig.fe_len == 0) { 1879 hs++; 1880 if (hs == s->history + s->max) 1881 hs = s->history; 1882 if (hs == s->history + s->start) 1883 return NULL; 1884 } 1885 return hs; 1886 } 1887 1888 static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos) 1889 { 1890 struct ext4_mb_proc_session *s = seq->private; 1891 struct ext4_mb_history *hs; 1892 int l = *pos; 1893 1894 if (l == 0) 1895 return SEQ_START_TOKEN; 1896 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1); 1897 if (!hs) 1898 return NULL; 1899 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL); 1900 return hs; 1901 } 1902 1903 static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v, 1904 loff_t *pos) 1905 { 1906 struct ext4_mb_proc_session *s = seq->private; 1907 struct ext4_mb_history *hs = v; 1908 1909 ++*pos; 1910 if (v == SEQ_START_TOKEN) 1911 return ext4_mb_history_skip_empty(s, s->history + s->start, 1); 1912 else 1913 return ext4_mb_history_skip_empty(s, ++hs, 0); 1914 } 1915 1916 static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) 1917 { 1918 char buf[25], buf2[25], buf3[25], *fmt; 1919 struct ext4_mb_history *hs = v; 1920 1921 if (v == SEQ_START_TOKEN) { 1922 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " 1923 "%-5s %-2s %-5s %-5s %-5s %-6s\n", 1924 "pid", "inode", "original", "goal", "result", "found", 1925 "grps", "cr", "flags", "merge", "tail", "broken"); 1926 return 0; 1927 } 1928 1929 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 1930 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 1931 "%-5u %-5s %-5u %-6u\n"; 1932 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, 1933 hs->result.fe_start, hs->result.fe_len, 1934 hs->result.fe_logical); 1935 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, 1936 hs->orig.fe_start, hs->orig.fe_len, 1937 hs->orig.fe_logical); 1938 sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, 1939 hs->goal.fe_start, hs->goal.fe_len, 1940 hs->goal.fe_logical); 1941 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, 1942 hs->found, hs->groups, hs->cr, hs->flags, 1943 hs->merged ? "M" : "", hs->tail, 1944 hs->buddy ? 1 << hs->buddy : 0); 1945 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { 1946 fmt = "%-5u %-8u %-23s %-23s %-23s\n"; 1947 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, 1948 hs->result.fe_start, hs->result.fe_len, 1949 hs->result.fe_logical); 1950 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, 1951 hs->orig.fe_start, hs->orig.fe_len, 1952 hs->orig.fe_logical); 1953 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); 1954 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { 1955 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, 1956 hs->result.fe_start, hs->result.fe_len); 1957 seq_printf(seq, "%-5u %-8u %-23s discard\n", 1958 hs->pid, hs->ino, buf2); 1959 } else if (hs->op == EXT4_MB_HISTORY_FREE) { 1960 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, 1961 hs->result.fe_start, hs->result.fe_len); 1962 seq_printf(seq, "%-5u %-8u %-23s free\n", 1963 hs->pid, hs->ino, buf2); 1964 } 1965 return 0; 1966 } 1967 1968 static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v) 1969 { 1970 } 1971 1972 static struct seq_operations ext4_mb_seq_history_ops = { 1973 .start = ext4_mb_seq_history_start, 1974 .next = ext4_mb_seq_history_next, 1975 .stop = ext4_mb_seq_history_stop, 1976 .show = ext4_mb_seq_history_show, 1977 }; 1978 1979 static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) 1980 { 1981 struct super_block *sb = PDE(inode)->data; 1982 struct ext4_sb_info *sbi = EXT4_SB(sb); 1983 struct ext4_mb_proc_session *s; 1984 int rc; 1985 int size; 1986 1987 if (unlikely(sbi->s_mb_history == NULL)) 1988 return -ENOMEM; 1989 s = kmalloc(sizeof(*s), GFP_KERNEL); 1990 if (s == NULL) 1991 return -ENOMEM; 1992 s->sb = sb; 1993 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max; 1994 s->history = kmalloc(size, GFP_KERNEL); 1995 if (s->history == NULL) { 1996 kfree(s); 1997 return -ENOMEM; 1998 } 1999 2000 spin_lock(&sbi->s_mb_history_lock); 2001 memcpy(s->history, sbi->s_mb_history, size); 2002 s->max = sbi->s_mb_history_max; 2003 s->start = sbi->s_mb_history_cur % s->max; 2004 spin_unlock(&sbi->s_mb_history_lock); 2005 2006 rc = seq_open(file, &ext4_mb_seq_history_ops); 2007 if (rc == 0) { 2008 struct seq_file *m = (struct seq_file *)file->private_data; 2009 m->private = s; 2010 } else { 2011 kfree(s->history); 2012 kfree(s); 2013 } 2014 return rc; 2015 2016 } 2017 2018 static int ext4_mb_seq_history_release(struct inode *inode, struct file *file) 2019 { 2020 struct seq_file *seq = (struct seq_file *)file->private_data; 2021 struct ext4_mb_proc_session *s = seq->private; 2022 kfree(s->history); 2023 kfree(s); 2024 return seq_release(inode, file); 2025 } 2026 2027 static ssize_t ext4_mb_seq_history_write(struct file *file, 2028 const char __user *buffer, 2029 size_t count, loff_t *ppos) 2030 { 2031 struct seq_file *seq = (struct seq_file *)file->private_data; 2032 struct ext4_mb_proc_session *s = seq->private; 2033 struct super_block *sb = s->sb; 2034 char str[32]; 2035 int value; 2036 2037 if (count >= sizeof(str)) { 2038 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", 2039 "mb_history", (int)sizeof(str)); 2040 return -EOVERFLOW; 2041 } 2042 2043 if (copy_from_user(str, buffer, count)) 2044 return -EFAULT; 2045 2046 value = simple_strtol(str, NULL, 0); 2047 if (value < 0) 2048 return -ERANGE; 2049 EXT4_SB(sb)->s_mb_history_filter = value; 2050 2051 return count; 2052 } 2053 2054 static struct file_operations ext4_mb_seq_history_fops = { 2055 .owner = THIS_MODULE, 2056 .open = ext4_mb_seq_history_open, 2057 .read = seq_read, 2058 .write = ext4_mb_seq_history_write, 2059 .llseek = seq_lseek, 2060 .release = ext4_mb_seq_history_release, 2061 }; 2062 2063 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2064 { 2065 struct super_block *sb = seq->private; 2066 struct ext4_sb_info *sbi = EXT4_SB(sb); 2067 ext4_group_t group; 2068 2069 if (*pos < 0 || *pos >= sbi->s_groups_count) 2070 return NULL; 2071 2072 group = *pos + 1; 2073 return (void *) group; 2074 } 2075 2076 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2077 { 2078 struct super_block *sb = seq->private; 2079 struct ext4_sb_info *sbi = EXT4_SB(sb); 2080 ext4_group_t group; 2081 2082 ++*pos; 2083 if (*pos < 0 || *pos >= sbi->s_groups_count) 2084 return NULL; 2085 group = *pos + 1; 2086 return (void *) group;; 2087 } 2088 2089 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2090 { 2091 struct super_block *sb = seq->private; 2092 long group = (long) v; 2093 int i; 2094 int err; 2095 struct ext4_buddy e4b; 2096 struct sg { 2097 struct ext4_group_info info; 2098 unsigned short counters[16]; 2099 } sg; 2100 2101 group--; 2102 if (group == 0) 2103 seq_printf(seq, "#%-5s: %-5s %-5s %-5s " 2104 "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " 2105 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", 2106 "group", "free", "frags", "first", 2107 "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", 2108 "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); 2109 2110 i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + 2111 sizeof(struct ext4_group_info); 2112 err = ext4_mb_load_buddy(sb, group, &e4b); 2113 if (err) { 2114 seq_printf(seq, "#%-5lu: I/O error\n", group); 2115 return 0; 2116 } 2117 ext4_lock_group(sb, group); 2118 memcpy(&sg, ext4_get_group_info(sb, group), i); 2119 ext4_unlock_group(sb, group); 2120 ext4_mb_release_desc(&e4b); 2121 2122 seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, 2123 sg.info.bb_fragments, sg.info.bb_first_free); 2124 for (i = 0; i <= 13; i++) 2125 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? 2126 sg.info.bb_counters[i] : 0); 2127 seq_printf(seq, " ]\n"); 2128 2129 return 0; 2130 } 2131 2132 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) 2133 { 2134 } 2135 2136 static struct seq_operations ext4_mb_seq_groups_ops = { 2137 .start = ext4_mb_seq_groups_start, 2138 .next = ext4_mb_seq_groups_next, 2139 .stop = ext4_mb_seq_groups_stop, 2140 .show = ext4_mb_seq_groups_show, 2141 }; 2142 2143 static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) 2144 { 2145 struct super_block *sb = PDE(inode)->data; 2146 int rc; 2147 2148 rc = seq_open(file, &ext4_mb_seq_groups_ops); 2149 if (rc == 0) { 2150 struct seq_file *m = (struct seq_file *)file->private_data; 2151 m->private = sb; 2152 } 2153 return rc; 2154 2155 } 2156 2157 static struct file_operations ext4_mb_seq_groups_fops = { 2158 .owner = THIS_MODULE, 2159 .open = ext4_mb_seq_groups_open, 2160 .read = seq_read, 2161 .llseek = seq_lseek, 2162 .release = seq_release, 2163 }; 2164 2165 static void ext4_mb_history_release(struct super_block *sb) 2166 { 2167 struct ext4_sb_info *sbi = EXT4_SB(sb); 2168 2169 remove_proc_entry("mb_groups", sbi->s_mb_proc); 2170 remove_proc_entry("mb_history", sbi->s_mb_proc); 2171 2172 kfree(sbi->s_mb_history); 2173 } 2174 2175 static void ext4_mb_history_init(struct super_block *sb) 2176 { 2177 struct ext4_sb_info *sbi = EXT4_SB(sb); 2178 int i; 2179 2180 if (sbi->s_mb_proc != NULL) { 2181 proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc, 2182 &ext4_mb_seq_history_fops, sb); 2183 proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc, 2184 &ext4_mb_seq_groups_fops, sb); 2185 } 2186 2187 sbi->s_mb_history_max = 1000; 2188 sbi->s_mb_history_cur = 0; 2189 spin_lock_init(&sbi->s_mb_history_lock); 2190 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); 2191 sbi->s_mb_history = kzalloc(i, GFP_KERNEL); 2192 /* if we can't allocate history, then we simple won't use it */ 2193 } 2194 2195 static noinline_for_stack void 2196 ext4_mb_store_history(struct ext4_allocation_context *ac) 2197 { 2198 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2199 struct ext4_mb_history h; 2200 2201 if (unlikely(sbi->s_mb_history == NULL)) 2202 return; 2203 2204 if (!(ac->ac_op & sbi->s_mb_history_filter)) 2205 return; 2206 2207 h.op = ac->ac_op; 2208 h.pid = current->pid; 2209 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0; 2210 h.orig = ac->ac_o_ex; 2211 h.result = ac->ac_b_ex; 2212 h.flags = ac->ac_flags; 2213 h.found = ac->ac_found; 2214 h.groups = ac->ac_groups_scanned; 2215 h.cr = ac->ac_criteria; 2216 h.tail = ac->ac_tail; 2217 h.buddy = ac->ac_buddy; 2218 h.merged = 0; 2219 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { 2220 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 2221 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 2222 h.merged = 1; 2223 h.goal = ac->ac_g_ex; 2224 h.result = ac->ac_f_ex; 2225 } 2226 2227 spin_lock(&sbi->s_mb_history_lock); 2228 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); 2229 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) 2230 sbi->s_mb_history_cur = 0; 2231 spin_unlock(&sbi->s_mb_history_lock); 2232 } 2233 2234 #else 2235 #define ext4_mb_history_release(sb) 2236 #define ext4_mb_history_init(sb) 2237 #endif 2238 2239 2240 /* Create and initialize ext4_group_info data for the given group. */ 2241 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2242 struct ext4_group_desc *desc) 2243 { 2244 int i, len; 2245 int metalen = 0; 2246 struct ext4_sb_info *sbi = EXT4_SB(sb); 2247 struct ext4_group_info **meta_group_info; 2248 2249 /* 2250 * First check if this group is the first of a reserved block. 2251 * If it's true, we have to allocate a new table of pointers 2252 * to ext4_group_info structures 2253 */ 2254 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 2255 metalen = sizeof(*meta_group_info) << 2256 EXT4_DESC_PER_BLOCK_BITS(sb); 2257 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2258 if (meta_group_info == NULL) { 2259 printk(KERN_ERR "EXT4-fs: can't allocate mem for a " 2260 "buddy group\n"); 2261 goto exit_meta_group_info; 2262 } 2263 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = 2264 meta_group_info; 2265 } 2266 2267 /* 2268 * calculate needed size. if change bb_counters size, 2269 * don't forget about ext4_mb_generate_buddy() 2270 */ 2271 len = offsetof(typeof(**meta_group_info), 2272 bb_counters[sb->s_blocksize_bits + 2]); 2273 2274 meta_group_info = 2275 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2276 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2277 2278 meta_group_info[i] = kzalloc(len, GFP_KERNEL); 2279 if (meta_group_info[i] == NULL) { 2280 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); 2281 goto exit_group_info; 2282 } 2283 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2284 &(meta_group_info[i]->bb_state)); 2285 2286 /* 2287 * initialize bb_free to be able to skip 2288 * empty groups without initialization 2289 */ 2290 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2291 meta_group_info[i]->bb_free = 2292 ext4_free_blocks_after_init(sb, group, desc); 2293 } else { 2294 meta_group_info[i]->bb_free = 2295 le16_to_cpu(desc->bg_free_blocks_count); 2296 } 2297 2298 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2299 2300 #ifdef DOUBLE_CHECK 2301 { 2302 struct buffer_head *bh; 2303 meta_group_info[i]->bb_bitmap = 2304 kmalloc(sb->s_blocksize, GFP_KERNEL); 2305 BUG_ON(meta_group_info[i]->bb_bitmap == NULL); 2306 bh = ext4_read_block_bitmap(sb, group); 2307 BUG_ON(bh == NULL); 2308 memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, 2309 sb->s_blocksize); 2310 put_bh(bh); 2311 } 2312 #endif 2313 2314 return 0; 2315 2316 exit_group_info: 2317 /* If a meta_group_info table has been allocated, release it now */ 2318 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) 2319 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); 2320 exit_meta_group_info: 2321 return -ENOMEM; 2322 } /* ext4_mb_add_groupinfo */ 2323 2324 /* 2325 * Add a group to the existing groups. 2326 * This function is used for online resize 2327 */ 2328 int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group, 2329 struct ext4_group_desc *desc) 2330 { 2331 struct ext4_sb_info *sbi = EXT4_SB(sb); 2332 struct inode *inode = sbi->s_buddy_cache; 2333 int blocks_per_page; 2334 int block; 2335 int pnum; 2336 struct page *page; 2337 int err; 2338 2339 /* Add group based on group descriptor*/ 2340 err = ext4_mb_add_groupinfo(sb, group, desc); 2341 if (err) 2342 return err; 2343 2344 /* 2345 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap 2346 * datas) are set not up to date so that they will be re-initilaized 2347 * during the next call to ext4_mb_load_buddy 2348 */ 2349 2350 /* Set buddy page as not up to date */ 2351 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 2352 block = group * 2; 2353 pnum = block / blocks_per_page; 2354 page = find_get_page(inode->i_mapping, pnum); 2355 if (page != NULL) { 2356 ClearPageUptodate(page); 2357 page_cache_release(page); 2358 } 2359 2360 /* Set bitmap page as not up to date */ 2361 block++; 2362 pnum = block / blocks_per_page; 2363 page = find_get_page(inode->i_mapping, pnum); 2364 if (page != NULL) { 2365 ClearPageUptodate(page); 2366 page_cache_release(page); 2367 } 2368 2369 return 0; 2370 } 2371 2372 /* 2373 * Update an existing group. 2374 * This function is used for online resize 2375 */ 2376 void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) 2377 { 2378 grp->bb_free += add; 2379 } 2380 2381 static int ext4_mb_init_backend(struct super_block *sb) 2382 { 2383 ext4_group_t i; 2384 int metalen; 2385 struct ext4_sb_info *sbi = EXT4_SB(sb); 2386 struct ext4_super_block *es = sbi->s_es; 2387 int num_meta_group_infos; 2388 int num_meta_group_infos_max; 2389 int array_size; 2390 struct ext4_group_info **meta_group_info; 2391 struct ext4_group_desc *desc; 2392 2393 /* This is the number of blocks used by GDT */ 2394 num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 2395 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); 2396 2397 /* 2398 * This is the total number of blocks used by GDT including 2399 * the number of reserved blocks for GDT. 2400 * The s_group_info array is allocated with this value 2401 * to allow a clean online resize without a complex 2402 * manipulation of pointer. 2403 * The drawback is the unused memory when no resize 2404 * occurs but it's very low in terms of pages 2405 * (see comments below) 2406 * Need to handle this properly when META_BG resizing is allowed 2407 */ 2408 num_meta_group_infos_max = num_meta_group_infos + 2409 le16_to_cpu(es->s_reserved_gdt_blocks); 2410 2411 /* 2412 * array_size is the size of s_group_info array. We round it 2413 * to the next power of two because this approximation is done 2414 * internally by kmalloc so we can have some more memory 2415 * for free here (e.g. may be used for META_BG resize). 2416 */ 2417 array_size = 1; 2418 while (array_size < sizeof(*sbi->s_group_info) * 2419 num_meta_group_infos_max) 2420 array_size = array_size << 1; 2421 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2422 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2423 * So a two level scheme suffices for now. */ 2424 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); 2425 if (sbi->s_group_info == NULL) { 2426 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2427 return -ENOMEM; 2428 } 2429 sbi->s_buddy_cache = new_inode(sb); 2430 if (sbi->s_buddy_cache == NULL) { 2431 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2432 goto err_freesgi; 2433 } 2434 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2435 2436 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); 2437 for (i = 0; i < num_meta_group_infos; i++) { 2438 if ((i + 1) == num_meta_group_infos) 2439 metalen = sizeof(*meta_group_info) * 2440 (sbi->s_groups_count - 2441 (i << EXT4_DESC_PER_BLOCK_BITS(sb))); 2442 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2443 if (meta_group_info == NULL) { 2444 printk(KERN_ERR "EXT4-fs: can't allocate mem for a " 2445 "buddy group\n"); 2446 goto err_freemeta; 2447 } 2448 sbi->s_group_info[i] = meta_group_info; 2449 } 2450 2451 for (i = 0; i < sbi->s_groups_count; i++) { 2452 desc = ext4_get_group_desc(sb, i, NULL); 2453 if (desc == NULL) { 2454 printk(KERN_ERR 2455 "EXT4-fs: can't read descriptor %lu\n", i); 2456 goto err_freebuddy; 2457 } 2458 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2459 goto err_freebuddy; 2460 } 2461 2462 return 0; 2463 2464 err_freebuddy: 2465 while (i-- > 0) 2466 kfree(ext4_get_group_info(sb, i)); 2467 i = num_meta_group_infos; 2468 err_freemeta: 2469 while (i-- > 0) 2470 kfree(sbi->s_group_info[i]); 2471 iput(sbi->s_buddy_cache); 2472 err_freesgi: 2473 kfree(sbi->s_group_info); 2474 return -ENOMEM; 2475 } 2476 2477 int ext4_mb_init(struct super_block *sb, int needs_recovery) 2478 { 2479 struct ext4_sb_info *sbi = EXT4_SB(sb); 2480 unsigned i; 2481 unsigned offset; 2482 unsigned max; 2483 int ret; 2484 2485 if (!test_opt(sb, MBALLOC)) 2486 return 0; 2487 2488 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2489 2490 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2491 if (sbi->s_mb_offsets == NULL) { 2492 clear_opt(sbi->s_mount_opt, MBALLOC); 2493 return -ENOMEM; 2494 } 2495 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2496 if (sbi->s_mb_maxs == NULL) { 2497 clear_opt(sbi->s_mount_opt, MBALLOC); 2498 kfree(sbi->s_mb_maxs); 2499 return -ENOMEM; 2500 } 2501 2502 /* order 0 is regular bitmap */ 2503 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 2504 sbi->s_mb_offsets[0] = 0; 2505 2506 i = 1; 2507 offset = 0; 2508 max = sb->s_blocksize << 2; 2509 do { 2510 sbi->s_mb_offsets[i] = offset; 2511 sbi->s_mb_maxs[i] = max; 2512 offset += 1 << (sb->s_blocksize_bits - i); 2513 max = max >> 1; 2514 i++; 2515 } while (i <= sb->s_blocksize_bits + 1); 2516 2517 /* init file for buddy data */ 2518 ret = ext4_mb_init_backend(sb); 2519 if (ret != 0) { 2520 clear_opt(sbi->s_mount_opt, MBALLOC); 2521 kfree(sbi->s_mb_offsets); 2522 kfree(sbi->s_mb_maxs); 2523 return ret; 2524 } 2525 2526 spin_lock_init(&sbi->s_md_lock); 2527 INIT_LIST_HEAD(&sbi->s_active_transaction); 2528 INIT_LIST_HEAD(&sbi->s_closed_transaction); 2529 INIT_LIST_HEAD(&sbi->s_committed_transaction); 2530 spin_lock_init(&sbi->s_bal_lock); 2531 2532 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 2533 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; 2534 sbi->s_mb_stats = MB_DEFAULT_STATS; 2535 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2536 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2537 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; 2538 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2539 2540 i = sizeof(struct ext4_locality_group) * NR_CPUS; 2541 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL); 2542 if (sbi->s_locality_groups == NULL) { 2543 clear_opt(sbi->s_mount_opt, MBALLOC); 2544 kfree(sbi->s_mb_offsets); 2545 kfree(sbi->s_mb_maxs); 2546 return -ENOMEM; 2547 } 2548 for (i = 0; i < NR_CPUS; i++) { 2549 struct ext4_locality_group *lg; 2550 lg = &sbi->s_locality_groups[i]; 2551 mutex_init(&lg->lg_mutex); 2552 INIT_LIST_HEAD(&lg->lg_prealloc_list); 2553 spin_lock_init(&lg->lg_prealloc_lock); 2554 } 2555 2556 ext4_mb_init_per_dev_proc(sb); 2557 ext4_mb_history_init(sb); 2558 2559 printk("EXT4-fs: mballoc enabled\n"); 2560 return 0; 2561 } 2562 2563 /* need to called with ext4 group lock (ext4_lock_group) */ 2564 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) 2565 { 2566 struct ext4_prealloc_space *pa; 2567 struct list_head *cur, *tmp; 2568 int count = 0; 2569 2570 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { 2571 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 2572 list_del(&pa->pa_group_list); 2573 count++; 2574 kfree(pa); 2575 } 2576 if (count) 2577 mb_debug("mballoc: %u PAs left\n", count); 2578 2579 } 2580 2581 int ext4_mb_release(struct super_block *sb) 2582 { 2583 ext4_group_t i; 2584 int num_meta_group_infos; 2585 struct ext4_group_info *grinfo; 2586 struct ext4_sb_info *sbi = EXT4_SB(sb); 2587 2588 if (!test_opt(sb, MBALLOC)) 2589 return 0; 2590 2591 /* release freed, non-committed blocks */ 2592 spin_lock(&sbi->s_md_lock); 2593 list_splice_init(&sbi->s_closed_transaction, 2594 &sbi->s_committed_transaction); 2595 list_splice_init(&sbi->s_active_transaction, 2596 &sbi->s_committed_transaction); 2597 spin_unlock(&sbi->s_md_lock); 2598 ext4_mb_free_committed_blocks(sb); 2599 2600 if (sbi->s_group_info) { 2601 for (i = 0; i < sbi->s_groups_count; i++) { 2602 grinfo = ext4_get_group_info(sb, i); 2603 #ifdef DOUBLE_CHECK 2604 kfree(grinfo->bb_bitmap); 2605 #endif 2606 ext4_lock_group(sb, i); 2607 ext4_mb_cleanup_pa(grinfo); 2608 ext4_unlock_group(sb, i); 2609 kfree(grinfo); 2610 } 2611 num_meta_group_infos = (sbi->s_groups_count + 2612 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2613 EXT4_DESC_PER_BLOCK_BITS(sb); 2614 for (i = 0; i < num_meta_group_infos; i++) 2615 kfree(sbi->s_group_info[i]); 2616 kfree(sbi->s_group_info); 2617 } 2618 kfree(sbi->s_mb_offsets); 2619 kfree(sbi->s_mb_maxs); 2620 if (sbi->s_buddy_cache) 2621 iput(sbi->s_buddy_cache); 2622 if (sbi->s_mb_stats) { 2623 printk(KERN_INFO 2624 "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", 2625 atomic_read(&sbi->s_bal_allocated), 2626 atomic_read(&sbi->s_bal_reqs), 2627 atomic_read(&sbi->s_bal_success)); 2628 printk(KERN_INFO 2629 "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " 2630 "%u 2^N hits, %u breaks, %u lost\n", 2631 atomic_read(&sbi->s_bal_ex_scanned), 2632 atomic_read(&sbi->s_bal_goals), 2633 atomic_read(&sbi->s_bal_2orders), 2634 atomic_read(&sbi->s_bal_breaks), 2635 atomic_read(&sbi->s_mb_lost_chunks)); 2636 printk(KERN_INFO 2637 "EXT4-fs: mballoc: %lu generated and it took %Lu\n", 2638 sbi->s_mb_buddies_generated++, 2639 sbi->s_mb_generation_time); 2640 printk(KERN_INFO 2641 "EXT4-fs: mballoc: %u preallocated, %u discarded\n", 2642 atomic_read(&sbi->s_mb_preallocated), 2643 atomic_read(&sbi->s_mb_discarded)); 2644 } 2645 2646 kfree(sbi->s_locality_groups); 2647 2648 ext4_mb_history_release(sb); 2649 ext4_mb_destroy_per_dev_proc(sb); 2650 2651 return 0; 2652 } 2653 2654 static noinline_for_stack void 2655 ext4_mb_free_committed_blocks(struct super_block *sb) 2656 { 2657 struct ext4_sb_info *sbi = EXT4_SB(sb); 2658 int err; 2659 int i; 2660 int count = 0; 2661 int count2 = 0; 2662 struct ext4_free_metadata *md; 2663 struct ext4_buddy e4b; 2664 2665 if (list_empty(&sbi->s_committed_transaction)) 2666 return; 2667 2668 /* there is committed blocks to be freed yet */ 2669 do { 2670 /* get next array of blocks */ 2671 md = NULL; 2672 spin_lock(&sbi->s_md_lock); 2673 if (!list_empty(&sbi->s_committed_transaction)) { 2674 md = list_entry(sbi->s_committed_transaction.next, 2675 struct ext4_free_metadata, list); 2676 list_del(&md->list); 2677 } 2678 spin_unlock(&sbi->s_md_lock); 2679 2680 if (md == NULL) 2681 break; 2682 2683 mb_debug("gonna free %u blocks in group %lu (0x%p):", 2684 md->num, md->group, md); 2685 2686 err = ext4_mb_load_buddy(sb, md->group, &e4b); 2687 /* we expect to find existing buddy because it's pinned */ 2688 BUG_ON(err != 0); 2689 2690 /* there are blocks to put in buddy to make them really free */ 2691 count += md->num; 2692 count2++; 2693 ext4_lock_group(sb, md->group); 2694 for (i = 0; i < md->num; i++) { 2695 mb_debug(" %u", md->blocks[i]); 2696 mb_free_blocks(NULL, &e4b, md->blocks[i], 1); 2697 } 2698 mb_debug("\n"); 2699 ext4_unlock_group(sb, md->group); 2700 2701 /* balance refcounts from ext4_mb_free_metadata() */ 2702 page_cache_release(e4b.bd_buddy_page); 2703 page_cache_release(e4b.bd_bitmap_page); 2704 2705 kfree(md); 2706 ext4_mb_release_desc(&e4b); 2707 2708 } while (md); 2709 2710 mb_debug("freed %u blocks in %u structures\n", count, count2); 2711 } 2712 2713 #define EXT4_MB_STATS_NAME "stats" 2714 #define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" 2715 #define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" 2716 #define EXT4_MB_ORDER2_REQ "order2_req" 2717 #define EXT4_MB_STREAM_REQ "stream_req" 2718 #define EXT4_MB_GROUP_PREALLOC "group_prealloc" 2719 2720 2721 2722 #define MB_PROC_FOPS(name) \ 2723 static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \ 2724 { \ 2725 struct ext4_sb_info *sbi = m->private; \ 2726 \ 2727 seq_printf(m, "%ld\n", sbi->s_mb_##name); \ 2728 return 0; \ 2729 } \ 2730 \ 2731 static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\ 2732 { \ 2733 return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\ 2734 } \ 2735 \ 2736 static ssize_t ext4_mb_##name##_proc_write(struct file *file, \ 2737 const char __user *buf, size_t cnt, loff_t *ppos) \ 2738 { \ 2739 struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\ 2740 char str[32]; \ 2741 long value; \ 2742 if (cnt >= sizeof(str)) \ 2743 return -EINVAL; \ 2744 if (copy_from_user(str, buf, cnt)) \ 2745 return -EFAULT; \ 2746 value = simple_strtol(str, NULL, 0); \ 2747 if (value <= 0) \ 2748 return -ERANGE; \ 2749 sbi->s_mb_##name = value; \ 2750 return cnt; \ 2751 } \ 2752 \ 2753 static const struct file_operations ext4_mb_##name##_proc_fops = { \ 2754 .owner = THIS_MODULE, \ 2755 .open = ext4_mb_##name##_proc_open, \ 2756 .read = seq_read, \ 2757 .llseek = seq_lseek, \ 2758 .release = single_release, \ 2759 .write = ext4_mb_##name##_proc_write, \ 2760 }; 2761 2762 MB_PROC_FOPS(stats); 2763 MB_PROC_FOPS(max_to_scan); 2764 MB_PROC_FOPS(min_to_scan); 2765 MB_PROC_FOPS(order2_reqs); 2766 MB_PROC_FOPS(stream_request); 2767 MB_PROC_FOPS(group_prealloc); 2768 2769 #define MB_PROC_HANDLER(name, var) \ 2770 do { \ 2771 proc = proc_create_data(name, mode, sbi->s_mb_proc, \ 2772 &ext4_mb_##var##_proc_fops, sbi); \ 2773 if (proc == NULL) { \ 2774 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ 2775 goto err_out; \ 2776 } \ 2777 } while (0) 2778 2779 static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2780 { 2781 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; 2782 struct ext4_sb_info *sbi = EXT4_SB(sb); 2783 struct proc_dir_entry *proc; 2784 char devname[64]; 2785 2786 if (proc_root_ext4 == NULL) { 2787 sbi->s_mb_proc = NULL; 2788 return -EINVAL; 2789 } 2790 bdevname(sb->s_bdev, devname); 2791 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); 2792 2793 MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats); 2794 MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan); 2795 MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan); 2796 MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs); 2797 MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request); 2798 MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc); 2799 2800 return 0; 2801 2802 err_out: 2803 printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); 2804 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); 2805 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); 2806 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); 2807 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); 2808 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); 2809 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); 2810 remove_proc_entry(devname, proc_root_ext4); 2811 sbi->s_mb_proc = NULL; 2812 2813 return -ENOMEM; 2814 } 2815 2816 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) 2817 { 2818 struct ext4_sb_info *sbi = EXT4_SB(sb); 2819 char devname[64]; 2820 2821 if (sbi->s_mb_proc == NULL) 2822 return -EINVAL; 2823 2824 bdevname(sb->s_bdev, devname); 2825 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); 2826 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); 2827 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); 2828 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); 2829 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); 2830 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); 2831 remove_proc_entry(devname, proc_root_ext4); 2832 2833 return 0; 2834 } 2835 2836 int __init init_ext4_mballoc(void) 2837 { 2838 ext4_pspace_cachep = 2839 kmem_cache_create("ext4_prealloc_space", 2840 sizeof(struct ext4_prealloc_space), 2841 0, SLAB_RECLAIM_ACCOUNT, NULL); 2842 if (ext4_pspace_cachep == NULL) 2843 return -ENOMEM; 2844 2845 ext4_ac_cachep = 2846 kmem_cache_create("ext4_alloc_context", 2847 sizeof(struct ext4_allocation_context), 2848 0, SLAB_RECLAIM_ACCOUNT, NULL); 2849 if (ext4_ac_cachep == NULL) { 2850 kmem_cache_destroy(ext4_pspace_cachep); 2851 return -ENOMEM; 2852 } 2853 #ifdef CONFIG_PROC_FS 2854 proc_root_ext4 = proc_mkdir("fs/ext4", NULL); 2855 if (proc_root_ext4 == NULL) 2856 printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n"); 2857 #endif 2858 return 0; 2859 } 2860 2861 void exit_ext4_mballoc(void) 2862 { 2863 /* XXX: synchronize_rcu(); */ 2864 kmem_cache_destroy(ext4_pspace_cachep); 2865 kmem_cache_destroy(ext4_ac_cachep); 2866 #ifdef CONFIG_PROC_FS 2867 remove_proc_entry("fs/ext4", NULL); 2868 #endif 2869 } 2870 2871 2872 /* 2873 * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps 2874 * Returns 0 if success or error code 2875 */ 2876 static noinline_for_stack int 2877 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2878 handle_t *handle) 2879 { 2880 struct buffer_head *bitmap_bh = NULL; 2881 struct ext4_super_block *es; 2882 struct ext4_group_desc *gdp; 2883 struct buffer_head *gdp_bh; 2884 struct ext4_sb_info *sbi; 2885 struct super_block *sb; 2886 ext4_fsblk_t block; 2887 int err, len; 2888 2889 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 2890 BUG_ON(ac->ac_b_ex.fe_len <= 0); 2891 2892 sb = ac->ac_sb; 2893 sbi = EXT4_SB(sb); 2894 es = sbi->s_es; 2895 2896 2897 err = -EIO; 2898 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 2899 if (!bitmap_bh) 2900 goto out_err; 2901 2902 err = ext4_journal_get_write_access(handle, bitmap_bh); 2903 if (err) 2904 goto out_err; 2905 2906 err = -EIO; 2907 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); 2908 if (!gdp) 2909 goto out_err; 2910 2911 ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, 2912 gdp->bg_free_blocks_count); 2913 2914 err = ext4_journal_get_write_access(handle, gdp_bh); 2915 if (err) 2916 goto out_err; 2917 2918 block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) 2919 + ac->ac_b_ex.fe_start 2920 + le32_to_cpu(es->s_first_data_block); 2921 2922 len = ac->ac_b_ex.fe_len; 2923 if (in_range(ext4_block_bitmap(sb, gdp), block, len) || 2924 in_range(ext4_inode_bitmap(sb, gdp), block, len) || 2925 in_range(block, ext4_inode_table(sb, gdp), 2926 EXT4_SB(sb)->s_itb_per_group) || 2927 in_range(block + len - 1, ext4_inode_table(sb, gdp), 2928 EXT4_SB(sb)->s_itb_per_group)) { 2929 ext4_error(sb, __func__, 2930 "Allocating block in system zone - block = %llu", 2931 block); 2932 /* File system mounted not to panic on error 2933 * Fix the bitmap and repeat the block allocation 2934 * We leak some of the blocks here. 2935 */ 2936 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), 2937 bitmap_bh->b_data, ac->ac_b_ex.fe_start, 2938 ac->ac_b_ex.fe_len); 2939 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 2940 if (!err) 2941 err = -EAGAIN; 2942 goto out_err; 2943 } 2944 #ifdef AGGRESSIVE_CHECK 2945 { 2946 int i; 2947 for (i = 0; i < ac->ac_b_ex.fe_len; i++) { 2948 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, 2949 bitmap_bh->b_data)); 2950 } 2951 } 2952 #endif 2953 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, 2954 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); 2955 2956 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 2957 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2958 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 2959 gdp->bg_free_blocks_count = 2960 cpu_to_le16(ext4_free_blocks_after_init(sb, 2961 ac->ac_b_ex.fe_group, 2962 gdp)); 2963 } 2964 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 2965 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2966 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 2967 2968 /* 2969 * free blocks account has already be reduced/reserved 2970 * at write_begin() time for delayed allocation 2971 * do not double accounting 2972 */ 2973 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 2974 percpu_counter_sub(&sbi->s_freeblocks_counter, 2975 ac->ac_b_ex.fe_len); 2976 2977 if (sbi->s_log_groups_per_flex) { 2978 ext4_group_t flex_group = ext4_flex_group(sbi, 2979 ac->ac_b_ex.fe_group); 2980 spin_lock(sb_bgl_lock(sbi, flex_group)); 2981 sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; 2982 spin_unlock(sb_bgl_lock(sbi, flex_group)); 2983 } 2984 2985 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 2986 if (err) 2987 goto out_err; 2988 err = ext4_journal_dirty_metadata(handle, gdp_bh); 2989 2990 out_err: 2991 sb->s_dirt = 1; 2992 brelse(bitmap_bh); 2993 return err; 2994 } 2995 2996 /* 2997 * here we normalize request for locality group 2998 * Group request are normalized to s_strip size if we set the same via mount 2999 * option. If not we set it to s_mb_group_prealloc which can be configured via 3000 * /proc/fs/ext4/<partition>/group_prealloc 3001 * 3002 * XXX: should we try to preallocate more than the group has now? 3003 */ 3004 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) 3005 { 3006 struct super_block *sb = ac->ac_sb; 3007 struct ext4_locality_group *lg = ac->ac_lg; 3008 3009 BUG_ON(lg == NULL); 3010 if (EXT4_SB(sb)->s_stripe) 3011 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; 3012 else 3013 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 3014 mb_debug("#%u: goal %u blocks for locality group\n", 3015 current->pid, ac->ac_g_ex.fe_len); 3016 } 3017 3018 /* 3019 * Normalization means making request better in terms of 3020 * size and alignment 3021 */ 3022 static noinline_for_stack void 3023 ext4_mb_normalize_request(struct ext4_allocation_context *ac, 3024 struct ext4_allocation_request *ar) 3025 { 3026 int bsbits, max; 3027 ext4_lblk_t end; 3028 loff_t size, orig_size, start_off; 3029 ext4_lblk_t start, orig_start; 3030 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3031 struct ext4_prealloc_space *pa; 3032 3033 /* do normalize only data requests, metadata requests 3034 do not need preallocation */ 3035 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3036 return; 3037 3038 /* sometime caller may want exact blocks */ 3039 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 3040 return; 3041 3042 /* caller may indicate that preallocation isn't 3043 * required (it's a tail, for example) */ 3044 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) 3045 return; 3046 3047 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { 3048 ext4_mb_normalize_group_request(ac); 3049 return ; 3050 } 3051 3052 bsbits = ac->ac_sb->s_blocksize_bits; 3053 3054 /* first, let's learn actual file size 3055 * given current request is allocated */ 3056 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3057 size = size << bsbits; 3058 if (size < i_size_read(ac->ac_inode)) 3059 size = i_size_read(ac->ac_inode); 3060 3061 /* max size of free chunks */ 3062 max = 2 << bsbits; 3063 3064 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ 3065 (req <= (size) || max <= (chunk_size)) 3066 3067 /* first, try to predict filesize */ 3068 /* XXX: should this table be tunable? */ 3069 start_off = 0; 3070 if (size <= 16 * 1024) { 3071 size = 16 * 1024; 3072 } else if (size <= 32 * 1024) { 3073 size = 32 * 1024; 3074 } else if (size <= 64 * 1024) { 3075 size = 64 * 1024; 3076 } else if (size <= 128 * 1024) { 3077 size = 128 * 1024; 3078 } else if (size <= 256 * 1024) { 3079 size = 256 * 1024; 3080 } else if (size <= 512 * 1024) { 3081 size = 512 * 1024; 3082 } else if (size <= 1024 * 1024) { 3083 size = 1024 * 1024; 3084 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { 3085 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3086 (21 - bsbits)) << 21; 3087 size = 2 * 1024 * 1024; 3088 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { 3089 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3090 (22 - bsbits)) << 22; 3091 size = 4 * 1024 * 1024; 3092 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, 3093 (8<<20)>>bsbits, max, 8 * 1024)) { 3094 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 3095 (23 - bsbits)) << 23; 3096 size = 8 * 1024 * 1024; 3097 } else { 3098 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; 3099 size = ac->ac_o_ex.fe_len << bsbits; 3100 } 3101 orig_size = size = size >> bsbits; 3102 orig_start = start = start_off >> bsbits; 3103 3104 /* don't cover already allocated blocks in selected range */ 3105 if (ar->pleft && start <= ar->lleft) { 3106 size -= ar->lleft + 1 - start; 3107 start = ar->lleft + 1; 3108 } 3109 if (ar->pright && start + size - 1 >= ar->lright) 3110 size -= start + size - ar->lright; 3111 3112 end = start + size; 3113 3114 /* check we don't cross already preallocated blocks */ 3115 rcu_read_lock(); 3116 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3117 unsigned long pa_end; 3118 3119 if (pa->pa_deleted) 3120 continue; 3121 spin_lock(&pa->pa_lock); 3122 if (pa->pa_deleted) { 3123 spin_unlock(&pa->pa_lock); 3124 continue; 3125 } 3126 3127 pa_end = pa->pa_lstart + pa->pa_len; 3128 3129 /* PA must not overlap original request */ 3130 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3131 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 3132 3133 /* skip PA normalized request doesn't overlap with */ 3134 if (pa->pa_lstart >= end) { 3135 spin_unlock(&pa->pa_lock); 3136 continue; 3137 } 3138 if (pa_end <= start) { 3139 spin_unlock(&pa->pa_lock); 3140 continue; 3141 } 3142 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 3143 3144 if (pa_end <= ac->ac_o_ex.fe_logical) { 3145 BUG_ON(pa_end < start); 3146 start = pa_end; 3147 } 3148 3149 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { 3150 BUG_ON(pa->pa_lstart > end); 3151 end = pa->pa_lstart; 3152 } 3153 spin_unlock(&pa->pa_lock); 3154 } 3155 rcu_read_unlock(); 3156 size = end - start; 3157 3158 /* XXX: extra loop to check we really don't overlap preallocations */ 3159 rcu_read_lock(); 3160 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3161 unsigned long pa_end; 3162 spin_lock(&pa->pa_lock); 3163 if (pa->pa_deleted == 0) { 3164 pa_end = pa->pa_lstart + pa->pa_len; 3165 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); 3166 } 3167 spin_unlock(&pa->pa_lock); 3168 } 3169 rcu_read_unlock(); 3170 3171 if (start + size <= ac->ac_o_ex.fe_logical && 3172 start > ac->ac_o_ex.fe_logical) { 3173 printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", 3174 (unsigned long) start, (unsigned long) size, 3175 (unsigned long) ac->ac_o_ex.fe_logical); 3176 } 3177 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3178 start > ac->ac_o_ex.fe_logical); 3179 BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 3180 3181 /* now prepare goal request */ 3182 3183 /* XXX: is it better to align blocks WRT to logical 3184 * placement or satisfy big request as is */ 3185 ac->ac_g_ex.fe_logical = start; 3186 ac->ac_g_ex.fe_len = size; 3187 3188 /* define goal start in order to merge */ 3189 if (ar->pright && (ar->lright == (start + size))) { 3190 /* merge to the right */ 3191 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, 3192 &ac->ac_f_ex.fe_group, 3193 &ac->ac_f_ex.fe_start); 3194 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3195 } 3196 if (ar->pleft && (ar->lleft + 1 == start)) { 3197 /* merge to the left */ 3198 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, 3199 &ac->ac_f_ex.fe_group, 3200 &ac->ac_f_ex.fe_start); 3201 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3202 } 3203 3204 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, 3205 (unsigned) orig_size, (unsigned) start); 3206 } 3207 3208 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) 3209 { 3210 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3211 3212 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3213 atomic_inc(&sbi->s_bal_reqs); 3214 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3215 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) 3216 atomic_inc(&sbi->s_bal_success); 3217 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3218 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3219 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 3220 atomic_inc(&sbi->s_bal_goals); 3221 if (ac->ac_found > sbi->s_mb_max_to_scan) 3222 atomic_inc(&sbi->s_bal_breaks); 3223 } 3224 3225 ext4_mb_store_history(ac); 3226 } 3227 3228 /* 3229 * use blocks preallocated to inode 3230 */ 3231 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 3232 struct ext4_prealloc_space *pa) 3233 { 3234 ext4_fsblk_t start; 3235 ext4_fsblk_t end; 3236 int len; 3237 3238 /* found preallocated blocks, use them */ 3239 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 3240 end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); 3241 len = end - start; 3242 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 3243 &ac->ac_b_ex.fe_start); 3244 ac->ac_b_ex.fe_len = len; 3245 ac->ac_status = AC_STATUS_FOUND; 3246 ac->ac_pa = pa; 3247 3248 BUG_ON(start < pa->pa_pstart); 3249 BUG_ON(start + len > pa->pa_pstart + pa->pa_len); 3250 BUG_ON(pa->pa_free < len); 3251 pa->pa_free -= len; 3252 3253 mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); 3254 } 3255 3256 /* 3257 * use blocks preallocated to locality group 3258 */ 3259 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, 3260 struct ext4_prealloc_space *pa) 3261 { 3262 unsigned int len = ac->ac_o_ex.fe_len; 3263 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 3264 &ac->ac_b_ex.fe_group, 3265 &ac->ac_b_ex.fe_start); 3266 ac->ac_b_ex.fe_len = len; 3267 ac->ac_status = AC_STATUS_FOUND; 3268 ac->ac_pa = pa; 3269 3270 /* we don't correct pa_pstart or pa_plen here to avoid 3271 * possible race when the group is being loaded concurrently 3272 * instead we correct pa later, after blocks are marked 3273 * in on-disk bitmap -- see ext4_mb_release_context() 3274 * Other CPUs are prevented from allocating from this pa by lg_mutex 3275 */ 3276 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3277 } 3278 3279 /* 3280 * search goal blocks in preallocated space 3281 */ 3282 static noinline_for_stack int 3283 ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 3284 { 3285 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3286 struct ext4_locality_group *lg; 3287 struct ext4_prealloc_space *pa; 3288 3289 /* only data can be preallocated */ 3290 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3291 return 0; 3292 3293 /* first, try per-file preallocation */ 3294 rcu_read_lock(); 3295 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3296 3297 /* all fields in this condition don't change, 3298 * so we can skip locking for them */ 3299 if (ac->ac_o_ex.fe_logical < pa->pa_lstart || 3300 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3301 continue; 3302 3303 /* found preallocated blocks, use them */ 3304 spin_lock(&pa->pa_lock); 3305 if (pa->pa_deleted == 0 && pa->pa_free) { 3306 atomic_inc(&pa->pa_count); 3307 ext4_mb_use_inode_pa(ac, pa); 3308 spin_unlock(&pa->pa_lock); 3309 ac->ac_criteria = 10; 3310 rcu_read_unlock(); 3311 return 1; 3312 } 3313 spin_unlock(&pa->pa_lock); 3314 } 3315 rcu_read_unlock(); 3316 3317 /* can we use group allocation? */ 3318 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) 3319 return 0; 3320 3321 /* inode may have no locality group for some reason */ 3322 lg = ac->ac_lg; 3323 if (lg == NULL) 3324 return 0; 3325 3326 rcu_read_lock(); 3327 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) { 3328 spin_lock(&pa->pa_lock); 3329 if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { 3330 atomic_inc(&pa->pa_count); 3331 ext4_mb_use_group_pa(ac, pa); 3332 spin_unlock(&pa->pa_lock); 3333 ac->ac_criteria = 20; 3334 rcu_read_unlock(); 3335 return 1; 3336 } 3337 spin_unlock(&pa->pa_lock); 3338 } 3339 rcu_read_unlock(); 3340 3341 return 0; 3342 } 3343 3344 /* 3345 * the function goes through all preallocation in this group and marks them 3346 * used in in-core bitmap. buddy must be generated from this bitmap 3347 * Need to be called with ext4 group lock (ext4_lock_group) 3348 */ 3349 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 3350 ext4_group_t group) 3351 { 3352 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3353 struct ext4_prealloc_space *pa; 3354 struct list_head *cur; 3355 ext4_group_t groupnr; 3356 ext4_grpblk_t start; 3357 int preallocated = 0; 3358 int count = 0; 3359 int len; 3360 3361 /* all form of preallocation discards first load group, 3362 * so the only competing code is preallocation use. 3363 * we don't need any locking here 3364 * notice we do NOT ignore preallocations with pa_deleted 3365 * otherwise we could leave used blocks available for 3366 * allocation in buddy when concurrent ext4_mb_put_pa() 3367 * is dropping preallocation 3368 */ 3369 list_for_each(cur, &grp->bb_prealloc_list) { 3370 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 3371 spin_lock(&pa->pa_lock); 3372 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 3373 &groupnr, &start); 3374 len = pa->pa_len; 3375 spin_unlock(&pa->pa_lock); 3376 if (unlikely(len == 0)) 3377 continue; 3378 BUG_ON(groupnr != group); 3379 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), 3380 bitmap, start, len); 3381 preallocated += len; 3382 count++; 3383 } 3384 mb_debug("prellocated %u for group %lu\n", preallocated, group); 3385 } 3386 3387 static void ext4_mb_pa_callback(struct rcu_head *head) 3388 { 3389 struct ext4_prealloc_space *pa; 3390 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 3391 kmem_cache_free(ext4_pspace_cachep, pa); 3392 } 3393 3394 /* 3395 * drops a reference to preallocated space descriptor 3396 * if this was the last reference and the space is consumed 3397 */ 3398 static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 3399 struct super_block *sb, struct ext4_prealloc_space *pa) 3400 { 3401 unsigned long grp; 3402 3403 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) 3404 return; 3405 3406 /* in this short window concurrent discard can set pa_deleted */ 3407 spin_lock(&pa->pa_lock); 3408 if (pa->pa_deleted == 1) { 3409 spin_unlock(&pa->pa_lock); 3410 return; 3411 } 3412 3413 pa->pa_deleted = 1; 3414 spin_unlock(&pa->pa_lock); 3415 3416 /* -1 is to protect from crossing allocation group */ 3417 ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); 3418 3419 /* 3420 * possible race: 3421 * 3422 * P1 (buddy init) P2 (regular allocation) 3423 * find block B in PA 3424 * copy on-disk bitmap to buddy 3425 * mark B in on-disk bitmap 3426 * drop PA from group 3427 * mark all PAs in buddy 3428 * 3429 * thus, P1 initializes buddy with B available. to prevent this 3430 * we make "copy" and "mark all PAs" atomic and serialize "drop PA" 3431 * against that pair 3432 */ 3433 ext4_lock_group(sb, grp); 3434 list_del(&pa->pa_group_list); 3435 ext4_unlock_group(sb, grp); 3436 3437 spin_lock(pa->pa_obj_lock); 3438 list_del_rcu(&pa->pa_inode_list); 3439 spin_unlock(pa->pa_obj_lock); 3440 3441 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3442 } 3443 3444 /* 3445 * creates new preallocated space for given inode 3446 */ 3447 static noinline_for_stack int 3448 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 3449 { 3450 struct super_block *sb = ac->ac_sb; 3451 struct ext4_prealloc_space *pa; 3452 struct ext4_group_info *grp; 3453 struct ext4_inode_info *ei; 3454 3455 /* preallocate only when found space is larger then requested */ 3456 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3457 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3458 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3459 3460 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3461 if (pa == NULL) 3462 return -ENOMEM; 3463 3464 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { 3465 int winl; 3466 int wins; 3467 int win; 3468 int offs; 3469 3470 /* we can't allocate as much as normalizer wants. 3471 * so, found space must get proper lstart 3472 * to cover original request */ 3473 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 3474 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); 3475 3476 /* we're limited by original request in that 3477 * logical block must be covered any way 3478 * winl is window we can move our chunk within */ 3479 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; 3480 3481 /* also, we should cover whole original request */ 3482 wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; 3483 3484 /* the smallest one defines real window */ 3485 win = min(winl, wins); 3486 3487 offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; 3488 if (offs && offs < win) 3489 win = offs; 3490 3491 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; 3492 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 3493 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 3494 } 3495 3496 /* preallocation can change ac_b_ex, thus we store actually 3497 * allocated blocks for history */ 3498 ac->ac_f_ex = ac->ac_b_ex; 3499 3500 pa->pa_lstart = ac->ac_b_ex.fe_logical; 3501 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3502 pa->pa_len = ac->ac_b_ex.fe_len; 3503 pa->pa_free = pa->pa_len; 3504 atomic_set(&pa->pa_count, 1); 3505 spin_lock_init(&pa->pa_lock); 3506 pa->pa_deleted = 0; 3507 pa->pa_linear = 0; 3508 3509 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3510 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3511 3512 ext4_mb_use_inode_pa(ac, pa); 3513 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3514 3515 ei = EXT4_I(ac->ac_inode); 3516 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3517 3518 pa->pa_obj_lock = &ei->i_prealloc_lock; 3519 pa->pa_inode = ac->ac_inode; 3520 3521 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3522 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3523 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3524 3525 spin_lock(pa->pa_obj_lock); 3526 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); 3527 spin_unlock(pa->pa_obj_lock); 3528 3529 return 0; 3530 } 3531 3532 /* 3533 * creates new preallocated space for locality group inodes belongs to 3534 */ 3535 static noinline_for_stack int 3536 ext4_mb_new_group_pa(struct ext4_allocation_context *ac) 3537 { 3538 struct super_block *sb = ac->ac_sb; 3539 struct ext4_locality_group *lg; 3540 struct ext4_prealloc_space *pa; 3541 struct ext4_group_info *grp; 3542 3543 /* preallocate only when found space is larger then requested */ 3544 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3545 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3546 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3547 3548 BUG_ON(ext4_pspace_cachep == NULL); 3549 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3550 if (pa == NULL) 3551 return -ENOMEM; 3552 3553 /* preallocation can change ac_b_ex, thus we store actually 3554 * allocated blocks for history */ 3555 ac->ac_f_ex = ac->ac_b_ex; 3556 3557 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3558 pa->pa_lstart = pa->pa_pstart; 3559 pa->pa_len = ac->ac_b_ex.fe_len; 3560 pa->pa_free = pa->pa_len; 3561 atomic_set(&pa->pa_count, 1); 3562 spin_lock_init(&pa->pa_lock); 3563 pa->pa_deleted = 0; 3564 pa->pa_linear = 1; 3565 3566 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3567 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3568 3569 ext4_mb_use_group_pa(ac, pa); 3570 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3571 3572 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3573 lg = ac->ac_lg; 3574 BUG_ON(lg == NULL); 3575 3576 pa->pa_obj_lock = &lg->lg_prealloc_lock; 3577 pa->pa_inode = NULL; 3578 3579 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3580 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3581 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3582 3583 spin_lock(pa->pa_obj_lock); 3584 list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); 3585 spin_unlock(pa->pa_obj_lock); 3586 3587 return 0; 3588 } 3589 3590 static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) 3591 { 3592 int err; 3593 3594 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 3595 err = ext4_mb_new_group_pa(ac); 3596 else 3597 err = ext4_mb_new_inode_pa(ac); 3598 return err; 3599 } 3600 3601 /* 3602 * finds all unused blocks in on-disk bitmap, frees them in 3603 * in-core bitmap and buddy. 3604 * @pa must be unlinked from inode and group lists, so that 3605 * nobody else can find/use it. 3606 * the caller MUST hold group/inode locks. 3607 * TODO: optimize the case when there are no in-core structures yet 3608 */ 3609 static noinline_for_stack int 3610 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 3611 struct ext4_prealloc_space *pa, 3612 struct ext4_allocation_context *ac) 3613 { 3614 struct super_block *sb = e4b->bd_sb; 3615 struct ext4_sb_info *sbi = EXT4_SB(sb); 3616 unsigned long end; 3617 unsigned long next; 3618 ext4_group_t group; 3619 ext4_grpblk_t bit; 3620 sector_t start; 3621 int err = 0; 3622 int free = 0; 3623 3624 BUG_ON(pa->pa_deleted == 0); 3625 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3626 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3627 end = bit + pa->pa_len; 3628 3629 if (ac) { 3630 ac->ac_sb = sb; 3631 ac->ac_inode = pa->pa_inode; 3632 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3633 } 3634 3635 while (bit < end) { 3636 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 3637 if (bit >= end) 3638 break; 3639 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3640 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3641 le32_to_cpu(sbi->s_es->s_first_data_block); 3642 mb_debug(" free preallocated %u/%u in group %u\n", 3643 (unsigned) start, (unsigned) next - bit, 3644 (unsigned) group); 3645 free += next - bit; 3646 3647 if (ac) { 3648 ac->ac_b_ex.fe_group = group; 3649 ac->ac_b_ex.fe_start = bit; 3650 ac->ac_b_ex.fe_len = next - bit; 3651 ac->ac_b_ex.fe_logical = 0; 3652 ext4_mb_store_history(ac); 3653 } 3654 3655 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3656 bit = next + 1; 3657 } 3658 if (free != pa->pa_free) { 3659 printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", 3660 pa, (unsigned long) pa->pa_lstart, 3661 (unsigned long) pa->pa_pstart, 3662 (unsigned long) pa->pa_len); 3663 ext4_error(sb, __func__, "free %u, pa_free %u\n", 3664 free, pa->pa_free); 3665 /* 3666 * pa is already deleted so we use the value obtained 3667 * from the bitmap and continue. 3668 */ 3669 } 3670 atomic_add(free, &sbi->s_mb_discarded); 3671 3672 return err; 3673 } 3674 3675 static noinline_for_stack int 3676 ext4_mb_release_group_pa(struct ext4_buddy *e4b, 3677 struct ext4_prealloc_space *pa, 3678 struct ext4_allocation_context *ac) 3679 { 3680 struct super_block *sb = e4b->bd_sb; 3681 ext4_group_t group; 3682 ext4_grpblk_t bit; 3683 3684 if (ac) 3685 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3686 3687 BUG_ON(pa->pa_deleted == 0); 3688 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3689 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3690 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 3691 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 3692 3693 if (ac) { 3694 ac->ac_sb = sb; 3695 ac->ac_inode = NULL; 3696 ac->ac_b_ex.fe_group = group; 3697 ac->ac_b_ex.fe_start = bit; 3698 ac->ac_b_ex.fe_len = pa->pa_len; 3699 ac->ac_b_ex.fe_logical = 0; 3700 ext4_mb_store_history(ac); 3701 } 3702 3703 return 0; 3704 } 3705 3706 /* 3707 * releases all preallocations in given group 3708 * 3709 * first, we need to decide discard policy: 3710 * - when do we discard 3711 * 1) ENOSPC 3712 * - how many do we discard 3713 * 1) how many requested 3714 */ 3715 static noinline_for_stack int 3716 ext4_mb_discard_group_preallocations(struct super_block *sb, 3717 ext4_group_t group, int needed) 3718 { 3719 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3720 struct buffer_head *bitmap_bh = NULL; 3721 struct ext4_prealloc_space *pa, *tmp; 3722 struct ext4_allocation_context *ac; 3723 struct list_head list; 3724 struct ext4_buddy e4b; 3725 int err; 3726 int busy = 0; 3727 int free = 0; 3728 3729 mb_debug("discard preallocation for group %lu\n", group); 3730 3731 if (list_empty(&grp->bb_prealloc_list)) 3732 return 0; 3733 3734 bitmap_bh = ext4_read_block_bitmap(sb, group); 3735 if (bitmap_bh == NULL) { 3736 /* error handling here */ 3737 ext4_mb_release_desc(&e4b); 3738 BUG_ON(bitmap_bh == NULL); 3739 } 3740 3741 err = ext4_mb_load_buddy(sb, group, &e4b); 3742 BUG_ON(err != 0); /* error handling here */ 3743 3744 if (needed == 0) 3745 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3746 3747 grp = ext4_get_group_info(sb, group); 3748 INIT_LIST_HEAD(&list); 3749 3750 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 3751 repeat: 3752 ext4_lock_group(sb, group); 3753 list_for_each_entry_safe(pa, tmp, 3754 &grp->bb_prealloc_list, pa_group_list) { 3755 spin_lock(&pa->pa_lock); 3756 if (atomic_read(&pa->pa_count)) { 3757 spin_unlock(&pa->pa_lock); 3758 busy = 1; 3759 continue; 3760 } 3761 if (pa->pa_deleted) { 3762 spin_unlock(&pa->pa_lock); 3763 continue; 3764 } 3765 3766 /* seems this one can be freed ... */ 3767 pa->pa_deleted = 1; 3768 3769 /* we can trust pa_free ... */ 3770 free += pa->pa_free; 3771 3772 spin_unlock(&pa->pa_lock); 3773 3774 list_del(&pa->pa_group_list); 3775 list_add(&pa->u.pa_tmp_list, &list); 3776 } 3777 3778 /* if we still need more blocks and some PAs were used, try again */ 3779 if (free < needed && busy) { 3780 busy = 0; 3781 ext4_unlock_group(sb, group); 3782 /* 3783 * Yield the CPU here so that we don't get soft lockup 3784 * in non preempt case. 3785 */ 3786 yield(); 3787 goto repeat; 3788 } 3789 3790 /* found anything to free? */ 3791 if (list_empty(&list)) { 3792 BUG_ON(free != 0); 3793 goto out; 3794 } 3795 3796 /* now free all selected PAs */ 3797 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3798 3799 /* remove from object (inode or locality group) */ 3800 spin_lock(pa->pa_obj_lock); 3801 list_del_rcu(&pa->pa_inode_list); 3802 spin_unlock(pa->pa_obj_lock); 3803 3804 if (pa->pa_linear) 3805 ext4_mb_release_group_pa(&e4b, pa, ac); 3806 else 3807 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3808 3809 list_del(&pa->u.pa_tmp_list); 3810 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3811 } 3812 3813 out: 3814 ext4_unlock_group(sb, group); 3815 if (ac) 3816 kmem_cache_free(ext4_ac_cachep, ac); 3817 ext4_mb_release_desc(&e4b); 3818 put_bh(bitmap_bh); 3819 return free; 3820 } 3821 3822 /* 3823 * releases all non-used preallocated blocks for given inode 3824 * 3825 * It's important to discard preallocations under i_data_sem 3826 * We don't want another block to be served from the prealloc 3827 * space when we are discarding the inode prealloc space. 3828 * 3829 * FIXME!! Make sure it is valid at all the call sites 3830 */ 3831 void ext4_mb_discard_inode_preallocations(struct inode *inode) 3832 { 3833 struct ext4_inode_info *ei = EXT4_I(inode); 3834 struct super_block *sb = inode->i_sb; 3835 struct buffer_head *bitmap_bh = NULL; 3836 struct ext4_prealloc_space *pa, *tmp; 3837 struct ext4_allocation_context *ac; 3838 ext4_group_t group = 0; 3839 struct list_head list; 3840 struct ext4_buddy e4b; 3841 int err; 3842 3843 if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) { 3844 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 3845 return; 3846 } 3847 3848 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 3849 3850 INIT_LIST_HEAD(&list); 3851 3852 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 3853 repeat: 3854 /* first, collect all pa's in the inode */ 3855 spin_lock(&ei->i_prealloc_lock); 3856 while (!list_empty(&ei->i_prealloc_list)) { 3857 pa = list_entry(ei->i_prealloc_list.next, 3858 struct ext4_prealloc_space, pa_inode_list); 3859 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); 3860 spin_lock(&pa->pa_lock); 3861 if (atomic_read(&pa->pa_count)) { 3862 /* this shouldn't happen often - nobody should 3863 * use preallocation while we're discarding it */ 3864 spin_unlock(&pa->pa_lock); 3865 spin_unlock(&ei->i_prealloc_lock); 3866 printk(KERN_ERR "uh-oh! used pa while discarding\n"); 3867 WARN_ON(1); 3868 schedule_timeout_uninterruptible(HZ); 3869 goto repeat; 3870 3871 } 3872 if (pa->pa_deleted == 0) { 3873 pa->pa_deleted = 1; 3874 spin_unlock(&pa->pa_lock); 3875 list_del_rcu(&pa->pa_inode_list); 3876 list_add(&pa->u.pa_tmp_list, &list); 3877 continue; 3878 } 3879 3880 /* someone is deleting pa right now */ 3881 spin_unlock(&pa->pa_lock); 3882 spin_unlock(&ei->i_prealloc_lock); 3883 3884 /* we have to wait here because pa_deleted 3885 * doesn't mean pa is already unlinked from 3886 * the list. as we might be called from 3887 * ->clear_inode() the inode will get freed 3888 * and concurrent thread which is unlinking 3889 * pa from inode's list may access already 3890 * freed memory, bad-bad-bad */ 3891 3892 /* XXX: if this happens too often, we can 3893 * add a flag to force wait only in case 3894 * of ->clear_inode(), but not in case of 3895 * regular truncate */ 3896 schedule_timeout_uninterruptible(HZ); 3897 goto repeat; 3898 } 3899 spin_unlock(&ei->i_prealloc_lock); 3900 3901 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3902 BUG_ON(pa->pa_linear != 0); 3903 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 3904 3905 err = ext4_mb_load_buddy(sb, group, &e4b); 3906 BUG_ON(err != 0); /* error handling here */ 3907 3908 bitmap_bh = ext4_read_block_bitmap(sb, group); 3909 if (bitmap_bh == NULL) { 3910 /* error handling here */ 3911 ext4_mb_release_desc(&e4b); 3912 BUG_ON(bitmap_bh == NULL); 3913 } 3914 3915 ext4_lock_group(sb, group); 3916 list_del(&pa->pa_group_list); 3917 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3918 ext4_unlock_group(sb, group); 3919 3920 ext4_mb_release_desc(&e4b); 3921 put_bh(bitmap_bh); 3922 3923 list_del(&pa->u.pa_tmp_list); 3924 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3925 } 3926 if (ac) 3927 kmem_cache_free(ext4_ac_cachep, ac); 3928 } 3929 3930 /* 3931 * finds all preallocated spaces and return blocks being freed to them 3932 * if preallocated space becomes full (no block is used from the space) 3933 * then the function frees space in buddy 3934 * XXX: at the moment, truncate (which is the only way to free blocks) 3935 * discards all preallocations 3936 */ 3937 static void ext4_mb_return_to_preallocation(struct inode *inode, 3938 struct ext4_buddy *e4b, 3939 sector_t block, int count) 3940 { 3941 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); 3942 } 3943 #ifdef MB_DEBUG 3944 static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3945 { 3946 struct super_block *sb = ac->ac_sb; 3947 ext4_group_t i; 3948 3949 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3950 " Allocation context details:\n"); 3951 printk(KERN_ERR "EXT4-fs: status %d flags %d\n", 3952 ac->ac_status, ac->ac_flags); 3953 printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " 3954 "best %lu/%lu/%lu@%lu cr %d\n", 3955 (unsigned long)ac->ac_o_ex.fe_group, 3956 (unsigned long)ac->ac_o_ex.fe_start, 3957 (unsigned long)ac->ac_o_ex.fe_len, 3958 (unsigned long)ac->ac_o_ex.fe_logical, 3959 (unsigned long)ac->ac_g_ex.fe_group, 3960 (unsigned long)ac->ac_g_ex.fe_start, 3961 (unsigned long)ac->ac_g_ex.fe_len, 3962 (unsigned long)ac->ac_g_ex.fe_logical, 3963 (unsigned long)ac->ac_b_ex.fe_group, 3964 (unsigned long)ac->ac_b_ex.fe_start, 3965 (unsigned long)ac->ac_b_ex.fe_len, 3966 (unsigned long)ac->ac_b_ex.fe_logical, 3967 (int)ac->ac_criteria); 3968 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, 3969 ac->ac_found); 3970 printk(KERN_ERR "EXT4-fs: groups: \n"); 3971 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 3972 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 3973 struct ext4_prealloc_space *pa; 3974 ext4_grpblk_t start; 3975 struct list_head *cur; 3976 ext4_lock_group(sb, i); 3977 list_for_each(cur, &grp->bb_prealloc_list) { 3978 pa = list_entry(cur, struct ext4_prealloc_space, 3979 pa_group_list); 3980 spin_lock(&pa->pa_lock); 3981 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 3982 NULL, &start); 3983 spin_unlock(&pa->pa_lock); 3984 printk(KERN_ERR "PA:%lu:%d:%u \n", i, 3985 start, pa->pa_len); 3986 } 3987 ext4_unlock_group(sb, i); 3988 3989 if (grp->bb_free == 0) 3990 continue; 3991 printk(KERN_ERR "%lu: %d/%d \n", 3992 i, grp->bb_free, grp->bb_fragments); 3993 } 3994 printk(KERN_ERR "\n"); 3995 } 3996 #else 3997 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3998 { 3999 return; 4000 } 4001 #endif 4002 4003 /* 4004 * We use locality group preallocation for small size file. The size of the 4005 * file is determined by the current size or the resulting size after 4006 * allocation which ever is larger 4007 * 4008 * One can tune this size via /proc/fs/ext4/<partition>/stream_req 4009 */ 4010 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 4011 { 4012 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4013 int bsbits = ac->ac_sb->s_blocksize_bits; 4014 loff_t size, isize; 4015 4016 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4017 return; 4018 4019 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 4020 isize = i_size_read(ac->ac_inode) >> bsbits; 4021 size = max(size, isize); 4022 4023 /* don't use group allocation for large files */ 4024 if (size >= sbi->s_mb_stream_request) 4025 return; 4026 4027 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4028 return; 4029 4030 BUG_ON(ac->ac_lg != NULL); 4031 /* 4032 * locality group prealloc space are per cpu. The reason for having 4033 * per cpu locality group is to reduce the contention between block 4034 * request from multiple CPUs. 4035 */ 4036 ac->ac_lg = &sbi->s_locality_groups[get_cpu()]; 4037 put_cpu(); 4038 4039 /* we're going to use group allocation */ 4040 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 4041 4042 /* serialize all allocations in the group */ 4043 mutex_lock(&ac->ac_lg->lg_mutex); 4044 } 4045 4046 static noinline_for_stack int 4047 ext4_mb_initialize_context(struct ext4_allocation_context *ac, 4048 struct ext4_allocation_request *ar) 4049 { 4050 struct super_block *sb = ar->inode->i_sb; 4051 struct ext4_sb_info *sbi = EXT4_SB(sb); 4052 struct ext4_super_block *es = sbi->s_es; 4053 ext4_group_t group; 4054 unsigned long len; 4055 unsigned long goal; 4056 ext4_grpblk_t block; 4057 4058 /* we can't allocate > group size */ 4059 len = ar->len; 4060 4061 /* just a dirty hack to filter too big requests */ 4062 if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) 4063 len = EXT4_BLOCKS_PER_GROUP(sb) - 10; 4064 4065 /* start searching from the goal */ 4066 goal = ar->goal; 4067 if (goal < le32_to_cpu(es->s_first_data_block) || 4068 goal >= ext4_blocks_count(es)) 4069 goal = le32_to_cpu(es->s_first_data_block); 4070 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4071 4072 /* set up allocation goals */ 4073 ac->ac_b_ex.fe_logical = ar->logical; 4074 ac->ac_b_ex.fe_group = 0; 4075 ac->ac_b_ex.fe_start = 0; 4076 ac->ac_b_ex.fe_len = 0; 4077 ac->ac_status = AC_STATUS_CONTINUE; 4078 ac->ac_groups_scanned = 0; 4079 ac->ac_ex_scanned = 0; 4080 ac->ac_found = 0; 4081 ac->ac_sb = sb; 4082 ac->ac_inode = ar->inode; 4083 ac->ac_o_ex.fe_logical = ar->logical; 4084 ac->ac_o_ex.fe_group = group; 4085 ac->ac_o_ex.fe_start = block; 4086 ac->ac_o_ex.fe_len = len; 4087 ac->ac_g_ex.fe_logical = ar->logical; 4088 ac->ac_g_ex.fe_group = group; 4089 ac->ac_g_ex.fe_start = block; 4090 ac->ac_g_ex.fe_len = len; 4091 ac->ac_f_ex.fe_len = 0; 4092 ac->ac_flags = ar->flags; 4093 ac->ac_2order = 0; 4094 ac->ac_criteria = 0; 4095 ac->ac_pa = NULL; 4096 ac->ac_bitmap_page = NULL; 4097 ac->ac_buddy_page = NULL; 4098 ac->ac_lg = NULL; 4099 4100 /* we have to define context: we'll we work with a file or 4101 * locality group. this is a policy, actually */ 4102 ext4_mb_group_or_file(ac); 4103 4104 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 4105 "left: %u/%u, right %u/%u to %swritable\n", 4106 (unsigned) ar->len, (unsigned) ar->logical, 4107 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 4108 (unsigned) ar->lleft, (unsigned) ar->pleft, 4109 (unsigned) ar->lright, (unsigned) ar->pright, 4110 atomic_read(&ar->inode->i_writecount) ? "" : "non-"); 4111 return 0; 4112 4113 } 4114 4115 /* 4116 * release all resource we used in allocation 4117 */ 4118 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 4119 { 4120 if (ac->ac_pa) { 4121 if (ac->ac_pa->pa_linear) { 4122 /* see comment in ext4_mb_use_group_pa() */ 4123 spin_lock(&ac->ac_pa->pa_lock); 4124 ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; 4125 ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; 4126 ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; 4127 ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; 4128 spin_unlock(&ac->ac_pa->pa_lock); 4129 } 4130 ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); 4131 } 4132 if (ac->ac_bitmap_page) 4133 page_cache_release(ac->ac_bitmap_page); 4134 if (ac->ac_buddy_page) 4135 page_cache_release(ac->ac_buddy_page); 4136 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 4137 mutex_unlock(&ac->ac_lg->lg_mutex); 4138 ext4_mb_collect_stats(ac); 4139 return 0; 4140 } 4141 4142 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 4143 { 4144 ext4_group_t i; 4145 int ret; 4146 int freed = 0; 4147 4148 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { 4149 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4150 freed += ret; 4151 needed -= ret; 4152 } 4153 4154 return freed; 4155 } 4156 4157 /* 4158 * Main entry point into mballoc to allocate blocks 4159 * it tries to use preallocation first, then falls back 4160 * to usual allocation 4161 */ 4162 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4163 struct ext4_allocation_request *ar, int *errp) 4164 { 4165 struct ext4_allocation_context *ac = NULL; 4166 struct ext4_sb_info *sbi; 4167 struct super_block *sb; 4168 ext4_fsblk_t block = 0; 4169 int freed; 4170 int inquota; 4171 4172 sb = ar->inode->i_sb; 4173 sbi = EXT4_SB(sb); 4174 4175 if (!test_opt(sb, MBALLOC)) { 4176 block = ext4_old_new_blocks(handle, ar->inode, ar->goal, 4177 &(ar->len), errp); 4178 return block; 4179 } 4180 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { 4181 /* 4182 * With delalloc we already reserved the blocks 4183 */ 4184 ar->len = ext4_has_free_blocks(sbi, ar->len); 4185 } 4186 4187 if (ar->len == 0) { 4188 *errp = -ENOSPC; 4189 return 0; 4190 } 4191 4192 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { 4193 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4194 ar->len--; 4195 } 4196 if (ar->len == 0) { 4197 *errp = -EDQUOT; 4198 return 0; 4199 } 4200 inquota = ar->len; 4201 4202 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4203 ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4204 4205 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4206 if (!ac) { 4207 ar->len = 0; 4208 *errp = -ENOMEM; 4209 goto out1; 4210 } 4211 4212 ext4_mb_poll_new_transaction(sb, handle); 4213 4214 *errp = ext4_mb_initialize_context(ac, ar); 4215 if (*errp) { 4216 ar->len = 0; 4217 goto out2; 4218 } 4219 4220 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4221 if (!ext4_mb_use_preallocated(ac)) { 4222 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 4223 ext4_mb_normalize_request(ac, ar); 4224 repeat: 4225 /* allocate space in core */ 4226 ext4_mb_regular_allocator(ac); 4227 4228 /* as we've just preallocated more space than 4229 * user requested orinally, we store allocated 4230 * space in a special descriptor */ 4231 if (ac->ac_status == AC_STATUS_FOUND && 4232 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4233 ext4_mb_new_preallocation(ac); 4234 } 4235 4236 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4237 *errp = ext4_mb_mark_diskspace_used(ac, handle); 4238 if (*errp == -EAGAIN) { 4239 ac->ac_b_ex.fe_group = 0; 4240 ac->ac_b_ex.fe_start = 0; 4241 ac->ac_b_ex.fe_len = 0; 4242 ac->ac_status = AC_STATUS_CONTINUE; 4243 goto repeat; 4244 } else if (*errp) { 4245 ac->ac_b_ex.fe_len = 0; 4246 ar->len = 0; 4247 ext4_mb_show_ac(ac); 4248 } else { 4249 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4250 ar->len = ac->ac_b_ex.fe_len; 4251 } 4252 } else { 4253 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); 4254 if (freed) 4255 goto repeat; 4256 *errp = -ENOSPC; 4257 ac->ac_b_ex.fe_len = 0; 4258 ar->len = 0; 4259 ext4_mb_show_ac(ac); 4260 } 4261 4262 ext4_mb_release_context(ac); 4263 4264 out2: 4265 kmem_cache_free(ext4_ac_cachep, ac); 4266 out1: 4267 if (ar->len < inquota) 4268 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); 4269 4270 return block; 4271 } 4272 static void ext4_mb_poll_new_transaction(struct super_block *sb, 4273 handle_t *handle) 4274 { 4275 struct ext4_sb_info *sbi = EXT4_SB(sb); 4276 4277 if (sbi->s_last_transaction == handle->h_transaction->t_tid) 4278 return; 4279 4280 /* new transaction! time to close last one and free blocks for 4281 * committed transaction. we know that only transaction can be 4282 * active, so previos transaction can be being logged and we 4283 * know that transaction before previous is known to be already 4284 * logged. this means that now we may free blocks freed in all 4285 * transactions before previous one. hope I'm clear enough ... */ 4286 4287 spin_lock(&sbi->s_md_lock); 4288 if (sbi->s_last_transaction != handle->h_transaction->t_tid) { 4289 mb_debug("new transaction %lu, old %lu\n", 4290 (unsigned long) handle->h_transaction->t_tid, 4291 (unsigned long) sbi->s_last_transaction); 4292 list_splice_init(&sbi->s_closed_transaction, 4293 &sbi->s_committed_transaction); 4294 list_splice_init(&sbi->s_active_transaction, 4295 &sbi->s_closed_transaction); 4296 sbi->s_last_transaction = handle->h_transaction->t_tid; 4297 } 4298 spin_unlock(&sbi->s_md_lock); 4299 4300 ext4_mb_free_committed_blocks(sb); 4301 } 4302 4303 static noinline_for_stack int 4304 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4305 ext4_group_t group, ext4_grpblk_t block, int count) 4306 { 4307 struct ext4_group_info *db = e4b->bd_info; 4308 struct super_block *sb = e4b->bd_sb; 4309 struct ext4_sb_info *sbi = EXT4_SB(sb); 4310 struct ext4_free_metadata *md; 4311 int i; 4312 4313 BUG_ON(e4b->bd_bitmap_page == NULL); 4314 BUG_ON(e4b->bd_buddy_page == NULL); 4315 4316 ext4_lock_group(sb, group); 4317 for (i = 0; i < count; i++) { 4318 md = db->bb_md_cur; 4319 if (md && db->bb_tid != handle->h_transaction->t_tid) { 4320 db->bb_md_cur = NULL; 4321 md = NULL; 4322 } 4323 4324 if (md == NULL) { 4325 ext4_unlock_group(sb, group); 4326 md = kmalloc(sizeof(*md), GFP_NOFS); 4327 if (md == NULL) 4328 return -ENOMEM; 4329 md->num = 0; 4330 md->group = group; 4331 4332 ext4_lock_group(sb, group); 4333 if (db->bb_md_cur == NULL) { 4334 spin_lock(&sbi->s_md_lock); 4335 list_add(&md->list, &sbi->s_active_transaction); 4336 spin_unlock(&sbi->s_md_lock); 4337 /* protect buddy cache from being freed, 4338 * otherwise we'll refresh it from 4339 * on-disk bitmap and lose not-yet-available 4340 * blocks */ 4341 page_cache_get(e4b->bd_buddy_page); 4342 page_cache_get(e4b->bd_bitmap_page); 4343 db->bb_md_cur = md; 4344 db->bb_tid = handle->h_transaction->t_tid; 4345 mb_debug("new md 0x%p for group %lu\n", 4346 md, md->group); 4347 } else { 4348 kfree(md); 4349 md = db->bb_md_cur; 4350 } 4351 } 4352 4353 BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); 4354 md->blocks[md->num] = block + i; 4355 md->num++; 4356 if (md->num == EXT4_BB_MAX_BLOCKS) { 4357 /* no more space, put full container on a sb's list */ 4358 db->bb_md_cur = NULL; 4359 } 4360 } 4361 ext4_unlock_group(sb, group); 4362 return 0; 4363 } 4364 4365 /* 4366 * Main entry point into mballoc to free blocks 4367 */ 4368 void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, 4369 unsigned long block, unsigned long count, 4370 int metadata, unsigned long *freed) 4371 { 4372 struct buffer_head *bitmap_bh = NULL; 4373 struct super_block *sb = inode->i_sb; 4374 struct ext4_allocation_context *ac = NULL; 4375 struct ext4_group_desc *gdp; 4376 struct ext4_super_block *es; 4377 unsigned long overflow; 4378 ext4_grpblk_t bit; 4379 struct buffer_head *gd_bh; 4380 ext4_group_t block_group; 4381 struct ext4_sb_info *sbi; 4382 struct ext4_buddy e4b; 4383 int err = 0; 4384 int ret; 4385 4386 *freed = 0; 4387 4388 ext4_mb_poll_new_transaction(sb, handle); 4389 4390 sbi = EXT4_SB(sb); 4391 es = EXT4_SB(sb)->s_es; 4392 if (block < le32_to_cpu(es->s_first_data_block) || 4393 block + count < block || 4394 block + count > ext4_blocks_count(es)) { 4395 ext4_error(sb, __func__, 4396 "Freeing blocks not in datazone - " 4397 "block = %lu, count = %lu", block, count); 4398 goto error_return; 4399 } 4400 4401 ext4_debug("freeing block %lu\n", block); 4402 4403 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4404 if (ac) { 4405 ac->ac_op = EXT4_MB_HISTORY_FREE; 4406 ac->ac_inode = inode; 4407 ac->ac_sb = sb; 4408 } 4409 4410 do_more: 4411 overflow = 0; 4412 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4413 4414 /* 4415 * Check to see if we are freeing blocks across a group 4416 * boundary. 4417 */ 4418 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4419 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 4420 count -= overflow; 4421 } 4422 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4423 if (!bitmap_bh) 4424 goto error_return; 4425 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 4426 if (!gdp) 4427 goto error_return; 4428 4429 if (in_range(ext4_block_bitmap(sb, gdp), block, count) || 4430 in_range(ext4_inode_bitmap(sb, gdp), block, count) || 4431 in_range(block, ext4_inode_table(sb, gdp), 4432 EXT4_SB(sb)->s_itb_per_group) || 4433 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4434 EXT4_SB(sb)->s_itb_per_group)) { 4435 4436 ext4_error(sb, __func__, 4437 "Freeing blocks in system zone - " 4438 "Block = %lu, count = %lu", block, count); 4439 /* err = 0. ext4_std_error should be a no op */ 4440 goto error_return; 4441 } 4442 4443 BUFFER_TRACE(bitmap_bh, "getting write access"); 4444 err = ext4_journal_get_write_access(handle, bitmap_bh); 4445 if (err) 4446 goto error_return; 4447 4448 /* 4449 * We are about to modify some metadata. Call the journal APIs 4450 * to unshare ->b_data if a currently-committing transaction is 4451 * using it 4452 */ 4453 BUFFER_TRACE(gd_bh, "get_write_access"); 4454 err = ext4_journal_get_write_access(handle, gd_bh); 4455 if (err) 4456 goto error_return; 4457 4458 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4459 if (err) 4460 goto error_return; 4461 4462 #ifdef AGGRESSIVE_CHECK 4463 { 4464 int i; 4465 for (i = 0; i < count; i++) 4466 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4467 } 4468 #endif 4469 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, 4470 bit, count); 4471 4472 /* We dirtied the bitmap block */ 4473 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4474 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 4475 4476 if (ac) { 4477 ac->ac_b_ex.fe_group = block_group; 4478 ac->ac_b_ex.fe_start = bit; 4479 ac->ac_b_ex.fe_len = count; 4480 ext4_mb_store_history(ac); 4481 } 4482 4483 if (metadata) { 4484 /* blocks being freed are metadata. these blocks shouldn't 4485 * be used until this transaction is committed */ 4486 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); 4487 } else { 4488 ext4_lock_group(sb, block_group); 4489 mb_free_blocks(inode, &e4b, bit, count); 4490 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4491 ext4_unlock_group(sb, block_group); 4492 } 4493 4494 spin_lock(sb_bgl_lock(sbi, block_group)); 4495 le16_add_cpu(&gdp->bg_free_blocks_count, count); 4496 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4497 spin_unlock(sb_bgl_lock(sbi, block_group)); 4498 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4499 4500 if (sbi->s_log_groups_per_flex) { 4501 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4502 spin_lock(sb_bgl_lock(sbi, flex_group)); 4503 sbi->s_flex_groups[flex_group].free_blocks += count; 4504 spin_unlock(sb_bgl_lock(sbi, flex_group)); 4505 } 4506 4507 ext4_mb_release_desc(&e4b); 4508 4509 *freed += count; 4510 4511 /* And the group descriptor block */ 4512 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4513 ret = ext4_journal_dirty_metadata(handle, gd_bh); 4514 if (!err) 4515 err = ret; 4516 4517 if (overflow && !err) { 4518 block += count; 4519 count = overflow; 4520 put_bh(bitmap_bh); 4521 goto do_more; 4522 } 4523 sb->s_dirt = 1; 4524 error_return: 4525 brelse(bitmap_bh); 4526 ext4_std_error(sb, err); 4527 if (ac) 4528 kmem_cache_free(ext4_ac_cachep, ac); 4529 return; 4530 } 4531